xmlregexp.c revision 4f82c8a161404d1fe6ca9fe6edb404df00e14f11
1/* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17#define IN_LIBXML 18#include "libxml.h" 19 20#ifdef LIBXML_REGEXP_ENABLED 21 22/* #define DEBUG_ERR */ 23 24#include <stdio.h> 25#include <string.h> 26#ifdef HAVE_LIMITS_H 27#include <limits.h> 28#endif 29 30#include <libxml/tree.h> 31#include <libxml/parserInternals.h> 32#include <libxml/xmlregexp.h> 33#include <libxml/xmlautomata.h> 34#include <libxml/xmlunicode.h> 35 36#ifndef INT_MAX 37#define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38#endif 39 40/* #define DEBUG_REGEXP_GRAPH */ 41/* #define DEBUG_REGEXP_EXEC */ 42/* #define DEBUG_PUSH */ 43/* #define DEBUG_COMPACTION */ 44 45#define ERROR(str) \ 46 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 47 xmlRegexpErrCompile(ctxt, str); 48#define NEXT ctxt->cur++ 49#define CUR (*(ctxt->cur)) 50#define NXT(index) (ctxt->cur[index]) 51 52#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 53#define NEXTL(l) ctxt->cur += l; 54#define XML_REG_STRING_SEPARATOR '|' 55 56/** 57 * TODO: 58 * 59 * macro to flag unimplemented blocks 60 */ 61#define TODO \ 62 xmlGenericError(xmlGenericErrorContext, \ 63 "Unimplemented block at %s:%d\n", \ 64 __FILE__, __LINE__); 65 66/************************************************************************ 67 * * 68 * Datatypes and structures * 69 * * 70 ************************************************************************/ 71 72typedef enum { 73 XML_REGEXP_EPSILON = 1, 74 XML_REGEXP_CHARVAL, 75 XML_REGEXP_RANGES, 76 XML_REGEXP_SUBREG, 77 XML_REGEXP_STRING, 78 XML_REGEXP_ANYCHAR, /* . */ 79 XML_REGEXP_ANYSPACE, /* \s */ 80 XML_REGEXP_NOTSPACE, /* \S */ 81 XML_REGEXP_INITNAME, /* \l */ 82 XML_REGEXP_NOTINITNAME, /* \l */ 83 XML_REGEXP_NAMECHAR, /* \c */ 84 XML_REGEXP_NOTNAMECHAR, /* \C */ 85 XML_REGEXP_DECIMAL, /* \d */ 86 XML_REGEXP_NOTDECIMAL, /* \d */ 87 XML_REGEXP_REALCHAR, /* \w */ 88 XML_REGEXP_NOTREALCHAR, /* \w */ 89 XML_REGEXP_LETTER, 90 XML_REGEXP_LETTER_UPPERCASE, 91 XML_REGEXP_LETTER_LOWERCASE, 92 XML_REGEXP_LETTER_TITLECASE, 93 XML_REGEXP_LETTER_MODIFIER, 94 XML_REGEXP_LETTER_OTHERS, 95 XML_REGEXP_MARK, 96 XML_REGEXP_MARK_NONSPACING, 97 XML_REGEXP_MARK_SPACECOMBINING, 98 XML_REGEXP_MARK_ENCLOSING, 99 XML_REGEXP_NUMBER, 100 XML_REGEXP_NUMBER_DECIMAL, 101 XML_REGEXP_NUMBER_LETTER, 102 XML_REGEXP_NUMBER_OTHERS, 103 XML_REGEXP_PUNCT, 104 XML_REGEXP_PUNCT_CONNECTOR, 105 XML_REGEXP_PUNCT_DASH, 106 XML_REGEXP_PUNCT_OPEN, 107 XML_REGEXP_PUNCT_CLOSE, 108 XML_REGEXP_PUNCT_INITQUOTE, 109 XML_REGEXP_PUNCT_FINQUOTE, 110 XML_REGEXP_PUNCT_OTHERS, 111 XML_REGEXP_SEPAR, 112 XML_REGEXP_SEPAR_SPACE, 113 XML_REGEXP_SEPAR_LINE, 114 XML_REGEXP_SEPAR_PARA, 115 XML_REGEXP_SYMBOL, 116 XML_REGEXP_SYMBOL_MATH, 117 XML_REGEXP_SYMBOL_CURRENCY, 118 XML_REGEXP_SYMBOL_MODIFIER, 119 XML_REGEXP_SYMBOL_OTHERS, 120 XML_REGEXP_OTHER, 121 XML_REGEXP_OTHER_CONTROL, 122 XML_REGEXP_OTHER_FORMAT, 123 XML_REGEXP_OTHER_PRIVATE, 124 XML_REGEXP_OTHER_NA, 125 XML_REGEXP_BLOCK_NAME 126} xmlRegAtomType; 127 128typedef enum { 129 XML_REGEXP_QUANT_EPSILON = 1, 130 XML_REGEXP_QUANT_ONCE, 131 XML_REGEXP_QUANT_OPT, 132 XML_REGEXP_QUANT_MULT, 133 XML_REGEXP_QUANT_PLUS, 134 XML_REGEXP_QUANT_ONCEONLY, 135 XML_REGEXP_QUANT_ALL, 136 XML_REGEXP_QUANT_RANGE 137} xmlRegQuantType; 138 139typedef enum { 140 XML_REGEXP_START_STATE = 1, 141 XML_REGEXP_FINAL_STATE, 142 XML_REGEXP_TRANS_STATE, 143 XML_REGEXP_SINK_STATE 144} xmlRegStateType; 145 146typedef enum { 147 XML_REGEXP_MARK_NORMAL = 0, 148 XML_REGEXP_MARK_START, 149 XML_REGEXP_MARK_VISITED 150} xmlRegMarkedType; 151 152typedef struct _xmlRegRange xmlRegRange; 153typedef xmlRegRange *xmlRegRangePtr; 154 155struct _xmlRegRange { 156 int neg; /* 0 normal, 1 not, 2 exclude */ 157 xmlRegAtomType type; 158 int start; 159 int end; 160 xmlChar *blockName; 161}; 162 163typedef struct _xmlRegAtom xmlRegAtom; 164typedef xmlRegAtom *xmlRegAtomPtr; 165 166typedef struct _xmlAutomataState xmlRegState; 167typedef xmlRegState *xmlRegStatePtr; 168 169struct _xmlRegAtom { 170 int no; 171 xmlRegAtomType type; 172 xmlRegQuantType quant; 173 int min; 174 int max; 175 176 void *valuep; 177 void *valuep2; 178 int neg; 179 int codepoint; 180 xmlRegStatePtr start; 181 xmlRegStatePtr stop; 182 int maxRanges; 183 int nbRanges; 184 xmlRegRangePtr *ranges; 185 void *data; 186}; 187 188typedef struct _xmlRegCounter xmlRegCounter; 189typedef xmlRegCounter *xmlRegCounterPtr; 190 191struct _xmlRegCounter { 192 int min; 193 int max; 194}; 195 196typedef struct _xmlRegTrans xmlRegTrans; 197typedef xmlRegTrans *xmlRegTransPtr; 198 199struct _xmlRegTrans { 200 xmlRegAtomPtr atom; 201 int to; 202 int counter; 203 int count; 204}; 205 206struct _xmlAutomataState { 207 xmlRegStateType type; 208 xmlRegMarkedType mark; 209 xmlRegMarkedType reached; 210 int no; 211 int maxTrans; 212 int nbTrans; 213 xmlRegTrans *trans; 214 /* knowing states ponting to us can speed things up */ 215 int maxTransTo; 216 int nbTransTo; 217 int *transTo; 218}; 219 220typedef struct _xmlAutomata xmlRegParserCtxt; 221typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 222 223struct _xmlAutomata { 224 xmlChar *string; 225 xmlChar *cur; 226 227 int error; 228 int neg; 229 230 xmlRegStatePtr start; 231 xmlRegStatePtr end; 232 xmlRegStatePtr state; 233 234 xmlRegAtomPtr atom; 235 236 int maxAtoms; 237 int nbAtoms; 238 xmlRegAtomPtr *atoms; 239 240 int maxStates; 241 int nbStates; 242 xmlRegStatePtr *states; 243 244 int maxCounters; 245 int nbCounters; 246 xmlRegCounter *counters; 247 248 int determinist; 249 int negs; 250}; 251 252struct _xmlRegexp { 253 xmlChar *string; 254 int nbStates; 255 xmlRegStatePtr *states; 256 int nbAtoms; 257 xmlRegAtomPtr *atoms; 258 int nbCounters; 259 xmlRegCounter *counters; 260 int determinist; 261 /* 262 * That's the compact form for determinists automatas 263 */ 264 int nbstates; 265 int *compact; 266 void **transdata; 267 int nbstrings; 268 xmlChar **stringMap; 269}; 270 271typedef struct _xmlRegExecRollback xmlRegExecRollback; 272typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 273 274struct _xmlRegExecRollback { 275 xmlRegStatePtr state;/* the current state */ 276 int index; /* the index in the input stack */ 277 int nextbranch; /* the next transition to explore in that state */ 278 int *counts; /* save the automata state if it has some */ 279}; 280 281typedef struct _xmlRegInputToken xmlRegInputToken; 282typedef xmlRegInputToken *xmlRegInputTokenPtr; 283 284struct _xmlRegInputToken { 285 xmlChar *value; 286 void *data; 287}; 288 289struct _xmlRegExecCtxt { 290 int status; /* execution status != 0 indicate an error */ 291 int determinist; /* did we find an indeterministic behaviour */ 292 xmlRegexpPtr comp; /* the compiled regexp */ 293 xmlRegExecCallbacks callback; 294 void *data; 295 296 xmlRegStatePtr state;/* the current state */ 297 int transno; /* the current transition on that state */ 298 int transcount; /* the number of chars in char counted transitions */ 299 300 /* 301 * A stack of rollback states 302 */ 303 int maxRollbacks; 304 int nbRollbacks; 305 xmlRegExecRollback *rollbacks; 306 307 /* 308 * The state of the automata if any 309 */ 310 int *counts; 311 312 /* 313 * The input stack 314 */ 315 int inputStackMax; 316 int inputStackNr; 317 int index; 318 int *charStack; 319 const xmlChar *inputString; /* when operating on characters */ 320 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 321 322 /* 323 * error handling 324 */ 325 int errStateNo; /* the error state number */ 326 xmlRegStatePtr errState; /* the error state */ 327 xmlChar *errString; /* the string raising the error */ 328 int *errCounts; /* counters at the error state */ 329}; 330 331#define REGEXP_ALL_COUNTER 0x123456 332#define REGEXP_ALL_LAX_COUNTER 0x123457 333 334static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 335static void xmlRegFreeState(xmlRegStatePtr state); 336static void xmlRegFreeAtom(xmlRegAtomPtr atom); 337static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 338 339/************************************************************************ 340 * * 341 * Regexp memory error handler * 342 * * 343 ************************************************************************/ 344/** 345 * xmlRegexpErrMemory: 346 * @extra: extra information 347 * 348 * Handle an out of memory condition 349 */ 350static void 351xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 352{ 353 const char *regexp = NULL; 354 if (ctxt != NULL) { 355 regexp = (const char *) ctxt->string; 356 ctxt->error = XML_ERR_NO_MEMORY; 357 } 358 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 359 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 360 regexp, NULL, 0, 0, 361 "Memory allocation failed : %s\n", extra); 362} 363 364/** 365 * xmlRegexpErrCompile: 366 * @extra: extra information 367 * 368 * Handle a compilation failure 369 */ 370static void 371xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 372{ 373 const char *regexp = NULL; 374 int idx = 0; 375 376 if (ctxt != NULL) { 377 regexp = (const char *) ctxt->string; 378 idx = ctxt->cur - ctxt->string; 379 ctxt->error = XML_REGEXP_COMPILE_ERROR; 380 } 381 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 382 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 383 regexp, NULL, idx, 0, 384 "failed to compile: %s\n", extra); 385} 386 387/************************************************************************ 388 * * 389 * Allocation/Deallocation * 390 * * 391 ************************************************************************/ 392 393static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 394/** 395 * xmlRegEpxFromParse: 396 * @ctxt: the parser context used to build it 397 * 398 * Allocate a new regexp and fill it with the result from the parser 399 * 400 * Returns the new regexp or NULL in case of error 401 */ 402static xmlRegexpPtr 403xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 404 xmlRegexpPtr ret; 405 406 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 407 if (ret == NULL) { 408 xmlRegexpErrMemory(ctxt, "compiling regexp"); 409 return(NULL); 410 } 411 memset(ret, 0, sizeof(xmlRegexp)); 412 ret->string = ctxt->string; 413 ret->nbStates = ctxt->nbStates; 414 ret->states = ctxt->states; 415 ret->nbAtoms = ctxt->nbAtoms; 416 ret->atoms = ctxt->atoms; 417 ret->nbCounters = ctxt->nbCounters; 418 ret->counters = ctxt->counters; 419 ret->determinist = ctxt->determinist; 420 421 if ((ret->determinist != 0) && 422 (ret->nbCounters == 0) && 423 (ctxt->negs == 0) && 424 (ret->atoms != NULL) && 425 (ret->atoms[0] != NULL) && 426 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 427 int i, j, nbstates = 0, nbatoms = 0; 428 int *stateRemap; 429 int *stringRemap; 430 int *transitions; 431 void **transdata; 432 xmlChar **stringMap; 433 xmlChar *value; 434 435 /* 436 * Switch to a compact representation 437 * 1/ counting the effective number of states left 438 * 2/ counting the unique number of atoms, and check that 439 * they are all of the string type 440 * 3/ build a table state x atom for the transitions 441 */ 442 443 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 444 if (stateRemap == NULL) { 445 xmlRegexpErrMemory(ctxt, "compiling regexp"); 446 xmlFree(ret); 447 return(NULL); 448 } 449 for (i = 0;i < ret->nbStates;i++) { 450 if (ret->states[i] != NULL) { 451 stateRemap[i] = nbstates; 452 nbstates++; 453 } else { 454 stateRemap[i] = -1; 455 } 456 } 457#ifdef DEBUG_COMPACTION 458 printf("Final: %d states\n", nbstates); 459#endif 460 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 461 if (stringMap == NULL) { 462 xmlRegexpErrMemory(ctxt, "compiling regexp"); 463 xmlFree(stateRemap); 464 xmlFree(ret); 465 return(NULL); 466 } 467 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 468 if (stringRemap == NULL) { 469 xmlRegexpErrMemory(ctxt, "compiling regexp"); 470 xmlFree(stringMap); 471 xmlFree(stateRemap); 472 xmlFree(ret); 473 return(NULL); 474 } 475 for (i = 0;i < ret->nbAtoms;i++) { 476 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 477 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 478 value = ret->atoms[i]->valuep; 479 for (j = 0;j < nbatoms;j++) { 480 if (xmlStrEqual(stringMap[j], value)) { 481 stringRemap[i] = j; 482 break; 483 } 484 } 485 if (j >= nbatoms) { 486 stringRemap[i] = nbatoms; 487 stringMap[nbatoms] = xmlStrdup(value); 488 if (stringMap[nbatoms] == NULL) { 489 for (i = 0;i < nbatoms;i++) 490 xmlFree(stringMap[i]); 491 xmlFree(stringRemap); 492 xmlFree(stringMap); 493 xmlFree(stateRemap); 494 xmlFree(ret); 495 return(NULL); 496 } 497 nbatoms++; 498 } 499 } else { 500 xmlFree(stateRemap); 501 xmlFree(stringRemap); 502 for (i = 0;i < nbatoms;i++) 503 xmlFree(stringMap[i]); 504 xmlFree(stringMap); 505 xmlFree(ret); 506 return(NULL); 507 } 508 } 509#ifdef DEBUG_COMPACTION 510 printf("Final: %d atoms\n", nbatoms); 511#endif 512 transitions = (int *) xmlMalloc((nbstates + 1) * 513 (nbatoms + 1) * sizeof(int)); 514 if (transitions == NULL) { 515 xmlFree(stateRemap); 516 xmlFree(stringRemap); 517 xmlFree(stringMap); 518 xmlFree(ret); 519 return(NULL); 520 } 521 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 522 523 /* 524 * Allocate the transition table. The first entry for each 525 * state corresponds to the state type. 526 */ 527 transdata = NULL; 528 529 for (i = 0;i < ret->nbStates;i++) { 530 int stateno, atomno, targetno, prev; 531 xmlRegStatePtr state; 532 xmlRegTransPtr trans; 533 534 stateno = stateRemap[i]; 535 if (stateno == -1) 536 continue; 537 state = ret->states[i]; 538 539 transitions[stateno * (nbatoms + 1)] = state->type; 540 541 for (j = 0;j < state->nbTrans;j++) { 542 trans = &(state->trans[j]); 543 if ((trans->to == -1) || (trans->atom == NULL)) 544 continue; 545 atomno = stringRemap[trans->atom->no]; 546 if ((trans->atom->data != NULL) && (transdata == NULL)) { 547 transdata = (void **) xmlMalloc(nbstates * nbatoms * 548 sizeof(void *)); 549 if (transdata != NULL) 550 memset(transdata, 0, 551 nbstates * nbatoms * sizeof(void *)); 552 else { 553 xmlRegexpErrMemory(ctxt, "compiling regexp"); 554 break; 555 } 556 } 557 targetno = stateRemap[trans->to]; 558 /* 559 * if the same atom can generate transitions to 2 different 560 * states then it means the automata is not determinist and 561 * the compact form can't be used ! 562 */ 563 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 564 if (prev != 0) { 565 if (prev != targetno + 1) { 566 ret->determinist = 0; 567#ifdef DEBUG_COMPACTION 568 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 569 i, j, trans->atom->no, trans->to, atomno, targetno); 570 printf(" previous to is %d\n", prev); 571#endif 572 ret->determinist = 0; 573 if (transdata != NULL) 574 xmlFree(transdata); 575 xmlFree(transitions); 576 xmlFree(stateRemap); 577 xmlFree(stringRemap); 578 for (i = 0;i < nbatoms;i++) 579 xmlFree(stringMap[i]); 580 xmlFree(stringMap); 581 goto not_determ; 582 } 583 } else { 584#if 0 585 printf("State %d trans %d: atom %d to %d : %d to %d\n", 586 i, j, trans->atom->no, trans->to, atomno, targetno); 587#endif 588 transitions[stateno * (nbatoms + 1) + atomno + 1] = 589 targetno + 1; /* to avoid 0 */ 590 if (transdata != NULL) 591 transdata[stateno * nbatoms + atomno] = 592 trans->atom->data; 593 } 594 } 595 } 596 ret->determinist = 1; 597#ifdef DEBUG_COMPACTION 598 /* 599 * Debug 600 */ 601 for (i = 0;i < nbstates;i++) { 602 for (j = 0;j < nbatoms + 1;j++) { 603 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 604 } 605 printf("\n"); 606 } 607 printf("\n"); 608#endif 609 /* 610 * Cleanup of the old data 611 */ 612 if (ret->states != NULL) { 613 for (i = 0;i < ret->nbStates;i++) 614 xmlRegFreeState(ret->states[i]); 615 xmlFree(ret->states); 616 } 617 ret->states = NULL; 618 ret->nbStates = 0; 619 if (ret->atoms != NULL) { 620 for (i = 0;i < ret->nbAtoms;i++) 621 xmlRegFreeAtom(ret->atoms[i]); 622 xmlFree(ret->atoms); 623 } 624 ret->atoms = NULL; 625 ret->nbAtoms = 0; 626 627 ret->compact = transitions; 628 ret->transdata = transdata; 629 ret->stringMap = stringMap; 630 ret->nbstrings = nbatoms; 631 ret->nbstates = nbstates; 632 xmlFree(stateRemap); 633 xmlFree(stringRemap); 634 } 635not_determ: 636 ctxt->string = NULL; 637 ctxt->nbStates = 0; 638 ctxt->states = NULL; 639 ctxt->nbAtoms = 0; 640 ctxt->atoms = NULL; 641 ctxt->nbCounters = 0; 642 ctxt->counters = NULL; 643 return(ret); 644} 645 646/** 647 * xmlRegNewParserCtxt: 648 * @string: the string to parse 649 * 650 * Allocate a new regexp parser context 651 * 652 * Returns the new context or NULL in case of error 653 */ 654static xmlRegParserCtxtPtr 655xmlRegNewParserCtxt(const xmlChar *string) { 656 xmlRegParserCtxtPtr ret; 657 658 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 659 if (ret == NULL) 660 return(NULL); 661 memset(ret, 0, sizeof(xmlRegParserCtxt)); 662 if (string != NULL) 663 ret->string = xmlStrdup(string); 664 ret->cur = ret->string; 665 ret->neg = 0; 666 ret->negs = 0; 667 ret->error = 0; 668 ret->determinist = -1; 669 return(ret); 670} 671 672/** 673 * xmlRegNewRange: 674 * @ctxt: the regexp parser context 675 * @neg: is that negative 676 * @type: the type of range 677 * @start: the start codepoint 678 * @end: the end codepoint 679 * 680 * Allocate a new regexp range 681 * 682 * Returns the new range or NULL in case of error 683 */ 684static xmlRegRangePtr 685xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 686 int neg, xmlRegAtomType type, int start, int end) { 687 xmlRegRangePtr ret; 688 689 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 690 if (ret == NULL) { 691 xmlRegexpErrMemory(ctxt, "allocating range"); 692 return(NULL); 693 } 694 ret->neg = neg; 695 ret->type = type; 696 ret->start = start; 697 ret->end = end; 698 return(ret); 699} 700 701/** 702 * xmlRegFreeRange: 703 * @range: the regexp range 704 * 705 * Free a regexp range 706 */ 707static void 708xmlRegFreeRange(xmlRegRangePtr range) { 709 if (range == NULL) 710 return; 711 712 if (range->blockName != NULL) 713 xmlFree(range->blockName); 714 xmlFree(range); 715} 716 717/** 718 * xmlRegNewAtom: 719 * @ctxt: the regexp parser context 720 * @type: the type of atom 721 * 722 * Allocate a new regexp range 723 * 724 * Returns the new atom or NULL in case of error 725 */ 726static xmlRegAtomPtr 727xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 728 xmlRegAtomPtr ret; 729 730 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 731 if (ret == NULL) { 732 xmlRegexpErrMemory(ctxt, "allocating atom"); 733 return(NULL); 734 } 735 memset(ret, 0, sizeof(xmlRegAtom)); 736 ret->type = type; 737 ret->quant = XML_REGEXP_QUANT_ONCE; 738 ret->min = 0; 739 ret->max = 0; 740 return(ret); 741} 742 743/** 744 * xmlRegFreeAtom: 745 * @atom: the regexp atom 746 * 747 * Free a regexp atom 748 */ 749static void 750xmlRegFreeAtom(xmlRegAtomPtr atom) { 751 int i; 752 753 if (atom == NULL) 754 return; 755 756 for (i = 0;i < atom->nbRanges;i++) 757 xmlRegFreeRange(atom->ranges[i]); 758 if (atom->ranges != NULL) 759 xmlFree(atom->ranges); 760 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 761 xmlFree(atom->valuep); 762 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 763 xmlFree(atom->valuep2); 764 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 765 xmlFree(atom->valuep); 766 xmlFree(atom); 767} 768 769static xmlRegStatePtr 770xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 771 xmlRegStatePtr ret; 772 773 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 774 if (ret == NULL) { 775 xmlRegexpErrMemory(ctxt, "allocating state"); 776 return(NULL); 777 } 778 memset(ret, 0, sizeof(xmlRegState)); 779 ret->type = XML_REGEXP_TRANS_STATE; 780 ret->mark = XML_REGEXP_MARK_NORMAL; 781 return(ret); 782} 783 784/** 785 * xmlRegFreeState: 786 * @state: the regexp state 787 * 788 * Free a regexp state 789 */ 790static void 791xmlRegFreeState(xmlRegStatePtr state) { 792 if (state == NULL) 793 return; 794 795 if (state->trans != NULL) 796 xmlFree(state->trans); 797 if (state->transTo != NULL) 798 xmlFree(state->transTo); 799 xmlFree(state); 800} 801 802/** 803 * xmlRegFreeParserCtxt: 804 * @ctxt: the regexp parser context 805 * 806 * Free a regexp parser context 807 */ 808static void 809xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 810 int i; 811 if (ctxt == NULL) 812 return; 813 814 if (ctxt->string != NULL) 815 xmlFree(ctxt->string); 816 if (ctxt->states != NULL) { 817 for (i = 0;i < ctxt->nbStates;i++) 818 xmlRegFreeState(ctxt->states[i]); 819 xmlFree(ctxt->states); 820 } 821 if (ctxt->atoms != NULL) { 822 for (i = 0;i < ctxt->nbAtoms;i++) 823 xmlRegFreeAtom(ctxt->atoms[i]); 824 xmlFree(ctxt->atoms); 825 } 826 if (ctxt->counters != NULL) 827 xmlFree(ctxt->counters); 828 xmlFree(ctxt); 829} 830 831/************************************************************************ 832 * * 833 * Display of Data structures * 834 * * 835 ************************************************************************/ 836 837static void 838xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 839 switch (type) { 840 case XML_REGEXP_EPSILON: 841 fprintf(output, "epsilon "); break; 842 case XML_REGEXP_CHARVAL: 843 fprintf(output, "charval "); break; 844 case XML_REGEXP_RANGES: 845 fprintf(output, "ranges "); break; 846 case XML_REGEXP_SUBREG: 847 fprintf(output, "subexpr "); break; 848 case XML_REGEXP_STRING: 849 fprintf(output, "string "); break; 850 case XML_REGEXP_ANYCHAR: 851 fprintf(output, "anychar "); break; 852 case XML_REGEXP_ANYSPACE: 853 fprintf(output, "anyspace "); break; 854 case XML_REGEXP_NOTSPACE: 855 fprintf(output, "notspace "); break; 856 case XML_REGEXP_INITNAME: 857 fprintf(output, "initname "); break; 858 case XML_REGEXP_NOTINITNAME: 859 fprintf(output, "notinitname "); break; 860 case XML_REGEXP_NAMECHAR: 861 fprintf(output, "namechar "); break; 862 case XML_REGEXP_NOTNAMECHAR: 863 fprintf(output, "notnamechar "); break; 864 case XML_REGEXP_DECIMAL: 865 fprintf(output, "decimal "); break; 866 case XML_REGEXP_NOTDECIMAL: 867 fprintf(output, "notdecimal "); break; 868 case XML_REGEXP_REALCHAR: 869 fprintf(output, "realchar "); break; 870 case XML_REGEXP_NOTREALCHAR: 871 fprintf(output, "notrealchar "); break; 872 case XML_REGEXP_LETTER: 873 fprintf(output, "LETTER "); break; 874 case XML_REGEXP_LETTER_UPPERCASE: 875 fprintf(output, "LETTER_UPPERCASE "); break; 876 case XML_REGEXP_LETTER_LOWERCASE: 877 fprintf(output, "LETTER_LOWERCASE "); break; 878 case XML_REGEXP_LETTER_TITLECASE: 879 fprintf(output, "LETTER_TITLECASE "); break; 880 case XML_REGEXP_LETTER_MODIFIER: 881 fprintf(output, "LETTER_MODIFIER "); break; 882 case XML_REGEXP_LETTER_OTHERS: 883 fprintf(output, "LETTER_OTHERS "); break; 884 case XML_REGEXP_MARK: 885 fprintf(output, "MARK "); break; 886 case XML_REGEXP_MARK_NONSPACING: 887 fprintf(output, "MARK_NONSPACING "); break; 888 case XML_REGEXP_MARK_SPACECOMBINING: 889 fprintf(output, "MARK_SPACECOMBINING "); break; 890 case XML_REGEXP_MARK_ENCLOSING: 891 fprintf(output, "MARK_ENCLOSING "); break; 892 case XML_REGEXP_NUMBER: 893 fprintf(output, "NUMBER "); break; 894 case XML_REGEXP_NUMBER_DECIMAL: 895 fprintf(output, "NUMBER_DECIMAL "); break; 896 case XML_REGEXP_NUMBER_LETTER: 897 fprintf(output, "NUMBER_LETTER "); break; 898 case XML_REGEXP_NUMBER_OTHERS: 899 fprintf(output, "NUMBER_OTHERS "); break; 900 case XML_REGEXP_PUNCT: 901 fprintf(output, "PUNCT "); break; 902 case XML_REGEXP_PUNCT_CONNECTOR: 903 fprintf(output, "PUNCT_CONNECTOR "); break; 904 case XML_REGEXP_PUNCT_DASH: 905 fprintf(output, "PUNCT_DASH "); break; 906 case XML_REGEXP_PUNCT_OPEN: 907 fprintf(output, "PUNCT_OPEN "); break; 908 case XML_REGEXP_PUNCT_CLOSE: 909 fprintf(output, "PUNCT_CLOSE "); break; 910 case XML_REGEXP_PUNCT_INITQUOTE: 911 fprintf(output, "PUNCT_INITQUOTE "); break; 912 case XML_REGEXP_PUNCT_FINQUOTE: 913 fprintf(output, "PUNCT_FINQUOTE "); break; 914 case XML_REGEXP_PUNCT_OTHERS: 915 fprintf(output, "PUNCT_OTHERS "); break; 916 case XML_REGEXP_SEPAR: 917 fprintf(output, "SEPAR "); break; 918 case XML_REGEXP_SEPAR_SPACE: 919 fprintf(output, "SEPAR_SPACE "); break; 920 case XML_REGEXP_SEPAR_LINE: 921 fprintf(output, "SEPAR_LINE "); break; 922 case XML_REGEXP_SEPAR_PARA: 923 fprintf(output, "SEPAR_PARA "); break; 924 case XML_REGEXP_SYMBOL: 925 fprintf(output, "SYMBOL "); break; 926 case XML_REGEXP_SYMBOL_MATH: 927 fprintf(output, "SYMBOL_MATH "); break; 928 case XML_REGEXP_SYMBOL_CURRENCY: 929 fprintf(output, "SYMBOL_CURRENCY "); break; 930 case XML_REGEXP_SYMBOL_MODIFIER: 931 fprintf(output, "SYMBOL_MODIFIER "); break; 932 case XML_REGEXP_SYMBOL_OTHERS: 933 fprintf(output, "SYMBOL_OTHERS "); break; 934 case XML_REGEXP_OTHER: 935 fprintf(output, "OTHER "); break; 936 case XML_REGEXP_OTHER_CONTROL: 937 fprintf(output, "OTHER_CONTROL "); break; 938 case XML_REGEXP_OTHER_FORMAT: 939 fprintf(output, "OTHER_FORMAT "); break; 940 case XML_REGEXP_OTHER_PRIVATE: 941 fprintf(output, "OTHER_PRIVATE "); break; 942 case XML_REGEXP_OTHER_NA: 943 fprintf(output, "OTHER_NA "); break; 944 case XML_REGEXP_BLOCK_NAME: 945 fprintf(output, "BLOCK "); break; 946 } 947} 948 949static void 950xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 951 switch (type) { 952 case XML_REGEXP_QUANT_EPSILON: 953 fprintf(output, "epsilon "); break; 954 case XML_REGEXP_QUANT_ONCE: 955 fprintf(output, "once "); break; 956 case XML_REGEXP_QUANT_OPT: 957 fprintf(output, "? "); break; 958 case XML_REGEXP_QUANT_MULT: 959 fprintf(output, "* "); break; 960 case XML_REGEXP_QUANT_PLUS: 961 fprintf(output, "+ "); break; 962 case XML_REGEXP_QUANT_RANGE: 963 fprintf(output, "range "); break; 964 case XML_REGEXP_QUANT_ONCEONLY: 965 fprintf(output, "onceonly "); break; 966 case XML_REGEXP_QUANT_ALL: 967 fprintf(output, "all "); break; 968 } 969} 970static void 971xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 972 fprintf(output, " range: "); 973 if (range->neg) 974 fprintf(output, "negative "); 975 xmlRegPrintAtomType(output, range->type); 976 fprintf(output, "%c - %c\n", range->start, range->end); 977} 978 979static void 980xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 981 fprintf(output, " atom: "); 982 if (atom == NULL) { 983 fprintf(output, "NULL\n"); 984 return; 985 } 986 if (atom->neg) 987 fprintf(output, "not "); 988 xmlRegPrintAtomType(output, atom->type); 989 xmlRegPrintQuantType(output, atom->quant); 990 if (atom->quant == XML_REGEXP_QUANT_RANGE) 991 fprintf(output, "%d-%d ", atom->min, atom->max); 992 if (atom->type == XML_REGEXP_STRING) 993 fprintf(output, "'%s' ", (char *) atom->valuep); 994 if (atom->type == XML_REGEXP_CHARVAL) 995 fprintf(output, "char %c\n", atom->codepoint); 996 else if (atom->type == XML_REGEXP_RANGES) { 997 int i; 998 fprintf(output, "%d entries\n", atom->nbRanges); 999 for (i = 0; i < atom->nbRanges;i++) 1000 xmlRegPrintRange(output, atom->ranges[i]); 1001 } else if (atom->type == XML_REGEXP_SUBREG) { 1002 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1003 } else { 1004 fprintf(output, "\n"); 1005 } 1006} 1007 1008static void 1009xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1010 fprintf(output, " trans: "); 1011 if (trans == NULL) { 1012 fprintf(output, "NULL\n"); 1013 return; 1014 } 1015 if (trans->to < 0) { 1016 fprintf(output, "removed\n"); 1017 return; 1018 } 1019 if (trans->counter >= 0) { 1020 fprintf(output, "counted %d, ", trans->counter); 1021 } 1022 if (trans->count == REGEXP_ALL_COUNTER) { 1023 fprintf(output, "all transition, "); 1024 } else if (trans->count >= 0) { 1025 fprintf(output, "count based %d, ", trans->count); 1026 } 1027 if (trans->atom == NULL) { 1028 fprintf(output, "epsilon to %d\n", trans->to); 1029 return; 1030 } 1031 if (trans->atom->type == XML_REGEXP_CHARVAL) 1032 fprintf(output, "char %c ", trans->atom->codepoint); 1033 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1034} 1035 1036static void 1037xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1038 int i; 1039 1040 fprintf(output, " state: "); 1041 if (state == NULL) { 1042 fprintf(output, "NULL\n"); 1043 return; 1044 } 1045 if (state->type == XML_REGEXP_START_STATE) 1046 fprintf(output, "START "); 1047 if (state->type == XML_REGEXP_FINAL_STATE) 1048 fprintf(output, "FINAL "); 1049 1050 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1051 for (i = 0;i < state->nbTrans; i++) { 1052 xmlRegPrintTrans(output, &(state->trans[i])); 1053 } 1054} 1055 1056#ifdef DEBUG_REGEXP_GRAPH 1057static void 1058xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1059 int i; 1060 1061 fprintf(output, " ctxt: "); 1062 if (ctxt == NULL) { 1063 fprintf(output, "NULL\n"); 1064 return; 1065 } 1066 fprintf(output, "'%s' ", ctxt->string); 1067 if (ctxt->error) 1068 fprintf(output, "error "); 1069 if (ctxt->neg) 1070 fprintf(output, "neg "); 1071 fprintf(output, "\n"); 1072 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1073 for (i = 0;i < ctxt->nbAtoms; i++) { 1074 fprintf(output, " %02d ", i); 1075 xmlRegPrintAtom(output, ctxt->atoms[i]); 1076 } 1077 if (ctxt->atom != NULL) { 1078 fprintf(output, "current atom:\n"); 1079 xmlRegPrintAtom(output, ctxt->atom); 1080 } 1081 fprintf(output, "%d states:", ctxt->nbStates); 1082 if (ctxt->start != NULL) 1083 fprintf(output, " start: %d", ctxt->start->no); 1084 if (ctxt->end != NULL) 1085 fprintf(output, " end: %d", ctxt->end->no); 1086 fprintf(output, "\n"); 1087 for (i = 0;i < ctxt->nbStates; i++) { 1088 xmlRegPrintState(output, ctxt->states[i]); 1089 } 1090 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1091 for (i = 0;i < ctxt->nbCounters; i++) { 1092 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1093 ctxt->counters[i].max); 1094 } 1095} 1096#endif 1097 1098/************************************************************************ 1099 * * 1100 * Finite Automata structures manipulations * 1101 * * 1102 ************************************************************************/ 1103 1104static void 1105xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1106 int neg, xmlRegAtomType type, int start, int end, 1107 xmlChar *blockName) { 1108 xmlRegRangePtr range; 1109 1110 if (atom == NULL) { 1111 ERROR("add range: atom is NULL"); 1112 return; 1113 } 1114 if (atom->type != XML_REGEXP_RANGES) { 1115 ERROR("add range: atom is not ranges"); 1116 return; 1117 } 1118 if (atom->maxRanges == 0) { 1119 atom->maxRanges = 4; 1120 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1121 sizeof(xmlRegRangePtr)); 1122 if (atom->ranges == NULL) { 1123 xmlRegexpErrMemory(ctxt, "adding ranges"); 1124 atom->maxRanges = 0; 1125 return; 1126 } 1127 } else if (atom->nbRanges >= atom->maxRanges) { 1128 xmlRegRangePtr *tmp; 1129 atom->maxRanges *= 2; 1130 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1131 sizeof(xmlRegRangePtr)); 1132 if (tmp == NULL) { 1133 xmlRegexpErrMemory(ctxt, "adding ranges"); 1134 atom->maxRanges /= 2; 1135 return; 1136 } 1137 atom->ranges = tmp; 1138 } 1139 range = xmlRegNewRange(ctxt, neg, type, start, end); 1140 if (range == NULL) 1141 return; 1142 range->blockName = blockName; 1143 atom->ranges[atom->nbRanges++] = range; 1144 1145} 1146 1147static int 1148xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1149 if (ctxt->maxCounters == 0) { 1150 ctxt->maxCounters = 4; 1151 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1152 sizeof(xmlRegCounter)); 1153 if (ctxt->counters == NULL) { 1154 xmlRegexpErrMemory(ctxt, "allocating counter"); 1155 ctxt->maxCounters = 0; 1156 return(-1); 1157 } 1158 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1159 xmlRegCounter *tmp; 1160 ctxt->maxCounters *= 2; 1161 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1162 sizeof(xmlRegCounter)); 1163 if (tmp == NULL) { 1164 xmlRegexpErrMemory(ctxt, "allocating counter"); 1165 ctxt->maxCounters /= 2; 1166 return(-1); 1167 } 1168 ctxt->counters = tmp; 1169 } 1170 ctxt->counters[ctxt->nbCounters].min = -1; 1171 ctxt->counters[ctxt->nbCounters].max = -1; 1172 return(ctxt->nbCounters++); 1173} 1174 1175static int 1176xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1177 if (atom == NULL) { 1178 ERROR("atom push: atom is NULL"); 1179 return(-1); 1180 } 1181 if (ctxt->maxAtoms == 0) { 1182 ctxt->maxAtoms = 4; 1183 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1184 sizeof(xmlRegAtomPtr)); 1185 if (ctxt->atoms == NULL) { 1186 xmlRegexpErrMemory(ctxt, "pushing atom"); 1187 ctxt->maxAtoms = 0; 1188 return(-1); 1189 } 1190 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1191 xmlRegAtomPtr *tmp; 1192 ctxt->maxAtoms *= 2; 1193 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1194 sizeof(xmlRegAtomPtr)); 1195 if (tmp == NULL) { 1196 xmlRegexpErrMemory(ctxt, "allocating counter"); 1197 ctxt->maxAtoms /= 2; 1198 return(-1); 1199 } 1200 ctxt->atoms = tmp; 1201 } 1202 atom->no = ctxt->nbAtoms; 1203 ctxt->atoms[ctxt->nbAtoms++] = atom; 1204 return(0); 1205} 1206 1207static void 1208xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1209 int from) { 1210 if (target->maxTransTo == 0) { 1211 target->maxTransTo = 8; 1212 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1213 sizeof(int)); 1214 if (target->transTo == NULL) { 1215 xmlRegexpErrMemory(ctxt, "adding transition"); 1216 target->maxTransTo = 0; 1217 return; 1218 } 1219 } else if (target->nbTransTo >= target->maxTransTo) { 1220 int *tmp; 1221 target->maxTransTo *= 2; 1222 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1223 sizeof(int)); 1224 if (tmp == NULL) { 1225 xmlRegexpErrMemory(ctxt, "adding transition"); 1226 target->maxTransTo /= 2; 1227 return; 1228 } 1229 target->transTo = tmp; 1230 } 1231 target->transTo[target->nbTransTo] = from; 1232 target->nbTransTo++; 1233} 1234 1235static void 1236xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1237 xmlRegAtomPtr atom, xmlRegStatePtr target, 1238 int counter, int count, int nchk) { 1239 1240 int nrtrans; 1241 1242 if (state == NULL) { 1243 ERROR("add state: state is NULL"); 1244 return; 1245 } 1246 if (target == NULL) { 1247 ERROR("add state: target is NULL"); 1248 return; 1249 } 1250 /* 1251 * Other routines follow the philosophy 'When in doubt, add a transition' 1252 * so we check here whether such a transition is already present and, if 1253 * so, silently ignore this request. 1254 */ 1255 1256 if (nchk == 0) { 1257 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1258 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1259 if ((trans->atom == atom) && 1260 (trans->to == target->no) && 1261 (trans->counter == counter) && 1262 (trans->count == count)) { 1263#ifdef DEBUG_REGEXP_GRAPH 1264 printf("Ignoring duplicate transition from %d to %d\n", 1265 state->no, target->no); 1266#endif 1267 return; 1268 } 1269 } 1270 } 1271 1272 if (state->maxTrans == 0) { 1273 state->maxTrans = 8; 1274 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1275 sizeof(xmlRegTrans)); 1276 if (state->trans == NULL) { 1277 xmlRegexpErrMemory(ctxt, "adding transition"); 1278 state->maxTrans = 0; 1279 return; 1280 } 1281 } else if (state->nbTrans >= state->maxTrans) { 1282 xmlRegTrans *tmp; 1283 state->maxTrans *= 2; 1284 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1285 sizeof(xmlRegTrans)); 1286 if (tmp == NULL) { 1287 xmlRegexpErrMemory(ctxt, "adding transition"); 1288 state->maxTrans /= 2; 1289 return; 1290 } 1291 state->trans = tmp; 1292 } 1293#ifdef DEBUG_REGEXP_GRAPH 1294 printf("Add trans from %d to %d ", state->no, target->no); 1295 if (count == REGEXP_ALL_COUNTER) 1296 printf("all transition\n"); 1297 else if (count >= 0) 1298 printf("count based %d\n", count); 1299 else if (counter >= 0) 1300 printf("counted %d\n", counter); 1301 else if (atom == NULL) 1302 printf("epsilon transition\n"); 1303 else if (atom != NULL) 1304 xmlRegPrintAtom(stdout, atom); 1305#endif 1306 1307 state->trans[state->nbTrans].atom = atom; 1308 state->trans[state->nbTrans].to = target->no; 1309 state->trans[state->nbTrans].counter = counter; 1310 state->trans[state->nbTrans].count = count; 1311 state->nbTrans++; 1312 xmlRegStateAddTransTo(ctxt, target, state->no); 1313} 1314 1315static int 1316xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1317 if (state == NULL) return(-1); 1318 if (ctxt->maxStates == 0) { 1319 ctxt->maxStates = 4; 1320 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1321 sizeof(xmlRegStatePtr)); 1322 if (ctxt->states == NULL) { 1323 xmlRegexpErrMemory(ctxt, "adding state"); 1324 ctxt->maxStates = 0; 1325 return(-1); 1326 } 1327 } else if (ctxt->nbStates >= ctxt->maxStates) { 1328 xmlRegStatePtr *tmp; 1329 ctxt->maxStates *= 2; 1330 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1331 sizeof(xmlRegStatePtr)); 1332 if (tmp == NULL) { 1333 xmlRegexpErrMemory(ctxt, "adding state"); 1334 ctxt->maxStates /= 2; 1335 return(-1); 1336 } 1337 ctxt->states = tmp; 1338 } 1339 state->no = ctxt->nbStates; 1340 ctxt->states[ctxt->nbStates++] = state; 1341 return(0); 1342} 1343 1344/** 1345 * xmlFAGenerateAllTransition: 1346 * @ctxt: a regexp parser context 1347 * @from: the from state 1348 * @to: the target state or NULL for building a new one 1349 * @lax: 1350 * 1351 */ 1352static void 1353xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1354 xmlRegStatePtr from, xmlRegStatePtr to, 1355 int lax) { 1356 if (to == NULL) { 1357 to = xmlRegNewState(ctxt); 1358 xmlRegStatePush(ctxt, to); 1359 ctxt->state = to; 1360 } 1361 if (lax) 1362 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER, 0); 1363 else 1364 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER, 0); 1365} 1366 1367/** 1368 * xmlFAGenerateEpsilonTransition: 1369 * @ctxt: a regexp parser context 1370 * @from: the from state 1371 * @to: the target state or NULL for building a new one 1372 * 1373 */ 1374static void 1375xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1376 xmlRegStatePtr from, xmlRegStatePtr to) { 1377 if (to == NULL) { 1378 to = xmlRegNewState(ctxt); 1379 xmlRegStatePush(ctxt, to); 1380 ctxt->state = to; 1381 } 1382 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1, 0); 1383} 1384 1385/** 1386 * xmlFAGenerateCountedEpsilonTransition: 1387 * @ctxt: a regexp parser context 1388 * @from: the from state 1389 * @to: the target state or NULL for building a new one 1390 * counter: the counter for that transition 1391 * 1392 */ 1393static void 1394xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1395 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1396 if (to == NULL) { 1397 to = xmlRegNewState(ctxt); 1398 xmlRegStatePush(ctxt, to); 1399 ctxt->state = to; 1400 } 1401 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1, 0); 1402} 1403 1404/** 1405 * xmlFAGenerateCountedTransition: 1406 * @ctxt: a regexp parser context 1407 * @from: the from state 1408 * @to: the target state or NULL for building a new one 1409 * counter: the counter for that transition 1410 * 1411 */ 1412static void 1413xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1414 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1415 if (to == NULL) { 1416 to = xmlRegNewState(ctxt); 1417 xmlRegStatePush(ctxt, to); 1418 ctxt->state = to; 1419 } 1420 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter, 0); 1421} 1422 1423/** 1424 * xmlFAGenerateTransitions: 1425 * @ctxt: a regexp parser context 1426 * @from: the from state 1427 * @to: the target state or NULL for building a new one 1428 * @atom: the atom generating the transition 1429 * 1430 * Returns 0 if success and -1 in case of error. 1431 */ 1432static int 1433xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1434 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1435 if (atom == NULL) { 1436 ERROR("genrate transition: atom == NULL"); 1437 return(-1); 1438 } 1439 if (atom->type == XML_REGEXP_SUBREG) { 1440 /* 1441 * this is a subexpression handling one should not need to 1442 * create a new node except for XML_REGEXP_QUANT_RANGE. 1443 */ 1444 if (xmlRegAtomPush(ctxt, atom) < 0) { 1445 return(-1); 1446 } 1447 if ((to != NULL) && (atom->stop != to) && 1448 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1449 /* 1450 * Generate an epsilon transition to link to the target 1451 */ 1452 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1453 } 1454 switch (atom->quant) { 1455 case XML_REGEXP_QUANT_OPT: 1456 atom->quant = XML_REGEXP_QUANT_ONCE; 1457 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1458 break; 1459 case XML_REGEXP_QUANT_MULT: 1460 atom->quant = XML_REGEXP_QUANT_ONCE; 1461 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1462 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1463 break; 1464 case XML_REGEXP_QUANT_PLUS: 1465 atom->quant = XML_REGEXP_QUANT_ONCE; 1466 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1467 break; 1468 case XML_REGEXP_QUANT_RANGE: { 1469 int counter; 1470 xmlRegStatePtr newstate; 1471 1472 /* 1473 * This one is nasty: 1474 * 1/ if range has minOccurs == 0, create a new state 1475 * and create epsilon transitions from atom->start 1476 * to atom->stop, as well as atom->start to the new 1477 * state 1478 * 2/ register a new counter 1479 * 3/ register an epsilon transition associated to 1480 * this counter going from atom->stop to atom->start 1481 * 4/ create a new state 1482 * 5/ generate a counted transition from atom->stop to 1483 * that state 1484 */ 1485 if (atom->min == 0) { 1486 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1487 atom->stop); 1488 newstate = xmlRegNewState(ctxt); 1489 xmlRegStatePush(ctxt, newstate); 1490 ctxt->state = newstate; 1491 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1492 newstate); 1493 } 1494 counter = xmlRegGetCounter(ctxt); 1495 ctxt->counters[counter].min = atom->min - 1; 1496 ctxt->counters[counter].max = atom->max - 1; 1497 atom->min = 0; 1498 atom->max = 0; 1499 atom->quant = XML_REGEXP_QUANT_ONCE; 1500 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1501 atom->start, counter); 1502 if (to != NULL) { 1503 newstate = to; 1504 } else { 1505 newstate = xmlRegNewState(ctxt); 1506 xmlRegStatePush(ctxt, newstate); 1507 ctxt->state = newstate; 1508 } 1509 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1510 newstate, counter); 1511 } 1512 default: 1513 break; 1514 } 1515 return(0); 1516 } else if ((atom->min == 0) && (atom->max == 0) && 1517 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1518 /* 1519 * we can discard the atom and generate an epsilon transition instead 1520 */ 1521 if (to == NULL) { 1522 to = xmlRegNewState(ctxt); 1523 if (to != NULL) 1524 xmlRegStatePush(ctxt, to); 1525 else { 1526 return(-1); 1527 } 1528 } 1529 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1530 ctxt->state = to; 1531 xmlRegFreeAtom(atom); 1532 return(0); 1533 } else { 1534 if (to == NULL) { 1535 to = xmlRegNewState(ctxt); 1536 if (to != NULL) 1537 xmlRegStatePush(ctxt, to); 1538 else { 1539 return(-1); 1540 } 1541 } 1542 if (xmlRegAtomPush(ctxt, atom) < 0) { 1543 return(-1); 1544 } 1545 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1, 0); 1546 ctxt->state = to; 1547 } 1548 switch (atom->quant) { 1549 case XML_REGEXP_QUANT_OPT: 1550 atom->quant = XML_REGEXP_QUANT_ONCE; 1551 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1552 break; 1553 case XML_REGEXP_QUANT_MULT: 1554 atom->quant = XML_REGEXP_QUANT_ONCE; 1555 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1556 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1, 0); 1557 break; 1558 case XML_REGEXP_QUANT_PLUS: 1559 atom->quant = XML_REGEXP_QUANT_ONCE; 1560 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1, 0); 1561 break; 1562 default: 1563 break; 1564 } 1565 return(0); 1566} 1567 1568/** 1569 * xmlFAReduceEpsilonTransitions: 1570 * @ctxt: a regexp parser context 1571 * @fromnr: the from state 1572 * @tonr: the to state 1573 * @counter: should that transition be associated to a counted 1574 * 1575 */ 1576static void 1577xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1578 int tonr, int counter) { 1579 int transnr; 1580 xmlRegStatePtr from; 1581 xmlRegStatePtr to; 1582 1583#ifdef DEBUG_REGEXP_GRAPH 1584 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1585#endif 1586 from = ctxt->states[fromnr]; 1587 if (from == NULL) 1588 return; 1589 to = ctxt->states[tonr]; 1590 if (to == NULL) 1591 return; 1592 if ((to->mark == XML_REGEXP_MARK_START) || 1593 (to->mark == XML_REGEXP_MARK_VISITED)) 1594 return; 1595 1596 to->mark = XML_REGEXP_MARK_VISITED; 1597 if (to->type == XML_REGEXP_FINAL_STATE) { 1598#ifdef DEBUG_REGEXP_GRAPH 1599 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1600#endif 1601 from->type = XML_REGEXP_FINAL_STATE; 1602 } 1603 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1604 if (to->trans[transnr].to < 0) 1605 continue; 1606 if (to->trans[transnr].atom == NULL) { 1607 /* 1608 * Don't remove counted transitions 1609 * Don't loop either 1610 */ 1611 if (to->trans[transnr].to != fromnr) { 1612 if (to->trans[transnr].count >= 0) { 1613 int newto = to->trans[transnr].to; 1614 1615 xmlRegStateAddTrans(ctxt, from, NULL, 1616 ctxt->states[newto], 1617 -1, to->trans[transnr].count, 0); 1618 } else { 1619#ifdef DEBUG_REGEXP_GRAPH 1620 printf("Found epsilon trans %d from %d to %d\n", 1621 transnr, tonr, to->trans[transnr].to); 1622#endif 1623 if (to->trans[transnr].counter >= 0) { 1624 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1625 to->trans[transnr].to, 1626 to->trans[transnr].counter); 1627 } else { 1628 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1629 to->trans[transnr].to, 1630 counter); 1631 } 1632 } 1633 } 1634 } else { 1635 int newto = to->trans[transnr].to; 1636 1637 if (to->trans[transnr].counter >= 0) { 1638 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1639 ctxt->states[newto], 1640 to->trans[transnr].counter, -1, 1); 1641 } else { 1642 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1643 ctxt->states[newto], counter, -1, 1); 1644 } 1645 } 1646 } 1647 to->mark = XML_REGEXP_MARK_NORMAL; 1648} 1649 1650/** 1651 * xmlFAEliminateSimpleEpsilonTransitions: 1652 * @ctxt: a regexp parser context 1653 * 1654 * Eliminating general epsilon transitions can get costly in the general 1655 * algorithm due to the large amount of generated new transitions and 1656 * associated comparisons. However for simple epsilon transition used just 1657 * to separate building blocks when generating the automata this can be 1658 * reduced to state elimination: 1659 * - if there exists an epsilon from X to Y 1660 * - if there is no other transition from X 1661 * then X and Y are semantically equivalent and X can be eliminated 1662 * If X is the start state then make Y the start state, else replace the 1663 * target of all transitions to X by transitions to Y. 1664 */ 1665static void 1666xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1667 int statenr, i, j, newto; 1668 xmlRegStatePtr state, tmp; 1669 1670 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1671 state = ctxt->states[statenr]; 1672 if (state == NULL) 1673 continue; 1674 if (state->nbTrans != 1) 1675 continue; 1676 /* is the only transition out a basic transition */ 1677 if ((state->trans[0].atom == NULL) && 1678 (state->trans[0].to >= 0) && 1679 (state->trans[0].to != statenr) && 1680 (state->trans[0].counter < 0) && 1681 (state->trans[0].count < 0)) { 1682 newto = state->trans[0].to; 1683 1684 if (state->type == XML_REGEXP_START_STATE) { 1685#ifdef DEBUG_REGEXP_GRAPH 1686 printf("Found simple epsilon trans from start %d to %d\n", 1687 statenr, newto); 1688#endif 1689 } else { 1690#ifdef DEBUG_REGEXP_GRAPH 1691 printf("Found simple epsilon trans from %d to %d\n", 1692 statenr, newto); 1693#endif 1694 for (i = 0;i < state->nbTransTo;i++) { 1695 tmp = ctxt->states[state->transTo[i]]; 1696 for (j = 0;j < tmp->nbTrans;j++) { 1697 if (tmp->trans[j].to == statenr) { 1698 tmp->trans[j].to = newto; 1699#ifdef DEBUG_REGEXP_GRAPH 1700 printf("Changed transition %d on %d to go to %d\n", 1701 j, tmp->no, newto); 1702#endif 1703 xmlRegStateAddTransTo(ctxt, ctxt->states[newto], 1704 tmp->no); 1705 } 1706 } 1707 } 1708#if 0 1709 for (i = 0;i < ctxt->nbStates;i++) { 1710 tmp = ctxt->states[i]; 1711 for (j = 0;j < tmp->nbTrans;j++) { 1712 if (tmp->trans[j].to == statenr) { 1713 tmp->trans[j].to = newto; 1714#ifdef DEBUG_REGEXP_GRAPH 1715 printf("Changed transition %d on %d to go to %d\n", 1716 j, tmp->no, newto); 1717#endif 1718 } 1719 } 1720 } 1721#endif 1722 if (state->type == XML_REGEXP_FINAL_STATE) 1723 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1724 /* eliminate the transition completely */ 1725 state->nbTrans = 0; 1726 1727 1728 } 1729 1730 } 1731 } 1732} 1733/** 1734 * xmlFAEliminateEpsilonTransitions: 1735 * @ctxt: a regexp parser context 1736 * 1737 */ 1738static void 1739xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1740 int statenr, transnr; 1741 xmlRegStatePtr state; 1742 int has_epsilon; 1743 1744 if (ctxt->states == NULL) return; 1745 1746 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1747 1748 has_epsilon = 0; 1749 1750 /* 1751 * build the completed transitions bypassing the epsilons 1752 * Use a marking algorithm to avoid loops 1753 * mark sink states too. 1754 */ 1755 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1756 state = ctxt->states[statenr]; 1757 if (state == NULL) 1758 continue; 1759 if ((state->nbTrans == 0) && 1760 (state->type != XML_REGEXP_FINAL_STATE)) { 1761 state->type = XML_REGEXP_SINK_STATE; 1762 } 1763 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1764 if ((state->trans[transnr].atom == NULL) && 1765 (state->trans[transnr].to >= 0)) { 1766 if (state->trans[transnr].to == statenr) { 1767 state->trans[transnr].to = -1; 1768#ifdef DEBUG_REGEXP_GRAPH 1769 printf("Removed loopback epsilon trans %d on %d\n", 1770 transnr, statenr); 1771#endif 1772 } else if (state->trans[transnr].count < 0) { 1773 int newto = state->trans[transnr].to; 1774 1775#ifdef DEBUG_REGEXP_GRAPH 1776 printf("Found epsilon trans %d from %d to %d\n", 1777 transnr, statenr, newto); 1778#endif 1779 state->mark = XML_REGEXP_MARK_START; 1780 has_epsilon = 1; 1781 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1782 newto, state->trans[transnr].counter); 1783 state->mark = XML_REGEXP_MARK_NORMAL; 1784#ifdef DEBUG_REGEXP_GRAPH 1785 } else { 1786 printf("Found counted transition %d on %d\n", 1787 transnr, statenr); 1788#endif 1789 } 1790 } 1791 } 1792 } 1793 /* 1794 * Eliminate the epsilon transitions 1795 */ 1796 if (has_epsilon) { 1797 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1798 state = ctxt->states[statenr]; 1799 if (state == NULL) 1800 continue; 1801 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1802 xmlRegTransPtr trans = &(state->trans[transnr]); 1803 if ((trans->atom == NULL) && 1804 (trans->count < 0) && 1805 (trans->to >= 0)) { 1806 trans->to = -1; 1807 } 1808 } 1809 } 1810 } 1811 1812 /* 1813 * Use this pass to detect unreachable states too 1814 */ 1815 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1816 state = ctxt->states[statenr]; 1817 if (state != NULL) 1818 state->reached = XML_REGEXP_MARK_NORMAL; 1819 } 1820 state = ctxt->states[0]; 1821 if (state != NULL) 1822 state->reached = XML_REGEXP_MARK_START; 1823 while (state != NULL) { 1824 xmlRegStatePtr target = NULL; 1825 state->reached = XML_REGEXP_MARK_VISITED; 1826 /* 1827 * Mark all states reachable from the current reachable state 1828 */ 1829 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1830 if ((state->trans[transnr].to >= 0) && 1831 ((state->trans[transnr].atom != NULL) || 1832 (state->trans[transnr].count >= 0))) { 1833 int newto = state->trans[transnr].to; 1834 1835 if (ctxt->states[newto] == NULL) 1836 continue; 1837 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 1838 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 1839 target = ctxt->states[newto]; 1840 } 1841 } 1842 } 1843 1844 /* 1845 * find the next accessible state not explored 1846 */ 1847 if (target == NULL) { 1848 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 1849 state = ctxt->states[statenr]; 1850 if ((state != NULL) && (state->reached == 1851 XML_REGEXP_MARK_START)) { 1852 target = state; 1853 break; 1854 } 1855 } 1856 } 1857 state = target; 1858 } 1859 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1860 state = ctxt->states[statenr]; 1861 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 1862#ifdef DEBUG_REGEXP_GRAPH 1863 printf("Removed unreachable state %d\n", statenr); 1864#endif 1865 xmlRegFreeState(state); 1866 ctxt->states[statenr] = NULL; 1867 } 1868 } 1869 1870} 1871 1872/** 1873 * xmlFACompareAtoms: 1874 * @atom1: an atom 1875 * @atom2: an atom 1876 * 1877 * Compares two atoms to check whether they are equivalents 1878 * 1879 * Returns 1 if yes and 0 otherwise 1880 */ 1881static int 1882xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 1883 int ret; 1884 1885 if (atom1 == atom2) 1886 return(1); 1887 if ((atom1 == NULL) || (atom2 == NULL)) 1888 return(0); 1889 1890 if (atom1->type != atom2->type) 1891 return(0); 1892 switch (atom1->type) { 1893 case XML_REGEXP_STRING: 1894 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 1895 (xmlChar *)atom2->valuep); 1896 break; 1897 case XML_REGEXP_EPSILON: 1898 return(1); 1899 case XML_REGEXP_CHARVAL: 1900 ret = atom1->codepoint == atom2->codepoint; 1901 break; 1902 case XML_REGEXP_RANGES: 1903 TODO; 1904 return(0); 1905 default: 1906 return(1); 1907 } 1908 if (atom1->neg != atom2->neg) { 1909 ret = !ret; 1910 } 1911 return(ret); 1912} 1913 1914/** 1915 * xmlFARecurseDeterminism: 1916 * @ctxt: a regexp parser context 1917 * 1918 * Check whether the associated regexp is determinist, 1919 * should be called after xmlFAEliminateEpsilonTransitions() 1920 * 1921 */ 1922static int 1923xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1924 int to, xmlRegAtomPtr atom) { 1925 int ret = 1; 1926 int transnr; 1927 xmlRegTransPtr t1; 1928 1929 if (state == NULL) 1930 return(ret); 1931 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1932 t1 = &(state->trans[transnr]); 1933 /* 1934 * check transitions conflicting with the one looked at 1935 */ 1936 if (t1->atom == NULL) { 1937 if (t1->to == -1) 1938 continue; 1939 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 1940 to, atom); 1941 if (ret == 0) 1942 return(0); 1943 continue; 1944 } 1945 if (t1->to != to) 1946 continue; 1947 if (xmlFACompareAtoms(t1->atom, atom)) 1948 return(0); 1949 } 1950 return(ret); 1951} 1952 1953/** 1954 * xmlFAComputesDeterminism: 1955 * @ctxt: a regexp parser context 1956 * 1957 * Check whether the associated regexp is determinist, 1958 * should be called after xmlFAEliminateEpsilonTransitions() 1959 * 1960 */ 1961static int 1962xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 1963 int statenr, transnr; 1964 xmlRegStatePtr state; 1965 xmlRegTransPtr t1, t2; 1966 int i; 1967 int ret = 1; 1968 1969#ifdef DEBUG_REGEXP_GRAPH 1970 printf("xmlFAComputesDeterminism\n"); 1971 xmlRegPrintCtxt(stdout, ctxt); 1972#endif 1973 if (ctxt->determinist != -1) 1974 return(ctxt->determinist); 1975 1976 /* 1977 * Check for all states that there aren't 2 transitions 1978 * with the same atom and a different target. 1979 */ 1980 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1981 state = ctxt->states[statenr]; 1982 if (state == NULL) 1983 continue; 1984 if (state->nbTrans < 2) 1985 continue; 1986 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1987 t1 = &(state->trans[transnr]); 1988 /* 1989 * Determinism checks in case of counted or all transitions 1990 * will have to be handled separately 1991 */ 1992 if (t1->atom == NULL) 1993 continue; 1994 if (t1->to == -1) /* eliminated */ 1995 continue; 1996 for (i = 0;i < transnr;i++) { 1997 t2 = &(state->trans[i]); 1998 if (t2->to == -1) /* eliminated */ 1999 continue; 2000 if (t2->atom != NULL) { 2001 if (t1->to == t2->to) { 2002 if (xmlFACompareAtoms(t1->atom, t2->atom)) 2003 t2->to = -1; /* eliminated */ 2004 } else { 2005 /* not determinist ! */ 2006 if (xmlFACompareAtoms(t1->atom, t2->atom)) 2007 ret = 0; 2008 } 2009 } else if (t1->to != -1) { 2010 /* 2011 * do the closure in case of remaining specific 2012 * epsilon transitions like choices or all 2013 */ 2014 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2015 t2->to, t2->atom); 2016 if (ret == 0) 2017 return(0); 2018 } 2019 } 2020 if (ret == 0) 2021 break; 2022 } 2023 if (ret == 0) 2024 break; 2025 } 2026 ctxt->determinist = ret; 2027 return(ret); 2028} 2029 2030/************************************************************************ 2031 * * 2032 * Routines to check input against transition atoms * 2033 * * 2034 ************************************************************************/ 2035 2036static int 2037xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2038 int start, int end, const xmlChar *blockName) { 2039 int ret = 0; 2040 2041 switch (type) { 2042 case XML_REGEXP_STRING: 2043 case XML_REGEXP_SUBREG: 2044 case XML_REGEXP_RANGES: 2045 case XML_REGEXP_EPSILON: 2046 return(-1); 2047 case XML_REGEXP_ANYCHAR: 2048 ret = ((codepoint != '\n') && (codepoint != '\r')); 2049 break; 2050 case XML_REGEXP_CHARVAL: 2051 ret = ((codepoint >= start) && (codepoint <= end)); 2052 break; 2053 case XML_REGEXP_NOTSPACE: 2054 neg = !neg; 2055 case XML_REGEXP_ANYSPACE: 2056 ret = ((codepoint == '\n') || (codepoint == '\r') || 2057 (codepoint == '\t') || (codepoint == ' ')); 2058 break; 2059 case XML_REGEXP_NOTINITNAME: 2060 neg = !neg; 2061 case XML_REGEXP_INITNAME: 2062 ret = (IS_LETTER(codepoint) || 2063 (codepoint == '_') || (codepoint == ':')); 2064 break; 2065 case XML_REGEXP_NOTNAMECHAR: 2066 neg = !neg; 2067 case XML_REGEXP_NAMECHAR: 2068 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2069 (codepoint == '.') || (codepoint == '-') || 2070 (codepoint == '_') || (codepoint == ':') || 2071 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2072 break; 2073 case XML_REGEXP_NOTDECIMAL: 2074 neg = !neg; 2075 case XML_REGEXP_DECIMAL: 2076 ret = xmlUCSIsCatNd(codepoint); 2077 break; 2078 case XML_REGEXP_REALCHAR: 2079 neg = !neg; 2080 case XML_REGEXP_NOTREALCHAR: 2081 ret = xmlUCSIsCatP(codepoint); 2082 if (ret == 0) 2083 ret = xmlUCSIsCatZ(codepoint); 2084 if (ret == 0) 2085 ret = xmlUCSIsCatC(codepoint); 2086 break; 2087 case XML_REGEXP_LETTER: 2088 ret = xmlUCSIsCatL(codepoint); 2089 break; 2090 case XML_REGEXP_LETTER_UPPERCASE: 2091 ret = xmlUCSIsCatLu(codepoint); 2092 break; 2093 case XML_REGEXP_LETTER_LOWERCASE: 2094 ret = xmlUCSIsCatLl(codepoint); 2095 break; 2096 case XML_REGEXP_LETTER_TITLECASE: 2097 ret = xmlUCSIsCatLt(codepoint); 2098 break; 2099 case XML_REGEXP_LETTER_MODIFIER: 2100 ret = xmlUCSIsCatLm(codepoint); 2101 break; 2102 case XML_REGEXP_LETTER_OTHERS: 2103 ret = xmlUCSIsCatLo(codepoint); 2104 break; 2105 case XML_REGEXP_MARK: 2106 ret = xmlUCSIsCatM(codepoint); 2107 break; 2108 case XML_REGEXP_MARK_NONSPACING: 2109 ret = xmlUCSIsCatMn(codepoint); 2110 break; 2111 case XML_REGEXP_MARK_SPACECOMBINING: 2112 ret = xmlUCSIsCatMc(codepoint); 2113 break; 2114 case XML_REGEXP_MARK_ENCLOSING: 2115 ret = xmlUCSIsCatMe(codepoint); 2116 break; 2117 case XML_REGEXP_NUMBER: 2118 ret = xmlUCSIsCatN(codepoint); 2119 break; 2120 case XML_REGEXP_NUMBER_DECIMAL: 2121 ret = xmlUCSIsCatNd(codepoint); 2122 break; 2123 case XML_REGEXP_NUMBER_LETTER: 2124 ret = xmlUCSIsCatNl(codepoint); 2125 break; 2126 case XML_REGEXP_NUMBER_OTHERS: 2127 ret = xmlUCSIsCatNo(codepoint); 2128 break; 2129 case XML_REGEXP_PUNCT: 2130 ret = xmlUCSIsCatP(codepoint); 2131 break; 2132 case XML_REGEXP_PUNCT_CONNECTOR: 2133 ret = xmlUCSIsCatPc(codepoint); 2134 break; 2135 case XML_REGEXP_PUNCT_DASH: 2136 ret = xmlUCSIsCatPd(codepoint); 2137 break; 2138 case XML_REGEXP_PUNCT_OPEN: 2139 ret = xmlUCSIsCatPs(codepoint); 2140 break; 2141 case XML_REGEXP_PUNCT_CLOSE: 2142 ret = xmlUCSIsCatPe(codepoint); 2143 break; 2144 case XML_REGEXP_PUNCT_INITQUOTE: 2145 ret = xmlUCSIsCatPi(codepoint); 2146 break; 2147 case XML_REGEXP_PUNCT_FINQUOTE: 2148 ret = xmlUCSIsCatPf(codepoint); 2149 break; 2150 case XML_REGEXP_PUNCT_OTHERS: 2151 ret = xmlUCSIsCatPo(codepoint); 2152 break; 2153 case XML_REGEXP_SEPAR: 2154 ret = xmlUCSIsCatZ(codepoint); 2155 break; 2156 case XML_REGEXP_SEPAR_SPACE: 2157 ret = xmlUCSIsCatZs(codepoint); 2158 break; 2159 case XML_REGEXP_SEPAR_LINE: 2160 ret = xmlUCSIsCatZl(codepoint); 2161 break; 2162 case XML_REGEXP_SEPAR_PARA: 2163 ret = xmlUCSIsCatZp(codepoint); 2164 break; 2165 case XML_REGEXP_SYMBOL: 2166 ret = xmlUCSIsCatS(codepoint); 2167 break; 2168 case XML_REGEXP_SYMBOL_MATH: 2169 ret = xmlUCSIsCatSm(codepoint); 2170 break; 2171 case XML_REGEXP_SYMBOL_CURRENCY: 2172 ret = xmlUCSIsCatSc(codepoint); 2173 break; 2174 case XML_REGEXP_SYMBOL_MODIFIER: 2175 ret = xmlUCSIsCatSk(codepoint); 2176 break; 2177 case XML_REGEXP_SYMBOL_OTHERS: 2178 ret = xmlUCSIsCatSo(codepoint); 2179 break; 2180 case XML_REGEXP_OTHER: 2181 ret = xmlUCSIsCatC(codepoint); 2182 break; 2183 case XML_REGEXP_OTHER_CONTROL: 2184 ret = xmlUCSIsCatCc(codepoint); 2185 break; 2186 case XML_REGEXP_OTHER_FORMAT: 2187 ret = xmlUCSIsCatCf(codepoint); 2188 break; 2189 case XML_REGEXP_OTHER_PRIVATE: 2190 ret = xmlUCSIsCatCo(codepoint); 2191 break; 2192 case XML_REGEXP_OTHER_NA: 2193 /* ret = xmlUCSIsCatCn(codepoint); */ 2194 /* Seems it doesn't exist anymore in recent Unicode releases */ 2195 ret = 0; 2196 break; 2197 case XML_REGEXP_BLOCK_NAME: 2198 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2199 break; 2200 } 2201 if (neg) 2202 return(!ret); 2203 return(ret); 2204} 2205 2206static int 2207xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2208 int i, ret = 0; 2209 xmlRegRangePtr range; 2210 2211 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2212 return(-1); 2213 2214 switch (atom->type) { 2215 case XML_REGEXP_SUBREG: 2216 case XML_REGEXP_EPSILON: 2217 return(-1); 2218 case XML_REGEXP_CHARVAL: 2219 return(codepoint == atom->codepoint); 2220 case XML_REGEXP_RANGES: { 2221 int accept = 0; 2222 2223 for (i = 0;i < atom->nbRanges;i++) { 2224 range = atom->ranges[i]; 2225 if (range->neg == 2) { 2226 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2227 0, range->start, range->end, 2228 range->blockName); 2229 if (ret != 0) 2230 return(0); /* excluded char */ 2231 } else if (range->neg) { 2232 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2233 0, range->start, range->end, 2234 range->blockName); 2235 if (ret == 0) 2236 accept = 1; 2237 else 2238 return(0); 2239 } else { 2240 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2241 0, range->start, range->end, 2242 range->blockName); 2243 if (ret != 0) 2244 accept = 1; /* might still be excluded */ 2245 } 2246 } 2247 return(accept); 2248 } 2249 case XML_REGEXP_STRING: 2250 printf("TODO: XML_REGEXP_STRING\n"); 2251 return(-1); 2252 case XML_REGEXP_ANYCHAR: 2253 case XML_REGEXP_ANYSPACE: 2254 case XML_REGEXP_NOTSPACE: 2255 case XML_REGEXP_INITNAME: 2256 case XML_REGEXP_NOTINITNAME: 2257 case XML_REGEXP_NAMECHAR: 2258 case XML_REGEXP_NOTNAMECHAR: 2259 case XML_REGEXP_DECIMAL: 2260 case XML_REGEXP_NOTDECIMAL: 2261 case XML_REGEXP_REALCHAR: 2262 case XML_REGEXP_NOTREALCHAR: 2263 case XML_REGEXP_LETTER: 2264 case XML_REGEXP_LETTER_UPPERCASE: 2265 case XML_REGEXP_LETTER_LOWERCASE: 2266 case XML_REGEXP_LETTER_TITLECASE: 2267 case XML_REGEXP_LETTER_MODIFIER: 2268 case XML_REGEXP_LETTER_OTHERS: 2269 case XML_REGEXP_MARK: 2270 case XML_REGEXP_MARK_NONSPACING: 2271 case XML_REGEXP_MARK_SPACECOMBINING: 2272 case XML_REGEXP_MARK_ENCLOSING: 2273 case XML_REGEXP_NUMBER: 2274 case XML_REGEXP_NUMBER_DECIMAL: 2275 case XML_REGEXP_NUMBER_LETTER: 2276 case XML_REGEXP_NUMBER_OTHERS: 2277 case XML_REGEXP_PUNCT: 2278 case XML_REGEXP_PUNCT_CONNECTOR: 2279 case XML_REGEXP_PUNCT_DASH: 2280 case XML_REGEXP_PUNCT_OPEN: 2281 case XML_REGEXP_PUNCT_CLOSE: 2282 case XML_REGEXP_PUNCT_INITQUOTE: 2283 case XML_REGEXP_PUNCT_FINQUOTE: 2284 case XML_REGEXP_PUNCT_OTHERS: 2285 case XML_REGEXP_SEPAR: 2286 case XML_REGEXP_SEPAR_SPACE: 2287 case XML_REGEXP_SEPAR_LINE: 2288 case XML_REGEXP_SEPAR_PARA: 2289 case XML_REGEXP_SYMBOL: 2290 case XML_REGEXP_SYMBOL_MATH: 2291 case XML_REGEXP_SYMBOL_CURRENCY: 2292 case XML_REGEXP_SYMBOL_MODIFIER: 2293 case XML_REGEXP_SYMBOL_OTHERS: 2294 case XML_REGEXP_OTHER: 2295 case XML_REGEXP_OTHER_CONTROL: 2296 case XML_REGEXP_OTHER_FORMAT: 2297 case XML_REGEXP_OTHER_PRIVATE: 2298 case XML_REGEXP_OTHER_NA: 2299 case XML_REGEXP_BLOCK_NAME: 2300 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 2301 (const xmlChar *)atom->valuep); 2302 if (atom->neg) 2303 ret = !ret; 2304 break; 2305 } 2306 return(ret); 2307} 2308 2309/************************************************************************ 2310 * * 2311 * Saving and restoring state of an execution context * 2312 * * 2313 ************************************************************************/ 2314 2315#ifdef DEBUG_REGEXP_EXEC 2316static void 2317xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 2318 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 2319 if (exec->inputStack != NULL) { 2320 int i; 2321 printf(": "); 2322 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 2323 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]); 2324 } else { 2325 printf(": %s", &(exec->inputString[exec->index])); 2326 } 2327 printf("\n"); 2328} 2329#endif 2330 2331static void 2332xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 2333#ifdef DEBUG_REGEXP_EXEC 2334 printf("saving "); 2335 exec->transno++; 2336 xmlFARegDebugExec(exec); 2337 exec->transno--; 2338#endif 2339 2340 if (exec->maxRollbacks == 0) { 2341 exec->maxRollbacks = 4; 2342 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 2343 sizeof(xmlRegExecRollback)); 2344 if (exec->rollbacks == NULL) { 2345 xmlRegexpErrMemory(NULL, "saving regexp"); 2346 exec->maxRollbacks = 0; 2347 return; 2348 } 2349 memset(exec->rollbacks, 0, 2350 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2351 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 2352 xmlRegExecRollback *tmp; 2353 int len = exec->maxRollbacks; 2354 2355 exec->maxRollbacks *= 2; 2356 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 2357 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2358 if (tmp == NULL) { 2359 xmlRegexpErrMemory(NULL, "saving regexp"); 2360 exec->maxRollbacks /= 2; 2361 return; 2362 } 2363 exec->rollbacks = tmp; 2364 tmp = &exec->rollbacks[len]; 2365 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 2366 } 2367 exec->rollbacks[exec->nbRollbacks].state = exec->state; 2368 exec->rollbacks[exec->nbRollbacks].index = exec->index; 2369 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 2370 if (exec->comp->nbCounters > 0) { 2371 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2372 exec->rollbacks[exec->nbRollbacks].counts = (int *) 2373 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 2374 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2375 xmlRegexpErrMemory(NULL, "saving regexp"); 2376 exec->status = -5; 2377 return; 2378 } 2379 } 2380 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 2381 exec->comp->nbCounters * sizeof(int)); 2382 } 2383 exec->nbRollbacks++; 2384} 2385 2386static void 2387xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 2388 if (exec->nbRollbacks <= 0) { 2389 exec->status = -1; 2390#ifdef DEBUG_REGEXP_EXEC 2391 printf("rollback failed on empty stack\n"); 2392#endif 2393 return; 2394 } 2395 exec->nbRollbacks--; 2396 exec->state = exec->rollbacks[exec->nbRollbacks].state; 2397 exec->index = exec->rollbacks[exec->nbRollbacks].index; 2398 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 2399 if (exec->comp->nbCounters > 0) { 2400 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2401 fprintf(stderr, "exec save: allocation failed"); 2402 exec->status = -6; 2403 return; 2404 } 2405 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 2406 exec->comp->nbCounters * sizeof(int)); 2407 } 2408 2409#ifdef DEBUG_REGEXP_EXEC 2410 printf("restored "); 2411 xmlFARegDebugExec(exec); 2412#endif 2413} 2414 2415/************************************************************************ 2416 * * 2417 * Verifier, running an input against a compiled regexp * 2418 * * 2419 ************************************************************************/ 2420 2421static int 2422xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 2423 xmlRegExecCtxt execval; 2424 xmlRegExecCtxtPtr exec = &execval; 2425 int ret, codepoint = 0, len; 2426 2427 exec->inputString = content; 2428 exec->index = 0; 2429 exec->determinist = 1; 2430 exec->maxRollbacks = 0; 2431 exec->nbRollbacks = 0; 2432 exec->rollbacks = NULL; 2433 exec->status = 0; 2434 exec->comp = comp; 2435 exec->state = comp->states[0]; 2436 exec->transno = 0; 2437 exec->transcount = 0; 2438 exec->inputStack = NULL; 2439 exec->inputStackMax = 0; 2440 if (comp->nbCounters > 0) { 2441 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 2442 if (exec->counts == NULL) { 2443 xmlRegexpErrMemory(NULL, "running regexp"); 2444 return(-1); 2445 } 2446 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 2447 } else 2448 exec->counts = NULL; 2449 while ((exec->status == 0) && 2450 ((exec->inputString[exec->index] != 0) || 2451 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 2452 xmlRegTransPtr trans; 2453 xmlRegAtomPtr atom; 2454 2455 /* 2456 * If end of input on non-terminal state, rollback, however we may 2457 * still have epsilon like transition for counted transitions 2458 * on counters, in that case don't break too early. Additionally, 2459 * if we are working on a range like "AB{0,2}", where B is not present, 2460 * we don't want to break. 2461 */ 2462 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 2463 /* 2464 * if there is a transition, we must check if 2465 * atom allows minOccurs of 0 2466 */ 2467 if (exec->transno < exec->state->nbTrans) { 2468 trans = &exec->state->trans[exec->transno]; 2469 if (trans->to >=0) { 2470 atom = trans->atom; 2471 if (!((atom->min == 0) && (atom->max > 0))) 2472 goto rollback; 2473 } 2474 } else 2475 goto rollback; 2476 } 2477 2478 exec->transcount = 0; 2479 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2480 trans = &exec->state->trans[exec->transno]; 2481 if (trans->to < 0) 2482 continue; 2483 atom = trans->atom; 2484 ret = 0; 2485 if (trans->count >= 0) { 2486 int count; 2487 xmlRegCounterPtr counter; 2488 2489 /* 2490 * A counted transition. 2491 */ 2492 2493 count = exec->counts[trans->count]; 2494 counter = &exec->comp->counters[trans->count]; 2495#ifdef DEBUG_REGEXP_EXEC 2496 printf("testing count %d: val %d, min %d, max %d\n", 2497 trans->count, count, counter->min, counter->max); 2498#endif 2499 ret = ((count >= counter->min) && (count <= counter->max)); 2500 } else if (atom == NULL) { 2501 fprintf(stderr, "epsilon transition left at runtime\n"); 2502 exec->status = -2; 2503 break; 2504 } else if (exec->inputString[exec->index] != 0) { 2505 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 2506 ret = xmlRegCheckCharacter(atom, codepoint); 2507 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 2508 xmlRegStatePtr to = comp->states[trans->to]; 2509 2510 /* 2511 * this is a multiple input sequence 2512 */ 2513 if (exec->state->nbTrans > exec->transno + 1) { 2514 xmlFARegExecSave(exec); 2515 } 2516 exec->transcount = 1; 2517 do { 2518 /* 2519 * Try to progress as much as possible on the input 2520 */ 2521 if (exec->transcount == atom->max) { 2522 break; 2523 } 2524 exec->index += len; 2525 /* 2526 * End of input: stop here 2527 */ 2528 if (exec->inputString[exec->index] == 0) { 2529 exec->index -= len; 2530 break; 2531 } 2532 if (exec->transcount >= atom->min) { 2533 int transno = exec->transno; 2534 xmlRegStatePtr state = exec->state; 2535 2536 /* 2537 * The transition is acceptable save it 2538 */ 2539 exec->transno = -1; /* trick */ 2540 exec->state = to; 2541 xmlFARegExecSave(exec); 2542 exec->transno = transno; 2543 exec->state = state; 2544 } 2545 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 2546 len); 2547 ret = xmlRegCheckCharacter(atom, codepoint); 2548 exec->transcount++; 2549 } while (ret == 1); 2550 if (exec->transcount < atom->min) 2551 ret = 0; 2552 2553 /* 2554 * If the last check failed but one transition was found 2555 * possible, rollback 2556 */ 2557 if (ret < 0) 2558 ret = 0; 2559 if (ret == 0) { 2560 goto rollback; 2561 } 2562 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 2563 /* 2564 * we don't match on the codepoint, but minOccurs of 0 2565 * says that's ok. Setting len to 0 inhibits stepping 2566 * over the codepoint. 2567 */ 2568 exec->transcount = 1; 2569 len = 0; 2570 ret = 1; 2571 } 2572 } else if ((atom->min == 0) && (atom->max > 0)) { 2573 /* another spot to match when minOccurs is 0 */ 2574 exec->transcount = 1; 2575 len = 0; 2576 ret = 1; 2577 } 2578 if (ret == 1) { 2579 if (exec->state->nbTrans > exec->transno + 1) { 2580 xmlFARegExecSave(exec); 2581 } 2582 if (trans->counter >= 0) { 2583#ifdef DEBUG_REGEXP_EXEC 2584 printf("Increasing count %d\n", trans->counter); 2585#endif 2586 exec->counts[trans->counter]++; 2587 } 2588 if ((trans->count >= 0) && 2589 (trans->count < REGEXP_ALL_COUNTER)) { 2590#ifdef DEBUG_REGEXP_EXEC 2591 printf("resetting count %d on transition\n", 2592 trans->count); 2593#endif 2594 exec->counts[trans->count] = 0; 2595 } 2596#ifdef DEBUG_REGEXP_EXEC 2597 printf("entering state %d\n", trans->to); 2598#endif 2599 exec->state = comp->states[trans->to]; 2600 exec->transno = 0; 2601 if (trans->atom != NULL) { 2602 exec->index += len; 2603 } 2604 goto progress; 2605 } else if (ret < 0) { 2606 exec->status = -4; 2607 break; 2608 } 2609 } 2610 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2611rollback: 2612 /* 2613 * Failed to find a way out 2614 */ 2615 exec->determinist = 0; 2616 xmlFARegExecRollBack(exec); 2617 } 2618progress: 2619 continue; 2620 } 2621 if (exec->rollbacks != NULL) { 2622 if (exec->counts != NULL) { 2623 int i; 2624 2625 for (i = 0;i < exec->maxRollbacks;i++) 2626 if (exec->rollbacks[i].counts != NULL) 2627 xmlFree(exec->rollbacks[i].counts); 2628 } 2629 xmlFree(exec->rollbacks); 2630 } 2631 if (exec->counts != NULL) 2632 xmlFree(exec->counts); 2633 if (exec->status == 0) 2634 return(1); 2635 if (exec->status == -1) 2636 return(0); 2637 return(exec->status); 2638} 2639 2640/************************************************************************ 2641 * * 2642 * Progressive interface to the verifier one atom at a time * 2643 * * 2644 ************************************************************************/ 2645#ifdef DEBUG_ERR 2646static void testerr(xmlRegExecCtxtPtr exec); 2647#endif 2648 2649/** 2650 * xmlRegNewExecCtxt: 2651 * @comp: a precompiled regular expression 2652 * @callback: a callback function used for handling progresses in the 2653 * automata matching phase 2654 * @data: the context data associated to the callback in this context 2655 * 2656 * Build a context used for progressive evaluation of a regexp. 2657 * 2658 * Returns the new context 2659 */ 2660xmlRegExecCtxtPtr 2661xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 2662 xmlRegExecCtxtPtr exec; 2663 2664 if (comp == NULL) 2665 return(NULL); 2666 if ((comp->compact == NULL) && (comp->states == NULL)) 2667 return(NULL); 2668 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 2669 if (exec == NULL) { 2670 xmlRegexpErrMemory(NULL, "creating execution context"); 2671 return(NULL); 2672 } 2673 memset(exec, 0, sizeof(xmlRegExecCtxt)); 2674 exec->inputString = NULL; 2675 exec->index = 0; 2676 exec->determinist = 1; 2677 exec->maxRollbacks = 0; 2678 exec->nbRollbacks = 0; 2679 exec->rollbacks = NULL; 2680 exec->status = 0; 2681 exec->comp = comp; 2682 if (comp->compact == NULL) 2683 exec->state = comp->states[0]; 2684 exec->transno = 0; 2685 exec->transcount = 0; 2686 exec->callback = callback; 2687 exec->data = data; 2688 if (comp->nbCounters > 0) { 2689 /* 2690 * For error handling, exec->counts is allocated twice the size 2691 * the second half is used to store the data in case of rollback 2692 */ 2693 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 2694 * 2); 2695 if (exec->counts == NULL) { 2696 xmlRegexpErrMemory(NULL, "creating execution context"); 2697 xmlFree(exec); 2698 return(NULL); 2699 } 2700 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 2701 exec->errCounts = &exec->counts[comp->nbCounters]; 2702 } else { 2703 exec->counts = NULL; 2704 exec->errCounts = NULL; 2705 } 2706 exec->inputStackMax = 0; 2707 exec->inputStackNr = 0; 2708 exec->inputStack = NULL; 2709 exec->errStateNo = -1; 2710 exec->errString = NULL; 2711 return(exec); 2712} 2713 2714/** 2715 * xmlRegFreeExecCtxt: 2716 * @exec: a regular expression evaulation context 2717 * 2718 * Free the structures associated to a regular expression evaulation context. 2719 */ 2720void 2721xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 2722 if (exec == NULL) 2723 return; 2724 2725 if (exec->rollbacks != NULL) { 2726 if (exec->counts != NULL) { 2727 int i; 2728 2729 for (i = 0;i < exec->maxRollbacks;i++) 2730 if (exec->rollbacks[i].counts != NULL) 2731 xmlFree(exec->rollbacks[i].counts); 2732 } 2733 xmlFree(exec->rollbacks); 2734 } 2735 if (exec->counts != NULL) 2736 xmlFree(exec->counts); 2737 if (exec->inputStack != NULL) { 2738 int i; 2739 2740 for (i = 0;i < exec->inputStackNr;i++) { 2741 if (exec->inputStack[i].value != NULL) 2742 xmlFree(exec->inputStack[i].value); 2743 } 2744 xmlFree(exec->inputStack); 2745 } 2746 if (exec->errString != NULL) 2747 xmlFree(exec->errString); 2748 xmlFree(exec); 2749} 2750 2751static void 2752xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 2753 void *data) { 2754#ifdef DEBUG_PUSH 2755 printf("saving value: %d:%s\n", exec->inputStackNr, value); 2756#endif 2757 if (exec->inputStackMax == 0) { 2758 exec->inputStackMax = 4; 2759 exec->inputStack = (xmlRegInputTokenPtr) 2760 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 2761 if (exec->inputStack == NULL) { 2762 xmlRegexpErrMemory(NULL, "pushing input string"); 2763 exec->inputStackMax = 0; 2764 return; 2765 } 2766 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 2767 xmlRegInputTokenPtr tmp; 2768 2769 exec->inputStackMax *= 2; 2770 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 2771 exec->inputStackMax * sizeof(xmlRegInputToken)); 2772 if (tmp == NULL) { 2773 xmlRegexpErrMemory(NULL, "pushing input string"); 2774 exec->inputStackMax /= 2; 2775 return; 2776 } 2777 exec->inputStack = tmp; 2778 } 2779 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 2780 exec->inputStack[exec->inputStackNr].data = data; 2781 exec->inputStackNr++; 2782 exec->inputStack[exec->inputStackNr].value = NULL; 2783 exec->inputStack[exec->inputStackNr].data = NULL; 2784} 2785 2786/** 2787 * xmlRegStrEqualWildcard: 2788 * @expStr: the string to be evaluated 2789 * @valStr: the validation string 2790 * 2791 * Checks if both strings are equal or have the same content. "*" 2792 * can be used as a wildcard in @valStr; "|" is used as a seperator of 2793 * substrings in both @expStr and @valStr. 2794 * 2795 * Returns 1 if the comparison is satisfied and the number of substrings 2796 * is equal, 0 otherwise. 2797 */ 2798 2799static int 2800xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 2801 if (expStr == valStr) return(1); 2802 if (expStr == NULL) return(0); 2803 if (valStr == NULL) return(0); 2804 do { 2805 /* 2806 * Eval if we have a wildcard for the current item. 2807 */ 2808 if (*expStr != *valStr) { 2809 /* if one of them starts with a wildcard make valStr be it */ 2810 if (*valStr == '*') { 2811 const xmlChar *tmp; 2812 2813 tmp = valStr; 2814 valStr = expStr; 2815 expStr = tmp; 2816 } 2817 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 2818 do { 2819 if (*valStr == XML_REG_STRING_SEPARATOR) 2820 break; 2821 valStr++; 2822 } while (*valStr != 0); 2823 continue; 2824 } else 2825 return(0); 2826 } 2827 expStr++; 2828 valStr++; 2829 } while (*valStr != 0); 2830 if (*expStr != 0) 2831 return (0); 2832 else 2833 return (1); 2834} 2835 2836/** 2837 * xmlRegCompactPushString: 2838 * @exec: a regexp execution context 2839 * @comp: the precompiled exec with a compact table 2840 * @value: a string token input 2841 * @data: data associated to the token to reuse in callbacks 2842 * 2843 * Push one input token in the execution context 2844 * 2845 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 2846 * a negative value in case of error. 2847 */ 2848static int 2849xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 2850 xmlRegexpPtr comp, 2851 const xmlChar *value, 2852 void *data) { 2853 int state = exec->index; 2854 int i, target; 2855 2856 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 2857 return(-1); 2858 2859 if (value == NULL) { 2860 /* 2861 * are we at a final state ? 2862 */ 2863 if (comp->compact[state * (comp->nbstrings + 1)] == 2864 XML_REGEXP_FINAL_STATE) 2865 return(1); 2866 return(0); 2867 } 2868 2869#ifdef DEBUG_PUSH 2870 printf("value pushed: %s\n", value); 2871#endif 2872 2873 /* 2874 * Examine all outside transitions from current state 2875 */ 2876 for (i = 0;i < comp->nbstrings;i++) { 2877 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 2878 if ((target > 0) && (target <= comp->nbstates)) { 2879 target--; /* to avoid 0 */ 2880 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 2881 exec->index = target; 2882 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 2883 exec->callback(exec->data, value, 2884 comp->transdata[state * comp->nbstrings + i], data); 2885 } 2886#ifdef DEBUG_PUSH 2887 printf("entering state %d\n", target); 2888#endif 2889 if (comp->compact[target * (comp->nbstrings + 1)] == 2890 XML_REGEXP_SINK_STATE) 2891 goto error; 2892 2893 if (comp->compact[target * (comp->nbstrings + 1)] == 2894 XML_REGEXP_FINAL_STATE) 2895 return(1); 2896 return(0); 2897 } 2898 } 2899 } 2900 /* 2901 * Failed to find an exit transition out from current state for the 2902 * current token 2903 */ 2904#ifdef DEBUG_PUSH 2905 printf("failed to find a transition for %s on state %d\n", value, state); 2906#endif 2907error: 2908 if (exec->errString != NULL) 2909 xmlFree(exec->errString); 2910 exec->errString = xmlStrdup(value); 2911 exec->errStateNo = state; 2912 exec->status = -1; 2913#ifdef DEBUG_ERR 2914 testerr(exec); 2915#endif 2916 return(-1); 2917} 2918 2919/** 2920 * xmlRegExecPushStringInternal: 2921 * @exec: a regexp execution context or NULL to indicate the end 2922 * @value: a string token input 2923 * @data: data associated to the token to reuse in callbacks 2924 * @compound: value was assembled from 2 strings 2925 * 2926 * Push one input token in the execution context 2927 * 2928 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 2929 * a negative value in case of error. 2930 */ 2931static int 2932xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 2933 void *data, int compound) { 2934 xmlRegTransPtr trans; 2935 xmlRegAtomPtr atom; 2936 int ret; 2937 int final = 0; 2938 int progress = 1; 2939 2940 if (exec == NULL) 2941 return(-1); 2942 if (exec->comp == NULL) 2943 return(-1); 2944 if (exec->status != 0) 2945 return(exec->status); 2946 2947 if (exec->comp->compact != NULL) 2948 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 2949 2950 if (value == NULL) { 2951 if (exec->state->type == XML_REGEXP_FINAL_STATE) 2952 return(1); 2953 final = 1; 2954 } 2955 2956#ifdef DEBUG_PUSH 2957 printf("value pushed: %s\n", value); 2958#endif 2959 /* 2960 * If we have an active rollback stack push the new value there 2961 * and get back to where we were left 2962 */ 2963 if ((value != NULL) && (exec->inputStackNr > 0)) { 2964 xmlFARegExecSaveInputString(exec, value, data); 2965 value = exec->inputStack[exec->index].value; 2966 data = exec->inputStack[exec->index].data; 2967#ifdef DEBUG_PUSH 2968 printf("value loaded: %s\n", value); 2969#endif 2970 } 2971 2972 while ((exec->status == 0) && 2973 ((value != NULL) || 2974 ((final == 1) && 2975 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 2976 2977 /* 2978 * End of input on non-terminal state, rollback, however we may 2979 * still have epsilon like transition for counted transitions 2980 * on counters, in that case don't break too early. 2981 */ 2982 if ((value == NULL) && (exec->counts == NULL)) 2983 goto rollback; 2984 2985 exec->transcount = 0; 2986 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2987 trans = &exec->state->trans[exec->transno]; 2988 if (trans->to < 0) 2989 continue; 2990 atom = trans->atom; 2991 ret = 0; 2992 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 2993 int i; 2994 int count; 2995 xmlRegTransPtr t; 2996 xmlRegCounterPtr counter; 2997 2998 ret = 0; 2999 3000#ifdef DEBUG_PUSH 3001 printf("testing all lax %d\n", trans->count); 3002#endif 3003 /* 3004 * Check all counted transitions from the current state 3005 */ 3006 if ((value == NULL) && (final)) { 3007 ret = 1; 3008 } else if (value != NULL) { 3009 for (i = 0;i < exec->state->nbTrans;i++) { 3010 t = &exec->state->trans[i]; 3011 if ((t->counter < 0) || (t == trans)) 3012 continue; 3013 counter = &exec->comp->counters[t->counter]; 3014 count = exec->counts[t->counter]; 3015 if ((count < counter->max) && 3016 (t->atom != NULL) && 3017 (xmlStrEqual(value, t->atom->valuep))) { 3018 ret = 0; 3019 break; 3020 } 3021 if ((count >= counter->min) && 3022 (count < counter->max) && 3023 (xmlStrEqual(value, t->atom->valuep))) { 3024 ret = 1; 3025 break; 3026 } 3027 } 3028 } 3029 } else if (trans->count == REGEXP_ALL_COUNTER) { 3030 int i; 3031 int count; 3032 xmlRegTransPtr t; 3033 xmlRegCounterPtr counter; 3034 3035 ret = 1; 3036 3037#ifdef DEBUG_PUSH 3038 printf("testing all %d\n", trans->count); 3039#endif 3040 /* 3041 * Check all counted transitions from the current state 3042 */ 3043 for (i = 0;i < exec->state->nbTrans;i++) { 3044 t = &exec->state->trans[i]; 3045 if ((t->counter < 0) || (t == trans)) 3046 continue; 3047 counter = &exec->comp->counters[t->counter]; 3048 count = exec->counts[t->counter]; 3049 if ((count < counter->min) || (count > counter->max)) { 3050 ret = 0; 3051 break; 3052 } 3053 } 3054 } else if (trans->count >= 0) { 3055 int count; 3056 xmlRegCounterPtr counter; 3057 3058 /* 3059 * A counted transition. 3060 */ 3061 3062 count = exec->counts[trans->count]; 3063 counter = &exec->comp->counters[trans->count]; 3064#ifdef DEBUG_PUSH 3065 printf("testing count %d: val %d, min %d, max %d\n", 3066 trans->count, count, counter->min, counter->max); 3067#endif 3068 ret = ((count >= counter->min) && (count <= counter->max)); 3069 } else if (atom == NULL) { 3070 fprintf(stderr, "epsilon transition left at runtime\n"); 3071 exec->status = -2; 3072 break; 3073 } else if (value != NULL) { 3074 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3075 if (atom->neg) { 3076 ret = !ret; 3077 if (!compound) 3078 ret = 0; 3079 } 3080 if ((ret == 1) && (trans->counter >= 0)) { 3081 xmlRegCounterPtr counter; 3082 int count; 3083 3084 count = exec->counts[trans->counter]; 3085 counter = &exec->comp->counters[trans->counter]; 3086 if (count >= counter->max) 3087 ret = 0; 3088 } 3089 3090 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3091 xmlRegStatePtr to = exec->comp->states[trans->to]; 3092 3093 /* 3094 * this is a multiple input sequence 3095 */ 3096 if (exec->state->nbTrans > exec->transno + 1) { 3097 if (exec->inputStackNr <= 0) { 3098 xmlFARegExecSaveInputString(exec, value, data); 3099 } 3100 xmlFARegExecSave(exec); 3101 } 3102 exec->transcount = 1; 3103 do { 3104 /* 3105 * Try to progress as much as possible on the input 3106 */ 3107 if (exec->transcount == atom->max) { 3108 break; 3109 } 3110 exec->index++; 3111 value = exec->inputStack[exec->index].value; 3112 data = exec->inputStack[exec->index].data; 3113#ifdef DEBUG_PUSH 3114 printf("value loaded: %s\n", value); 3115#endif 3116 3117 /* 3118 * End of input: stop here 3119 */ 3120 if (value == NULL) { 3121 exec->index --; 3122 break; 3123 } 3124 if (exec->transcount >= atom->min) { 3125 int transno = exec->transno; 3126 xmlRegStatePtr state = exec->state; 3127 3128 /* 3129 * The transition is acceptable save it 3130 */ 3131 exec->transno = -1; /* trick */ 3132 exec->state = to; 3133 if (exec->inputStackNr <= 0) { 3134 xmlFARegExecSaveInputString(exec, value, data); 3135 } 3136 xmlFARegExecSave(exec); 3137 exec->transno = transno; 3138 exec->state = state; 3139 } 3140 ret = xmlStrEqual(value, atom->valuep); 3141 exec->transcount++; 3142 } while (ret == 1); 3143 if (exec->transcount < atom->min) 3144 ret = 0; 3145 3146 /* 3147 * If the last check failed but one transition was found 3148 * possible, rollback 3149 */ 3150 if (ret < 0) 3151 ret = 0; 3152 if (ret == 0) { 3153 goto rollback; 3154 } 3155 } 3156 } 3157 if (ret == 1) { 3158 if ((exec->callback != NULL) && (atom != NULL) && 3159 (data != NULL)) { 3160 exec->callback(exec->data, atom->valuep, 3161 atom->data, data); 3162 } 3163 if (exec->state->nbTrans > exec->transno + 1) { 3164 if (exec->inputStackNr <= 0) { 3165 xmlFARegExecSaveInputString(exec, value, data); 3166 } 3167 xmlFARegExecSave(exec); 3168 } 3169 if (trans->counter >= 0) { 3170#ifdef DEBUG_PUSH 3171 printf("Increasing count %d\n", trans->counter); 3172#endif 3173 exec->counts[trans->counter]++; 3174 } 3175 if ((trans->count >= 0) && 3176 (trans->count < REGEXP_ALL_COUNTER)) { 3177#ifdef DEBUG_REGEXP_EXEC 3178 printf("resetting count %d on transition\n", 3179 trans->count); 3180#endif 3181 exec->counts[trans->count] = 0; 3182 } 3183#ifdef DEBUG_PUSH 3184 printf("entering state %d\n", trans->to); 3185#endif 3186 if ((exec->comp->states[trans->to] != NULL) && 3187 (exec->comp->states[trans->to]->type == 3188 XML_REGEXP_SINK_STATE)) { 3189 /* 3190 * entering a sink state, save the current state as error 3191 * state. 3192 */ 3193 if (exec->errString != NULL) 3194 xmlFree(exec->errString); 3195 exec->errString = xmlStrdup(value); 3196 exec->errState = exec->state; 3197 memcpy(exec->errCounts, exec->counts, 3198 exec->comp->nbCounters * sizeof(int)); 3199 } 3200 exec->state = exec->comp->states[trans->to]; 3201 exec->transno = 0; 3202 if (trans->atom != NULL) { 3203 if (exec->inputStack != NULL) { 3204 exec->index++; 3205 if (exec->index < exec->inputStackNr) { 3206 value = exec->inputStack[exec->index].value; 3207 data = exec->inputStack[exec->index].data; 3208#ifdef DEBUG_PUSH 3209 printf("value loaded: %s\n", value); 3210#endif 3211 } else { 3212 value = NULL; 3213 data = NULL; 3214#ifdef DEBUG_PUSH 3215 printf("end of input\n"); 3216#endif 3217 } 3218 } else { 3219 value = NULL; 3220 data = NULL; 3221#ifdef DEBUG_PUSH 3222 printf("end of input\n"); 3223#endif 3224 } 3225 } 3226 goto progress; 3227 } else if (ret < 0) { 3228 exec->status = -4; 3229 break; 3230 } 3231 } 3232 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3233rollback: 3234 /* 3235 * if we didn't yet rollback on the current input 3236 * store the current state as the error state. 3237 */ 3238 if ((progress) && (exec->state != NULL) && 3239 (exec->state->type != XML_REGEXP_SINK_STATE)) { 3240 progress = 0; 3241 if (exec->errString != NULL) 3242 xmlFree(exec->errString); 3243 exec->errString = xmlStrdup(value); 3244 exec->errState = exec->state; 3245 memcpy(exec->errCounts, exec->counts, 3246 exec->comp->nbCounters * sizeof(int)); 3247 } 3248 3249 /* 3250 * Failed to find a way out 3251 */ 3252 exec->determinist = 0; 3253 xmlFARegExecRollBack(exec); 3254 if (exec->status == 0) { 3255 value = exec->inputStack[exec->index].value; 3256 data = exec->inputStack[exec->index].data; 3257#ifdef DEBUG_PUSH 3258 printf("value loaded: %s\n", value); 3259#endif 3260 } 3261 } 3262 continue; 3263progress: 3264 progress = 1; 3265 continue; 3266 } 3267 if (exec->status == 0) { 3268 return(exec->state->type == XML_REGEXP_FINAL_STATE); 3269 } 3270#ifdef DEBUG_ERR 3271 if (exec->status < 0) { 3272 testerr(exec); 3273 } 3274#endif 3275 return(exec->status); 3276} 3277 3278/** 3279 * xmlRegExecPushString: 3280 * @exec: a regexp execution context or NULL to indicate the end 3281 * @value: a string token input 3282 * @data: data associated to the token to reuse in callbacks 3283 * 3284 * Push one input token in the execution context 3285 * 3286 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3287 * a negative value in case of error. 3288 */ 3289int 3290xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3291 void *data) { 3292 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 3293} 3294 3295/** 3296 * xmlRegExecPushString2: 3297 * @exec: a regexp execution context or NULL to indicate the end 3298 * @value: the first string token input 3299 * @value2: the second string token input 3300 * @data: data associated to the token to reuse in callbacks 3301 * 3302 * Push one input token in the execution context 3303 * 3304 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3305 * a negative value in case of error. 3306 */ 3307int 3308xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 3309 const xmlChar *value2, void *data) { 3310 xmlChar buf[150]; 3311 int lenn, lenp, ret; 3312 xmlChar *str; 3313 3314 if (exec == NULL) 3315 return(-1); 3316 if (exec->comp == NULL) 3317 return(-1); 3318 if (exec->status != 0) 3319 return(exec->status); 3320 3321 if (value2 == NULL) 3322 return(xmlRegExecPushString(exec, value, data)); 3323 3324 lenn = strlen((char *) value2); 3325 lenp = strlen((char *) value); 3326 3327 if (150 < lenn + lenp + 2) { 3328 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 3329 if (str == NULL) { 3330 exec->status = -1; 3331 return(-1); 3332 } 3333 } else { 3334 str = buf; 3335 } 3336 memcpy(&str[0], value, lenp); 3337 str[lenp] = XML_REG_STRING_SEPARATOR; 3338 memcpy(&str[lenp + 1], value2, lenn); 3339 str[lenn + lenp + 1] = 0; 3340 3341 if (exec->comp->compact != NULL) 3342 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 3343 else 3344 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 3345 3346 if (str != buf) 3347 xmlFree(buf); 3348 return(ret); 3349} 3350 3351/** 3352 * xmlRegExecGetValues: 3353 * @exec: a regexp execution context 3354 * @err: error extraction or normal one 3355 * @nbval: pointer to the number of accepted values IN/OUT 3356 * @nbneg: return number of negative transitions 3357 * @values: pointer to the array of acceptable values 3358 * @terminal: return value if this was a terminal state 3359 * 3360 * Extract informations from the regexp execution, internal routine to 3361 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 3362 * 3363 * Returns: 0 in case of success or -1 in case of error. 3364 */ 3365static int 3366xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 3367 int *nbval, int *nbneg, 3368 xmlChar **values, int *terminal) { 3369 int maxval; 3370 int nb = 0; 3371 3372 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 3373 (values == NULL) || (*nbval <= 0)) 3374 return(-1); 3375 3376 maxval = *nbval; 3377 *nbval = 0; 3378 *nbneg = 0; 3379 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 3380 xmlRegexpPtr comp; 3381 int target, i, state; 3382 3383 comp = exec->comp; 3384 3385 if (err) { 3386 if (exec->errStateNo == -1) return(-1); 3387 state = exec->errStateNo; 3388 } else { 3389 state = exec->index; 3390 } 3391 if (terminal != NULL) { 3392 if (comp->compact[state * (comp->nbstrings + 1)] == 3393 XML_REGEXP_FINAL_STATE) 3394 *terminal = 1; 3395 else 3396 *terminal = 0; 3397 } 3398 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 3399 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3400 if ((target > 0) && (target <= comp->nbstates) && 3401 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 3402 XML_REGEXP_SINK_STATE)) { 3403 values[nb++] = comp->stringMap[i]; 3404 (*nbval)++; 3405 } 3406 } 3407 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 3408 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3409 if ((target > 0) && (target <= comp->nbstates) && 3410 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 3411 XML_REGEXP_SINK_STATE)) { 3412 values[nb++] = comp->stringMap[i]; 3413 (*nbneg)++; 3414 } 3415 } 3416 } else { 3417 int transno; 3418 xmlRegTransPtr trans; 3419 xmlRegAtomPtr atom; 3420 xmlRegStatePtr state; 3421 3422 if (terminal != NULL) { 3423 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3424 *terminal = 1; 3425 else 3426 *terminal = 0; 3427 } 3428 3429 if (err) { 3430 if (exec->errState == NULL) return(-1); 3431 state = exec->errState; 3432 } else { 3433 if (exec->state == NULL) return(-1); 3434 state = exec->state; 3435 } 3436 for (transno = 0; 3437 (transno < state->nbTrans) && (nb < maxval); 3438 transno++) { 3439 trans = &state->trans[transno]; 3440 if (trans->to < 0) 3441 continue; 3442 atom = trans->atom; 3443 if ((atom == NULL) || (atom->valuep == NULL)) 3444 continue; 3445 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3446 /* this should not be reached but ... */ 3447 TODO; 3448 } else if (trans->count == REGEXP_ALL_COUNTER) { 3449 /* this should not be reached but ... */ 3450 TODO; 3451 } else if (trans->counter >= 0) { 3452 xmlRegCounterPtr counter; 3453 int count; 3454 3455 if (err) 3456 count = exec->errCounts[trans->counter]; 3457 else 3458 count = exec->counts[trans->counter]; 3459 counter = &exec->comp->counters[trans->counter]; 3460 if (count < counter->max) { 3461 if (atom->neg) 3462 values[nb++] = (xmlChar *) atom->valuep2; 3463 else 3464 values[nb++] = (xmlChar *) atom->valuep; 3465 (*nbval)++; 3466 } 3467 } else { 3468 if ((exec->comp->states[trans->to] != NULL) && 3469 (exec->comp->states[trans->to]->type != 3470 XML_REGEXP_SINK_STATE)) { 3471 if (atom->neg) 3472 values[nb++] = (xmlChar *) atom->valuep2; 3473 else 3474 values[nb++] = (xmlChar *) atom->valuep; 3475 (*nbval)++; 3476 } 3477 } 3478 } 3479 for (transno = 0; 3480 (transno < state->nbTrans) && (nb < maxval); 3481 transno++) { 3482 trans = &state->trans[transno]; 3483 if (trans->to < 0) 3484 continue; 3485 atom = trans->atom; 3486 if ((atom == NULL) || (atom->valuep == NULL)) 3487 continue; 3488 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3489 continue; 3490 } else if (trans->count == REGEXP_ALL_COUNTER) { 3491 continue; 3492 } else if (trans->counter >= 0) { 3493 continue; 3494 } else { 3495 if ((exec->comp->states[trans->to] != NULL) && 3496 (exec->comp->states[trans->to]->type == 3497 XML_REGEXP_SINK_STATE)) { 3498 if (atom->neg) 3499 values[nb++] = (xmlChar *) atom->valuep2; 3500 else 3501 values[nb++] = (xmlChar *) atom->valuep; 3502 (*nbneg)++; 3503 } 3504 } 3505 } 3506 } 3507 return(0); 3508} 3509 3510/** 3511 * xmlRegExecNextValues: 3512 * @exec: a regexp execution context 3513 * @nbval: pointer to the number of accepted values IN/OUT 3514 * @nbneg: return number of negative transitions 3515 * @values: pointer to the array of acceptable values 3516 * @terminal: return value if this was a terminal state 3517 * 3518 * Extract informations from the regexp execution, 3519 * the parameter @values must point to an array of @nbval string pointers 3520 * on return nbval will contain the number of possible strings in that 3521 * state and the @values array will be updated with them. The string values 3522 * returned will be freed with the @exec context and don't need to be 3523 * deallocated. 3524 * 3525 * Returns: 0 in case of success or -1 in case of error. 3526 */ 3527int 3528xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 3529 xmlChar **values, int *terminal) { 3530 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 3531} 3532 3533/** 3534 * xmlRegExecErrInfo: 3535 * @exec: a regexp execution context generating an error 3536 * @string: return value for the error string 3537 * @nbval: pointer to the number of accepted values IN/OUT 3538 * @nbneg: return number of negative transitions 3539 * @values: pointer to the array of acceptable values 3540 * @terminal: return value if this was a terminal state 3541 * 3542 * Extract error informations from the regexp execution, the parameter 3543 * @string will be updated with the value pushed and not accepted, 3544 * the parameter @values must point to an array of @nbval string pointers 3545 * on return nbval will contain the number of possible strings in that 3546 * state and the @values array will be updated with them. The string values 3547 * returned will be freed with the @exec context and don't need to be 3548 * deallocated. 3549 * 3550 * Returns: 0 in case of success or -1 in case of error. 3551 */ 3552int 3553xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 3554 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 3555 if (exec == NULL) 3556 return(-1); 3557 if (string != NULL) { 3558 if (exec->status != 0) 3559 *string = exec->errString; 3560 else 3561 *string = NULL; 3562 } 3563 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 3564} 3565 3566#ifdef DEBUG_ERR 3567static void testerr(xmlRegExecCtxtPtr exec) { 3568 const xmlChar *string; 3569 xmlChar *values[5]; 3570 int nb = 5; 3571 int nbneg; 3572 int terminal; 3573 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 3574} 3575#endif 3576 3577#if 0 3578static int 3579xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 3580 xmlRegTransPtr trans; 3581 xmlRegAtomPtr atom; 3582 int ret; 3583 int codepoint, len; 3584 3585 if (exec == NULL) 3586 return(-1); 3587 if (exec->status != 0) 3588 return(exec->status); 3589 3590 while ((exec->status == 0) && 3591 ((exec->inputString[exec->index] != 0) || 3592 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 3593 3594 /* 3595 * End of input on non-terminal state, rollback, however we may 3596 * still have epsilon like transition for counted transitions 3597 * on counters, in that case don't break too early. 3598 */ 3599 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 3600 goto rollback; 3601 3602 exec->transcount = 0; 3603 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3604 trans = &exec->state->trans[exec->transno]; 3605 if (trans->to < 0) 3606 continue; 3607 atom = trans->atom; 3608 ret = 0; 3609 if (trans->count >= 0) { 3610 int count; 3611 xmlRegCounterPtr counter; 3612 3613 /* 3614 * A counted transition. 3615 */ 3616 3617 count = exec->counts[trans->count]; 3618 counter = &exec->comp->counters[trans->count]; 3619#ifdef DEBUG_REGEXP_EXEC 3620 printf("testing count %d: val %d, min %d, max %d\n", 3621 trans->count, count, counter->min, counter->max); 3622#endif 3623 ret = ((count >= counter->min) && (count <= counter->max)); 3624 } else if (atom == NULL) { 3625 fprintf(stderr, "epsilon transition left at runtime\n"); 3626 exec->status = -2; 3627 break; 3628 } else if (exec->inputString[exec->index] != 0) { 3629 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3630 ret = xmlRegCheckCharacter(atom, codepoint); 3631 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3632 xmlRegStatePtr to = exec->comp->states[trans->to]; 3633 3634 /* 3635 * this is a multiple input sequence 3636 */ 3637 if (exec->state->nbTrans > exec->transno + 1) { 3638 xmlFARegExecSave(exec); 3639 } 3640 exec->transcount = 1; 3641 do { 3642 /* 3643 * Try to progress as much as possible on the input 3644 */ 3645 if (exec->transcount == atom->max) { 3646 break; 3647 } 3648 exec->index += len; 3649 /* 3650 * End of input: stop here 3651 */ 3652 if (exec->inputString[exec->index] == 0) { 3653 exec->index -= len; 3654 break; 3655 } 3656 if (exec->transcount >= atom->min) { 3657 int transno = exec->transno; 3658 xmlRegStatePtr state = exec->state; 3659 3660 /* 3661 * The transition is acceptable save it 3662 */ 3663 exec->transno = -1; /* trick */ 3664 exec->state = to; 3665 xmlFARegExecSave(exec); 3666 exec->transno = transno; 3667 exec->state = state; 3668 } 3669 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3670 len); 3671 ret = xmlRegCheckCharacter(atom, codepoint); 3672 exec->transcount++; 3673 } while (ret == 1); 3674 if (exec->transcount < atom->min) 3675 ret = 0; 3676 3677 /* 3678 * If the last check failed but one transition was found 3679 * possible, rollback 3680 */ 3681 if (ret < 0) 3682 ret = 0; 3683 if (ret == 0) { 3684 goto rollback; 3685 } 3686 } 3687 } 3688 if (ret == 1) { 3689 if (exec->state->nbTrans > exec->transno + 1) { 3690 xmlFARegExecSave(exec); 3691 } 3692 if (trans->counter >= 0) { 3693#ifdef DEBUG_REGEXP_EXEC 3694 printf("Increasing count %d\n", trans->counter); 3695#endif 3696 exec->counts[trans->counter]++; 3697 } 3698#ifdef DEBUG_REGEXP_EXEC 3699 printf("entering state %d\n", trans->to); 3700#endif 3701 exec->state = exec->comp->states[trans->to]; 3702 exec->transno = 0; 3703 if (trans->atom != NULL) { 3704 exec->index += len; 3705 } 3706 goto progress; 3707 } else if (ret < 0) { 3708 exec->status = -4; 3709 break; 3710 } 3711 } 3712 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3713rollback: 3714 /* 3715 * Failed to find a way out 3716 */ 3717 exec->determinist = 0; 3718 xmlFARegExecRollBack(exec); 3719 } 3720progress: 3721 continue; 3722 } 3723} 3724#endif 3725/************************************************************************ 3726 * * 3727 * Parser for the Schemas Datatype Regular Expressions * 3728 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 3729 * * 3730 ************************************************************************/ 3731 3732/** 3733 * xmlFAIsChar: 3734 * @ctxt: a regexp parser context 3735 * 3736 * [10] Char ::= [^.\?*+()|#x5B#x5D] 3737 */ 3738static int 3739xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 3740 int cur; 3741 int len; 3742 3743 cur = CUR_SCHAR(ctxt->cur, len); 3744 if ((cur == '.') || (cur == '\\') || (cur == '?') || 3745 (cur == '*') || (cur == '+') || (cur == '(') || 3746 (cur == ')') || (cur == '|') || (cur == 0x5B) || 3747 (cur == 0x5D) || (cur == 0)) 3748 return(-1); 3749 return(cur); 3750} 3751 3752/** 3753 * xmlFAParseCharProp: 3754 * @ctxt: a regexp parser context 3755 * 3756 * [27] charProp ::= IsCategory | IsBlock 3757 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 3758 * Separators | Symbols | Others 3759 * [29] Letters ::= 'L' [ultmo]? 3760 * [30] Marks ::= 'M' [nce]? 3761 * [31] Numbers ::= 'N' [dlo]? 3762 * [32] Punctuation ::= 'P' [cdseifo]? 3763 * [33] Separators ::= 'Z' [slp]? 3764 * [34] Symbols ::= 'S' [mcko]? 3765 * [35] Others ::= 'C' [cfon]? 3766 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 3767 */ 3768static void 3769xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 3770 int cur; 3771 xmlRegAtomType type = (xmlRegAtomType) 0; 3772 xmlChar *blockName = NULL; 3773 3774 cur = CUR; 3775 if (cur == 'L') { 3776 NEXT; 3777 cur = CUR; 3778 if (cur == 'u') { 3779 NEXT; 3780 type = XML_REGEXP_LETTER_UPPERCASE; 3781 } else if (cur == 'l') { 3782 NEXT; 3783 type = XML_REGEXP_LETTER_LOWERCASE; 3784 } else if (cur == 't') { 3785 NEXT; 3786 type = XML_REGEXP_LETTER_TITLECASE; 3787 } else if (cur == 'm') { 3788 NEXT; 3789 type = XML_REGEXP_LETTER_MODIFIER; 3790 } else if (cur == 'o') { 3791 NEXT; 3792 type = XML_REGEXP_LETTER_OTHERS; 3793 } else { 3794 type = XML_REGEXP_LETTER; 3795 } 3796 } else if (cur == 'M') { 3797 NEXT; 3798 cur = CUR; 3799 if (cur == 'n') { 3800 NEXT; 3801 /* nonspacing */ 3802 type = XML_REGEXP_MARK_NONSPACING; 3803 } else if (cur == 'c') { 3804 NEXT; 3805 /* spacing combining */ 3806 type = XML_REGEXP_MARK_SPACECOMBINING; 3807 } else if (cur == 'e') { 3808 NEXT; 3809 /* enclosing */ 3810 type = XML_REGEXP_MARK_ENCLOSING; 3811 } else { 3812 /* all marks */ 3813 type = XML_REGEXP_MARK; 3814 } 3815 } else if (cur == 'N') { 3816 NEXT; 3817 cur = CUR; 3818 if (cur == 'd') { 3819 NEXT; 3820 /* digital */ 3821 type = XML_REGEXP_NUMBER_DECIMAL; 3822 } else if (cur == 'l') { 3823 NEXT; 3824 /* letter */ 3825 type = XML_REGEXP_NUMBER_LETTER; 3826 } else if (cur == 'o') { 3827 NEXT; 3828 /* other */ 3829 type = XML_REGEXP_NUMBER_OTHERS; 3830 } else { 3831 /* all numbers */ 3832 type = XML_REGEXP_NUMBER; 3833 } 3834 } else if (cur == 'P') { 3835 NEXT; 3836 cur = CUR; 3837 if (cur == 'c') { 3838 NEXT; 3839 /* connector */ 3840 type = XML_REGEXP_PUNCT_CONNECTOR; 3841 } else if (cur == 'd') { 3842 NEXT; 3843 /* dash */ 3844 type = XML_REGEXP_PUNCT_DASH; 3845 } else if (cur == 's') { 3846 NEXT; 3847 /* open */ 3848 type = XML_REGEXP_PUNCT_OPEN; 3849 } else if (cur == 'e') { 3850 NEXT; 3851 /* close */ 3852 type = XML_REGEXP_PUNCT_CLOSE; 3853 } else if (cur == 'i') { 3854 NEXT; 3855 /* initial quote */ 3856 type = XML_REGEXP_PUNCT_INITQUOTE; 3857 } else if (cur == 'f') { 3858 NEXT; 3859 /* final quote */ 3860 type = XML_REGEXP_PUNCT_FINQUOTE; 3861 } else if (cur == 'o') { 3862 NEXT; 3863 /* other */ 3864 type = XML_REGEXP_PUNCT_OTHERS; 3865 } else { 3866 /* all punctuation */ 3867 type = XML_REGEXP_PUNCT; 3868 } 3869 } else if (cur == 'Z') { 3870 NEXT; 3871 cur = CUR; 3872 if (cur == 's') { 3873 NEXT; 3874 /* space */ 3875 type = XML_REGEXP_SEPAR_SPACE; 3876 } else if (cur == 'l') { 3877 NEXT; 3878 /* line */ 3879 type = XML_REGEXP_SEPAR_LINE; 3880 } else if (cur == 'p') { 3881 NEXT; 3882 /* paragraph */ 3883 type = XML_REGEXP_SEPAR_PARA; 3884 } else { 3885 /* all separators */ 3886 type = XML_REGEXP_SEPAR; 3887 } 3888 } else if (cur == 'S') { 3889 NEXT; 3890 cur = CUR; 3891 if (cur == 'm') { 3892 NEXT; 3893 type = XML_REGEXP_SYMBOL_MATH; 3894 /* math */ 3895 } else if (cur == 'c') { 3896 NEXT; 3897 type = XML_REGEXP_SYMBOL_CURRENCY; 3898 /* currency */ 3899 } else if (cur == 'k') { 3900 NEXT; 3901 type = XML_REGEXP_SYMBOL_MODIFIER; 3902 /* modifiers */ 3903 } else if (cur == 'o') { 3904 NEXT; 3905 type = XML_REGEXP_SYMBOL_OTHERS; 3906 /* other */ 3907 } else { 3908 /* all symbols */ 3909 type = XML_REGEXP_SYMBOL; 3910 } 3911 } else if (cur == 'C') { 3912 NEXT; 3913 cur = CUR; 3914 if (cur == 'c') { 3915 NEXT; 3916 /* control */ 3917 type = XML_REGEXP_OTHER_CONTROL; 3918 } else if (cur == 'f') { 3919 NEXT; 3920 /* format */ 3921 type = XML_REGEXP_OTHER_FORMAT; 3922 } else if (cur == 'o') { 3923 NEXT; 3924 /* private use */ 3925 type = XML_REGEXP_OTHER_PRIVATE; 3926 } else if (cur == 'n') { 3927 NEXT; 3928 /* not assigned */ 3929 type = XML_REGEXP_OTHER_NA; 3930 } else { 3931 /* all others */ 3932 type = XML_REGEXP_OTHER; 3933 } 3934 } else if (cur == 'I') { 3935 const xmlChar *start; 3936 NEXT; 3937 cur = CUR; 3938 if (cur != 's') { 3939 ERROR("IsXXXX expected"); 3940 return; 3941 } 3942 NEXT; 3943 start = ctxt->cur; 3944 cur = CUR; 3945 if (((cur >= 'a') && (cur <= 'z')) || 3946 ((cur >= 'A') && (cur <= 'Z')) || 3947 ((cur >= '0') && (cur <= '9')) || 3948 (cur == 0x2D)) { 3949 NEXT; 3950 cur = CUR; 3951 while (((cur >= 'a') && (cur <= 'z')) || 3952 ((cur >= 'A') && (cur <= 'Z')) || 3953 ((cur >= '0') && (cur <= '9')) || 3954 (cur == 0x2D)) { 3955 NEXT; 3956 cur = CUR; 3957 } 3958 } 3959 type = XML_REGEXP_BLOCK_NAME; 3960 blockName = xmlStrndup(start, ctxt->cur - start); 3961 } else { 3962 ERROR("Unknown char property"); 3963 return; 3964 } 3965 if (ctxt->atom == NULL) { 3966 ctxt->atom = xmlRegNewAtom(ctxt, type); 3967 if (ctxt->atom != NULL) 3968 ctxt->atom->valuep = blockName; 3969 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3970 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3971 type, 0, 0, blockName); 3972 } 3973} 3974 3975/** 3976 * xmlFAParseCharClassEsc: 3977 * @ctxt: a regexp parser context 3978 * 3979 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 3980 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 3981 * [25] catEsc ::= '\p{' charProp '}' 3982 * [26] complEsc ::= '\P{' charProp '}' 3983 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 3984 */ 3985static void 3986xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 3987 int cur; 3988 3989 if (CUR == '.') { 3990 if (ctxt->atom == NULL) { 3991 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 3992 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3993 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3994 XML_REGEXP_ANYCHAR, 0, 0, NULL); 3995 } 3996 NEXT; 3997 return; 3998 } 3999 if (CUR != '\\') { 4000 ERROR("Escaped sequence: expecting \\"); 4001 return; 4002 } 4003 NEXT; 4004 cur = CUR; 4005 if (cur == 'p') { 4006 NEXT; 4007 if (CUR != '{') { 4008 ERROR("Expecting '{'"); 4009 return; 4010 } 4011 NEXT; 4012 xmlFAParseCharProp(ctxt); 4013 if (CUR != '}') { 4014 ERROR("Expecting '}'"); 4015 return; 4016 } 4017 NEXT; 4018 } else if (cur == 'P') { 4019 NEXT; 4020 if (CUR != '{') { 4021 ERROR("Expecting '{'"); 4022 return; 4023 } 4024 NEXT; 4025 xmlFAParseCharProp(ctxt); 4026 ctxt->atom->neg = 1; 4027 if (CUR != '}') { 4028 ERROR("Expecting '}'"); 4029 return; 4030 } 4031 NEXT; 4032 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4033 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4034 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4035 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4036 (cur == 0x5E)) { 4037 if (ctxt->atom == NULL) { 4038 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4039 if (ctxt->atom != NULL) { 4040 switch (cur) { 4041 case 'n': 4042 ctxt->atom->codepoint = '\n'; 4043 break; 4044 case 'r': 4045 ctxt->atom->codepoint = '\r'; 4046 break; 4047 case 't': 4048 ctxt->atom->codepoint = '\t'; 4049 break; 4050 default: 4051 ctxt->atom->codepoint = cur; 4052 } 4053 } 4054 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4055 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4056 XML_REGEXP_CHARVAL, cur, cur, NULL); 4057 } 4058 NEXT; 4059 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4060 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4061 (cur == 'w') || (cur == 'W')) { 4062 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4063 4064 switch (cur) { 4065 case 's': 4066 type = XML_REGEXP_ANYSPACE; 4067 break; 4068 case 'S': 4069 type = XML_REGEXP_NOTSPACE; 4070 break; 4071 case 'i': 4072 type = XML_REGEXP_INITNAME; 4073 break; 4074 case 'I': 4075 type = XML_REGEXP_NOTINITNAME; 4076 break; 4077 case 'c': 4078 type = XML_REGEXP_NAMECHAR; 4079 break; 4080 case 'C': 4081 type = XML_REGEXP_NOTNAMECHAR; 4082 break; 4083 case 'd': 4084 type = XML_REGEXP_DECIMAL; 4085 break; 4086 case 'D': 4087 type = XML_REGEXP_NOTDECIMAL; 4088 break; 4089 case 'w': 4090 type = XML_REGEXP_REALCHAR; 4091 break; 4092 case 'W': 4093 type = XML_REGEXP_NOTREALCHAR; 4094 break; 4095 } 4096 NEXT; 4097 if (ctxt->atom == NULL) { 4098 ctxt->atom = xmlRegNewAtom(ctxt, type); 4099 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4100 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4101 type, 0, 0, NULL); 4102 } 4103 } 4104} 4105 4106/** 4107 * xmlFAParseCharRef: 4108 * @ctxt: a regexp parser context 4109 * 4110 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' ) 4111 */ 4112static int 4113xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) { 4114 int ret = 0, cur; 4115 4116 if ((CUR != '&') || (NXT(1) != '#')) 4117 return(-1); 4118 NEXT; 4119 NEXT; 4120 cur = CUR; 4121 if (cur == 'x') { 4122 NEXT; 4123 cur = CUR; 4124 if (((cur >= '0') && (cur <= '9')) || 4125 ((cur >= 'a') && (cur <= 'f')) || 4126 ((cur >= 'A') && (cur <= 'F'))) { 4127 while (((cur >= '0') && (cur <= '9')) || 4128 ((cur >= 'A') && (cur <= 'F'))) { 4129 if ((cur >= '0') && (cur <= '9')) 4130 ret = ret * 16 + cur - '0'; 4131 else if ((cur >= 'a') && (cur <= 'f')) 4132 ret = ret * 16 + 10 + (cur - 'a'); 4133 else 4134 ret = ret * 16 + 10 + (cur - 'A'); 4135 NEXT; 4136 cur = CUR; 4137 } 4138 } else { 4139 ERROR("Char ref: expecting [0-9A-F]"); 4140 return(-1); 4141 } 4142 } else { 4143 if ((cur >= '0') && (cur <= '9')) { 4144 while ((cur >= '0') && (cur <= '9')) { 4145 ret = ret * 10 + cur - '0'; 4146 NEXT; 4147 cur = CUR; 4148 } 4149 } else { 4150 ERROR("Char ref: expecting [0-9]"); 4151 return(-1); 4152 } 4153 } 4154 if (cur != ';') { 4155 ERROR("Char ref: expecting ';'"); 4156 return(-1); 4157 } else { 4158 NEXT; 4159 } 4160 return(ret); 4161} 4162 4163/** 4164 * xmlFAParseCharRange: 4165 * @ctxt: a regexp parser context 4166 * 4167 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4168 * [18] seRange ::= charOrEsc '-' charOrEsc 4169 * [20] charOrEsc ::= XmlChar | SingleCharEsc 4170 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 4171 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 4172 */ 4173static void 4174xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 4175 int cur, len; 4176 int start = -1; 4177 int end = -1; 4178 4179 if ((CUR == '&') && (NXT(1) == '#')) { 4180 end = start = xmlFAParseCharRef(ctxt); 4181 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4182 XML_REGEXP_CHARVAL, start, end, NULL); 4183 return; 4184 } 4185 cur = CUR; 4186 if (cur == '\\') { 4187 NEXT; 4188 cur = CUR; 4189 switch (cur) { 4190 case 'n': start = 0xA; break; 4191 case 'r': start = 0xD; break; 4192 case 't': start = 0x9; break; 4193 case '\\': case '|': case '.': case '-': case '^': case '?': 4194 case '*': case '+': case '{': case '}': case '(': case ')': 4195 case '[': case ']': 4196 start = cur; break; 4197 default: 4198 ERROR("Invalid escape value"); 4199 return; 4200 } 4201 end = start; 4202 len = 1; 4203 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4204 end = start = CUR_SCHAR(ctxt->cur, len); 4205 } else { 4206 ERROR("Expecting a char range"); 4207 return; 4208 } 4209 NEXTL(len); 4210 if (start == '-') { 4211 return; 4212 } 4213 cur = CUR; 4214 if ((cur != '-') || (NXT(1) == ']')) { 4215 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4216 XML_REGEXP_CHARVAL, start, end, NULL); 4217 return; 4218 } 4219 NEXT; 4220 cur = CUR; 4221 if (cur == '\\') { 4222 NEXT; 4223 cur = CUR; 4224 switch (cur) { 4225 case 'n': end = 0xA; break; 4226 case 'r': end = 0xD; break; 4227 case 't': end = 0x9; break; 4228 case '\\': case '|': case '.': case '-': case '^': case '?': 4229 case '*': case '+': case '{': case '}': case '(': case ')': 4230 case '[': case ']': 4231 end = cur; break; 4232 default: 4233 ERROR("Invalid escape value"); 4234 return; 4235 } 4236 len = 1; 4237 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4238 end = CUR_SCHAR(ctxt->cur, len); 4239 } else { 4240 ERROR("Expecting the end of a char range"); 4241 return; 4242 } 4243 NEXTL(len); 4244 /* TODO check that the values are acceptable character ranges for XML */ 4245 if (end < start) { 4246 ERROR("End of range is before start of range"); 4247 } else { 4248 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4249 XML_REGEXP_CHARVAL, start, end, NULL); 4250 } 4251 return; 4252} 4253 4254/** 4255 * xmlFAParsePosCharGroup: 4256 * @ctxt: a regexp parser context 4257 * 4258 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 4259 */ 4260static void 4261xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 4262 do { 4263 if ((CUR == '\\') || (CUR == '.')) { 4264 xmlFAParseCharClassEsc(ctxt); 4265 } else { 4266 xmlFAParseCharRange(ctxt); 4267 } 4268 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 4269 (ctxt->error == 0)); 4270} 4271 4272/** 4273 * xmlFAParseCharGroup: 4274 * @ctxt: a regexp parser context 4275 * 4276 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 4277 * [15] negCharGroup ::= '^' posCharGroup 4278 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 4279 * [12] charClassExpr ::= '[' charGroup ']' 4280 */ 4281static void 4282xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 4283 int n = ctxt->neg; 4284 while ((CUR != ']') && (ctxt->error == 0)) { 4285 if (CUR == '^') { 4286 int neg = ctxt->neg; 4287 4288 NEXT; 4289 ctxt->neg = !ctxt->neg; 4290 xmlFAParsePosCharGroup(ctxt); 4291 ctxt->neg = neg; 4292 } else if ((CUR == '-') && (NXT(1) == '[')) { 4293 int neg = ctxt->neg; 4294 ctxt->neg = 2; 4295 NEXT; /* eat the '-' */ 4296 NEXT; /* eat the '[' */ 4297 xmlFAParseCharGroup(ctxt); 4298 if (CUR == ']') { 4299 NEXT; 4300 } else { 4301 ERROR("charClassExpr: ']' expected"); 4302 break; 4303 } 4304 ctxt->neg = neg; 4305 break; 4306 } else if (CUR != ']') { 4307 xmlFAParsePosCharGroup(ctxt); 4308 } 4309 } 4310 ctxt->neg = n; 4311} 4312 4313/** 4314 * xmlFAParseCharClass: 4315 * @ctxt: a regexp parser context 4316 * 4317 * [11] charClass ::= charClassEsc | charClassExpr 4318 * [12] charClassExpr ::= '[' charGroup ']' 4319 */ 4320static void 4321xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 4322 if (CUR == '[') { 4323 NEXT; 4324 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 4325 if (ctxt->atom == NULL) 4326 return; 4327 xmlFAParseCharGroup(ctxt); 4328 if (CUR == ']') { 4329 NEXT; 4330 } else { 4331 ERROR("xmlFAParseCharClass: ']' expected"); 4332 } 4333 } else { 4334 xmlFAParseCharClassEsc(ctxt); 4335 } 4336} 4337 4338/** 4339 * xmlFAParseQuantExact: 4340 * @ctxt: a regexp parser context 4341 * 4342 * [8] QuantExact ::= [0-9]+ 4343 * 4344 * Returns 0 if success or -1 in case of error 4345 */ 4346static int 4347xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 4348 int ret = 0; 4349 int ok = 0; 4350 4351 while ((CUR >= '0') && (CUR <= '9')) { 4352 ret = ret * 10 + (CUR - '0'); 4353 ok = 1; 4354 NEXT; 4355 } 4356 if (ok != 1) { 4357 return(-1); 4358 } 4359 return(ret); 4360} 4361 4362/** 4363 * xmlFAParseQuantifier: 4364 * @ctxt: a regexp parser context 4365 * 4366 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 4367 * [5] quantity ::= quantRange | quantMin | QuantExact 4368 * [6] quantRange ::= QuantExact ',' QuantExact 4369 * [7] quantMin ::= QuantExact ',' 4370 * [8] QuantExact ::= [0-9]+ 4371 */ 4372static int 4373xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 4374 int cur; 4375 4376 cur = CUR; 4377 if ((cur == '?') || (cur == '*') || (cur == '+')) { 4378 if (ctxt->atom != NULL) { 4379 if (cur == '?') 4380 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 4381 else if (cur == '*') 4382 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 4383 else if (cur == '+') 4384 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 4385 } 4386 NEXT; 4387 return(1); 4388 } 4389 if (cur == '{') { 4390 int min = 0, max = 0; 4391 4392 NEXT; 4393 cur = xmlFAParseQuantExact(ctxt); 4394 if (cur >= 0) 4395 min = cur; 4396 if (CUR == ',') { 4397 NEXT; 4398 if (CUR == '}') 4399 max = INT_MAX; 4400 else { 4401 cur = xmlFAParseQuantExact(ctxt); 4402 if (cur >= 0) 4403 max = cur; 4404 else { 4405 ERROR("Improper quantifier"); 4406 } 4407 } 4408 } 4409 if (CUR == '}') { 4410 NEXT; 4411 } else { 4412 ERROR("Unterminated quantifier"); 4413 } 4414 if (max == 0) 4415 max = min; 4416 if (ctxt->atom != NULL) { 4417 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 4418 ctxt->atom->min = min; 4419 ctxt->atom->max = max; 4420 } 4421 return(1); 4422 } 4423 return(0); 4424} 4425 4426/** 4427 * xmlFAParseAtom: 4428 * @ctxt: a regexp parser context 4429 * 4430 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 4431 */ 4432static int 4433xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 4434 int codepoint, len; 4435 4436 codepoint = xmlFAIsChar(ctxt); 4437 if (codepoint > 0) { 4438 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4439 if (ctxt->atom == NULL) 4440 return(-1); 4441 codepoint = CUR_SCHAR(ctxt->cur, len); 4442 ctxt->atom->codepoint = codepoint; 4443 NEXTL(len); 4444 return(1); 4445 } else if (CUR == '|') { 4446 return(0); 4447 } else if (CUR == 0) { 4448 return(0); 4449 } else if (CUR == ')') { 4450 return(0); 4451 } else if (CUR == '(') { 4452 xmlRegStatePtr start, oldend; 4453 4454 NEXT; 4455 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 4456 start = ctxt->state; 4457 oldend = ctxt->end; 4458 ctxt->end = NULL; 4459 ctxt->atom = NULL; 4460 xmlFAParseRegExp(ctxt, 0); 4461 if (CUR == ')') { 4462 NEXT; 4463 } else { 4464 ERROR("xmlFAParseAtom: expecting ')'"); 4465 } 4466 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 4467 if (ctxt->atom == NULL) 4468 return(-1); 4469 ctxt->atom->start = start; 4470 ctxt->atom->stop = ctxt->state; 4471 ctxt->end = oldend; 4472 return(1); 4473 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 4474 xmlFAParseCharClass(ctxt); 4475 return(1); 4476 } 4477 return(0); 4478} 4479 4480/** 4481 * xmlFAParsePiece: 4482 * @ctxt: a regexp parser context 4483 * 4484 * [3] piece ::= atom quantifier? 4485 */ 4486static int 4487xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 4488 int ret; 4489 4490 ctxt->atom = NULL; 4491 ret = xmlFAParseAtom(ctxt); 4492 if (ret == 0) 4493 return(0); 4494 if (ctxt->atom == NULL) { 4495 ERROR("internal: no atom generated"); 4496 } 4497 xmlFAParseQuantifier(ctxt); 4498 return(1); 4499} 4500 4501/** 4502 * xmlFAParseBranch: 4503 * @ctxt: a regexp parser context 4504 * 4505 * [2] branch ::= piece* 4506 8 4507 */ 4508static int 4509xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) { 4510 xmlRegStatePtr previous; 4511 int ret; 4512 4513 previous = ctxt->state; 4514 ret = xmlFAParsePiece(ctxt); 4515 if (ret != 0) { 4516 if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0) 4517 return(-1); 4518 previous = ctxt->state; 4519 ctxt->atom = NULL; 4520 } 4521 while ((ret != 0) && (ctxt->error == 0)) { 4522 ret = xmlFAParsePiece(ctxt); 4523 if (ret != 0) { 4524 if (xmlFAGenerateTransitions(ctxt, previous, NULL, 4525 ctxt->atom) < 0) 4526 return(-1); 4527 previous = ctxt->state; 4528 ctxt->atom = NULL; 4529 } 4530 } 4531 return(0); 4532} 4533 4534/** 4535 * xmlFAParseRegExp: 4536 * @ctxt: a regexp parser context 4537 * @top: is this the top-level expression ? 4538 * 4539 * [1] regExp ::= branch ( '|' branch )* 4540 */ 4541static void 4542xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 4543 xmlRegStatePtr start, end; 4544 4545 /* if not top start should have been generated by an epsilon trans */ 4546 start = ctxt->state; 4547 ctxt->end = NULL; 4548 xmlFAParseBranch(ctxt); 4549 if (top) { 4550#ifdef DEBUG_REGEXP_GRAPH 4551 printf("State %d is final\n", ctxt->state->no); 4552#endif 4553 ctxt->state->type = XML_REGEXP_FINAL_STATE; 4554 } 4555 if (CUR != '|') { 4556 ctxt->end = ctxt->state; 4557 return; 4558 } 4559 end = ctxt->state; 4560 while ((CUR == '|') && (ctxt->error == 0)) { 4561 NEXT; 4562 ctxt->state = start; 4563 ctxt->end = NULL; 4564 xmlFAParseBranch(ctxt); 4565 if (top) { 4566 ctxt->state->type = XML_REGEXP_FINAL_STATE; 4567#ifdef DEBUG_REGEXP_GRAPH 4568 printf("State %d is final\n", ctxt->state->no); 4569#endif 4570 } else { 4571 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, end); 4572 } 4573 } 4574 if (!top) { 4575 ctxt->state = end; 4576 ctxt->end = end; 4577 } 4578} 4579 4580/************************************************************************ 4581 * * 4582 * The basic API * 4583 * * 4584 ************************************************************************/ 4585 4586/** 4587 * xmlRegexpPrint: 4588 * @output: the file for the output debug 4589 * @regexp: the compiled regexp 4590 * 4591 * Print the content of the compiled regular expression 4592 */ 4593void 4594xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 4595 int i; 4596 4597 if (output == NULL) 4598 return; 4599 fprintf(output, " regexp: "); 4600 if (regexp == NULL) { 4601 fprintf(output, "NULL\n"); 4602 return; 4603 } 4604 fprintf(output, "'%s' ", regexp->string); 4605 fprintf(output, "\n"); 4606 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 4607 for (i = 0;i < regexp->nbAtoms; i++) { 4608 fprintf(output, " %02d ", i); 4609 xmlRegPrintAtom(output, regexp->atoms[i]); 4610 } 4611 fprintf(output, "%d states:", regexp->nbStates); 4612 fprintf(output, "\n"); 4613 for (i = 0;i < regexp->nbStates; i++) { 4614 xmlRegPrintState(output, regexp->states[i]); 4615 } 4616 fprintf(output, "%d counters:\n", regexp->nbCounters); 4617 for (i = 0;i < regexp->nbCounters; i++) { 4618 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 4619 regexp->counters[i].max); 4620 } 4621} 4622 4623/** 4624 * xmlRegexpCompile: 4625 * @regexp: a regular expression string 4626 * 4627 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 4628 * Appendix F and builds an automata suitable for testing strings against 4629 * that regular expression 4630 * 4631 * Returns the compiled expression or NULL in case of error 4632 */ 4633xmlRegexpPtr 4634xmlRegexpCompile(const xmlChar *regexp) { 4635 xmlRegexpPtr ret; 4636 xmlRegParserCtxtPtr ctxt; 4637 4638 ctxt = xmlRegNewParserCtxt(regexp); 4639 if (ctxt == NULL) 4640 return(NULL); 4641 4642 /* initialize the parser */ 4643 ctxt->end = NULL; 4644 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 4645 xmlRegStatePush(ctxt, ctxt->start); 4646 4647 /* parse the expression building an automata */ 4648 xmlFAParseRegExp(ctxt, 1); 4649 if (CUR != 0) { 4650 ERROR("xmlFAParseRegExp: extra characters"); 4651 } 4652 ctxt->end = ctxt->state; 4653 ctxt->start->type = XML_REGEXP_START_STATE; 4654 ctxt->end->type = XML_REGEXP_FINAL_STATE; 4655 4656 /* remove the Epsilon except for counted transitions */ 4657 xmlFAEliminateEpsilonTransitions(ctxt); 4658 4659 4660 if (ctxt->error != 0) { 4661 xmlRegFreeParserCtxt(ctxt); 4662 return(NULL); 4663 } 4664 ret = xmlRegEpxFromParse(ctxt); 4665 xmlRegFreeParserCtxt(ctxt); 4666 return(ret); 4667} 4668 4669/** 4670 * xmlRegexpExec: 4671 * @comp: the compiled regular expression 4672 * @content: the value to check against the regular expression 4673 * 4674 * Check if the regular expression generates the value 4675 * 4676 * Returns 1 if it matches, 0 if not and a negative value in case of error 4677 */ 4678int 4679xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 4680 if ((comp == NULL) || (content == NULL)) 4681 return(-1); 4682 return(xmlFARegExec(comp, content)); 4683} 4684 4685/** 4686 * xmlRegexpIsDeterminist: 4687 * @comp: the compiled regular expression 4688 * 4689 * Check if the regular expression is determinist 4690 * 4691 * Returns 1 if it yes, 0 if not and a negative value in case of error 4692 */ 4693int 4694xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 4695 xmlAutomataPtr am; 4696 int ret; 4697 4698 if (comp == NULL) 4699 return(-1); 4700 if (comp->determinist != -1) 4701 return(comp->determinist); 4702 4703 am = xmlNewAutomata(); 4704 if (am->states != NULL) { 4705 int i; 4706 4707 for (i = 0;i < am->nbStates;i++) 4708 xmlRegFreeState(am->states[i]); 4709 xmlFree(am->states); 4710 } 4711 am->nbAtoms = comp->nbAtoms; 4712 am->atoms = comp->atoms; 4713 am->nbStates = comp->nbStates; 4714 am->states = comp->states; 4715 am->determinist = -1; 4716 ret = xmlFAComputesDeterminism(am); 4717 am->atoms = NULL; 4718 am->states = NULL; 4719 xmlFreeAutomata(am); 4720 return(ret); 4721} 4722 4723/** 4724 * xmlRegFreeRegexp: 4725 * @regexp: the regexp 4726 * 4727 * Free a regexp 4728 */ 4729void 4730xmlRegFreeRegexp(xmlRegexpPtr regexp) { 4731 int i; 4732 if (regexp == NULL) 4733 return; 4734 4735 if (regexp->string != NULL) 4736 xmlFree(regexp->string); 4737 if (regexp->states != NULL) { 4738 for (i = 0;i < regexp->nbStates;i++) 4739 xmlRegFreeState(regexp->states[i]); 4740 xmlFree(regexp->states); 4741 } 4742 if (regexp->atoms != NULL) { 4743 for (i = 0;i < regexp->nbAtoms;i++) 4744 xmlRegFreeAtom(regexp->atoms[i]); 4745 xmlFree(regexp->atoms); 4746 } 4747 if (regexp->counters != NULL) 4748 xmlFree(regexp->counters); 4749 if (regexp->compact != NULL) 4750 xmlFree(regexp->compact); 4751 if (regexp->transdata != NULL) 4752 xmlFree(regexp->transdata); 4753 if (regexp->stringMap != NULL) { 4754 for (i = 0; i < regexp->nbstrings;i++) 4755 xmlFree(regexp->stringMap[i]); 4756 xmlFree(regexp->stringMap); 4757 } 4758 4759 xmlFree(regexp); 4760} 4761 4762#ifdef LIBXML_AUTOMATA_ENABLED 4763/************************************************************************ 4764 * * 4765 * The Automata interface * 4766 * * 4767 ************************************************************************/ 4768 4769/** 4770 * xmlNewAutomata: 4771 * 4772 * Create a new automata 4773 * 4774 * Returns the new object or NULL in case of failure 4775 */ 4776xmlAutomataPtr 4777xmlNewAutomata(void) { 4778 xmlAutomataPtr ctxt; 4779 4780 ctxt = xmlRegNewParserCtxt(NULL); 4781 if (ctxt == NULL) 4782 return(NULL); 4783 4784 /* initialize the parser */ 4785 ctxt->end = NULL; 4786 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 4787 ctxt->start->type = XML_REGEXP_START_STATE; 4788 if (ctxt->start == NULL) { 4789 xmlFreeAutomata(ctxt); 4790 return(NULL); 4791 } 4792 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 4793 xmlRegFreeState(ctxt->start); 4794 xmlFreeAutomata(ctxt); 4795 return(NULL); 4796 } 4797 4798 return(ctxt); 4799} 4800 4801/** 4802 * xmlFreeAutomata: 4803 * @am: an automata 4804 * 4805 * Free an automata 4806 */ 4807void 4808xmlFreeAutomata(xmlAutomataPtr am) { 4809 if (am == NULL) 4810 return; 4811 xmlRegFreeParserCtxt(am); 4812} 4813 4814/** 4815 * xmlAutomataGetInitState: 4816 * @am: an automata 4817 * 4818 * Initial state lookup 4819 * 4820 * Returns the initial state of the automata 4821 */ 4822xmlAutomataStatePtr 4823xmlAutomataGetInitState(xmlAutomataPtr am) { 4824 if (am == NULL) 4825 return(NULL); 4826 return(am->start); 4827} 4828 4829/** 4830 * xmlAutomataSetFinalState: 4831 * @am: an automata 4832 * @state: a state in this automata 4833 * 4834 * Makes that state a final state 4835 * 4836 * Returns 0 or -1 in case of error 4837 */ 4838int 4839xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 4840 if ((am == NULL) || (state == NULL)) 4841 return(-1); 4842 state->type = XML_REGEXP_FINAL_STATE; 4843 return(0); 4844} 4845 4846/** 4847 * xmlAutomataNewTransition: 4848 * @am: an automata 4849 * @from: the starting point of the transition 4850 * @to: the target point of the transition or NULL 4851 * @token: the input string associated to that transition 4852 * @data: data passed to the callback function if the transition is activated 4853 * 4854 * If @to is NULL, this creates first a new target state in the automata 4855 * and then adds a transition from the @from state to the target state 4856 * activated by the value of @token 4857 * 4858 * Returns the target state or NULL in case of error 4859 */ 4860xmlAutomataStatePtr 4861xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 4862 xmlAutomataStatePtr to, const xmlChar *token, 4863 void *data) { 4864 xmlRegAtomPtr atom; 4865 4866 if ((am == NULL) || (from == NULL) || (token == NULL)) 4867 return(NULL); 4868 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4869 if (atom == NULL) 4870 return(NULL); 4871 atom->data = data; 4872 if (atom == NULL) 4873 return(NULL); 4874 atom->valuep = xmlStrdup(token); 4875 4876 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 4877 xmlRegFreeAtom(atom); 4878 return(NULL); 4879 } 4880 if (to == NULL) 4881 return(am->state); 4882 return(to); 4883} 4884 4885/** 4886 * xmlAutomataNewTransition2: 4887 * @am: an automata 4888 * @from: the starting point of the transition 4889 * @to: the target point of the transition or NULL 4890 * @token: the first input string associated to that transition 4891 * @token2: the second input string associated to that transition 4892 * @data: data passed to the callback function if the transition is activated 4893 * 4894 * If @to is NULL, this creates first a new target state in the automata 4895 * and then adds a transition from the @from state to the target state 4896 * activated by the value of @token 4897 * 4898 * Returns the target state or NULL in case of error 4899 */ 4900xmlAutomataStatePtr 4901xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 4902 xmlAutomataStatePtr to, const xmlChar *token, 4903 const xmlChar *token2, void *data) { 4904 xmlRegAtomPtr atom; 4905 4906 if ((am == NULL) || (from == NULL) || (token == NULL)) 4907 return(NULL); 4908 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4909 atom->data = data; 4910 if (atom == NULL) 4911 return(NULL); 4912 if ((token2 == NULL) || (*token2 == 0)) { 4913 atom->valuep = xmlStrdup(token); 4914 } else { 4915 int lenn, lenp; 4916 xmlChar *str; 4917 4918 lenn = strlen((char *) token2); 4919 lenp = strlen((char *) token); 4920 4921 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4922 if (str == NULL) { 4923 xmlRegFreeAtom(atom); 4924 return(NULL); 4925 } 4926 memcpy(&str[0], token, lenp); 4927 str[lenp] = '|'; 4928 memcpy(&str[lenp + 1], token2, lenn); 4929 str[lenn + lenp + 1] = 0; 4930 4931 atom->valuep = str; 4932 } 4933 4934 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 4935 xmlRegFreeAtom(atom); 4936 return(NULL); 4937 } 4938 if (to == NULL) 4939 return(am->state); 4940 return(to); 4941} 4942 4943/** 4944 * xmlAutomataNewNegTrans: 4945 * @am: an automata 4946 * @from: the starting point of the transition 4947 * @to: the target point of the transition or NULL 4948 * @token: the first input string associated to that transition 4949 * @token2: the second input string associated to that transition 4950 * @data: data passed to the callback function if the transition is activated 4951 * 4952 * If @to is NULL, this creates first a new target state in the automata 4953 * and then adds a transition from the @from state to the target state 4954 * activated by any value except (@token,@token2) 4955 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 4956 # the semantic of XSD ##other 4957 * 4958 * Returns the target state or NULL in case of error 4959 */ 4960xmlAutomataStatePtr 4961xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4962 xmlAutomataStatePtr to, const xmlChar *token, 4963 const xmlChar *token2, void *data) { 4964 xmlRegAtomPtr atom; 4965 xmlChar err_msg[200]; 4966 4967 if ((am == NULL) || (from == NULL) || (token == NULL)) 4968 return(NULL); 4969 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4970 if (atom == NULL) 4971 return(NULL); 4972 atom->data = data; 4973 atom->neg = 1; 4974 if ((token2 == NULL) || (*token2 == 0)) { 4975 atom->valuep = xmlStrdup(token); 4976 } else { 4977 int lenn, lenp; 4978 xmlChar *str; 4979 4980 lenn = strlen((char *) token2); 4981 lenp = strlen((char *) token); 4982 4983 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4984 if (str == NULL) { 4985 xmlRegFreeAtom(atom); 4986 return(NULL); 4987 } 4988 memcpy(&str[0], token, lenp); 4989 str[lenp] = '|'; 4990 memcpy(&str[lenp + 1], token2, lenn); 4991 str[lenn + lenp + 1] = 0; 4992 4993 atom->valuep = str; 4994 } 4995 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 4996 err_msg[199] = 0; 4997 atom->valuep2 = xmlStrdup(err_msg); 4998 4999 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5000 xmlRegFreeAtom(atom); 5001 return(NULL); 5002 } 5003 am->negs++; 5004 if (to == NULL) 5005 return(am->state); 5006 return(to); 5007} 5008 5009/** 5010 * xmlAutomataNewCountTrans2: 5011 * @am: an automata 5012 * @from: the starting point of the transition 5013 * @to: the target point of the transition or NULL 5014 * @token: the input string associated to that transition 5015 * @token2: the second input string associated to that transition 5016 * @min: the minimum successive occurences of token 5017 * @max: the maximum successive occurences of token 5018 * @data: data associated to the transition 5019 * 5020 * If @to is NULL, this creates first a new target state in the automata 5021 * and then adds a transition from the @from state to the target state 5022 * activated by a succession of input of value @token and @token2 and 5023 * whose number is between @min and @max 5024 * 5025 * Returns the target state or NULL in case of error 5026 */ 5027xmlAutomataStatePtr 5028xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5029 xmlAutomataStatePtr to, const xmlChar *token, 5030 const xmlChar *token2, 5031 int min, int max, void *data) { 5032 xmlRegAtomPtr atom; 5033 int counter; 5034 5035 if ((am == NULL) || (from == NULL) || (token == NULL)) 5036 return(NULL); 5037 if (min < 0) 5038 return(NULL); 5039 if ((max < min) || (max < 1)) 5040 return(NULL); 5041 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5042 if (atom == NULL) 5043 return(NULL); 5044 if ((token2 == NULL) || (*token2 == 0)) { 5045 atom->valuep = xmlStrdup(token); 5046 } else { 5047 int lenn, lenp; 5048 xmlChar *str; 5049 5050 lenn = strlen((char *) token2); 5051 lenp = strlen((char *) token); 5052 5053 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5054 if (str == NULL) { 5055 xmlRegFreeAtom(atom); 5056 return(NULL); 5057 } 5058 memcpy(&str[0], token, lenp); 5059 str[lenp] = '|'; 5060 memcpy(&str[lenp + 1], token2, lenn); 5061 str[lenn + lenp + 1] = 0; 5062 5063 atom->valuep = str; 5064 } 5065 atom->data = data; 5066 if (min == 0) 5067 atom->min = 1; 5068 else 5069 atom->min = min; 5070 atom->max = max; 5071 5072 /* 5073 * associate a counter to the transition. 5074 */ 5075 counter = xmlRegGetCounter(am); 5076 am->counters[counter].min = min; 5077 am->counters[counter].max = max; 5078 5079 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5080 if (to == NULL) { 5081 to = xmlRegNewState(am); 5082 xmlRegStatePush(am, to); 5083 } 5084 xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); 5085 xmlRegAtomPush(am, atom); 5086 am->state = to; 5087 5088 if (to == NULL) 5089 to = am->state; 5090 if (to == NULL) 5091 return(NULL); 5092 if (min == 0) 5093 xmlFAGenerateEpsilonTransition(am, from, to); 5094 return(to); 5095} 5096 5097/** 5098 * xmlAutomataNewCountTrans: 5099 * @am: an automata 5100 * @from: the starting point of the transition 5101 * @to: the target point of the transition or NULL 5102 * @token: the input string associated to that transition 5103 * @min: the minimum successive occurences of token 5104 * @max: the maximum successive occurences of token 5105 * @data: data associated to the transition 5106 * 5107 * If @to is NULL, this creates first a new target state in the automata 5108 * and then adds a transition from the @from state to the target state 5109 * activated by a succession of input of value @token and whose number 5110 * is between @min and @max 5111 * 5112 * Returns the target state or NULL in case of error 5113 */ 5114xmlAutomataStatePtr 5115xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5116 xmlAutomataStatePtr to, const xmlChar *token, 5117 int min, int max, void *data) { 5118 xmlRegAtomPtr atom; 5119 int counter; 5120 5121 if ((am == NULL) || (from == NULL) || (token == NULL)) 5122 return(NULL); 5123 if (min < 0) 5124 return(NULL); 5125 if ((max < min) || (max < 1)) 5126 return(NULL); 5127 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5128 if (atom == NULL) 5129 return(NULL); 5130 atom->valuep = xmlStrdup(token); 5131 atom->data = data; 5132 if (min == 0) 5133 atom->min = 1; 5134 else 5135 atom->min = min; 5136 atom->max = max; 5137 5138 /* 5139 * associate a counter to the transition. 5140 */ 5141 counter = xmlRegGetCounter(am); 5142 am->counters[counter].min = min; 5143 am->counters[counter].max = max; 5144 5145 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5146 if (to == NULL) { 5147 to = xmlRegNewState(am); 5148 xmlRegStatePush(am, to); 5149 } 5150 xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); 5151 xmlRegAtomPush(am, atom); 5152 am->state = to; 5153 5154 if (to == NULL) 5155 to = am->state; 5156 if (to == NULL) 5157 return(NULL); 5158 if (min == 0) 5159 xmlFAGenerateEpsilonTransition(am, from, to); 5160 return(to); 5161} 5162 5163/** 5164 * xmlAutomataNewOnceTrans2: 5165 * @am: an automata 5166 * @from: the starting point of the transition 5167 * @to: the target point of the transition or NULL 5168 * @token: the input string associated to that transition 5169 * @token2: the second input string associated to that transition 5170 * @min: the minimum successive occurences of token 5171 * @max: the maximum successive occurences of token 5172 * @data: data associated to the transition 5173 * 5174 * If @to is NULL, this creates first a new target state in the automata 5175 * and then adds a transition from the @from state to the target state 5176 * activated by a succession of input of value @token and @token2 and whose 5177 * number is between @min and @max, moreover that transition can only be 5178 * crossed once. 5179 * 5180 * Returns the target state or NULL in case of error 5181 */ 5182xmlAutomataStatePtr 5183xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5184 xmlAutomataStatePtr to, const xmlChar *token, 5185 const xmlChar *token2, 5186 int min, int max, void *data) { 5187 xmlRegAtomPtr atom; 5188 int counter; 5189 5190 if ((am == NULL) || (from == NULL) || (token == NULL)) 5191 return(NULL); 5192 if (min < 1) 5193 return(NULL); 5194 if ((max < min) || (max < 1)) 5195 return(NULL); 5196 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5197 if (atom == NULL) 5198 return(NULL); 5199 if ((token2 == NULL) || (*token2 == 0)) { 5200 atom->valuep = xmlStrdup(token); 5201 } else { 5202 int lenn, lenp; 5203 xmlChar *str; 5204 5205 lenn = strlen((char *) token2); 5206 lenp = strlen((char *) token); 5207 5208 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5209 if (str == NULL) { 5210 xmlRegFreeAtom(atom); 5211 return(NULL); 5212 } 5213 memcpy(&str[0], token, lenp); 5214 str[lenp] = '|'; 5215 memcpy(&str[lenp + 1], token2, lenn); 5216 str[lenn + lenp + 1] = 0; 5217 5218 atom->valuep = str; 5219 } 5220 atom->data = data; 5221 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 5222 if (min == 0) 5223 atom->min = 1; 5224 else 5225 atom->min = min; 5226 atom->max = max; 5227 /* 5228 * associate a counter to the transition. 5229 */ 5230 counter = xmlRegGetCounter(am); 5231 am->counters[counter].min = 1; 5232 am->counters[counter].max = 1; 5233 5234 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5235 if (to == NULL) { 5236 to = xmlRegNewState(am); 5237 xmlRegStatePush(am, to); 5238 } 5239 xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); 5240 xmlRegAtomPush(am, atom); 5241 am->state = to; 5242 return(to); 5243} 5244 5245 5246 5247/** 5248 * xmlAutomataNewOnceTrans: 5249 * @am: an automata 5250 * @from: the starting point of the transition 5251 * @to: the target point of the transition or NULL 5252 * @token: the input string associated to that transition 5253 * @min: the minimum successive occurences of token 5254 * @max: the maximum successive occurences of token 5255 * @data: data associated to the transition 5256 * 5257 * If @to is NULL, this creates first a new target state in the automata 5258 * and then adds a transition from the @from state to the target state 5259 * activated by a succession of input of value @token and whose number 5260 * is between @min and @max, moreover that transition can only be crossed 5261 * once. 5262 * 5263 * Returns the target state or NULL in case of error 5264 */ 5265xmlAutomataStatePtr 5266xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5267 xmlAutomataStatePtr to, const xmlChar *token, 5268 int min, int max, void *data) { 5269 xmlRegAtomPtr atom; 5270 int counter; 5271 5272 if ((am == NULL) || (from == NULL) || (token == NULL)) 5273 return(NULL); 5274 if (min < 1) 5275 return(NULL); 5276 if ((max < min) || (max < 1)) 5277 return(NULL); 5278 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5279 if (atom == NULL) 5280 return(NULL); 5281 atom->valuep = xmlStrdup(token); 5282 atom->data = data; 5283 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 5284 if (min == 0) 5285 atom->min = 1; 5286 else 5287 atom->min = min; 5288 atom->max = max; 5289 /* 5290 * associate a counter to the transition. 5291 */ 5292 counter = xmlRegGetCounter(am); 5293 am->counters[counter].min = 1; 5294 am->counters[counter].max = 1; 5295 5296 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5297 if (to == NULL) { 5298 to = xmlRegNewState(am); 5299 xmlRegStatePush(am, to); 5300 } 5301 xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); 5302 xmlRegAtomPush(am, atom); 5303 am->state = to; 5304 return(to); 5305} 5306 5307/** 5308 * xmlAutomataNewState: 5309 * @am: an automata 5310 * 5311 * Create a new disconnected state in the automata 5312 * 5313 * Returns the new state or NULL in case of error 5314 */ 5315xmlAutomataStatePtr 5316xmlAutomataNewState(xmlAutomataPtr am) { 5317 xmlAutomataStatePtr to; 5318 5319 if (am == NULL) 5320 return(NULL); 5321 to = xmlRegNewState(am); 5322 xmlRegStatePush(am, to); 5323 return(to); 5324} 5325 5326/** 5327 * xmlAutomataNewEpsilon: 5328 * @am: an automata 5329 * @from: the starting point of the transition 5330 * @to: the target point of the transition or NULL 5331 * 5332 * If @to is NULL, this creates first a new target state in the automata 5333 * and then adds an epsilon transition from the @from state to the 5334 * target state 5335 * 5336 * Returns the target state or NULL in case of error 5337 */ 5338xmlAutomataStatePtr 5339xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 5340 xmlAutomataStatePtr to) { 5341 if ((am == NULL) || (from == NULL)) 5342 return(NULL); 5343 xmlFAGenerateEpsilonTransition(am, from, to); 5344 if (to == NULL) 5345 return(am->state); 5346 return(to); 5347} 5348 5349/** 5350 * xmlAutomataNewAllTrans: 5351 * @am: an automata 5352 * @from: the starting point of the transition 5353 * @to: the target point of the transition or NULL 5354 * @lax: allow to transition if not all all transitions have been activated 5355 * 5356 * If @to is NULL, this creates first a new target state in the automata 5357 * and then adds a an ALL transition from the @from state to the 5358 * target state. That transition is an epsilon transition allowed only when 5359 * all transitions from the @from node have been activated. 5360 * 5361 * Returns the target state or NULL in case of error 5362 */ 5363xmlAutomataStatePtr 5364xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5365 xmlAutomataStatePtr to, int lax) { 5366 if ((am == NULL) || (from == NULL)) 5367 return(NULL); 5368 xmlFAGenerateAllTransition(am, from, to, lax); 5369 if (to == NULL) 5370 return(am->state); 5371 return(to); 5372} 5373 5374/** 5375 * xmlAutomataNewCounter: 5376 * @am: an automata 5377 * @min: the minimal value on the counter 5378 * @max: the maximal value on the counter 5379 * 5380 * Create a new counter 5381 * 5382 * Returns the counter number or -1 in case of error 5383 */ 5384int 5385xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 5386 int ret; 5387 5388 if (am == NULL) 5389 return(-1); 5390 5391 ret = xmlRegGetCounter(am); 5392 if (ret < 0) 5393 return(-1); 5394 am->counters[ret].min = min; 5395 am->counters[ret].max = max; 5396 return(ret); 5397} 5398 5399/** 5400 * xmlAutomataNewCountedTrans: 5401 * @am: an automata 5402 * @from: the starting point of the transition 5403 * @to: the target point of the transition or NULL 5404 * @counter: the counter associated to that transition 5405 * 5406 * If @to is NULL, this creates first a new target state in the automata 5407 * and then adds an epsilon transition from the @from state to the target state 5408 * which will increment the counter provided 5409 * 5410 * Returns the target state or NULL in case of error 5411 */ 5412xmlAutomataStatePtr 5413xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5414 xmlAutomataStatePtr to, int counter) { 5415 if ((am == NULL) || (from == NULL) || (counter < 0)) 5416 return(NULL); 5417 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 5418 if (to == NULL) 5419 return(am->state); 5420 return(to); 5421} 5422 5423/** 5424 * xmlAutomataNewCounterTrans: 5425 * @am: an automata 5426 * @from: the starting point of the transition 5427 * @to: the target point of the transition or NULL 5428 * @counter: the counter associated to that transition 5429 * 5430 * If @to is NULL, this creates first a new target state in the automata 5431 * and then adds an epsilon transition from the @from state to the target state 5432 * which will be allowed only if the counter is within the right range. 5433 * 5434 * Returns the target state or NULL in case of error 5435 */ 5436xmlAutomataStatePtr 5437xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5438 xmlAutomataStatePtr to, int counter) { 5439 if ((am == NULL) || (from == NULL) || (counter < 0)) 5440 return(NULL); 5441 xmlFAGenerateCountedTransition(am, from, to, counter); 5442 if (to == NULL) 5443 return(am->state); 5444 return(to); 5445} 5446 5447/** 5448 * xmlAutomataCompile: 5449 * @am: an automata 5450 * 5451 * Compile the automata into a Reg Exp ready for being executed. 5452 * The automata should be free after this point. 5453 * 5454 * Returns the compiled regexp or NULL in case of error 5455 */ 5456xmlRegexpPtr 5457xmlAutomataCompile(xmlAutomataPtr am) { 5458 xmlRegexpPtr ret; 5459 5460 if ((am == NULL) || (am->error != 0)) return(NULL); 5461 xmlFAEliminateEpsilonTransitions(am); 5462 /* xmlFAComputesDeterminism(am); */ 5463 ret = xmlRegEpxFromParse(am); 5464 5465 return(ret); 5466} 5467 5468/** 5469 * xmlAutomataIsDeterminist: 5470 * @am: an automata 5471 * 5472 * Checks if an automata is determinist. 5473 * 5474 * Returns 1 if true, 0 if not, and -1 in case of error 5475 */ 5476int 5477xmlAutomataIsDeterminist(xmlAutomataPtr am) { 5478 int ret; 5479 5480 if (am == NULL) 5481 return(-1); 5482 5483 ret = xmlFAComputesDeterminism(am); 5484 return(ret); 5485} 5486#endif /* LIBXML_AUTOMATA_ENABLED */ 5487#define bottom_xmlregexp 5488#include "elfgcchack.h" 5489#endif /* LIBXML_REGEXP_ENABLED */ 5490