xmlregexp.c revision aa62201290b6f3f3305f0fff504faf89c286a360
1/* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17#define IN_LIBXML 18#include "libxml.h" 19 20#ifdef LIBXML_REGEXP_ENABLED 21 22/* #define DEBUG_ERR */ 23 24#include <stdio.h> 25#include <string.h> 26#ifdef HAVE_LIMITS_H 27#include <limits.h> 28#endif 29 30#include <libxml/tree.h> 31#include <libxml/parserInternals.h> 32#include <libxml/xmlregexp.h> 33#include <libxml/xmlautomata.h> 34#include <libxml/xmlunicode.h> 35 36#ifndef INT_MAX 37#define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38#endif 39 40/* #define DEBUG_REGEXP_GRAPH */ 41/* #define DEBUG_REGEXP_EXEC */ 42/* #define DEBUG_PUSH */ 43/* #define DEBUG_COMPACTION */ 44 45#define MAX_PUSH 10000000 46 47#define ERROR(str) \ 48 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 49 xmlRegexpErrCompile(ctxt, str); 50#define NEXT ctxt->cur++ 51#define CUR (*(ctxt->cur)) 52#define NXT(index) (ctxt->cur[index]) 53 54#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 55#define NEXTL(l) ctxt->cur += l; 56#define XML_REG_STRING_SEPARATOR '|' 57 58/** 59 * TODO: 60 * 61 * macro to flag unimplemented blocks 62 */ 63#define TODO \ 64 xmlGenericError(xmlGenericErrorContext, \ 65 "Unimplemented block at %s:%d\n", \ 66 __FILE__, __LINE__); 67 68/************************************************************************ 69 * * 70 * Datatypes and structures * 71 * * 72 ************************************************************************/ 73 74typedef enum { 75 XML_REGEXP_EPSILON = 1, 76 XML_REGEXP_CHARVAL, 77 XML_REGEXP_RANGES, 78 XML_REGEXP_SUBREG, /* used for () sub regexps */ 79 XML_REGEXP_STRING, 80 XML_REGEXP_ANYCHAR, /* . */ 81 XML_REGEXP_ANYSPACE, /* \s */ 82 XML_REGEXP_NOTSPACE, /* \S */ 83 XML_REGEXP_INITNAME, /* \l */ 84 XML_REGEXP_NOTINITNAME, /* \L */ 85 XML_REGEXP_NAMECHAR, /* \c */ 86 XML_REGEXP_NOTNAMECHAR, /* \C */ 87 XML_REGEXP_DECIMAL, /* \d */ 88 XML_REGEXP_NOTDECIMAL, /* \D */ 89 XML_REGEXP_REALCHAR, /* \w */ 90 XML_REGEXP_NOTREALCHAR, /* \W */ 91 XML_REGEXP_LETTER = 100, 92 XML_REGEXP_LETTER_UPPERCASE, 93 XML_REGEXP_LETTER_LOWERCASE, 94 XML_REGEXP_LETTER_TITLECASE, 95 XML_REGEXP_LETTER_MODIFIER, 96 XML_REGEXP_LETTER_OTHERS, 97 XML_REGEXP_MARK, 98 XML_REGEXP_MARK_NONSPACING, 99 XML_REGEXP_MARK_SPACECOMBINING, 100 XML_REGEXP_MARK_ENCLOSING, 101 XML_REGEXP_NUMBER, 102 XML_REGEXP_NUMBER_DECIMAL, 103 XML_REGEXP_NUMBER_LETTER, 104 XML_REGEXP_NUMBER_OTHERS, 105 XML_REGEXP_PUNCT, 106 XML_REGEXP_PUNCT_CONNECTOR, 107 XML_REGEXP_PUNCT_DASH, 108 XML_REGEXP_PUNCT_OPEN, 109 XML_REGEXP_PUNCT_CLOSE, 110 XML_REGEXP_PUNCT_INITQUOTE, 111 XML_REGEXP_PUNCT_FINQUOTE, 112 XML_REGEXP_PUNCT_OTHERS, 113 XML_REGEXP_SEPAR, 114 XML_REGEXP_SEPAR_SPACE, 115 XML_REGEXP_SEPAR_LINE, 116 XML_REGEXP_SEPAR_PARA, 117 XML_REGEXP_SYMBOL, 118 XML_REGEXP_SYMBOL_MATH, 119 XML_REGEXP_SYMBOL_CURRENCY, 120 XML_REGEXP_SYMBOL_MODIFIER, 121 XML_REGEXP_SYMBOL_OTHERS, 122 XML_REGEXP_OTHER, 123 XML_REGEXP_OTHER_CONTROL, 124 XML_REGEXP_OTHER_FORMAT, 125 XML_REGEXP_OTHER_PRIVATE, 126 XML_REGEXP_OTHER_NA, 127 XML_REGEXP_BLOCK_NAME 128} xmlRegAtomType; 129 130typedef enum { 131 XML_REGEXP_QUANT_EPSILON = 1, 132 XML_REGEXP_QUANT_ONCE, 133 XML_REGEXP_QUANT_OPT, 134 XML_REGEXP_QUANT_MULT, 135 XML_REGEXP_QUANT_PLUS, 136 XML_REGEXP_QUANT_ONCEONLY, 137 XML_REGEXP_QUANT_ALL, 138 XML_REGEXP_QUANT_RANGE 139} xmlRegQuantType; 140 141typedef enum { 142 XML_REGEXP_START_STATE = 1, 143 XML_REGEXP_FINAL_STATE, 144 XML_REGEXP_TRANS_STATE, 145 XML_REGEXP_SINK_STATE 146} xmlRegStateType; 147 148typedef enum { 149 XML_REGEXP_MARK_NORMAL = 0, 150 XML_REGEXP_MARK_START, 151 XML_REGEXP_MARK_VISITED 152} xmlRegMarkedType; 153 154typedef struct _xmlRegRange xmlRegRange; 155typedef xmlRegRange *xmlRegRangePtr; 156 157struct _xmlRegRange { 158 int neg; /* 0 normal, 1 not, 2 exclude */ 159 xmlRegAtomType type; 160 int start; 161 int end; 162 xmlChar *blockName; 163}; 164 165typedef struct _xmlRegAtom xmlRegAtom; 166typedef xmlRegAtom *xmlRegAtomPtr; 167 168typedef struct _xmlAutomataState xmlRegState; 169typedef xmlRegState *xmlRegStatePtr; 170 171struct _xmlRegAtom { 172 int no; 173 xmlRegAtomType type; 174 xmlRegQuantType quant; 175 int min; 176 int max; 177 178 void *valuep; 179 void *valuep2; 180 int neg; 181 int codepoint; 182 xmlRegStatePtr start; 183 xmlRegStatePtr stop; 184 int maxRanges; 185 int nbRanges; 186 xmlRegRangePtr *ranges; 187 void *data; 188}; 189 190typedef struct _xmlRegCounter xmlRegCounter; 191typedef xmlRegCounter *xmlRegCounterPtr; 192 193struct _xmlRegCounter { 194 int min; 195 int max; 196}; 197 198typedef struct _xmlRegTrans xmlRegTrans; 199typedef xmlRegTrans *xmlRegTransPtr; 200 201struct _xmlRegTrans { 202 xmlRegAtomPtr atom; 203 int to; 204 int counter; 205 int count; 206 int nd; 207}; 208 209struct _xmlAutomataState { 210 xmlRegStateType type; 211 xmlRegMarkedType mark; 212 xmlRegMarkedType reached; 213 int no; 214 int maxTrans; 215 int nbTrans; 216 xmlRegTrans *trans; 217 /* knowing states ponting to us can speed things up */ 218 int maxTransTo; 219 int nbTransTo; 220 int *transTo; 221}; 222 223typedef struct _xmlAutomata xmlRegParserCtxt; 224typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 225 226struct _xmlAutomata { 227 xmlChar *string; 228 xmlChar *cur; 229 230 int error; 231 int neg; 232 233 xmlRegStatePtr start; 234 xmlRegStatePtr end; 235 xmlRegStatePtr state; 236 237 xmlRegAtomPtr atom; 238 239 int maxAtoms; 240 int nbAtoms; 241 xmlRegAtomPtr *atoms; 242 243 int maxStates; 244 int nbStates; 245 xmlRegStatePtr *states; 246 247 int maxCounters; 248 int nbCounters; 249 xmlRegCounter *counters; 250 251 int determinist; 252 int negs; 253}; 254 255struct _xmlRegexp { 256 xmlChar *string; 257 int nbStates; 258 xmlRegStatePtr *states; 259 int nbAtoms; 260 xmlRegAtomPtr *atoms; 261 int nbCounters; 262 xmlRegCounter *counters; 263 int determinist; 264 /* 265 * That's the compact form for determinists automatas 266 */ 267 int nbstates; 268 int *compact; 269 void **transdata; 270 int nbstrings; 271 xmlChar **stringMap; 272}; 273 274typedef struct _xmlRegExecRollback xmlRegExecRollback; 275typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 276 277struct _xmlRegExecRollback { 278 xmlRegStatePtr state;/* the current state */ 279 int index; /* the index in the input stack */ 280 int nextbranch; /* the next transition to explore in that state */ 281 int *counts; /* save the automata state if it has some */ 282}; 283 284typedef struct _xmlRegInputToken xmlRegInputToken; 285typedef xmlRegInputToken *xmlRegInputTokenPtr; 286 287struct _xmlRegInputToken { 288 xmlChar *value; 289 void *data; 290}; 291 292struct _xmlRegExecCtxt { 293 int status; /* execution status != 0 indicate an error */ 294 int determinist; /* did we find an indeterministic behaviour */ 295 xmlRegexpPtr comp; /* the compiled regexp */ 296 xmlRegExecCallbacks callback; 297 void *data; 298 299 xmlRegStatePtr state;/* the current state */ 300 int transno; /* the current transition on that state */ 301 int transcount; /* the number of chars in char counted transitions */ 302 303 /* 304 * A stack of rollback states 305 */ 306 int maxRollbacks; 307 int nbRollbacks; 308 xmlRegExecRollback *rollbacks; 309 310 /* 311 * The state of the automata if any 312 */ 313 int *counts; 314 315 /* 316 * The input stack 317 */ 318 int inputStackMax; 319 int inputStackNr; 320 int index; 321 int *charStack; 322 const xmlChar *inputString; /* when operating on characters */ 323 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 324 325 /* 326 * error handling 327 */ 328 int errStateNo; /* the error state number */ 329 xmlRegStatePtr errState; /* the error state */ 330 xmlChar *errString; /* the string raising the error */ 331 int *errCounts; /* counters at the error state */ 332 int nbPush; 333}; 334 335#define REGEXP_ALL_COUNTER 0x123456 336#define REGEXP_ALL_LAX_COUNTER 0x123457 337 338static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 339static void xmlRegFreeState(xmlRegStatePtr state); 340static void xmlRegFreeAtom(xmlRegAtomPtr atom); 341static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 342static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 343static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 344 int neg, int start, int end, const xmlChar *blockName); 345 346/************************************************************************ 347 * * 348 * Regexp memory error handler * 349 * * 350 ************************************************************************/ 351/** 352 * xmlRegexpErrMemory: 353 * @extra: extra information 354 * 355 * Handle an out of memory condition 356 */ 357static void 358xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 359{ 360 const char *regexp = NULL; 361 if (ctxt != NULL) { 362 regexp = (const char *) ctxt->string; 363 ctxt->error = XML_ERR_NO_MEMORY; 364 } 365 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 366 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 367 regexp, NULL, 0, 0, 368 "Memory allocation failed : %s\n", extra); 369} 370 371/** 372 * xmlRegexpErrCompile: 373 * @extra: extra information 374 * 375 * Handle a compilation failure 376 */ 377static void 378xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 379{ 380 const char *regexp = NULL; 381 int idx = 0; 382 383 if (ctxt != NULL) { 384 regexp = (const char *) ctxt->string; 385 idx = ctxt->cur - ctxt->string; 386 ctxt->error = XML_REGEXP_COMPILE_ERROR; 387 } 388 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 389 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 390 regexp, NULL, idx, 0, 391 "failed to compile: %s\n", extra); 392} 393 394/************************************************************************ 395 * * 396 * Allocation/Deallocation * 397 * * 398 ************************************************************************/ 399 400static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 401/** 402 * xmlRegEpxFromParse: 403 * @ctxt: the parser context used to build it 404 * 405 * Allocate a new regexp and fill it with the result from the parser 406 * 407 * Returns the new regexp or NULL in case of error 408 */ 409static xmlRegexpPtr 410xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 411 xmlRegexpPtr ret; 412 413 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 414 if (ret == NULL) { 415 xmlRegexpErrMemory(ctxt, "compiling regexp"); 416 return(NULL); 417 } 418 memset(ret, 0, sizeof(xmlRegexp)); 419 ret->string = ctxt->string; 420 ret->nbStates = ctxt->nbStates; 421 ret->states = ctxt->states; 422 ret->nbAtoms = ctxt->nbAtoms; 423 ret->atoms = ctxt->atoms; 424 ret->nbCounters = ctxt->nbCounters; 425 ret->counters = ctxt->counters; 426 ret->determinist = ctxt->determinist; 427 if (ret->determinist == -1) { 428 xmlRegexpIsDeterminist(ret); 429 } 430 431 if ((ret->determinist != 0) && 432 (ret->nbCounters == 0) && 433 (ctxt->negs == 0) && 434 (ret->atoms != NULL) && 435 (ret->atoms[0] != NULL) && 436 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 437 int i, j, nbstates = 0, nbatoms = 0; 438 int *stateRemap; 439 int *stringRemap; 440 int *transitions; 441 void **transdata; 442 xmlChar **stringMap; 443 xmlChar *value; 444 445 /* 446 * Switch to a compact representation 447 * 1/ counting the effective number of states left 448 * 2/ counting the unique number of atoms, and check that 449 * they are all of the string type 450 * 3/ build a table state x atom for the transitions 451 */ 452 453 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 454 if (stateRemap == NULL) { 455 xmlRegexpErrMemory(ctxt, "compiling regexp"); 456 xmlFree(ret); 457 return(NULL); 458 } 459 for (i = 0;i < ret->nbStates;i++) { 460 if (ret->states[i] != NULL) { 461 stateRemap[i] = nbstates; 462 nbstates++; 463 } else { 464 stateRemap[i] = -1; 465 } 466 } 467#ifdef DEBUG_COMPACTION 468 printf("Final: %d states\n", nbstates); 469#endif 470 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 471 if (stringMap == NULL) { 472 xmlRegexpErrMemory(ctxt, "compiling regexp"); 473 xmlFree(stateRemap); 474 xmlFree(ret); 475 return(NULL); 476 } 477 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 478 if (stringRemap == NULL) { 479 xmlRegexpErrMemory(ctxt, "compiling regexp"); 480 xmlFree(stringMap); 481 xmlFree(stateRemap); 482 xmlFree(ret); 483 return(NULL); 484 } 485 for (i = 0;i < ret->nbAtoms;i++) { 486 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 487 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 488 value = ret->atoms[i]->valuep; 489 for (j = 0;j < nbatoms;j++) { 490 if (xmlStrEqual(stringMap[j], value)) { 491 stringRemap[i] = j; 492 break; 493 } 494 } 495 if (j >= nbatoms) { 496 stringRemap[i] = nbatoms; 497 stringMap[nbatoms] = xmlStrdup(value); 498 if (stringMap[nbatoms] == NULL) { 499 for (i = 0;i < nbatoms;i++) 500 xmlFree(stringMap[i]); 501 xmlFree(stringRemap); 502 xmlFree(stringMap); 503 xmlFree(stateRemap); 504 xmlFree(ret); 505 return(NULL); 506 } 507 nbatoms++; 508 } 509 } else { 510 xmlFree(stateRemap); 511 xmlFree(stringRemap); 512 for (i = 0;i < nbatoms;i++) 513 xmlFree(stringMap[i]); 514 xmlFree(stringMap); 515 xmlFree(ret); 516 return(NULL); 517 } 518 } 519#ifdef DEBUG_COMPACTION 520 printf("Final: %d atoms\n", nbatoms); 521#endif 522 transitions = (int *) xmlMalloc((nbstates + 1) * 523 (nbatoms + 1) * sizeof(int)); 524 if (transitions == NULL) { 525 xmlFree(stateRemap); 526 xmlFree(stringRemap); 527 xmlFree(stringMap); 528 xmlFree(ret); 529 return(NULL); 530 } 531 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 532 533 /* 534 * Allocate the transition table. The first entry for each 535 * state corresponds to the state type. 536 */ 537 transdata = NULL; 538 539 for (i = 0;i < ret->nbStates;i++) { 540 int stateno, atomno, targetno, prev; 541 xmlRegStatePtr state; 542 xmlRegTransPtr trans; 543 544 stateno = stateRemap[i]; 545 if (stateno == -1) 546 continue; 547 state = ret->states[i]; 548 549 transitions[stateno * (nbatoms + 1)] = state->type; 550 551 for (j = 0;j < state->nbTrans;j++) { 552 trans = &(state->trans[j]); 553 if ((trans->to == -1) || (trans->atom == NULL)) 554 continue; 555 atomno = stringRemap[trans->atom->no]; 556 if ((trans->atom->data != NULL) && (transdata == NULL)) { 557 transdata = (void **) xmlMalloc(nbstates * nbatoms * 558 sizeof(void *)); 559 if (transdata != NULL) 560 memset(transdata, 0, 561 nbstates * nbatoms * sizeof(void *)); 562 else { 563 xmlRegexpErrMemory(ctxt, "compiling regexp"); 564 break; 565 } 566 } 567 targetno = stateRemap[trans->to]; 568 /* 569 * if the same atom can generate transitions to 2 different 570 * states then it means the automata is not determinist and 571 * the compact form can't be used ! 572 */ 573 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 574 if (prev != 0) { 575 if (prev != targetno + 1) { 576 ret->determinist = 0; 577#ifdef DEBUG_COMPACTION 578 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 579 i, j, trans->atom->no, trans->to, atomno, targetno); 580 printf(" previous to is %d\n", prev); 581#endif 582 if (transdata != NULL) 583 xmlFree(transdata); 584 xmlFree(transitions); 585 xmlFree(stateRemap); 586 xmlFree(stringRemap); 587 for (i = 0;i < nbatoms;i++) 588 xmlFree(stringMap[i]); 589 xmlFree(stringMap); 590 goto not_determ; 591 } 592 } else { 593#if 0 594 printf("State %d trans %d: atom %d to %d : %d to %d\n", 595 i, j, trans->atom->no, trans->to, atomno, targetno); 596#endif 597 transitions[stateno * (nbatoms + 1) + atomno + 1] = 598 targetno + 1; /* to avoid 0 */ 599 if (transdata != NULL) 600 transdata[stateno * nbatoms + atomno] = 601 trans->atom->data; 602 } 603 } 604 } 605 ret->determinist = 1; 606#ifdef DEBUG_COMPACTION 607 /* 608 * Debug 609 */ 610 for (i = 0;i < nbstates;i++) { 611 for (j = 0;j < nbatoms + 1;j++) { 612 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 613 } 614 printf("\n"); 615 } 616 printf("\n"); 617#endif 618 /* 619 * Cleanup of the old data 620 */ 621 if (ret->states != NULL) { 622 for (i = 0;i < ret->nbStates;i++) 623 xmlRegFreeState(ret->states[i]); 624 xmlFree(ret->states); 625 } 626 ret->states = NULL; 627 ret->nbStates = 0; 628 if (ret->atoms != NULL) { 629 for (i = 0;i < ret->nbAtoms;i++) 630 xmlRegFreeAtom(ret->atoms[i]); 631 xmlFree(ret->atoms); 632 } 633 ret->atoms = NULL; 634 ret->nbAtoms = 0; 635 636 ret->compact = transitions; 637 ret->transdata = transdata; 638 ret->stringMap = stringMap; 639 ret->nbstrings = nbatoms; 640 ret->nbstates = nbstates; 641 xmlFree(stateRemap); 642 xmlFree(stringRemap); 643 } 644not_determ: 645 ctxt->string = NULL; 646 ctxt->nbStates = 0; 647 ctxt->states = NULL; 648 ctxt->nbAtoms = 0; 649 ctxt->atoms = NULL; 650 ctxt->nbCounters = 0; 651 ctxt->counters = NULL; 652 return(ret); 653} 654 655/** 656 * xmlRegNewParserCtxt: 657 * @string: the string to parse 658 * 659 * Allocate a new regexp parser context 660 * 661 * Returns the new context or NULL in case of error 662 */ 663static xmlRegParserCtxtPtr 664xmlRegNewParserCtxt(const xmlChar *string) { 665 xmlRegParserCtxtPtr ret; 666 667 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 668 if (ret == NULL) 669 return(NULL); 670 memset(ret, 0, sizeof(xmlRegParserCtxt)); 671 if (string != NULL) 672 ret->string = xmlStrdup(string); 673 ret->cur = ret->string; 674 ret->neg = 0; 675 ret->negs = 0; 676 ret->error = 0; 677 ret->determinist = -1; 678 return(ret); 679} 680 681/** 682 * xmlRegNewRange: 683 * @ctxt: the regexp parser context 684 * @neg: is that negative 685 * @type: the type of range 686 * @start: the start codepoint 687 * @end: the end codepoint 688 * 689 * Allocate a new regexp range 690 * 691 * Returns the new range or NULL in case of error 692 */ 693static xmlRegRangePtr 694xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 695 int neg, xmlRegAtomType type, int start, int end) { 696 xmlRegRangePtr ret; 697 698 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 699 if (ret == NULL) { 700 xmlRegexpErrMemory(ctxt, "allocating range"); 701 return(NULL); 702 } 703 ret->neg = neg; 704 ret->type = type; 705 ret->start = start; 706 ret->end = end; 707 return(ret); 708} 709 710/** 711 * xmlRegFreeRange: 712 * @range: the regexp range 713 * 714 * Free a regexp range 715 */ 716static void 717xmlRegFreeRange(xmlRegRangePtr range) { 718 if (range == NULL) 719 return; 720 721 if (range->blockName != NULL) 722 xmlFree(range->blockName); 723 xmlFree(range); 724} 725 726/** 727 * xmlRegNewAtom: 728 * @ctxt: the regexp parser context 729 * @type: the type of atom 730 * 731 * Allocate a new regexp range 732 * 733 * Returns the new atom or NULL in case of error 734 */ 735static xmlRegAtomPtr 736xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 737 xmlRegAtomPtr ret; 738 739 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 740 if (ret == NULL) { 741 xmlRegexpErrMemory(ctxt, "allocating atom"); 742 return(NULL); 743 } 744 memset(ret, 0, sizeof(xmlRegAtom)); 745 ret->type = type; 746 ret->quant = XML_REGEXP_QUANT_ONCE; 747 ret->min = 0; 748 ret->max = 0; 749 return(ret); 750} 751 752/** 753 * xmlRegFreeAtom: 754 * @atom: the regexp atom 755 * 756 * Free a regexp atom 757 */ 758static void 759xmlRegFreeAtom(xmlRegAtomPtr atom) { 760 int i; 761 762 if (atom == NULL) 763 return; 764 765 for (i = 0;i < atom->nbRanges;i++) 766 xmlRegFreeRange(atom->ranges[i]); 767 if (atom->ranges != NULL) 768 xmlFree(atom->ranges); 769 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 770 xmlFree(atom->valuep); 771 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 772 xmlFree(atom->valuep2); 773 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 774 xmlFree(atom->valuep); 775 xmlFree(atom); 776} 777 778static xmlRegStatePtr 779xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 780 xmlRegStatePtr ret; 781 782 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 783 if (ret == NULL) { 784 xmlRegexpErrMemory(ctxt, "allocating state"); 785 return(NULL); 786 } 787 memset(ret, 0, sizeof(xmlRegState)); 788 ret->type = XML_REGEXP_TRANS_STATE; 789 ret->mark = XML_REGEXP_MARK_NORMAL; 790 return(ret); 791} 792 793/** 794 * xmlRegFreeState: 795 * @state: the regexp state 796 * 797 * Free a regexp state 798 */ 799static void 800xmlRegFreeState(xmlRegStatePtr state) { 801 if (state == NULL) 802 return; 803 804 if (state->trans != NULL) 805 xmlFree(state->trans); 806 if (state->transTo != NULL) 807 xmlFree(state->transTo); 808 xmlFree(state); 809} 810 811/** 812 * xmlRegFreeParserCtxt: 813 * @ctxt: the regexp parser context 814 * 815 * Free a regexp parser context 816 */ 817static void 818xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 819 int i; 820 if (ctxt == NULL) 821 return; 822 823 if (ctxt->string != NULL) 824 xmlFree(ctxt->string); 825 if (ctxt->states != NULL) { 826 for (i = 0;i < ctxt->nbStates;i++) 827 xmlRegFreeState(ctxt->states[i]); 828 xmlFree(ctxt->states); 829 } 830 if (ctxt->atoms != NULL) { 831 for (i = 0;i < ctxt->nbAtoms;i++) 832 xmlRegFreeAtom(ctxt->atoms[i]); 833 xmlFree(ctxt->atoms); 834 } 835 if (ctxt->counters != NULL) 836 xmlFree(ctxt->counters); 837 xmlFree(ctxt); 838} 839 840/************************************************************************ 841 * * 842 * Display of Data structures * 843 * * 844 ************************************************************************/ 845 846static void 847xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 848 switch (type) { 849 case XML_REGEXP_EPSILON: 850 fprintf(output, "epsilon "); break; 851 case XML_REGEXP_CHARVAL: 852 fprintf(output, "charval "); break; 853 case XML_REGEXP_RANGES: 854 fprintf(output, "ranges "); break; 855 case XML_REGEXP_SUBREG: 856 fprintf(output, "subexpr "); break; 857 case XML_REGEXP_STRING: 858 fprintf(output, "string "); break; 859 case XML_REGEXP_ANYCHAR: 860 fprintf(output, "anychar "); break; 861 case XML_REGEXP_ANYSPACE: 862 fprintf(output, "anyspace "); break; 863 case XML_REGEXP_NOTSPACE: 864 fprintf(output, "notspace "); break; 865 case XML_REGEXP_INITNAME: 866 fprintf(output, "initname "); break; 867 case XML_REGEXP_NOTINITNAME: 868 fprintf(output, "notinitname "); break; 869 case XML_REGEXP_NAMECHAR: 870 fprintf(output, "namechar "); break; 871 case XML_REGEXP_NOTNAMECHAR: 872 fprintf(output, "notnamechar "); break; 873 case XML_REGEXP_DECIMAL: 874 fprintf(output, "decimal "); break; 875 case XML_REGEXP_NOTDECIMAL: 876 fprintf(output, "notdecimal "); break; 877 case XML_REGEXP_REALCHAR: 878 fprintf(output, "realchar "); break; 879 case XML_REGEXP_NOTREALCHAR: 880 fprintf(output, "notrealchar "); break; 881 case XML_REGEXP_LETTER: 882 fprintf(output, "LETTER "); break; 883 case XML_REGEXP_LETTER_UPPERCASE: 884 fprintf(output, "LETTER_UPPERCASE "); break; 885 case XML_REGEXP_LETTER_LOWERCASE: 886 fprintf(output, "LETTER_LOWERCASE "); break; 887 case XML_REGEXP_LETTER_TITLECASE: 888 fprintf(output, "LETTER_TITLECASE "); break; 889 case XML_REGEXP_LETTER_MODIFIER: 890 fprintf(output, "LETTER_MODIFIER "); break; 891 case XML_REGEXP_LETTER_OTHERS: 892 fprintf(output, "LETTER_OTHERS "); break; 893 case XML_REGEXP_MARK: 894 fprintf(output, "MARK "); break; 895 case XML_REGEXP_MARK_NONSPACING: 896 fprintf(output, "MARK_NONSPACING "); break; 897 case XML_REGEXP_MARK_SPACECOMBINING: 898 fprintf(output, "MARK_SPACECOMBINING "); break; 899 case XML_REGEXP_MARK_ENCLOSING: 900 fprintf(output, "MARK_ENCLOSING "); break; 901 case XML_REGEXP_NUMBER: 902 fprintf(output, "NUMBER "); break; 903 case XML_REGEXP_NUMBER_DECIMAL: 904 fprintf(output, "NUMBER_DECIMAL "); break; 905 case XML_REGEXP_NUMBER_LETTER: 906 fprintf(output, "NUMBER_LETTER "); break; 907 case XML_REGEXP_NUMBER_OTHERS: 908 fprintf(output, "NUMBER_OTHERS "); break; 909 case XML_REGEXP_PUNCT: 910 fprintf(output, "PUNCT "); break; 911 case XML_REGEXP_PUNCT_CONNECTOR: 912 fprintf(output, "PUNCT_CONNECTOR "); break; 913 case XML_REGEXP_PUNCT_DASH: 914 fprintf(output, "PUNCT_DASH "); break; 915 case XML_REGEXP_PUNCT_OPEN: 916 fprintf(output, "PUNCT_OPEN "); break; 917 case XML_REGEXP_PUNCT_CLOSE: 918 fprintf(output, "PUNCT_CLOSE "); break; 919 case XML_REGEXP_PUNCT_INITQUOTE: 920 fprintf(output, "PUNCT_INITQUOTE "); break; 921 case XML_REGEXP_PUNCT_FINQUOTE: 922 fprintf(output, "PUNCT_FINQUOTE "); break; 923 case XML_REGEXP_PUNCT_OTHERS: 924 fprintf(output, "PUNCT_OTHERS "); break; 925 case XML_REGEXP_SEPAR: 926 fprintf(output, "SEPAR "); break; 927 case XML_REGEXP_SEPAR_SPACE: 928 fprintf(output, "SEPAR_SPACE "); break; 929 case XML_REGEXP_SEPAR_LINE: 930 fprintf(output, "SEPAR_LINE "); break; 931 case XML_REGEXP_SEPAR_PARA: 932 fprintf(output, "SEPAR_PARA "); break; 933 case XML_REGEXP_SYMBOL: 934 fprintf(output, "SYMBOL "); break; 935 case XML_REGEXP_SYMBOL_MATH: 936 fprintf(output, "SYMBOL_MATH "); break; 937 case XML_REGEXP_SYMBOL_CURRENCY: 938 fprintf(output, "SYMBOL_CURRENCY "); break; 939 case XML_REGEXP_SYMBOL_MODIFIER: 940 fprintf(output, "SYMBOL_MODIFIER "); break; 941 case XML_REGEXP_SYMBOL_OTHERS: 942 fprintf(output, "SYMBOL_OTHERS "); break; 943 case XML_REGEXP_OTHER: 944 fprintf(output, "OTHER "); break; 945 case XML_REGEXP_OTHER_CONTROL: 946 fprintf(output, "OTHER_CONTROL "); break; 947 case XML_REGEXP_OTHER_FORMAT: 948 fprintf(output, "OTHER_FORMAT "); break; 949 case XML_REGEXP_OTHER_PRIVATE: 950 fprintf(output, "OTHER_PRIVATE "); break; 951 case XML_REGEXP_OTHER_NA: 952 fprintf(output, "OTHER_NA "); break; 953 case XML_REGEXP_BLOCK_NAME: 954 fprintf(output, "BLOCK "); break; 955 } 956} 957 958static void 959xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 960 switch (type) { 961 case XML_REGEXP_QUANT_EPSILON: 962 fprintf(output, "epsilon "); break; 963 case XML_REGEXP_QUANT_ONCE: 964 fprintf(output, "once "); break; 965 case XML_REGEXP_QUANT_OPT: 966 fprintf(output, "? "); break; 967 case XML_REGEXP_QUANT_MULT: 968 fprintf(output, "* "); break; 969 case XML_REGEXP_QUANT_PLUS: 970 fprintf(output, "+ "); break; 971 case XML_REGEXP_QUANT_RANGE: 972 fprintf(output, "range "); break; 973 case XML_REGEXP_QUANT_ONCEONLY: 974 fprintf(output, "onceonly "); break; 975 case XML_REGEXP_QUANT_ALL: 976 fprintf(output, "all "); break; 977 } 978} 979static void 980xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 981 fprintf(output, " range: "); 982 if (range->neg) 983 fprintf(output, "negative "); 984 xmlRegPrintAtomType(output, range->type); 985 fprintf(output, "%c - %c\n", range->start, range->end); 986} 987 988static void 989xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 990 fprintf(output, " atom: "); 991 if (atom == NULL) { 992 fprintf(output, "NULL\n"); 993 return; 994 } 995 if (atom->neg) 996 fprintf(output, "not "); 997 xmlRegPrintAtomType(output, atom->type); 998 xmlRegPrintQuantType(output, atom->quant); 999 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1000 fprintf(output, "%d-%d ", atom->min, atom->max); 1001 if (atom->type == XML_REGEXP_STRING) 1002 fprintf(output, "'%s' ", (char *) atom->valuep); 1003 if (atom->type == XML_REGEXP_CHARVAL) 1004 fprintf(output, "char %c\n", atom->codepoint); 1005 else if (atom->type == XML_REGEXP_RANGES) { 1006 int i; 1007 fprintf(output, "%d entries\n", atom->nbRanges); 1008 for (i = 0; i < atom->nbRanges;i++) 1009 xmlRegPrintRange(output, atom->ranges[i]); 1010 } else if (atom->type == XML_REGEXP_SUBREG) { 1011 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1012 } else { 1013 fprintf(output, "\n"); 1014 } 1015} 1016 1017static void 1018xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1019 fprintf(output, " trans: "); 1020 if (trans == NULL) { 1021 fprintf(output, "NULL\n"); 1022 return; 1023 } 1024 if (trans->to < 0) { 1025 fprintf(output, "removed\n"); 1026 return; 1027 } 1028 if (trans->nd != 0) { 1029 if (trans->nd == 2) 1030 fprintf(output, "last not determinist, "); 1031 else 1032 fprintf(output, "not determinist, "); 1033 } 1034 if (trans->counter >= 0) { 1035 fprintf(output, "counted %d, ", trans->counter); 1036 } 1037 if (trans->count == REGEXP_ALL_COUNTER) { 1038 fprintf(output, "all transition, "); 1039 } else if (trans->count >= 0) { 1040 fprintf(output, "count based %d, ", trans->count); 1041 } 1042 if (trans->atom == NULL) { 1043 fprintf(output, "epsilon to %d\n", trans->to); 1044 return; 1045 } 1046 if (trans->atom->type == XML_REGEXP_CHARVAL) 1047 fprintf(output, "char %c ", trans->atom->codepoint); 1048 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1049} 1050 1051static void 1052xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1053 int i; 1054 1055 fprintf(output, " state: "); 1056 if (state == NULL) { 1057 fprintf(output, "NULL\n"); 1058 return; 1059 } 1060 if (state->type == XML_REGEXP_START_STATE) 1061 fprintf(output, "START "); 1062 if (state->type == XML_REGEXP_FINAL_STATE) 1063 fprintf(output, "FINAL "); 1064 1065 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1066 for (i = 0;i < state->nbTrans; i++) { 1067 xmlRegPrintTrans(output, &(state->trans[i])); 1068 } 1069} 1070 1071#ifdef DEBUG_REGEXP_GRAPH 1072static void 1073xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1074 int i; 1075 1076 fprintf(output, " ctxt: "); 1077 if (ctxt == NULL) { 1078 fprintf(output, "NULL\n"); 1079 return; 1080 } 1081 fprintf(output, "'%s' ", ctxt->string); 1082 if (ctxt->error) 1083 fprintf(output, "error "); 1084 if (ctxt->neg) 1085 fprintf(output, "neg "); 1086 fprintf(output, "\n"); 1087 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1088 for (i = 0;i < ctxt->nbAtoms; i++) { 1089 fprintf(output, " %02d ", i); 1090 xmlRegPrintAtom(output, ctxt->atoms[i]); 1091 } 1092 if (ctxt->atom != NULL) { 1093 fprintf(output, "current atom:\n"); 1094 xmlRegPrintAtom(output, ctxt->atom); 1095 } 1096 fprintf(output, "%d states:", ctxt->nbStates); 1097 if (ctxt->start != NULL) 1098 fprintf(output, " start: %d", ctxt->start->no); 1099 if (ctxt->end != NULL) 1100 fprintf(output, " end: %d", ctxt->end->no); 1101 fprintf(output, "\n"); 1102 for (i = 0;i < ctxt->nbStates; i++) { 1103 xmlRegPrintState(output, ctxt->states[i]); 1104 } 1105 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1106 for (i = 0;i < ctxt->nbCounters; i++) { 1107 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1108 ctxt->counters[i].max); 1109 } 1110} 1111#endif 1112 1113/************************************************************************ 1114 * * 1115 * Finite Automata structures manipulations * 1116 * * 1117 ************************************************************************/ 1118 1119static void 1120xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1121 int neg, xmlRegAtomType type, int start, int end, 1122 xmlChar *blockName) { 1123 xmlRegRangePtr range; 1124 1125 if (atom == NULL) { 1126 ERROR("add range: atom is NULL"); 1127 return; 1128 } 1129 if (atom->type != XML_REGEXP_RANGES) { 1130 ERROR("add range: atom is not ranges"); 1131 return; 1132 } 1133 if (atom->maxRanges == 0) { 1134 atom->maxRanges = 4; 1135 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1136 sizeof(xmlRegRangePtr)); 1137 if (atom->ranges == NULL) { 1138 xmlRegexpErrMemory(ctxt, "adding ranges"); 1139 atom->maxRanges = 0; 1140 return; 1141 } 1142 } else if (atom->nbRanges >= atom->maxRanges) { 1143 xmlRegRangePtr *tmp; 1144 atom->maxRanges *= 2; 1145 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1146 sizeof(xmlRegRangePtr)); 1147 if (tmp == NULL) { 1148 xmlRegexpErrMemory(ctxt, "adding ranges"); 1149 atom->maxRanges /= 2; 1150 return; 1151 } 1152 atom->ranges = tmp; 1153 } 1154 range = xmlRegNewRange(ctxt, neg, type, start, end); 1155 if (range == NULL) 1156 return; 1157 range->blockName = blockName; 1158 atom->ranges[atom->nbRanges++] = range; 1159 1160} 1161 1162static int 1163xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1164 if (ctxt->maxCounters == 0) { 1165 ctxt->maxCounters = 4; 1166 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1167 sizeof(xmlRegCounter)); 1168 if (ctxt->counters == NULL) { 1169 xmlRegexpErrMemory(ctxt, "allocating counter"); 1170 ctxt->maxCounters = 0; 1171 return(-1); 1172 } 1173 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1174 xmlRegCounter *tmp; 1175 ctxt->maxCounters *= 2; 1176 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1177 sizeof(xmlRegCounter)); 1178 if (tmp == NULL) { 1179 xmlRegexpErrMemory(ctxt, "allocating counter"); 1180 ctxt->maxCounters /= 2; 1181 return(-1); 1182 } 1183 ctxt->counters = tmp; 1184 } 1185 ctxt->counters[ctxt->nbCounters].min = -1; 1186 ctxt->counters[ctxt->nbCounters].max = -1; 1187 return(ctxt->nbCounters++); 1188} 1189 1190static int 1191xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1192 if (atom == NULL) { 1193 ERROR("atom push: atom is NULL"); 1194 return(-1); 1195 } 1196 if (ctxt->maxAtoms == 0) { 1197 ctxt->maxAtoms = 4; 1198 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1199 sizeof(xmlRegAtomPtr)); 1200 if (ctxt->atoms == NULL) { 1201 xmlRegexpErrMemory(ctxt, "pushing atom"); 1202 ctxt->maxAtoms = 0; 1203 return(-1); 1204 } 1205 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1206 xmlRegAtomPtr *tmp; 1207 ctxt->maxAtoms *= 2; 1208 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1209 sizeof(xmlRegAtomPtr)); 1210 if (tmp == NULL) { 1211 xmlRegexpErrMemory(ctxt, "allocating counter"); 1212 ctxt->maxAtoms /= 2; 1213 return(-1); 1214 } 1215 ctxt->atoms = tmp; 1216 } 1217 atom->no = ctxt->nbAtoms; 1218 ctxt->atoms[ctxt->nbAtoms++] = atom; 1219 return(0); 1220} 1221 1222static void 1223xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1224 int from) { 1225 if (target->maxTransTo == 0) { 1226 target->maxTransTo = 8; 1227 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1228 sizeof(int)); 1229 if (target->transTo == NULL) { 1230 xmlRegexpErrMemory(ctxt, "adding transition"); 1231 target->maxTransTo = 0; 1232 return; 1233 } 1234 } else if (target->nbTransTo >= target->maxTransTo) { 1235 int *tmp; 1236 target->maxTransTo *= 2; 1237 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1238 sizeof(int)); 1239 if (tmp == NULL) { 1240 xmlRegexpErrMemory(ctxt, "adding transition"); 1241 target->maxTransTo /= 2; 1242 return; 1243 } 1244 target->transTo = tmp; 1245 } 1246 target->transTo[target->nbTransTo] = from; 1247 target->nbTransTo++; 1248} 1249 1250static void 1251xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1252 xmlRegAtomPtr atom, xmlRegStatePtr target, 1253 int counter, int count) { 1254 1255 int nrtrans; 1256 1257 if (state == NULL) { 1258 ERROR("add state: state is NULL"); 1259 return; 1260 } 1261 if (target == NULL) { 1262 ERROR("add state: target is NULL"); 1263 return; 1264 } 1265 /* 1266 * Other routines follow the philosophy 'When in doubt, add a transition' 1267 * so we check here whether such a transition is already present and, if 1268 * so, silently ignore this request. 1269 */ 1270 1271 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1272 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1273 if ((trans->atom == atom) && 1274 (trans->to == target->no) && 1275 (trans->counter == counter) && 1276 (trans->count == count)) { 1277#ifdef DEBUG_REGEXP_GRAPH 1278 printf("Ignoring duplicate transition from %d to %d\n", 1279 state->no, target->no); 1280#endif 1281 return; 1282 } 1283 } 1284 1285 if (state->maxTrans == 0) { 1286 state->maxTrans = 8; 1287 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1288 sizeof(xmlRegTrans)); 1289 if (state->trans == NULL) { 1290 xmlRegexpErrMemory(ctxt, "adding transition"); 1291 state->maxTrans = 0; 1292 return; 1293 } 1294 } else if (state->nbTrans >= state->maxTrans) { 1295 xmlRegTrans *tmp; 1296 state->maxTrans *= 2; 1297 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1298 sizeof(xmlRegTrans)); 1299 if (tmp == NULL) { 1300 xmlRegexpErrMemory(ctxt, "adding transition"); 1301 state->maxTrans /= 2; 1302 return; 1303 } 1304 state->trans = tmp; 1305 } 1306#ifdef DEBUG_REGEXP_GRAPH 1307 printf("Add trans from %d to %d ", state->no, target->no); 1308 if (count == REGEXP_ALL_COUNTER) 1309 printf("all transition\n"); 1310 else if (count >= 0) 1311 printf("count based %d\n", count); 1312 else if (counter >= 0) 1313 printf("counted %d\n", counter); 1314 else if (atom == NULL) 1315 printf("epsilon transition\n"); 1316 else if (atom != NULL) 1317 xmlRegPrintAtom(stdout, atom); 1318#endif 1319 1320 state->trans[state->nbTrans].atom = atom; 1321 state->trans[state->nbTrans].to = target->no; 1322 state->trans[state->nbTrans].counter = counter; 1323 state->trans[state->nbTrans].count = count; 1324 state->trans[state->nbTrans].nd = 0; 1325 state->nbTrans++; 1326 xmlRegStateAddTransTo(ctxt, target, state->no); 1327} 1328 1329static int 1330xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1331 if (state == NULL) return(-1); 1332 if (ctxt->maxStates == 0) { 1333 ctxt->maxStates = 4; 1334 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1335 sizeof(xmlRegStatePtr)); 1336 if (ctxt->states == NULL) { 1337 xmlRegexpErrMemory(ctxt, "adding state"); 1338 ctxt->maxStates = 0; 1339 return(-1); 1340 } 1341 } else if (ctxt->nbStates >= ctxt->maxStates) { 1342 xmlRegStatePtr *tmp; 1343 ctxt->maxStates *= 2; 1344 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1345 sizeof(xmlRegStatePtr)); 1346 if (tmp == NULL) { 1347 xmlRegexpErrMemory(ctxt, "adding state"); 1348 ctxt->maxStates /= 2; 1349 return(-1); 1350 } 1351 ctxt->states = tmp; 1352 } 1353 state->no = ctxt->nbStates; 1354 ctxt->states[ctxt->nbStates++] = state; 1355 return(0); 1356} 1357 1358/** 1359 * xmlFAGenerateAllTransition: 1360 * @ctxt: a regexp parser context 1361 * @from: the from state 1362 * @to: the target state or NULL for building a new one 1363 * @lax: 1364 * 1365 */ 1366static void 1367xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1368 xmlRegStatePtr from, xmlRegStatePtr to, 1369 int lax) { 1370 if (to == NULL) { 1371 to = xmlRegNewState(ctxt); 1372 xmlRegStatePush(ctxt, to); 1373 ctxt->state = to; 1374 } 1375 if (lax) 1376 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1377 else 1378 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1379} 1380 1381/** 1382 * xmlFAGenerateEpsilonTransition: 1383 * @ctxt: a regexp parser context 1384 * @from: the from state 1385 * @to: the target state or NULL for building a new one 1386 * 1387 */ 1388static void 1389xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1390 xmlRegStatePtr from, xmlRegStatePtr to) { 1391 if (to == NULL) { 1392 to = xmlRegNewState(ctxt); 1393 xmlRegStatePush(ctxt, to); 1394 ctxt->state = to; 1395 } 1396 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1397} 1398 1399/** 1400 * xmlFAGenerateCountedEpsilonTransition: 1401 * @ctxt: a regexp parser context 1402 * @from: the from state 1403 * @to: the target state or NULL for building a new one 1404 * counter: the counter for that transition 1405 * 1406 */ 1407static void 1408xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1409 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1410 if (to == NULL) { 1411 to = xmlRegNewState(ctxt); 1412 xmlRegStatePush(ctxt, to); 1413 ctxt->state = to; 1414 } 1415 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1416} 1417 1418/** 1419 * xmlFAGenerateCountedTransition: 1420 * @ctxt: a regexp parser context 1421 * @from: the from state 1422 * @to: the target state or NULL for building a new one 1423 * counter: the counter for that transition 1424 * 1425 */ 1426static void 1427xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1428 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1429 if (to == NULL) { 1430 to = xmlRegNewState(ctxt); 1431 xmlRegStatePush(ctxt, to); 1432 ctxt->state = to; 1433 } 1434 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1435} 1436 1437/** 1438 * xmlFAGenerateTransitions: 1439 * @ctxt: a regexp parser context 1440 * @from: the from state 1441 * @to: the target state or NULL for building a new one 1442 * @atom: the atom generating the transition 1443 * 1444 * Returns 0 if success and -1 in case of error. 1445 */ 1446static int 1447xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1448 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1449 if (atom == NULL) { 1450 ERROR("genrate transition: atom == NULL"); 1451 return(-1); 1452 } 1453 if (atom->type == XML_REGEXP_SUBREG) { 1454 /* 1455 * this is a subexpression handling one should not need to 1456 * create a new node except for XML_REGEXP_QUANT_RANGE. 1457 */ 1458 if (xmlRegAtomPush(ctxt, atom) < 0) { 1459 return(-1); 1460 } 1461 if ((to != NULL) && (atom->stop != to) && 1462 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1463 /* 1464 * Generate an epsilon transition to link to the target 1465 */ 1466 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1467#ifdef DV 1468 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1469 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1470 to = xmlRegNewState(ctxt); 1471 xmlRegStatePush(ctxt, to); 1472 ctxt->state = to; 1473 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1474#endif 1475 } 1476 switch (atom->quant) { 1477 case XML_REGEXP_QUANT_OPT: 1478 atom->quant = XML_REGEXP_QUANT_ONCE; 1479 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1480 break; 1481 case XML_REGEXP_QUANT_MULT: 1482 atom->quant = XML_REGEXP_QUANT_ONCE; 1483 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1484 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1485 break; 1486 case XML_REGEXP_QUANT_PLUS: 1487 atom->quant = XML_REGEXP_QUANT_ONCE; 1488 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1489 break; 1490 case XML_REGEXP_QUANT_RANGE: { 1491 int counter; 1492 xmlRegStatePtr newstate; 1493 1494 /* 1495 * This one is nasty: 1496 * 1/ if range has minOccurs == 0, create a new state 1497 * and create epsilon transitions from atom->start 1498 * to atom->stop, as well as atom->start to the new 1499 * state 1500 * 2/ register a new counter 1501 * 3/ register an epsilon transition associated to 1502 * this counter going from atom->stop to atom->start 1503 * 4/ create a new state 1504 * 5/ generate a counted transition from atom->stop to 1505 * that state 1506 */ 1507 if (atom->min == 0) { 1508 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1509 atom->stop); 1510 newstate = xmlRegNewState(ctxt); 1511 xmlRegStatePush(ctxt, newstate); 1512 ctxt->state = newstate; 1513 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1514 newstate); 1515 } 1516 counter = xmlRegGetCounter(ctxt); 1517 ctxt->counters[counter].min = atom->min - 1; 1518 ctxt->counters[counter].max = atom->max - 1; 1519 atom->min = 0; 1520 atom->max = 0; 1521 atom->quant = XML_REGEXP_QUANT_ONCE; 1522 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1523 atom->start, counter); 1524 if (to != NULL) { 1525 newstate = to; 1526 } else { 1527 newstate = xmlRegNewState(ctxt); 1528 xmlRegStatePush(ctxt, newstate); 1529 ctxt->state = newstate; 1530 } 1531 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1532 newstate, counter); 1533 } 1534 default: 1535 break; 1536 } 1537 return(0); 1538 } 1539 if ((atom->min == 0) && (atom->max == 0) && 1540 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1541 /* 1542 * we can discard the atom and generate an epsilon transition instead 1543 */ 1544 if (to == NULL) { 1545 to = xmlRegNewState(ctxt); 1546 if (to != NULL) 1547 xmlRegStatePush(ctxt, to); 1548 else { 1549 return(-1); 1550 } 1551 } 1552 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1553 ctxt->state = to; 1554 xmlRegFreeAtom(atom); 1555 return(0); 1556 } 1557 if (to == NULL) { 1558 to = xmlRegNewState(ctxt); 1559 if (to != NULL) 1560 xmlRegStatePush(ctxt, to); 1561 else { 1562 return(-1); 1563 } 1564 } 1565 if (xmlRegAtomPush(ctxt, atom) < 0) { 1566 return(-1); 1567 } 1568 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1569 ctxt->state = to; 1570 switch (atom->quant) { 1571 case XML_REGEXP_QUANT_OPT: 1572 atom->quant = XML_REGEXP_QUANT_ONCE; 1573 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1574 break; 1575 case XML_REGEXP_QUANT_MULT: 1576 atom->quant = XML_REGEXP_QUANT_ONCE; 1577 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1578 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1579 break; 1580 case XML_REGEXP_QUANT_PLUS: 1581 atom->quant = XML_REGEXP_QUANT_ONCE; 1582 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1583 break; 1584 default: 1585 break; 1586 } 1587 return(0); 1588} 1589 1590/** 1591 * xmlFAReduceEpsilonTransitions: 1592 * @ctxt: a regexp parser context 1593 * @fromnr: the from state 1594 * @tonr: the to state 1595 * @counter: should that transition be associated to a counted 1596 * 1597 */ 1598static void 1599xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1600 int tonr, int counter) { 1601 int transnr; 1602 xmlRegStatePtr from; 1603 xmlRegStatePtr to; 1604 1605#ifdef DEBUG_REGEXP_GRAPH 1606 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1607#endif 1608 from = ctxt->states[fromnr]; 1609 if (from == NULL) 1610 return; 1611 to = ctxt->states[tonr]; 1612 if (to == NULL) 1613 return; 1614 if ((to->mark == XML_REGEXP_MARK_START) || 1615 (to->mark == XML_REGEXP_MARK_VISITED)) 1616 return; 1617 1618 to->mark = XML_REGEXP_MARK_VISITED; 1619 if (to->type == XML_REGEXP_FINAL_STATE) { 1620#ifdef DEBUG_REGEXP_GRAPH 1621 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1622#endif 1623 from->type = XML_REGEXP_FINAL_STATE; 1624 } 1625 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1626 if (to->trans[transnr].to < 0) 1627 continue; 1628 if (to->trans[transnr].atom == NULL) { 1629 /* 1630 * Don't remove counted transitions 1631 * Don't loop either 1632 */ 1633 if (to->trans[transnr].to != fromnr) { 1634 if (to->trans[transnr].count >= 0) { 1635 int newto = to->trans[transnr].to; 1636 1637 xmlRegStateAddTrans(ctxt, from, NULL, 1638 ctxt->states[newto], 1639 -1, to->trans[transnr].count); 1640 } else { 1641#ifdef DEBUG_REGEXP_GRAPH 1642 printf("Found epsilon trans %d from %d to %d\n", 1643 transnr, tonr, to->trans[transnr].to); 1644#endif 1645 if (to->trans[transnr].counter >= 0) { 1646 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1647 to->trans[transnr].to, 1648 to->trans[transnr].counter); 1649 } else { 1650 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1651 to->trans[transnr].to, 1652 counter); 1653 } 1654 } 1655 } 1656 } else { 1657 int newto = to->trans[transnr].to; 1658 1659 if (to->trans[transnr].counter >= 0) { 1660 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1661 ctxt->states[newto], 1662 to->trans[transnr].counter, -1); 1663 } else { 1664 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1665 ctxt->states[newto], counter, -1); 1666 } 1667 } 1668 } 1669 to->mark = XML_REGEXP_MARK_NORMAL; 1670} 1671 1672/** 1673 * xmlFAEliminateSimpleEpsilonTransitions: 1674 * @ctxt: a regexp parser context 1675 * 1676 * Eliminating general epsilon transitions can get costly in the general 1677 * algorithm due to the large amount of generated new transitions and 1678 * associated comparisons. However for simple epsilon transition used just 1679 * to separate building blocks when generating the automata this can be 1680 * reduced to state elimination: 1681 * - if there exists an epsilon from X to Y 1682 * - if there is no other transition from X 1683 * then X and Y are semantically equivalent and X can be eliminated 1684 * If X is the start state then make Y the start state, else replace the 1685 * target of all transitions to X by transitions to Y. 1686 */ 1687static void 1688xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1689 int statenr, i, j, newto; 1690 xmlRegStatePtr state, tmp; 1691 1692 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1693 state = ctxt->states[statenr]; 1694 if (state == NULL) 1695 continue; 1696 if (state->nbTrans != 1) 1697 continue; 1698 /* is the only transition out a basic transition */ 1699 if ((state->trans[0].atom == NULL) && 1700 (state->trans[0].to >= 0) && 1701 (state->trans[0].to != statenr) && 1702 (state->trans[0].counter < 0) && 1703 (state->trans[0].count < 0)) { 1704 newto = state->trans[0].to; 1705 1706 if (state->type == XML_REGEXP_START_STATE) { 1707#ifdef DEBUG_REGEXP_GRAPH 1708 printf("Found simple epsilon trans from start %d to %d\n", 1709 statenr, newto); 1710#endif 1711 } else { 1712#ifdef DEBUG_REGEXP_GRAPH 1713 printf("Found simple epsilon trans from %d to %d\n", 1714 statenr, newto); 1715#endif 1716 for (i = 0;i < state->nbTransTo;i++) { 1717 tmp = ctxt->states[state->transTo[i]]; 1718 for (j = 0;j < tmp->nbTrans;j++) { 1719 if (tmp->trans[j].to == statenr) { 1720 tmp->trans[j].to = newto; 1721#ifdef DEBUG_REGEXP_GRAPH 1722 printf("Changed transition %d on %d to go to %d\n", 1723 j, tmp->no, newto); 1724#endif 1725 xmlRegStateAddTransTo(ctxt, ctxt->states[newto], 1726 tmp->no); 1727 } 1728 } 1729 } 1730#if 0 1731 for (i = 0;i < ctxt->nbStates;i++) { 1732 tmp = ctxt->states[i]; 1733 for (j = 0;j < tmp->nbTrans;j++) { 1734 if (tmp->trans[j].to == statenr) { 1735 tmp->trans[j].to = newto; 1736#ifdef DEBUG_REGEXP_GRAPH 1737 printf("Changed transition %d on %d to go to %d\n", 1738 j, tmp->no, newto); 1739#endif 1740 } 1741 } 1742 } 1743#endif 1744 if (state->type == XML_REGEXP_FINAL_STATE) 1745 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1746 /* eliminate the transition completely */ 1747 state->nbTrans = 0; 1748 1749 1750 } 1751 1752 } 1753 } 1754} 1755/** 1756 * xmlFAEliminateEpsilonTransitions: 1757 * @ctxt: a regexp parser context 1758 * 1759 */ 1760static void 1761xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1762 int statenr, transnr; 1763 xmlRegStatePtr state; 1764 int has_epsilon; 1765 1766 if (ctxt->states == NULL) return; 1767 1768 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1769 1770 has_epsilon = 0; 1771 1772 /* 1773 * build the completed transitions bypassing the epsilons 1774 * Use a marking algorithm to avoid loops 1775 * mark sink states too. 1776 */ 1777 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1778 state = ctxt->states[statenr]; 1779 if (state == NULL) 1780 continue; 1781 if ((state->nbTrans == 0) && 1782 (state->type != XML_REGEXP_FINAL_STATE)) { 1783 state->type = XML_REGEXP_SINK_STATE; 1784 } 1785 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1786 if ((state->trans[transnr].atom == NULL) && 1787 (state->trans[transnr].to >= 0)) { 1788 if (state->trans[transnr].to == statenr) { 1789 state->trans[transnr].to = -1; 1790#ifdef DEBUG_REGEXP_GRAPH 1791 printf("Removed loopback epsilon trans %d on %d\n", 1792 transnr, statenr); 1793#endif 1794 } else if (state->trans[transnr].count < 0) { 1795 int newto = state->trans[transnr].to; 1796 1797#ifdef DEBUG_REGEXP_GRAPH 1798 printf("Found epsilon trans %d from %d to %d\n", 1799 transnr, statenr, newto); 1800#endif 1801 state->mark = XML_REGEXP_MARK_START; 1802 has_epsilon = 1; 1803 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1804 newto, state->trans[transnr].counter); 1805 state->mark = XML_REGEXP_MARK_NORMAL; 1806#ifdef DEBUG_REGEXP_GRAPH 1807 } else { 1808 printf("Found counted transition %d on %d\n", 1809 transnr, statenr); 1810#endif 1811 } 1812 } 1813 } 1814 } 1815 /* 1816 * Eliminate the epsilon transitions 1817 */ 1818 if (has_epsilon) { 1819 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1820 state = ctxt->states[statenr]; 1821 if (state == NULL) 1822 continue; 1823 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1824 xmlRegTransPtr trans = &(state->trans[transnr]); 1825 if ((trans->atom == NULL) && 1826 (trans->count < 0) && 1827 (trans->to >= 0)) { 1828 trans->to = -1; 1829 } 1830 } 1831 } 1832 } 1833 1834 /* 1835 * Use this pass to detect unreachable states too 1836 */ 1837 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1838 state = ctxt->states[statenr]; 1839 if (state != NULL) 1840 state->reached = XML_REGEXP_MARK_NORMAL; 1841 } 1842 state = ctxt->states[0]; 1843 if (state != NULL) 1844 state->reached = XML_REGEXP_MARK_START; 1845 while (state != NULL) { 1846 xmlRegStatePtr target = NULL; 1847 state->reached = XML_REGEXP_MARK_VISITED; 1848 /* 1849 * Mark all states reachable from the current reachable state 1850 */ 1851 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1852 if ((state->trans[transnr].to >= 0) && 1853 ((state->trans[transnr].atom != NULL) || 1854 (state->trans[transnr].count >= 0))) { 1855 int newto = state->trans[transnr].to; 1856 1857 if (ctxt->states[newto] == NULL) 1858 continue; 1859 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 1860 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 1861 target = ctxt->states[newto]; 1862 } 1863 } 1864 } 1865 1866 /* 1867 * find the next accessible state not explored 1868 */ 1869 if (target == NULL) { 1870 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 1871 state = ctxt->states[statenr]; 1872 if ((state != NULL) && (state->reached == 1873 XML_REGEXP_MARK_START)) { 1874 target = state; 1875 break; 1876 } 1877 } 1878 } 1879 state = target; 1880 } 1881 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1882 state = ctxt->states[statenr]; 1883 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 1884#ifdef DEBUG_REGEXP_GRAPH 1885 printf("Removed unreachable state %d\n", statenr); 1886#endif 1887 xmlRegFreeState(state); 1888 ctxt->states[statenr] = NULL; 1889 } 1890 } 1891 1892} 1893 1894static int 1895xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 1896 int ret = 0; 1897 1898 if ((range1->type == XML_REGEXP_RANGES) || 1899 (range2->type == XML_REGEXP_RANGES) || 1900 (range2->type == XML_REGEXP_SUBREG) || 1901 (range1->type == XML_REGEXP_SUBREG) || 1902 (range1->type == XML_REGEXP_STRING) || 1903 (range2->type == XML_REGEXP_STRING)) 1904 return(-1); 1905 1906 /* put them in order */ 1907 if (range1->type > range2->type) { 1908 xmlRegRangePtr tmp; 1909 1910 tmp = range1; 1911 range1 = range2; 1912 range2 = tmp; 1913 } 1914 if ((range1->type == XML_REGEXP_ANYCHAR) || 1915 (range2->type == XML_REGEXP_ANYCHAR)) { 1916 ret = 1; 1917 } else if ((range1->type == XML_REGEXP_EPSILON) || 1918 (range2->type == XML_REGEXP_EPSILON)) { 1919 return(0); 1920 } else if (range1->type == range2->type) { 1921 if ((range1->type != XML_REGEXP_CHARVAL) || 1922 (range1->end < range2->start) || 1923 (range2->end < range1->start)) 1924 ret = 1; 1925 else 1926 ret = 0; 1927 } else if (range1->type == XML_REGEXP_CHARVAL) { 1928 int codepoint; 1929 int neg = 0; 1930 1931 /* 1932 * just check all codepoints in the range for acceptance, 1933 * this is usually way cheaper since done only once at 1934 * compilation than testing over and over at runtime or 1935 * pushing too many states when evaluating. 1936 */ 1937 if (((range1->neg == 0) && (range2->neg != 0)) || 1938 ((range1->neg != 0) && (range2->neg == 0))) 1939 neg = 1; 1940 1941 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 1942 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 1943 0, range2->start, range2->end, 1944 range2->blockName); 1945 if (ret < 0) 1946 return(-1); 1947 if (((neg == 1) && (ret == 0)) || 1948 ((neg == 0) && (ret == 1))) 1949 return(1); 1950 } 1951 return(0); 1952 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 1953 (range2->type == XML_REGEXP_BLOCK_NAME)) { 1954 if (range1->type == range2->type) { 1955 ret = xmlStrEqual(range1->blockName, range2->blockName); 1956 } else { 1957 /* 1958 * comparing a block range with anything else is way 1959 * too costly, and maintining the table is like too much 1960 * memory too, so let's force the automata to save state 1961 * here. 1962 */ 1963 return(1); 1964 } 1965 } else if ((range1->type < XML_REGEXP_LETTER) || 1966 (range2->type < XML_REGEXP_LETTER)) { 1967 if ((range1->type == XML_REGEXP_ANYSPACE) && 1968 (range2->type == XML_REGEXP_NOTSPACE)) 1969 ret = 0; 1970 else if ((range1->type == XML_REGEXP_INITNAME) && 1971 (range2->type == XML_REGEXP_NOTINITNAME)) 1972 ret = 0; 1973 else if ((range1->type == XML_REGEXP_NAMECHAR) && 1974 (range2->type == XML_REGEXP_NOTNAMECHAR)) 1975 ret = 0; 1976 else if ((range1->type == XML_REGEXP_DECIMAL) && 1977 (range2->type == XML_REGEXP_NOTDECIMAL)) 1978 ret = 0; 1979 else if ((range1->type == XML_REGEXP_REALCHAR) && 1980 (range2->type == XML_REGEXP_NOTREALCHAR)) 1981 ret = 0; 1982 else { 1983 /* same thing to limit complexity */ 1984 return(1); 1985 } 1986 } else { 1987 ret = 0; 1988 /* range1->type < range2->type here */ 1989 switch (range1->type) { 1990 case XML_REGEXP_LETTER: 1991 /* all disjoint except in the subgroups */ 1992 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 1993 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 1994 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 1995 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 1996 (range2->type == XML_REGEXP_LETTER_OTHERS)) 1997 ret = 1; 1998 break; 1999 case XML_REGEXP_MARK: 2000 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2001 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2002 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2003 ret = 1; 2004 break; 2005 case XML_REGEXP_NUMBER: 2006 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2007 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2008 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2009 ret = 1; 2010 break; 2011 case XML_REGEXP_PUNCT: 2012 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2013 (range2->type == XML_REGEXP_PUNCT_DASH) || 2014 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2015 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2016 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2017 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2018 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2019 ret = 1; 2020 break; 2021 case XML_REGEXP_SEPAR: 2022 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2023 (range2->type == XML_REGEXP_SEPAR_LINE) || 2024 (range2->type == XML_REGEXP_SEPAR_PARA)) 2025 ret = 1; 2026 break; 2027 case XML_REGEXP_SYMBOL: 2028 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2029 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2030 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2031 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2032 ret = 1; 2033 break; 2034 case XML_REGEXP_OTHER: 2035 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2036 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2037 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2038 ret = 1; 2039 break; 2040 default: 2041 if ((range2->type >= XML_REGEXP_LETTER) && 2042 (range2->type < XML_REGEXP_BLOCK_NAME)) 2043 ret = 0; 2044 else { 2045 /* safety net ! */ 2046 return(1); 2047 } 2048 } 2049 } 2050 if (((range1->neg == 0) && (range2->neg != 0)) || 2051 ((range1->neg != 0) && (range2->neg == 0))) 2052 ret = !ret; 2053 return(1); 2054} 2055 2056/** 2057 * xmlFACompareAtoms: 2058 * @atom1: an atom 2059 * @atom2: an atom 2060 * 2061 * Compares two atoms to check whether they intersect in some ways, 2062 * this is used by xmlFAComputesDeterminism only 2063 * 2064 * Returns 1 if yes and 0 otherwise 2065 */ 2066static int 2067xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 2068 int ret; 2069 2070 if (atom1 == atom2) 2071 return(1); 2072 if ((atom1 == NULL) || (atom2 == NULL)) 2073 return(0); 2074 2075 if ((atom1->type == XML_REGEXP_RANGES) && 2076 (atom2->type == XML_REGEXP_CHARVAL)) { 2077 } else if ((atom1->type == XML_REGEXP_CHARVAL) && 2078 (atom2->type == XML_REGEXP_RANGES)) { 2079 xmlRegAtomPtr tmp; 2080 tmp = atom1; 2081 atom1 = atom2; 2082 atom2 = tmp; 2083 } else if (atom1->type != atom2->type) { 2084 return(0); 2085 } 2086 switch (atom1->type) { 2087 case XML_REGEXP_STRING: 2088 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 2089 (xmlChar *)atom2->valuep); 2090 break; 2091 case XML_REGEXP_EPSILON: 2092 goto not_determinist; 2093 case XML_REGEXP_CHARVAL: 2094 ret = (atom1->codepoint == atom2->codepoint); 2095 break; 2096 case XML_REGEXP_RANGES: 2097 if (atom2->type == XML_REGEXP_CHARVAL) { 2098 ret = xmlRegCheckCharacter(atom1, atom2->codepoint); 2099 if (ret < 0) 2100 return(-1); 2101 break; 2102 } else { 2103 int i, j, res; 2104 xmlRegRangePtr r1, r2; 2105 2106 /* 2107 * need to check that none of the ranges eventually matches 2108 */ 2109 for (i = 0;i < atom1->nbRanges;i++) { 2110 for (j = 0;j < atom2->nbRanges;j++) { 2111 r1 = atom1->ranges[i]; 2112 r2 = atom2->ranges[j]; 2113 res = xmlFACompareRanges(r1, r2); 2114 if (res == 1) { 2115 ret = 1; 2116 goto done; 2117 } 2118 } 2119 } 2120 ret = 0; 2121 } 2122 break; 2123 default: 2124 goto not_determinist; 2125 } 2126done: 2127 if (atom1->neg != atom2->neg) { 2128 ret = !ret; 2129 } 2130 if (ret == 0) 2131 return(0); 2132not_determinist: 2133 return(1); 2134} 2135 2136/** 2137 * xmlFARecurseDeterminism: 2138 * @ctxt: a regexp parser context 2139 * 2140 * Check whether the associated regexp is determinist, 2141 * should be called after xmlFAEliminateEpsilonTransitions() 2142 * 2143 */ 2144static int 2145xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2146 int to, xmlRegAtomPtr atom) { 2147 int ret = 1; 2148 int res; 2149 int transnr, nbTrans; 2150 xmlRegTransPtr t1; 2151 2152 if (state == NULL) 2153 return(ret); 2154 /* 2155 * don't recurse on transitions potentially added in the course of 2156 * the elimination. 2157 */ 2158 nbTrans = state->nbTrans; 2159 for (transnr = 0;transnr < nbTrans;transnr++) { 2160 t1 = &(state->trans[transnr]); 2161 /* 2162 * check transitions conflicting with the one looked at 2163 */ 2164 if (t1->atom == NULL) { 2165 if (t1->to == -1) 2166 continue; 2167 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2168 to, atom); 2169 if (res == 0) { 2170 ret = 0; 2171 /* t1->nd = 1; */ 2172 } 2173 continue; 2174 } 2175 if (t1->to != to) 2176 continue; 2177 if (xmlFACompareAtoms(t1->atom, atom)) { 2178 ret = 0; 2179 /* mark the transition as non-deterministic */ 2180 t1->nd = 1; 2181 } 2182 } 2183 return(ret); 2184} 2185 2186/** 2187 * xmlFAComputesDeterminism: 2188 * @ctxt: a regexp parser context 2189 * 2190 * Check whether the associated regexp is determinist, 2191 * should be called after xmlFAEliminateEpsilonTransitions() 2192 * 2193 */ 2194static int 2195xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2196 int statenr, transnr; 2197 xmlRegStatePtr state; 2198 xmlRegTransPtr t1, t2, last; 2199 int i; 2200 int ret = 1; 2201 2202#ifdef DEBUG_REGEXP_GRAPH 2203 printf("xmlFAComputesDeterminism\n"); 2204 xmlRegPrintCtxt(stdout, ctxt); 2205#endif 2206 if (ctxt->determinist != -1) 2207 return(ctxt->determinist); 2208 2209 /* 2210 * First cleanup the automata removing cancelled transitions 2211 */ 2212 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2213 state = ctxt->states[statenr]; 2214 if (state == NULL) 2215 continue; 2216 if (state->nbTrans < 2) 2217 continue; 2218 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2219 t1 = &(state->trans[transnr]); 2220 /* 2221 * Determinism checks in case of counted or all transitions 2222 * will have to be handled separately 2223 */ 2224 if (t1->atom == NULL) { 2225 /* t1->nd = 1; */ 2226 continue; 2227 } 2228 if (t1->to == -1) /* eliminated */ 2229 continue; 2230 for (i = 0;i < transnr;i++) { 2231 t2 = &(state->trans[i]); 2232 if (t2->to == -1) /* eliminated */ 2233 continue; 2234 if (t2->atom != NULL) { 2235 if (t1->to == t2->to) { 2236 if (xmlFACompareAtoms(t1->atom, t2->atom)) 2237 t2->to = -1; /* eliminated */ 2238 } 2239 } 2240 } 2241 } 2242 } 2243 2244 /* 2245 * Check for all states that there aren't 2 transitions 2246 * with the same atom and a different target. 2247 */ 2248 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2249 state = ctxt->states[statenr]; 2250 if (state == NULL) 2251 continue; 2252 if (state->nbTrans < 2) 2253 continue; 2254 last = NULL; 2255 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2256 t1 = &(state->trans[transnr]); 2257 /* 2258 * Determinism checks in case of counted or all transitions 2259 * will have to be handled separately 2260 */ 2261 if (t1->atom == NULL) { 2262 continue; 2263 } 2264 if (t1->to == -1) /* eliminated */ 2265 continue; 2266 for (i = 0;i < transnr;i++) { 2267 t2 = &(state->trans[i]); 2268 if (t2->to == -1) /* eliminated */ 2269 continue; 2270 if (t2->atom != NULL) { 2271 /* not determinist ! */ 2272 if (xmlFACompareAtoms(t1->atom, t2->atom)) { 2273 ret = 0; 2274 /* mark the transitions as non-deterministic ones */ 2275 t1->nd = 1; 2276 t2->nd = 1; 2277 last = t1; 2278 } 2279 } else if (t1->to != -1) { 2280 /* 2281 * do the closure in case of remaining specific 2282 * epsilon transitions like choices or all 2283 */ 2284 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2285 t2->to, t2->atom); 2286 /* don't shortcut the computation so all non deterministic 2287 transition get marked down 2288 if (ret == 0) 2289 return(0); 2290 */ 2291 if (ret == 0) { 2292 t1->nd = 1; 2293 /* t2->nd = 1; */ 2294 last = t1; 2295 } 2296 } 2297 } 2298 /* don't shortcut the computation so all non deterministic 2299 transition get marked down 2300 if (ret == 0) 2301 break; */ 2302 } 2303 2304 /* 2305 * mark specifically the last non-deterministic transition 2306 * from a state since there is no need to set-up rollback 2307 * from it 2308 */ 2309 if (last != NULL) { 2310 last->nd = 2; 2311 } 2312 2313 /* don't shortcut the computation so all non deterministic 2314 transition get marked down 2315 if (ret == 0) 2316 break; */ 2317 } 2318 2319 ctxt->determinist = ret; 2320 return(ret); 2321} 2322 2323/************************************************************************ 2324 * * 2325 * Routines to check input against transition atoms * 2326 * * 2327 ************************************************************************/ 2328 2329static int 2330xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2331 int start, int end, const xmlChar *blockName) { 2332 int ret = 0; 2333 2334 switch (type) { 2335 case XML_REGEXP_STRING: 2336 case XML_REGEXP_SUBREG: 2337 case XML_REGEXP_RANGES: 2338 case XML_REGEXP_EPSILON: 2339 return(-1); 2340 case XML_REGEXP_ANYCHAR: 2341 ret = ((codepoint != '\n') && (codepoint != '\r')); 2342 break; 2343 case XML_REGEXP_CHARVAL: 2344 ret = ((codepoint >= start) && (codepoint <= end)); 2345 break; 2346 case XML_REGEXP_NOTSPACE: 2347 neg = !neg; 2348 case XML_REGEXP_ANYSPACE: 2349 ret = ((codepoint == '\n') || (codepoint == '\r') || 2350 (codepoint == '\t') || (codepoint == ' ')); 2351 break; 2352 case XML_REGEXP_NOTINITNAME: 2353 neg = !neg; 2354 case XML_REGEXP_INITNAME: 2355 ret = (IS_LETTER(codepoint) || 2356 (codepoint == '_') || (codepoint == ':')); 2357 break; 2358 case XML_REGEXP_NOTNAMECHAR: 2359 neg = !neg; 2360 case XML_REGEXP_NAMECHAR: 2361 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2362 (codepoint == '.') || (codepoint == '-') || 2363 (codepoint == '_') || (codepoint == ':') || 2364 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2365 break; 2366 case XML_REGEXP_NOTDECIMAL: 2367 neg = !neg; 2368 case XML_REGEXP_DECIMAL: 2369 ret = xmlUCSIsCatNd(codepoint); 2370 break; 2371 case XML_REGEXP_REALCHAR: 2372 neg = !neg; 2373 case XML_REGEXP_NOTREALCHAR: 2374 ret = xmlUCSIsCatP(codepoint); 2375 if (ret == 0) 2376 ret = xmlUCSIsCatZ(codepoint); 2377 if (ret == 0) 2378 ret = xmlUCSIsCatC(codepoint); 2379 break; 2380 case XML_REGEXP_LETTER: 2381 ret = xmlUCSIsCatL(codepoint); 2382 break; 2383 case XML_REGEXP_LETTER_UPPERCASE: 2384 ret = xmlUCSIsCatLu(codepoint); 2385 break; 2386 case XML_REGEXP_LETTER_LOWERCASE: 2387 ret = xmlUCSIsCatLl(codepoint); 2388 break; 2389 case XML_REGEXP_LETTER_TITLECASE: 2390 ret = xmlUCSIsCatLt(codepoint); 2391 break; 2392 case XML_REGEXP_LETTER_MODIFIER: 2393 ret = xmlUCSIsCatLm(codepoint); 2394 break; 2395 case XML_REGEXP_LETTER_OTHERS: 2396 ret = xmlUCSIsCatLo(codepoint); 2397 break; 2398 case XML_REGEXP_MARK: 2399 ret = xmlUCSIsCatM(codepoint); 2400 break; 2401 case XML_REGEXP_MARK_NONSPACING: 2402 ret = xmlUCSIsCatMn(codepoint); 2403 break; 2404 case XML_REGEXP_MARK_SPACECOMBINING: 2405 ret = xmlUCSIsCatMc(codepoint); 2406 break; 2407 case XML_REGEXP_MARK_ENCLOSING: 2408 ret = xmlUCSIsCatMe(codepoint); 2409 break; 2410 case XML_REGEXP_NUMBER: 2411 ret = xmlUCSIsCatN(codepoint); 2412 break; 2413 case XML_REGEXP_NUMBER_DECIMAL: 2414 ret = xmlUCSIsCatNd(codepoint); 2415 break; 2416 case XML_REGEXP_NUMBER_LETTER: 2417 ret = xmlUCSIsCatNl(codepoint); 2418 break; 2419 case XML_REGEXP_NUMBER_OTHERS: 2420 ret = xmlUCSIsCatNo(codepoint); 2421 break; 2422 case XML_REGEXP_PUNCT: 2423 ret = xmlUCSIsCatP(codepoint); 2424 break; 2425 case XML_REGEXP_PUNCT_CONNECTOR: 2426 ret = xmlUCSIsCatPc(codepoint); 2427 break; 2428 case XML_REGEXP_PUNCT_DASH: 2429 ret = xmlUCSIsCatPd(codepoint); 2430 break; 2431 case XML_REGEXP_PUNCT_OPEN: 2432 ret = xmlUCSIsCatPs(codepoint); 2433 break; 2434 case XML_REGEXP_PUNCT_CLOSE: 2435 ret = xmlUCSIsCatPe(codepoint); 2436 break; 2437 case XML_REGEXP_PUNCT_INITQUOTE: 2438 ret = xmlUCSIsCatPi(codepoint); 2439 break; 2440 case XML_REGEXP_PUNCT_FINQUOTE: 2441 ret = xmlUCSIsCatPf(codepoint); 2442 break; 2443 case XML_REGEXP_PUNCT_OTHERS: 2444 ret = xmlUCSIsCatPo(codepoint); 2445 break; 2446 case XML_REGEXP_SEPAR: 2447 ret = xmlUCSIsCatZ(codepoint); 2448 break; 2449 case XML_REGEXP_SEPAR_SPACE: 2450 ret = xmlUCSIsCatZs(codepoint); 2451 break; 2452 case XML_REGEXP_SEPAR_LINE: 2453 ret = xmlUCSIsCatZl(codepoint); 2454 break; 2455 case XML_REGEXP_SEPAR_PARA: 2456 ret = xmlUCSIsCatZp(codepoint); 2457 break; 2458 case XML_REGEXP_SYMBOL: 2459 ret = xmlUCSIsCatS(codepoint); 2460 break; 2461 case XML_REGEXP_SYMBOL_MATH: 2462 ret = xmlUCSIsCatSm(codepoint); 2463 break; 2464 case XML_REGEXP_SYMBOL_CURRENCY: 2465 ret = xmlUCSIsCatSc(codepoint); 2466 break; 2467 case XML_REGEXP_SYMBOL_MODIFIER: 2468 ret = xmlUCSIsCatSk(codepoint); 2469 break; 2470 case XML_REGEXP_SYMBOL_OTHERS: 2471 ret = xmlUCSIsCatSo(codepoint); 2472 break; 2473 case XML_REGEXP_OTHER: 2474 ret = xmlUCSIsCatC(codepoint); 2475 break; 2476 case XML_REGEXP_OTHER_CONTROL: 2477 ret = xmlUCSIsCatCc(codepoint); 2478 break; 2479 case XML_REGEXP_OTHER_FORMAT: 2480 ret = xmlUCSIsCatCf(codepoint); 2481 break; 2482 case XML_REGEXP_OTHER_PRIVATE: 2483 ret = xmlUCSIsCatCo(codepoint); 2484 break; 2485 case XML_REGEXP_OTHER_NA: 2486 /* ret = xmlUCSIsCatCn(codepoint); */ 2487 /* Seems it doesn't exist anymore in recent Unicode releases */ 2488 ret = 0; 2489 break; 2490 case XML_REGEXP_BLOCK_NAME: 2491 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2492 break; 2493 } 2494 if (neg) 2495 return(!ret); 2496 return(ret); 2497} 2498 2499static int 2500xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2501 int i, ret = 0; 2502 xmlRegRangePtr range; 2503 2504 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2505 return(-1); 2506 2507 switch (atom->type) { 2508 case XML_REGEXP_SUBREG: 2509 case XML_REGEXP_EPSILON: 2510 return(-1); 2511 case XML_REGEXP_CHARVAL: 2512 return(codepoint == atom->codepoint); 2513 case XML_REGEXP_RANGES: { 2514 int accept = 0; 2515 2516 for (i = 0;i < atom->nbRanges;i++) { 2517 range = atom->ranges[i]; 2518 if (range->neg == 2) { 2519 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2520 0, range->start, range->end, 2521 range->blockName); 2522 if (ret != 0) 2523 return(0); /* excluded char */ 2524 } else if (range->neg) { 2525 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2526 0, range->start, range->end, 2527 range->blockName); 2528 if (ret == 0) 2529 accept = 1; 2530 else 2531 return(0); 2532 } else { 2533 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2534 0, range->start, range->end, 2535 range->blockName); 2536 if (ret != 0) 2537 accept = 1; /* might still be excluded */ 2538 } 2539 } 2540 return(accept); 2541 } 2542 case XML_REGEXP_STRING: 2543 printf("TODO: XML_REGEXP_STRING\n"); 2544 return(-1); 2545 case XML_REGEXP_ANYCHAR: 2546 case XML_REGEXP_ANYSPACE: 2547 case XML_REGEXP_NOTSPACE: 2548 case XML_REGEXP_INITNAME: 2549 case XML_REGEXP_NOTINITNAME: 2550 case XML_REGEXP_NAMECHAR: 2551 case XML_REGEXP_NOTNAMECHAR: 2552 case XML_REGEXP_DECIMAL: 2553 case XML_REGEXP_NOTDECIMAL: 2554 case XML_REGEXP_REALCHAR: 2555 case XML_REGEXP_NOTREALCHAR: 2556 case XML_REGEXP_LETTER: 2557 case XML_REGEXP_LETTER_UPPERCASE: 2558 case XML_REGEXP_LETTER_LOWERCASE: 2559 case XML_REGEXP_LETTER_TITLECASE: 2560 case XML_REGEXP_LETTER_MODIFIER: 2561 case XML_REGEXP_LETTER_OTHERS: 2562 case XML_REGEXP_MARK: 2563 case XML_REGEXP_MARK_NONSPACING: 2564 case XML_REGEXP_MARK_SPACECOMBINING: 2565 case XML_REGEXP_MARK_ENCLOSING: 2566 case XML_REGEXP_NUMBER: 2567 case XML_REGEXP_NUMBER_DECIMAL: 2568 case XML_REGEXP_NUMBER_LETTER: 2569 case XML_REGEXP_NUMBER_OTHERS: 2570 case XML_REGEXP_PUNCT: 2571 case XML_REGEXP_PUNCT_CONNECTOR: 2572 case XML_REGEXP_PUNCT_DASH: 2573 case XML_REGEXP_PUNCT_OPEN: 2574 case XML_REGEXP_PUNCT_CLOSE: 2575 case XML_REGEXP_PUNCT_INITQUOTE: 2576 case XML_REGEXP_PUNCT_FINQUOTE: 2577 case XML_REGEXP_PUNCT_OTHERS: 2578 case XML_REGEXP_SEPAR: 2579 case XML_REGEXP_SEPAR_SPACE: 2580 case XML_REGEXP_SEPAR_LINE: 2581 case XML_REGEXP_SEPAR_PARA: 2582 case XML_REGEXP_SYMBOL: 2583 case XML_REGEXP_SYMBOL_MATH: 2584 case XML_REGEXP_SYMBOL_CURRENCY: 2585 case XML_REGEXP_SYMBOL_MODIFIER: 2586 case XML_REGEXP_SYMBOL_OTHERS: 2587 case XML_REGEXP_OTHER: 2588 case XML_REGEXP_OTHER_CONTROL: 2589 case XML_REGEXP_OTHER_FORMAT: 2590 case XML_REGEXP_OTHER_PRIVATE: 2591 case XML_REGEXP_OTHER_NA: 2592 case XML_REGEXP_BLOCK_NAME: 2593 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 2594 (const xmlChar *)atom->valuep); 2595 if (atom->neg) 2596 ret = !ret; 2597 break; 2598 } 2599 return(ret); 2600} 2601 2602/************************************************************************ 2603 * * 2604 * Saving and restoring state of an execution context * 2605 * * 2606 ************************************************************************/ 2607 2608#ifdef DEBUG_REGEXP_EXEC 2609static void 2610xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 2611 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 2612 if (exec->inputStack != NULL) { 2613 int i; 2614 printf(": "); 2615 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 2616 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]); 2617 } else { 2618 printf(": %s", &(exec->inputString[exec->index])); 2619 } 2620 printf("\n"); 2621} 2622#endif 2623 2624static void 2625xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 2626#ifdef DEBUG_REGEXP_EXEC 2627 printf("saving "); 2628 exec->transno++; 2629 xmlFARegDebugExec(exec); 2630 exec->transno--; 2631#endif 2632#ifdef MAX_PUSH 2633 if (exec->nbPush > MAX_PUSH) { 2634 return; 2635 } 2636 exec->nbPush++; 2637#endif 2638 2639 if (exec->maxRollbacks == 0) { 2640 exec->maxRollbacks = 4; 2641 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 2642 sizeof(xmlRegExecRollback)); 2643 if (exec->rollbacks == NULL) { 2644 xmlRegexpErrMemory(NULL, "saving regexp"); 2645 exec->maxRollbacks = 0; 2646 return; 2647 } 2648 memset(exec->rollbacks, 0, 2649 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2650 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 2651 xmlRegExecRollback *tmp; 2652 int len = exec->maxRollbacks; 2653 2654 exec->maxRollbacks *= 2; 2655 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 2656 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2657 if (tmp == NULL) { 2658 xmlRegexpErrMemory(NULL, "saving regexp"); 2659 exec->maxRollbacks /= 2; 2660 return; 2661 } 2662 exec->rollbacks = tmp; 2663 tmp = &exec->rollbacks[len]; 2664 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 2665 } 2666 exec->rollbacks[exec->nbRollbacks].state = exec->state; 2667 exec->rollbacks[exec->nbRollbacks].index = exec->index; 2668 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 2669 if (exec->comp->nbCounters > 0) { 2670 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2671 exec->rollbacks[exec->nbRollbacks].counts = (int *) 2672 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 2673 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2674 xmlRegexpErrMemory(NULL, "saving regexp"); 2675 exec->status = -5; 2676 return; 2677 } 2678 } 2679 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 2680 exec->comp->nbCounters * sizeof(int)); 2681 } 2682 exec->nbRollbacks++; 2683} 2684 2685static void 2686xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 2687 if (exec->nbRollbacks <= 0) { 2688 exec->status = -1; 2689#ifdef DEBUG_REGEXP_EXEC 2690 printf("rollback failed on empty stack\n"); 2691#endif 2692 return; 2693 } 2694 exec->nbRollbacks--; 2695 exec->state = exec->rollbacks[exec->nbRollbacks].state; 2696 exec->index = exec->rollbacks[exec->nbRollbacks].index; 2697 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 2698 if (exec->comp->nbCounters > 0) { 2699 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2700 fprintf(stderr, "exec save: allocation failed"); 2701 exec->status = -6; 2702 return; 2703 } 2704 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 2705 exec->comp->nbCounters * sizeof(int)); 2706 } 2707 2708#ifdef DEBUG_REGEXP_EXEC 2709 printf("restored "); 2710 xmlFARegDebugExec(exec); 2711#endif 2712} 2713 2714/************************************************************************ 2715 * * 2716 * Verifier, running an input against a compiled regexp * 2717 * * 2718 ************************************************************************/ 2719 2720static int 2721xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 2722 xmlRegExecCtxt execval; 2723 xmlRegExecCtxtPtr exec = &execval; 2724 int ret, codepoint = 0, len, deter; 2725 2726 exec->inputString = content; 2727 exec->index = 0; 2728 exec->nbPush = 0; 2729 exec->determinist = 1; 2730 exec->maxRollbacks = 0; 2731 exec->nbRollbacks = 0; 2732 exec->rollbacks = NULL; 2733 exec->status = 0; 2734 exec->comp = comp; 2735 exec->state = comp->states[0]; 2736 exec->transno = 0; 2737 exec->transcount = 0; 2738 exec->inputStack = NULL; 2739 exec->inputStackMax = 0; 2740 if (comp->nbCounters > 0) { 2741 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 2742 if (exec->counts == NULL) { 2743 xmlRegexpErrMemory(NULL, "running regexp"); 2744 return(-1); 2745 } 2746 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 2747 } else 2748 exec->counts = NULL; 2749 while ((exec->status == 0) && 2750 ((exec->inputString[exec->index] != 0) || 2751 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 2752 xmlRegTransPtr trans; 2753 xmlRegAtomPtr atom; 2754 2755 /* 2756 * If end of input on non-terminal state, rollback, however we may 2757 * still have epsilon like transition for counted transitions 2758 * on counters, in that case don't break too early. Additionally, 2759 * if we are working on a range like "AB{0,2}", where B is not present, 2760 * we don't want to break. 2761 */ 2762 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 2763 /* 2764 * if there is a transition, we must check if 2765 * atom allows minOccurs of 0 2766 */ 2767 if (exec->transno < exec->state->nbTrans) { 2768 trans = &exec->state->trans[exec->transno]; 2769 if (trans->to >=0) { 2770 atom = trans->atom; 2771 if (!((atom->min == 0) && (atom->max > 0))) 2772 goto rollback; 2773 } 2774 } else 2775 goto rollback; 2776 } 2777 2778 exec->transcount = 0; 2779 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2780 trans = &exec->state->trans[exec->transno]; 2781 if (trans->to < 0) 2782 continue; 2783 atom = trans->atom; 2784 ret = 0; 2785 deter = 1; 2786 if (trans->count >= 0) { 2787 int count; 2788 xmlRegCounterPtr counter; 2789 2790 /* 2791 * A counted transition. 2792 */ 2793 2794 count = exec->counts[trans->count]; 2795 counter = &exec->comp->counters[trans->count]; 2796#ifdef DEBUG_REGEXP_EXEC 2797 printf("testing count %d: val %d, min %d, max %d\n", 2798 trans->count, count, counter->min, counter->max); 2799#endif 2800 ret = ((count >= counter->min) && (count <= counter->max)); 2801 if ((ret) && (counter->min != counter->max)) 2802 deter = 0; 2803 } else if (atom == NULL) { 2804 fprintf(stderr, "epsilon transition left at runtime\n"); 2805 exec->status = -2; 2806 break; 2807 } else if (exec->inputString[exec->index] != 0) { 2808 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 2809 ret = xmlRegCheckCharacter(atom, codepoint); 2810 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 2811 xmlRegStatePtr to = comp->states[trans->to]; 2812 2813 /* 2814 * this is a multiple input sequence 2815 */ 2816 if (exec->state->nbTrans > exec->transno + 1) { 2817 xmlFARegExecSave(exec); 2818 } 2819 exec->transcount = 1; 2820 do { 2821 /* 2822 * Try to progress as much as possible on the input 2823 */ 2824 if (exec->transcount == atom->max) { 2825 break; 2826 } 2827 exec->index += len; 2828 /* 2829 * End of input: stop here 2830 */ 2831 if (exec->inputString[exec->index] == 0) { 2832 exec->index -= len; 2833 break; 2834 } 2835 if (exec->transcount >= atom->min) { 2836 int transno = exec->transno; 2837 xmlRegStatePtr state = exec->state; 2838 2839 /* 2840 * The transition is acceptable save it 2841 */ 2842 exec->transno = -1; /* trick */ 2843 exec->state = to; 2844 xmlFARegExecSave(exec); 2845 exec->transno = transno; 2846 exec->state = state; 2847 } 2848 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 2849 len); 2850 ret = xmlRegCheckCharacter(atom, codepoint); 2851 exec->transcount++; 2852 } while (ret == 1); 2853 if (exec->transcount < atom->min) 2854 ret = 0; 2855 2856 /* 2857 * If the last check failed but one transition was found 2858 * possible, rollback 2859 */ 2860 if (ret < 0) 2861 ret = 0; 2862 if (ret == 0) { 2863 goto rollback; 2864 } 2865 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 2866 /* 2867 * we don't match on the codepoint, but minOccurs of 0 2868 * says that's ok. Setting len to 0 inhibits stepping 2869 * over the codepoint. 2870 */ 2871 exec->transcount = 1; 2872 len = 0; 2873 ret = 1; 2874 } 2875 } else if ((atom->min == 0) && (atom->max > 0)) { 2876 /* another spot to match when minOccurs is 0 */ 2877 exec->transcount = 1; 2878 len = 0; 2879 ret = 1; 2880 } 2881 if (ret == 1) { 2882 if ((trans->nd == 1) || 2883 ((trans->count >= 0) && (deter == 0) && 2884 (exec->state->nbTrans > exec->transno + 1))) { 2885#ifdef DEBUG_REGEXP_EXEC 2886 if (trans->nd == 1) 2887 printf("Saving on nd transition atom %d for %c at %d\n", 2888 trans->atom->no, codepoint, exec->index); 2889 else 2890 printf("Saving on counted transition count %d for %c at %d\n", 2891 trans->count, codepoint, exec->index); 2892#endif 2893 xmlFARegExecSave(exec); 2894 } 2895 if (trans->counter >= 0) { 2896#ifdef DEBUG_REGEXP_EXEC 2897 printf("Increasing count %d\n", trans->counter); 2898#endif 2899 exec->counts[trans->counter]++; 2900 } 2901 if ((trans->count >= 0) && 2902 (trans->count < REGEXP_ALL_COUNTER)) { 2903#ifdef DEBUG_REGEXP_EXEC 2904 printf("resetting count %d on transition\n", 2905 trans->count); 2906#endif 2907 exec->counts[trans->count] = 0; 2908 } 2909#ifdef DEBUG_REGEXP_EXEC 2910 printf("entering state %d\n", trans->to); 2911#endif 2912 exec->state = comp->states[trans->to]; 2913 exec->transno = 0; 2914 if (trans->atom != NULL) { 2915 exec->index += len; 2916 } 2917 goto progress; 2918 } else if (ret < 0) { 2919 exec->status = -4; 2920 break; 2921 } 2922 } 2923 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2924rollback: 2925 /* 2926 * Failed to find a way out 2927 */ 2928 exec->determinist = 0; 2929#ifdef DEBUG_REGEXP_EXEC 2930 printf("rollback from state %d on %d:%c\n", exec->state->no, 2931 codepoint,codepoint); 2932#endif 2933 xmlFARegExecRollBack(exec); 2934 } 2935progress: 2936 continue; 2937 } 2938 if (exec->rollbacks != NULL) { 2939 if (exec->counts != NULL) { 2940 int i; 2941 2942 for (i = 0;i < exec->maxRollbacks;i++) 2943 if (exec->rollbacks[i].counts != NULL) 2944 xmlFree(exec->rollbacks[i].counts); 2945 } 2946 xmlFree(exec->rollbacks); 2947 } 2948 if (exec->counts != NULL) 2949 xmlFree(exec->counts); 2950 if (exec->status == 0) 2951 return(1); 2952 if (exec->status == -1) { 2953 if (exec->nbPush > MAX_PUSH) 2954 return(-1); 2955 return(0); 2956 } 2957 return(exec->status); 2958} 2959 2960/************************************************************************ 2961 * * 2962 * Progressive interface to the verifier one atom at a time * 2963 * * 2964 ************************************************************************/ 2965#ifdef DEBUG_ERR 2966static void testerr(xmlRegExecCtxtPtr exec); 2967#endif 2968 2969/** 2970 * xmlRegNewExecCtxt: 2971 * @comp: a precompiled regular expression 2972 * @callback: a callback function used for handling progresses in the 2973 * automata matching phase 2974 * @data: the context data associated to the callback in this context 2975 * 2976 * Build a context used for progressive evaluation of a regexp. 2977 * 2978 * Returns the new context 2979 */ 2980xmlRegExecCtxtPtr 2981xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 2982 xmlRegExecCtxtPtr exec; 2983 2984 if (comp == NULL) 2985 return(NULL); 2986 if ((comp->compact == NULL) && (comp->states == NULL)) 2987 return(NULL); 2988 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 2989 if (exec == NULL) { 2990 xmlRegexpErrMemory(NULL, "creating execution context"); 2991 return(NULL); 2992 } 2993 memset(exec, 0, sizeof(xmlRegExecCtxt)); 2994 exec->inputString = NULL; 2995 exec->index = 0; 2996 exec->determinist = 1; 2997 exec->maxRollbacks = 0; 2998 exec->nbRollbacks = 0; 2999 exec->rollbacks = NULL; 3000 exec->status = 0; 3001 exec->comp = comp; 3002 if (comp->compact == NULL) 3003 exec->state = comp->states[0]; 3004 exec->transno = 0; 3005 exec->transcount = 0; 3006 exec->callback = callback; 3007 exec->data = data; 3008 if (comp->nbCounters > 0) { 3009 /* 3010 * For error handling, exec->counts is allocated twice the size 3011 * the second half is used to store the data in case of rollback 3012 */ 3013 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3014 * 2); 3015 if (exec->counts == NULL) { 3016 xmlRegexpErrMemory(NULL, "creating execution context"); 3017 xmlFree(exec); 3018 return(NULL); 3019 } 3020 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3021 exec->errCounts = &exec->counts[comp->nbCounters]; 3022 } else { 3023 exec->counts = NULL; 3024 exec->errCounts = NULL; 3025 } 3026 exec->inputStackMax = 0; 3027 exec->inputStackNr = 0; 3028 exec->inputStack = NULL; 3029 exec->errStateNo = -1; 3030 exec->errString = NULL; 3031 exec->nbPush = 0; 3032 return(exec); 3033} 3034 3035/** 3036 * xmlRegFreeExecCtxt: 3037 * @exec: a regular expression evaulation context 3038 * 3039 * Free the structures associated to a regular expression evaulation context. 3040 */ 3041void 3042xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3043 if (exec == NULL) 3044 return; 3045 3046 if (exec->rollbacks != NULL) { 3047 if (exec->counts != NULL) { 3048 int i; 3049 3050 for (i = 0;i < exec->maxRollbacks;i++) 3051 if (exec->rollbacks[i].counts != NULL) 3052 xmlFree(exec->rollbacks[i].counts); 3053 } 3054 xmlFree(exec->rollbacks); 3055 } 3056 if (exec->counts != NULL) 3057 xmlFree(exec->counts); 3058 if (exec->inputStack != NULL) { 3059 int i; 3060 3061 for (i = 0;i < exec->inputStackNr;i++) { 3062 if (exec->inputStack[i].value != NULL) 3063 xmlFree(exec->inputStack[i].value); 3064 } 3065 xmlFree(exec->inputStack); 3066 } 3067 if (exec->errString != NULL) 3068 xmlFree(exec->errString); 3069 xmlFree(exec); 3070} 3071 3072static void 3073xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3074 void *data) { 3075#ifdef DEBUG_PUSH 3076 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3077#endif 3078 if (exec->inputStackMax == 0) { 3079 exec->inputStackMax = 4; 3080 exec->inputStack = (xmlRegInputTokenPtr) 3081 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3082 if (exec->inputStack == NULL) { 3083 xmlRegexpErrMemory(NULL, "pushing input string"); 3084 exec->inputStackMax = 0; 3085 return; 3086 } 3087 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3088 xmlRegInputTokenPtr tmp; 3089 3090 exec->inputStackMax *= 2; 3091 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3092 exec->inputStackMax * sizeof(xmlRegInputToken)); 3093 if (tmp == NULL) { 3094 xmlRegexpErrMemory(NULL, "pushing input string"); 3095 exec->inputStackMax /= 2; 3096 return; 3097 } 3098 exec->inputStack = tmp; 3099 } 3100 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3101 exec->inputStack[exec->inputStackNr].data = data; 3102 exec->inputStackNr++; 3103 exec->inputStack[exec->inputStackNr].value = NULL; 3104 exec->inputStack[exec->inputStackNr].data = NULL; 3105} 3106 3107/** 3108 * xmlRegStrEqualWildcard: 3109 * @expStr: the string to be evaluated 3110 * @valStr: the validation string 3111 * 3112 * Checks if both strings are equal or have the same content. "*" 3113 * can be used as a wildcard in @valStr; "|" is used as a seperator of 3114 * substrings in both @expStr and @valStr. 3115 * 3116 * Returns 1 if the comparison is satisfied and the number of substrings 3117 * is equal, 0 otherwise. 3118 */ 3119 3120static int 3121xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3122 if (expStr == valStr) return(1); 3123 if (expStr == NULL) return(0); 3124 if (valStr == NULL) return(0); 3125 do { 3126 /* 3127 * Eval if we have a wildcard for the current item. 3128 */ 3129 if (*expStr != *valStr) { 3130 /* if one of them starts with a wildcard make valStr be it */ 3131 if (*valStr == '*') { 3132 const xmlChar *tmp; 3133 3134 tmp = valStr; 3135 valStr = expStr; 3136 expStr = tmp; 3137 } 3138 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3139 do { 3140 if (*valStr == XML_REG_STRING_SEPARATOR) 3141 break; 3142 valStr++; 3143 } while (*valStr != 0); 3144 continue; 3145 } else 3146 return(0); 3147 } 3148 expStr++; 3149 valStr++; 3150 } while (*valStr != 0); 3151 if (*expStr != 0) 3152 return (0); 3153 else 3154 return (1); 3155} 3156 3157/** 3158 * xmlRegCompactPushString: 3159 * @exec: a regexp execution context 3160 * @comp: the precompiled exec with a compact table 3161 * @value: a string token input 3162 * @data: data associated to the token to reuse in callbacks 3163 * 3164 * Push one input token in the execution context 3165 * 3166 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3167 * a negative value in case of error. 3168 */ 3169static int 3170xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3171 xmlRegexpPtr comp, 3172 const xmlChar *value, 3173 void *data) { 3174 int state = exec->index; 3175 int i, target; 3176 3177 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3178 return(-1); 3179 3180 if (value == NULL) { 3181 /* 3182 * are we at a final state ? 3183 */ 3184 if (comp->compact[state * (comp->nbstrings + 1)] == 3185 XML_REGEXP_FINAL_STATE) 3186 return(1); 3187 return(0); 3188 } 3189 3190#ifdef DEBUG_PUSH 3191 printf("value pushed: %s\n", value); 3192#endif 3193 3194 /* 3195 * Examine all outside transitions from current state 3196 */ 3197 for (i = 0;i < comp->nbstrings;i++) { 3198 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3199 if ((target > 0) && (target <= comp->nbstates)) { 3200 target--; /* to avoid 0 */ 3201 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3202 exec->index = target; 3203 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3204 exec->callback(exec->data, value, 3205 comp->transdata[state * comp->nbstrings + i], data); 3206 } 3207#ifdef DEBUG_PUSH 3208 printf("entering state %d\n", target); 3209#endif 3210 if (comp->compact[target * (comp->nbstrings + 1)] == 3211 XML_REGEXP_SINK_STATE) 3212 goto error; 3213 3214 if (comp->compact[target * (comp->nbstrings + 1)] == 3215 XML_REGEXP_FINAL_STATE) 3216 return(1); 3217 return(0); 3218 } 3219 } 3220 } 3221 /* 3222 * Failed to find an exit transition out from current state for the 3223 * current token 3224 */ 3225#ifdef DEBUG_PUSH 3226 printf("failed to find a transition for %s on state %d\n", value, state); 3227#endif 3228error: 3229 if (exec->errString != NULL) 3230 xmlFree(exec->errString); 3231 exec->errString = xmlStrdup(value); 3232 exec->errStateNo = state; 3233 exec->status = -1; 3234#ifdef DEBUG_ERR 3235 testerr(exec); 3236#endif 3237 return(-1); 3238} 3239 3240/** 3241 * xmlRegExecPushStringInternal: 3242 * @exec: a regexp execution context or NULL to indicate the end 3243 * @value: a string token input 3244 * @data: data associated to the token to reuse in callbacks 3245 * @compound: value was assembled from 2 strings 3246 * 3247 * Push one input token in the execution context 3248 * 3249 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3250 * a negative value in case of error. 3251 */ 3252static int 3253xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3254 void *data, int compound) { 3255 xmlRegTransPtr trans; 3256 xmlRegAtomPtr atom; 3257 int ret; 3258 int final = 0; 3259 int progress = 1; 3260 3261 if (exec == NULL) 3262 return(-1); 3263 if (exec->comp == NULL) 3264 return(-1); 3265 if (exec->status != 0) 3266 return(exec->status); 3267 3268 if (exec->comp->compact != NULL) 3269 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3270 3271 if (value == NULL) { 3272 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3273 return(1); 3274 final = 1; 3275 } 3276 3277#ifdef DEBUG_PUSH 3278 printf("value pushed: %s\n", value); 3279#endif 3280 /* 3281 * If we have an active rollback stack push the new value there 3282 * and get back to where we were left 3283 */ 3284 if ((value != NULL) && (exec->inputStackNr > 0)) { 3285 xmlFARegExecSaveInputString(exec, value, data); 3286 value = exec->inputStack[exec->index].value; 3287 data = exec->inputStack[exec->index].data; 3288#ifdef DEBUG_PUSH 3289 printf("value loaded: %s\n", value); 3290#endif 3291 } 3292 3293 while ((exec->status == 0) && 3294 ((value != NULL) || 3295 ((final == 1) && 3296 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3297 3298 /* 3299 * End of input on non-terminal state, rollback, however we may 3300 * still have epsilon like transition for counted transitions 3301 * on counters, in that case don't break too early. 3302 */ 3303 if ((value == NULL) && (exec->counts == NULL)) 3304 goto rollback; 3305 3306 exec->transcount = 0; 3307 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3308 trans = &exec->state->trans[exec->transno]; 3309 if (trans->to < 0) 3310 continue; 3311 atom = trans->atom; 3312 ret = 0; 3313 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3314 int i; 3315 int count; 3316 xmlRegTransPtr t; 3317 xmlRegCounterPtr counter; 3318 3319 ret = 0; 3320 3321#ifdef DEBUG_PUSH 3322 printf("testing all lax %d\n", trans->count); 3323#endif 3324 /* 3325 * Check all counted transitions from the current state 3326 */ 3327 if ((value == NULL) && (final)) { 3328 ret = 1; 3329 } else if (value != NULL) { 3330 for (i = 0;i < exec->state->nbTrans;i++) { 3331 t = &exec->state->trans[i]; 3332 if ((t->counter < 0) || (t == trans)) 3333 continue; 3334 counter = &exec->comp->counters[t->counter]; 3335 count = exec->counts[t->counter]; 3336 if ((count < counter->max) && 3337 (t->atom != NULL) && 3338 (xmlStrEqual(value, t->atom->valuep))) { 3339 ret = 0; 3340 break; 3341 } 3342 if ((count >= counter->min) && 3343 (count < counter->max) && 3344 (xmlStrEqual(value, t->atom->valuep))) { 3345 ret = 1; 3346 break; 3347 } 3348 } 3349 } 3350 } else if (trans->count == REGEXP_ALL_COUNTER) { 3351 int i; 3352 int count; 3353 xmlRegTransPtr t; 3354 xmlRegCounterPtr counter; 3355 3356 ret = 1; 3357 3358#ifdef DEBUG_PUSH 3359 printf("testing all %d\n", trans->count); 3360#endif 3361 /* 3362 * Check all counted transitions from the current state 3363 */ 3364 for (i = 0;i < exec->state->nbTrans;i++) { 3365 t = &exec->state->trans[i]; 3366 if ((t->counter < 0) || (t == trans)) 3367 continue; 3368 counter = &exec->comp->counters[t->counter]; 3369 count = exec->counts[t->counter]; 3370 if ((count < counter->min) || (count > counter->max)) { 3371 ret = 0; 3372 break; 3373 } 3374 } 3375 } else if (trans->count >= 0) { 3376 int count; 3377 xmlRegCounterPtr counter; 3378 3379 /* 3380 * A counted transition. 3381 */ 3382 3383 count = exec->counts[trans->count]; 3384 counter = &exec->comp->counters[trans->count]; 3385#ifdef DEBUG_PUSH 3386 printf("testing count %d: val %d, min %d, max %d\n", 3387 trans->count, count, counter->min, counter->max); 3388#endif 3389 ret = ((count >= counter->min) && (count <= counter->max)); 3390 } else if (atom == NULL) { 3391 fprintf(stderr, "epsilon transition left at runtime\n"); 3392 exec->status = -2; 3393 break; 3394 } else if (value != NULL) { 3395 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3396 if (atom->neg) { 3397 ret = !ret; 3398 if (!compound) 3399 ret = 0; 3400 } 3401 if ((ret == 1) && (trans->counter >= 0)) { 3402 xmlRegCounterPtr counter; 3403 int count; 3404 3405 count = exec->counts[trans->counter]; 3406 counter = &exec->comp->counters[trans->counter]; 3407 if (count >= counter->max) 3408 ret = 0; 3409 } 3410 3411 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3412 xmlRegStatePtr to = exec->comp->states[trans->to]; 3413 3414 /* 3415 * this is a multiple input sequence 3416 */ 3417 if (exec->state->nbTrans > exec->transno + 1) { 3418 if (exec->inputStackNr <= 0) { 3419 xmlFARegExecSaveInputString(exec, value, data); 3420 } 3421 xmlFARegExecSave(exec); 3422 } 3423 exec->transcount = 1; 3424 do { 3425 /* 3426 * Try to progress as much as possible on the input 3427 */ 3428 if (exec->transcount == atom->max) { 3429 break; 3430 } 3431 exec->index++; 3432 value = exec->inputStack[exec->index].value; 3433 data = exec->inputStack[exec->index].data; 3434#ifdef DEBUG_PUSH 3435 printf("value loaded: %s\n", value); 3436#endif 3437 3438 /* 3439 * End of input: stop here 3440 */ 3441 if (value == NULL) { 3442 exec->index --; 3443 break; 3444 } 3445 if (exec->transcount >= atom->min) { 3446 int transno = exec->transno; 3447 xmlRegStatePtr state = exec->state; 3448 3449 /* 3450 * The transition is acceptable save it 3451 */ 3452 exec->transno = -1; /* trick */ 3453 exec->state = to; 3454 if (exec->inputStackNr <= 0) { 3455 xmlFARegExecSaveInputString(exec, value, data); 3456 } 3457 xmlFARegExecSave(exec); 3458 exec->transno = transno; 3459 exec->state = state; 3460 } 3461 ret = xmlStrEqual(value, atom->valuep); 3462 exec->transcount++; 3463 } while (ret == 1); 3464 if (exec->transcount < atom->min) 3465 ret = 0; 3466 3467 /* 3468 * If the last check failed but one transition was found 3469 * possible, rollback 3470 */ 3471 if (ret < 0) 3472 ret = 0; 3473 if (ret == 0) { 3474 goto rollback; 3475 } 3476 } 3477 } 3478 if (ret == 1) { 3479 if ((exec->callback != NULL) && (atom != NULL) && 3480 (data != NULL)) { 3481 exec->callback(exec->data, atom->valuep, 3482 atom->data, data); 3483 } 3484 if (exec->state->nbTrans > exec->transno + 1) { 3485 if (exec->inputStackNr <= 0) { 3486 xmlFARegExecSaveInputString(exec, value, data); 3487 } 3488 xmlFARegExecSave(exec); 3489 } 3490 if (trans->counter >= 0) { 3491#ifdef DEBUG_PUSH 3492 printf("Increasing count %d\n", trans->counter); 3493#endif 3494 exec->counts[trans->counter]++; 3495 } 3496 if ((trans->count >= 0) && 3497 (trans->count < REGEXP_ALL_COUNTER)) { 3498#ifdef DEBUG_REGEXP_EXEC 3499 printf("resetting count %d on transition\n", 3500 trans->count); 3501#endif 3502 exec->counts[trans->count] = 0; 3503 } 3504#ifdef DEBUG_PUSH 3505 printf("entering state %d\n", trans->to); 3506#endif 3507 if ((exec->comp->states[trans->to] != NULL) && 3508 (exec->comp->states[trans->to]->type == 3509 XML_REGEXP_SINK_STATE)) { 3510 /* 3511 * entering a sink state, save the current state as error 3512 * state. 3513 */ 3514 if (exec->errString != NULL) 3515 xmlFree(exec->errString); 3516 exec->errString = xmlStrdup(value); 3517 exec->errState = exec->state; 3518 memcpy(exec->errCounts, exec->counts, 3519 exec->comp->nbCounters * sizeof(int)); 3520 } 3521 exec->state = exec->comp->states[trans->to]; 3522 exec->transno = 0; 3523 if (trans->atom != NULL) { 3524 if (exec->inputStack != NULL) { 3525 exec->index++; 3526 if (exec->index < exec->inputStackNr) { 3527 value = exec->inputStack[exec->index].value; 3528 data = exec->inputStack[exec->index].data; 3529#ifdef DEBUG_PUSH 3530 printf("value loaded: %s\n", value); 3531#endif 3532 } else { 3533 value = NULL; 3534 data = NULL; 3535#ifdef DEBUG_PUSH 3536 printf("end of input\n"); 3537#endif 3538 } 3539 } else { 3540 value = NULL; 3541 data = NULL; 3542#ifdef DEBUG_PUSH 3543 printf("end of input\n"); 3544#endif 3545 } 3546 } 3547 goto progress; 3548 } else if (ret < 0) { 3549 exec->status = -4; 3550 break; 3551 } 3552 } 3553 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3554rollback: 3555 /* 3556 * if we didn't yet rollback on the current input 3557 * store the current state as the error state. 3558 */ 3559 if ((progress) && (exec->state != NULL) && 3560 (exec->state->type != XML_REGEXP_SINK_STATE)) { 3561 progress = 0; 3562 if (exec->errString != NULL) 3563 xmlFree(exec->errString); 3564 exec->errString = xmlStrdup(value); 3565 exec->errState = exec->state; 3566 memcpy(exec->errCounts, exec->counts, 3567 exec->comp->nbCounters * sizeof(int)); 3568 } 3569 3570 /* 3571 * Failed to find a way out 3572 */ 3573 exec->determinist = 0; 3574 xmlFARegExecRollBack(exec); 3575 if (exec->status == 0) { 3576 value = exec->inputStack[exec->index].value; 3577 data = exec->inputStack[exec->index].data; 3578#ifdef DEBUG_PUSH 3579 printf("value loaded: %s\n", value); 3580#endif 3581 } 3582 } 3583 continue; 3584progress: 3585 progress = 1; 3586 continue; 3587 } 3588 if (exec->status == 0) { 3589 return(exec->state->type == XML_REGEXP_FINAL_STATE); 3590 } 3591#ifdef DEBUG_ERR 3592 if (exec->status < 0) { 3593 testerr(exec); 3594 } 3595#endif 3596 return(exec->status); 3597} 3598 3599/** 3600 * xmlRegExecPushString: 3601 * @exec: a regexp execution context or NULL to indicate the end 3602 * @value: a string token input 3603 * @data: data associated to the token to reuse in callbacks 3604 * 3605 * Push one input token in the execution context 3606 * 3607 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3608 * a negative value in case of error. 3609 */ 3610int 3611xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3612 void *data) { 3613 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 3614} 3615 3616/** 3617 * xmlRegExecPushString2: 3618 * @exec: a regexp execution context or NULL to indicate the end 3619 * @value: the first string token input 3620 * @value2: the second string token input 3621 * @data: data associated to the token to reuse in callbacks 3622 * 3623 * Push one input token in the execution context 3624 * 3625 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3626 * a negative value in case of error. 3627 */ 3628int 3629xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 3630 const xmlChar *value2, void *data) { 3631 xmlChar buf[150]; 3632 int lenn, lenp, ret; 3633 xmlChar *str; 3634 3635 if (exec == NULL) 3636 return(-1); 3637 if (exec->comp == NULL) 3638 return(-1); 3639 if (exec->status != 0) 3640 return(exec->status); 3641 3642 if (value2 == NULL) 3643 return(xmlRegExecPushString(exec, value, data)); 3644 3645 lenn = strlen((char *) value2); 3646 lenp = strlen((char *) value); 3647 3648 if (150 < lenn + lenp + 2) { 3649 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 3650 if (str == NULL) { 3651 exec->status = -1; 3652 return(-1); 3653 } 3654 } else { 3655 str = buf; 3656 } 3657 memcpy(&str[0], value, lenp); 3658 str[lenp] = XML_REG_STRING_SEPARATOR; 3659 memcpy(&str[lenp + 1], value2, lenn); 3660 str[lenn + lenp + 1] = 0; 3661 3662 if (exec->comp->compact != NULL) 3663 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 3664 else 3665 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 3666 3667 if (str != buf) 3668 xmlFree(buf); 3669 return(ret); 3670} 3671 3672/** 3673 * xmlRegExecGetValues: 3674 * @exec: a regexp execution context 3675 * @err: error extraction or normal one 3676 * @nbval: pointer to the number of accepted values IN/OUT 3677 * @nbneg: return number of negative transitions 3678 * @values: pointer to the array of acceptable values 3679 * @terminal: return value if this was a terminal state 3680 * 3681 * Extract informations from the regexp execution, internal routine to 3682 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 3683 * 3684 * Returns: 0 in case of success or -1 in case of error. 3685 */ 3686static int 3687xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 3688 int *nbval, int *nbneg, 3689 xmlChar **values, int *terminal) { 3690 int maxval; 3691 int nb = 0; 3692 3693 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 3694 (values == NULL) || (*nbval <= 0)) 3695 return(-1); 3696 3697 maxval = *nbval; 3698 *nbval = 0; 3699 *nbneg = 0; 3700 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 3701 xmlRegexpPtr comp; 3702 int target, i, state; 3703 3704 comp = exec->comp; 3705 3706 if (err) { 3707 if (exec->errStateNo == -1) return(-1); 3708 state = exec->errStateNo; 3709 } else { 3710 state = exec->index; 3711 } 3712 if (terminal != NULL) { 3713 if (comp->compact[state * (comp->nbstrings + 1)] == 3714 XML_REGEXP_FINAL_STATE) 3715 *terminal = 1; 3716 else 3717 *terminal = 0; 3718 } 3719 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 3720 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3721 if ((target > 0) && (target <= comp->nbstates) && 3722 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 3723 XML_REGEXP_SINK_STATE)) { 3724 values[nb++] = comp->stringMap[i]; 3725 (*nbval)++; 3726 } 3727 } 3728 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 3729 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3730 if ((target > 0) && (target <= comp->nbstates) && 3731 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 3732 XML_REGEXP_SINK_STATE)) { 3733 values[nb++] = comp->stringMap[i]; 3734 (*nbneg)++; 3735 } 3736 } 3737 } else { 3738 int transno; 3739 xmlRegTransPtr trans; 3740 xmlRegAtomPtr atom; 3741 xmlRegStatePtr state; 3742 3743 if (terminal != NULL) { 3744 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3745 *terminal = 1; 3746 else 3747 *terminal = 0; 3748 } 3749 3750 if (err) { 3751 if (exec->errState == NULL) return(-1); 3752 state = exec->errState; 3753 } else { 3754 if (exec->state == NULL) return(-1); 3755 state = exec->state; 3756 } 3757 for (transno = 0; 3758 (transno < state->nbTrans) && (nb < maxval); 3759 transno++) { 3760 trans = &state->trans[transno]; 3761 if (trans->to < 0) 3762 continue; 3763 atom = trans->atom; 3764 if ((atom == NULL) || (atom->valuep == NULL)) 3765 continue; 3766 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3767 /* this should not be reached but ... */ 3768 TODO; 3769 } else if (trans->count == REGEXP_ALL_COUNTER) { 3770 /* this should not be reached but ... */ 3771 TODO; 3772 } else if (trans->counter >= 0) { 3773 xmlRegCounterPtr counter; 3774 int count; 3775 3776 if (err) 3777 count = exec->errCounts[trans->counter]; 3778 else 3779 count = exec->counts[trans->counter]; 3780 counter = &exec->comp->counters[trans->counter]; 3781 if (count < counter->max) { 3782 if (atom->neg) 3783 values[nb++] = (xmlChar *) atom->valuep2; 3784 else 3785 values[nb++] = (xmlChar *) atom->valuep; 3786 (*nbval)++; 3787 } 3788 } else { 3789 if ((exec->comp->states[trans->to] != NULL) && 3790 (exec->comp->states[trans->to]->type != 3791 XML_REGEXP_SINK_STATE)) { 3792 if (atom->neg) 3793 values[nb++] = (xmlChar *) atom->valuep2; 3794 else 3795 values[nb++] = (xmlChar *) atom->valuep; 3796 (*nbval)++; 3797 } 3798 } 3799 } 3800 for (transno = 0; 3801 (transno < state->nbTrans) && (nb < maxval); 3802 transno++) { 3803 trans = &state->trans[transno]; 3804 if (trans->to < 0) 3805 continue; 3806 atom = trans->atom; 3807 if ((atom == NULL) || (atom->valuep == NULL)) 3808 continue; 3809 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3810 continue; 3811 } else if (trans->count == REGEXP_ALL_COUNTER) { 3812 continue; 3813 } else if (trans->counter >= 0) { 3814 continue; 3815 } else { 3816 if ((exec->comp->states[trans->to] != NULL) && 3817 (exec->comp->states[trans->to]->type == 3818 XML_REGEXP_SINK_STATE)) { 3819 if (atom->neg) 3820 values[nb++] = (xmlChar *) atom->valuep2; 3821 else 3822 values[nb++] = (xmlChar *) atom->valuep; 3823 (*nbneg)++; 3824 } 3825 } 3826 } 3827 } 3828 return(0); 3829} 3830 3831/** 3832 * xmlRegExecNextValues: 3833 * @exec: a regexp execution context 3834 * @nbval: pointer to the number of accepted values IN/OUT 3835 * @nbneg: return number of negative transitions 3836 * @values: pointer to the array of acceptable values 3837 * @terminal: return value if this was a terminal state 3838 * 3839 * Extract informations from the regexp execution, 3840 * the parameter @values must point to an array of @nbval string pointers 3841 * on return nbval will contain the number of possible strings in that 3842 * state and the @values array will be updated with them. The string values 3843 * returned will be freed with the @exec context and don't need to be 3844 * deallocated. 3845 * 3846 * Returns: 0 in case of success or -1 in case of error. 3847 */ 3848int 3849xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 3850 xmlChar **values, int *terminal) { 3851 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 3852} 3853 3854/** 3855 * xmlRegExecErrInfo: 3856 * @exec: a regexp execution context generating an error 3857 * @string: return value for the error string 3858 * @nbval: pointer to the number of accepted values IN/OUT 3859 * @nbneg: return number of negative transitions 3860 * @values: pointer to the array of acceptable values 3861 * @terminal: return value if this was a terminal state 3862 * 3863 * Extract error informations from the regexp execution, the parameter 3864 * @string will be updated with the value pushed and not accepted, 3865 * the parameter @values must point to an array of @nbval string pointers 3866 * on return nbval will contain the number of possible strings in that 3867 * state and the @values array will be updated with them. The string values 3868 * returned will be freed with the @exec context and don't need to be 3869 * deallocated. 3870 * 3871 * Returns: 0 in case of success or -1 in case of error. 3872 */ 3873int 3874xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 3875 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 3876 if (exec == NULL) 3877 return(-1); 3878 if (string != NULL) { 3879 if (exec->status != 0) 3880 *string = exec->errString; 3881 else 3882 *string = NULL; 3883 } 3884 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 3885} 3886 3887#ifdef DEBUG_ERR 3888static void testerr(xmlRegExecCtxtPtr exec) { 3889 const xmlChar *string; 3890 xmlChar *values[5]; 3891 int nb = 5; 3892 int nbneg; 3893 int terminal; 3894 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 3895} 3896#endif 3897 3898#if 0 3899static int 3900xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 3901 xmlRegTransPtr trans; 3902 xmlRegAtomPtr atom; 3903 int ret; 3904 int codepoint, len; 3905 3906 if (exec == NULL) 3907 return(-1); 3908 if (exec->status != 0) 3909 return(exec->status); 3910 3911 while ((exec->status == 0) && 3912 ((exec->inputString[exec->index] != 0) || 3913 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 3914 3915 /* 3916 * End of input on non-terminal state, rollback, however we may 3917 * still have epsilon like transition for counted transitions 3918 * on counters, in that case don't break too early. 3919 */ 3920 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 3921 goto rollback; 3922 3923 exec->transcount = 0; 3924 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3925 trans = &exec->state->trans[exec->transno]; 3926 if (trans->to < 0) 3927 continue; 3928 atom = trans->atom; 3929 ret = 0; 3930 if (trans->count >= 0) { 3931 int count; 3932 xmlRegCounterPtr counter; 3933 3934 /* 3935 * A counted transition. 3936 */ 3937 3938 count = exec->counts[trans->count]; 3939 counter = &exec->comp->counters[trans->count]; 3940#ifdef DEBUG_REGEXP_EXEC 3941 printf("testing count %d: val %d, min %d, max %d\n", 3942 trans->count, count, counter->min, counter->max); 3943#endif 3944 ret = ((count >= counter->min) && (count <= counter->max)); 3945 } else if (atom == NULL) { 3946 fprintf(stderr, "epsilon transition left at runtime\n"); 3947 exec->status = -2; 3948 break; 3949 } else if (exec->inputString[exec->index] != 0) { 3950 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3951 ret = xmlRegCheckCharacter(atom, codepoint); 3952 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3953 xmlRegStatePtr to = exec->comp->states[trans->to]; 3954 3955 /* 3956 * this is a multiple input sequence 3957 */ 3958 if (exec->state->nbTrans > exec->transno + 1) { 3959 xmlFARegExecSave(exec); 3960 } 3961 exec->transcount = 1; 3962 do { 3963 /* 3964 * Try to progress as much as possible on the input 3965 */ 3966 if (exec->transcount == atom->max) { 3967 break; 3968 } 3969 exec->index += len; 3970 /* 3971 * End of input: stop here 3972 */ 3973 if (exec->inputString[exec->index] == 0) { 3974 exec->index -= len; 3975 break; 3976 } 3977 if (exec->transcount >= atom->min) { 3978 int transno = exec->transno; 3979 xmlRegStatePtr state = exec->state; 3980 3981 /* 3982 * The transition is acceptable save it 3983 */ 3984 exec->transno = -1; /* trick */ 3985 exec->state = to; 3986 xmlFARegExecSave(exec); 3987 exec->transno = transno; 3988 exec->state = state; 3989 } 3990 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3991 len); 3992 ret = xmlRegCheckCharacter(atom, codepoint); 3993 exec->transcount++; 3994 } while (ret == 1); 3995 if (exec->transcount < atom->min) 3996 ret = 0; 3997 3998 /* 3999 * If the last check failed but one transition was found 4000 * possible, rollback 4001 */ 4002 if (ret < 0) 4003 ret = 0; 4004 if (ret == 0) { 4005 goto rollback; 4006 } 4007 } 4008 } 4009 if (ret == 1) { 4010 if (exec->state->nbTrans > exec->transno + 1) { 4011 xmlFARegExecSave(exec); 4012 } 4013 if (trans->counter >= 0) { 4014#ifdef DEBUG_REGEXP_EXEC 4015 printf("Increasing count %d\n", trans->counter); 4016#endif 4017 exec->counts[trans->counter]++; 4018 } 4019#ifdef DEBUG_REGEXP_EXEC 4020 printf("entering state %d\n", trans->to); 4021#endif 4022 exec->state = exec->comp->states[trans->to]; 4023 exec->transno = 0; 4024 if (trans->atom != NULL) { 4025 exec->index += len; 4026 } 4027 goto progress; 4028 } else if (ret < 0) { 4029 exec->status = -4; 4030 break; 4031 } 4032 } 4033 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4034rollback: 4035 /* 4036 * Failed to find a way out 4037 */ 4038 exec->determinist = 0; 4039 xmlFARegExecRollBack(exec); 4040 } 4041progress: 4042 continue; 4043 } 4044} 4045#endif 4046/************************************************************************ 4047 * * 4048 * Parser for the Schemas Datatype Regular Expressions * 4049 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4050 * * 4051 ************************************************************************/ 4052 4053/** 4054 * xmlFAIsChar: 4055 * @ctxt: a regexp parser context 4056 * 4057 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4058 */ 4059static int 4060xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4061 int cur; 4062 int len; 4063 4064 cur = CUR_SCHAR(ctxt->cur, len); 4065 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4066 (cur == '*') || (cur == '+') || (cur == '(') || 4067 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4068 (cur == 0x5D) || (cur == 0)) 4069 return(-1); 4070 return(cur); 4071} 4072 4073/** 4074 * xmlFAParseCharProp: 4075 * @ctxt: a regexp parser context 4076 * 4077 * [27] charProp ::= IsCategory | IsBlock 4078 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4079 * Separators | Symbols | Others 4080 * [29] Letters ::= 'L' [ultmo]? 4081 * [30] Marks ::= 'M' [nce]? 4082 * [31] Numbers ::= 'N' [dlo]? 4083 * [32] Punctuation ::= 'P' [cdseifo]? 4084 * [33] Separators ::= 'Z' [slp]? 4085 * [34] Symbols ::= 'S' [mcko]? 4086 * [35] Others ::= 'C' [cfon]? 4087 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4088 */ 4089static void 4090xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4091 int cur; 4092 xmlRegAtomType type = (xmlRegAtomType) 0; 4093 xmlChar *blockName = NULL; 4094 4095 cur = CUR; 4096 if (cur == 'L') { 4097 NEXT; 4098 cur = CUR; 4099 if (cur == 'u') { 4100 NEXT; 4101 type = XML_REGEXP_LETTER_UPPERCASE; 4102 } else if (cur == 'l') { 4103 NEXT; 4104 type = XML_REGEXP_LETTER_LOWERCASE; 4105 } else if (cur == 't') { 4106 NEXT; 4107 type = XML_REGEXP_LETTER_TITLECASE; 4108 } else if (cur == 'm') { 4109 NEXT; 4110 type = XML_REGEXP_LETTER_MODIFIER; 4111 } else if (cur == 'o') { 4112 NEXT; 4113 type = XML_REGEXP_LETTER_OTHERS; 4114 } else { 4115 type = XML_REGEXP_LETTER; 4116 } 4117 } else if (cur == 'M') { 4118 NEXT; 4119 cur = CUR; 4120 if (cur == 'n') { 4121 NEXT; 4122 /* nonspacing */ 4123 type = XML_REGEXP_MARK_NONSPACING; 4124 } else if (cur == 'c') { 4125 NEXT; 4126 /* spacing combining */ 4127 type = XML_REGEXP_MARK_SPACECOMBINING; 4128 } else if (cur == 'e') { 4129 NEXT; 4130 /* enclosing */ 4131 type = XML_REGEXP_MARK_ENCLOSING; 4132 } else { 4133 /* all marks */ 4134 type = XML_REGEXP_MARK; 4135 } 4136 } else if (cur == 'N') { 4137 NEXT; 4138 cur = CUR; 4139 if (cur == 'd') { 4140 NEXT; 4141 /* digital */ 4142 type = XML_REGEXP_NUMBER_DECIMAL; 4143 } else if (cur == 'l') { 4144 NEXT; 4145 /* letter */ 4146 type = XML_REGEXP_NUMBER_LETTER; 4147 } else if (cur == 'o') { 4148 NEXT; 4149 /* other */ 4150 type = XML_REGEXP_NUMBER_OTHERS; 4151 } else { 4152 /* all numbers */ 4153 type = XML_REGEXP_NUMBER; 4154 } 4155 } else if (cur == 'P') { 4156 NEXT; 4157 cur = CUR; 4158 if (cur == 'c') { 4159 NEXT; 4160 /* connector */ 4161 type = XML_REGEXP_PUNCT_CONNECTOR; 4162 } else if (cur == 'd') { 4163 NEXT; 4164 /* dash */ 4165 type = XML_REGEXP_PUNCT_DASH; 4166 } else if (cur == 's') { 4167 NEXT; 4168 /* open */ 4169 type = XML_REGEXP_PUNCT_OPEN; 4170 } else if (cur == 'e') { 4171 NEXT; 4172 /* close */ 4173 type = XML_REGEXP_PUNCT_CLOSE; 4174 } else if (cur == 'i') { 4175 NEXT; 4176 /* initial quote */ 4177 type = XML_REGEXP_PUNCT_INITQUOTE; 4178 } else if (cur == 'f') { 4179 NEXT; 4180 /* final quote */ 4181 type = XML_REGEXP_PUNCT_FINQUOTE; 4182 } else if (cur == 'o') { 4183 NEXT; 4184 /* other */ 4185 type = XML_REGEXP_PUNCT_OTHERS; 4186 } else { 4187 /* all punctuation */ 4188 type = XML_REGEXP_PUNCT; 4189 } 4190 } else if (cur == 'Z') { 4191 NEXT; 4192 cur = CUR; 4193 if (cur == 's') { 4194 NEXT; 4195 /* space */ 4196 type = XML_REGEXP_SEPAR_SPACE; 4197 } else if (cur == 'l') { 4198 NEXT; 4199 /* line */ 4200 type = XML_REGEXP_SEPAR_LINE; 4201 } else if (cur == 'p') { 4202 NEXT; 4203 /* paragraph */ 4204 type = XML_REGEXP_SEPAR_PARA; 4205 } else { 4206 /* all separators */ 4207 type = XML_REGEXP_SEPAR; 4208 } 4209 } else if (cur == 'S') { 4210 NEXT; 4211 cur = CUR; 4212 if (cur == 'm') { 4213 NEXT; 4214 type = XML_REGEXP_SYMBOL_MATH; 4215 /* math */ 4216 } else if (cur == 'c') { 4217 NEXT; 4218 type = XML_REGEXP_SYMBOL_CURRENCY; 4219 /* currency */ 4220 } else if (cur == 'k') { 4221 NEXT; 4222 type = XML_REGEXP_SYMBOL_MODIFIER; 4223 /* modifiers */ 4224 } else if (cur == 'o') { 4225 NEXT; 4226 type = XML_REGEXP_SYMBOL_OTHERS; 4227 /* other */ 4228 } else { 4229 /* all symbols */ 4230 type = XML_REGEXP_SYMBOL; 4231 } 4232 } else if (cur == 'C') { 4233 NEXT; 4234 cur = CUR; 4235 if (cur == 'c') { 4236 NEXT; 4237 /* control */ 4238 type = XML_REGEXP_OTHER_CONTROL; 4239 } else if (cur == 'f') { 4240 NEXT; 4241 /* format */ 4242 type = XML_REGEXP_OTHER_FORMAT; 4243 } else if (cur == 'o') { 4244 NEXT; 4245 /* private use */ 4246 type = XML_REGEXP_OTHER_PRIVATE; 4247 } else if (cur == 'n') { 4248 NEXT; 4249 /* not assigned */ 4250 type = XML_REGEXP_OTHER_NA; 4251 } else { 4252 /* all others */ 4253 type = XML_REGEXP_OTHER; 4254 } 4255 } else if (cur == 'I') { 4256 const xmlChar *start; 4257 NEXT; 4258 cur = CUR; 4259 if (cur != 's') { 4260 ERROR("IsXXXX expected"); 4261 return; 4262 } 4263 NEXT; 4264 start = ctxt->cur; 4265 cur = CUR; 4266 if (((cur >= 'a') && (cur <= 'z')) || 4267 ((cur >= 'A') && (cur <= 'Z')) || 4268 ((cur >= '0') && (cur <= '9')) || 4269 (cur == 0x2D)) { 4270 NEXT; 4271 cur = CUR; 4272 while (((cur >= 'a') && (cur <= 'z')) || 4273 ((cur >= 'A') && (cur <= 'Z')) || 4274 ((cur >= '0') && (cur <= '9')) || 4275 (cur == 0x2D)) { 4276 NEXT; 4277 cur = CUR; 4278 } 4279 } 4280 type = XML_REGEXP_BLOCK_NAME; 4281 blockName = xmlStrndup(start, ctxt->cur - start); 4282 } else { 4283 ERROR("Unknown char property"); 4284 return; 4285 } 4286 if (ctxt->atom == NULL) { 4287 ctxt->atom = xmlRegNewAtom(ctxt, type); 4288 if (ctxt->atom != NULL) 4289 ctxt->atom->valuep = blockName; 4290 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4291 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4292 type, 0, 0, blockName); 4293 } 4294} 4295 4296/** 4297 * xmlFAParseCharClassEsc: 4298 * @ctxt: a regexp parser context 4299 * 4300 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4301 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4302 * [25] catEsc ::= '\p{' charProp '}' 4303 * [26] complEsc ::= '\P{' charProp '}' 4304 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4305 */ 4306static void 4307xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4308 int cur; 4309 4310 if (CUR == '.') { 4311 if (ctxt->atom == NULL) { 4312 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4313 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4314 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4315 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4316 } 4317 NEXT; 4318 return; 4319 } 4320 if (CUR != '\\') { 4321 ERROR("Escaped sequence: expecting \\"); 4322 return; 4323 } 4324 NEXT; 4325 cur = CUR; 4326 if (cur == 'p') { 4327 NEXT; 4328 if (CUR != '{') { 4329 ERROR("Expecting '{'"); 4330 return; 4331 } 4332 NEXT; 4333 xmlFAParseCharProp(ctxt); 4334 if (CUR != '}') { 4335 ERROR("Expecting '}'"); 4336 return; 4337 } 4338 NEXT; 4339 } else if (cur == 'P') { 4340 NEXT; 4341 if (CUR != '{') { 4342 ERROR("Expecting '{'"); 4343 return; 4344 } 4345 NEXT; 4346 xmlFAParseCharProp(ctxt); 4347 ctxt->atom->neg = 1; 4348 if (CUR != '}') { 4349 ERROR("Expecting '}'"); 4350 return; 4351 } 4352 NEXT; 4353 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4354 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4355 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4356 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4357 (cur == 0x5E)) { 4358 if (ctxt->atom == NULL) { 4359 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4360 if (ctxt->atom != NULL) { 4361 switch (cur) { 4362 case 'n': 4363 ctxt->atom->codepoint = '\n'; 4364 break; 4365 case 'r': 4366 ctxt->atom->codepoint = '\r'; 4367 break; 4368 case 't': 4369 ctxt->atom->codepoint = '\t'; 4370 break; 4371 default: 4372 ctxt->atom->codepoint = cur; 4373 } 4374 } 4375 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4376 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4377 XML_REGEXP_CHARVAL, cur, cur, NULL); 4378 } 4379 NEXT; 4380 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4381 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4382 (cur == 'w') || (cur == 'W')) { 4383 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4384 4385 switch (cur) { 4386 case 's': 4387 type = XML_REGEXP_ANYSPACE; 4388 break; 4389 case 'S': 4390 type = XML_REGEXP_NOTSPACE; 4391 break; 4392 case 'i': 4393 type = XML_REGEXP_INITNAME; 4394 break; 4395 case 'I': 4396 type = XML_REGEXP_NOTINITNAME; 4397 break; 4398 case 'c': 4399 type = XML_REGEXP_NAMECHAR; 4400 break; 4401 case 'C': 4402 type = XML_REGEXP_NOTNAMECHAR; 4403 break; 4404 case 'd': 4405 type = XML_REGEXP_DECIMAL; 4406 break; 4407 case 'D': 4408 type = XML_REGEXP_NOTDECIMAL; 4409 break; 4410 case 'w': 4411 type = XML_REGEXP_REALCHAR; 4412 break; 4413 case 'W': 4414 type = XML_REGEXP_NOTREALCHAR; 4415 break; 4416 } 4417 NEXT; 4418 if (ctxt->atom == NULL) { 4419 ctxt->atom = xmlRegNewAtom(ctxt, type); 4420 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4421 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4422 type, 0, 0, NULL); 4423 } 4424 } 4425} 4426 4427/** 4428 * xmlFAParseCharRef: 4429 * @ctxt: a regexp parser context 4430 * 4431 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' ) 4432 */ 4433static int 4434xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) { 4435 int ret = 0, cur; 4436 4437 if ((CUR != '&') || (NXT(1) != '#')) 4438 return(-1); 4439 NEXT; 4440 NEXT; 4441 cur = CUR; 4442 if (cur == 'x') { 4443 NEXT; 4444 cur = CUR; 4445 if (((cur >= '0') && (cur <= '9')) || 4446 ((cur >= 'a') && (cur <= 'f')) || 4447 ((cur >= 'A') && (cur <= 'F'))) { 4448 while (((cur >= '0') && (cur <= '9')) || 4449 ((cur >= 'A') && (cur <= 'F'))) { 4450 if ((cur >= '0') && (cur <= '9')) 4451 ret = ret * 16 + cur - '0'; 4452 else if ((cur >= 'a') && (cur <= 'f')) 4453 ret = ret * 16 + 10 + (cur - 'a'); 4454 else 4455 ret = ret * 16 + 10 + (cur - 'A'); 4456 NEXT; 4457 cur = CUR; 4458 } 4459 } else { 4460 ERROR("Char ref: expecting [0-9A-F]"); 4461 return(-1); 4462 } 4463 } else { 4464 if ((cur >= '0') && (cur <= '9')) { 4465 while ((cur >= '0') && (cur <= '9')) { 4466 ret = ret * 10 + cur - '0'; 4467 NEXT; 4468 cur = CUR; 4469 } 4470 } else { 4471 ERROR("Char ref: expecting [0-9]"); 4472 return(-1); 4473 } 4474 } 4475 if (cur != ';') { 4476 ERROR("Char ref: expecting ';'"); 4477 return(-1); 4478 } else { 4479 NEXT; 4480 } 4481 return(ret); 4482} 4483 4484/** 4485 * xmlFAParseCharRange: 4486 * @ctxt: a regexp parser context 4487 * 4488 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4489 * [18] seRange ::= charOrEsc '-' charOrEsc 4490 * [20] charOrEsc ::= XmlChar | SingleCharEsc 4491 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 4492 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 4493 */ 4494static void 4495xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 4496 int cur, len; 4497 int start = -1; 4498 int end = -1; 4499 4500 if ((CUR == '&') && (NXT(1) == '#')) { 4501 end = start = xmlFAParseCharRef(ctxt); 4502 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4503 XML_REGEXP_CHARVAL, start, end, NULL); 4504 return; 4505 } 4506 cur = CUR; 4507 if (cur == '\\') { 4508 NEXT; 4509 cur = CUR; 4510 switch (cur) { 4511 case 'n': start = 0xA; break; 4512 case 'r': start = 0xD; break; 4513 case 't': start = 0x9; break; 4514 case '\\': case '|': case '.': case '-': case '^': case '?': 4515 case '*': case '+': case '{': case '}': case '(': case ')': 4516 case '[': case ']': 4517 start = cur; break; 4518 default: 4519 ERROR("Invalid escape value"); 4520 return; 4521 } 4522 end = start; 4523 len = 1; 4524 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4525 end = start = CUR_SCHAR(ctxt->cur, len); 4526 } else { 4527 ERROR("Expecting a char range"); 4528 return; 4529 } 4530 NEXTL(len); 4531 if (start == '-') { 4532 return; 4533 } 4534 cur = CUR; 4535 if ((cur != '-') || (NXT(1) == ']')) { 4536 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4537 XML_REGEXP_CHARVAL, start, end, NULL); 4538 return; 4539 } 4540 NEXT; 4541 cur = CUR; 4542 if (cur == '\\') { 4543 NEXT; 4544 cur = CUR; 4545 switch (cur) { 4546 case 'n': end = 0xA; break; 4547 case 'r': end = 0xD; break; 4548 case 't': end = 0x9; break; 4549 case '\\': case '|': case '.': case '-': case '^': case '?': 4550 case '*': case '+': case '{': case '}': case '(': case ')': 4551 case '[': case ']': 4552 end = cur; break; 4553 default: 4554 ERROR("Invalid escape value"); 4555 return; 4556 } 4557 len = 1; 4558 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4559 end = CUR_SCHAR(ctxt->cur, len); 4560 } else { 4561 ERROR("Expecting the end of a char range"); 4562 return; 4563 } 4564 NEXTL(len); 4565 /* TODO check that the values are acceptable character ranges for XML */ 4566 if (end < start) { 4567 ERROR("End of range is before start of range"); 4568 } else { 4569 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4570 XML_REGEXP_CHARVAL, start, end, NULL); 4571 } 4572 return; 4573} 4574 4575/** 4576 * xmlFAParsePosCharGroup: 4577 * @ctxt: a regexp parser context 4578 * 4579 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 4580 */ 4581static void 4582xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 4583 do { 4584 if ((CUR == '\\') || (CUR == '.')) { 4585 xmlFAParseCharClassEsc(ctxt); 4586 } else { 4587 xmlFAParseCharRange(ctxt); 4588 } 4589 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 4590 (ctxt->error == 0)); 4591} 4592 4593/** 4594 * xmlFAParseCharGroup: 4595 * @ctxt: a regexp parser context 4596 * 4597 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 4598 * [15] negCharGroup ::= '^' posCharGroup 4599 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 4600 * [12] charClassExpr ::= '[' charGroup ']' 4601 */ 4602static void 4603xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 4604 int n = ctxt->neg; 4605 while ((CUR != ']') && (ctxt->error == 0)) { 4606 if (CUR == '^') { 4607 int neg = ctxt->neg; 4608 4609 NEXT; 4610 ctxt->neg = !ctxt->neg; 4611 xmlFAParsePosCharGroup(ctxt); 4612 ctxt->neg = neg; 4613 } else if ((CUR == '-') && (NXT(1) == '[')) { 4614 int neg = ctxt->neg; 4615 ctxt->neg = 2; 4616 NEXT; /* eat the '-' */ 4617 NEXT; /* eat the '[' */ 4618 xmlFAParseCharGroup(ctxt); 4619 if (CUR == ']') { 4620 NEXT; 4621 } else { 4622 ERROR("charClassExpr: ']' expected"); 4623 break; 4624 } 4625 ctxt->neg = neg; 4626 break; 4627 } else if (CUR != ']') { 4628 xmlFAParsePosCharGroup(ctxt); 4629 } 4630 } 4631 ctxt->neg = n; 4632} 4633 4634/** 4635 * xmlFAParseCharClass: 4636 * @ctxt: a regexp parser context 4637 * 4638 * [11] charClass ::= charClassEsc | charClassExpr 4639 * [12] charClassExpr ::= '[' charGroup ']' 4640 */ 4641static void 4642xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 4643 if (CUR == '[') { 4644 NEXT; 4645 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 4646 if (ctxt->atom == NULL) 4647 return; 4648 xmlFAParseCharGroup(ctxt); 4649 if (CUR == ']') { 4650 NEXT; 4651 } else { 4652 ERROR("xmlFAParseCharClass: ']' expected"); 4653 } 4654 } else { 4655 xmlFAParseCharClassEsc(ctxt); 4656 } 4657} 4658 4659/** 4660 * xmlFAParseQuantExact: 4661 * @ctxt: a regexp parser context 4662 * 4663 * [8] QuantExact ::= [0-9]+ 4664 * 4665 * Returns 0 if success or -1 in case of error 4666 */ 4667static int 4668xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 4669 int ret = 0; 4670 int ok = 0; 4671 4672 while ((CUR >= '0') && (CUR <= '9')) { 4673 ret = ret * 10 + (CUR - '0'); 4674 ok = 1; 4675 NEXT; 4676 } 4677 if (ok != 1) { 4678 return(-1); 4679 } 4680 return(ret); 4681} 4682 4683/** 4684 * xmlFAParseQuantifier: 4685 * @ctxt: a regexp parser context 4686 * 4687 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 4688 * [5] quantity ::= quantRange | quantMin | QuantExact 4689 * [6] quantRange ::= QuantExact ',' QuantExact 4690 * [7] quantMin ::= QuantExact ',' 4691 * [8] QuantExact ::= [0-9]+ 4692 */ 4693static int 4694xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 4695 int cur; 4696 4697 cur = CUR; 4698 if ((cur == '?') || (cur == '*') || (cur == '+')) { 4699 if (ctxt->atom != NULL) { 4700 if (cur == '?') 4701 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 4702 else if (cur == '*') 4703 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 4704 else if (cur == '+') 4705 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 4706 } 4707 NEXT; 4708 return(1); 4709 } 4710 if (cur == '{') { 4711 int min = 0, max = 0; 4712 4713 NEXT; 4714 cur = xmlFAParseQuantExact(ctxt); 4715 if (cur >= 0) 4716 min = cur; 4717 if (CUR == ',') { 4718 NEXT; 4719 if (CUR == '}') 4720 max = INT_MAX; 4721 else { 4722 cur = xmlFAParseQuantExact(ctxt); 4723 if (cur >= 0) 4724 max = cur; 4725 else { 4726 ERROR("Improper quantifier"); 4727 } 4728 } 4729 } 4730 if (CUR == '}') { 4731 NEXT; 4732 } else { 4733 ERROR("Unterminated quantifier"); 4734 } 4735 if (max == 0) 4736 max = min; 4737 if (ctxt->atom != NULL) { 4738 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 4739 ctxt->atom->min = min; 4740 ctxt->atom->max = max; 4741 } 4742 return(1); 4743 } 4744 return(0); 4745} 4746 4747/** 4748 * xmlFAParseAtom: 4749 * @ctxt: a regexp parser context 4750 * 4751 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 4752 */ 4753static int 4754xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 4755 int codepoint, len; 4756 4757 codepoint = xmlFAIsChar(ctxt); 4758 if (codepoint > 0) { 4759 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4760 if (ctxt->atom == NULL) 4761 return(-1); 4762 codepoint = CUR_SCHAR(ctxt->cur, len); 4763 ctxt->atom->codepoint = codepoint; 4764 NEXTL(len); 4765 return(1); 4766 } else if (CUR == '|') { 4767 return(0); 4768 } else if (CUR == 0) { 4769 return(0); 4770 } else if (CUR == ')') { 4771 return(0); 4772 } else if (CUR == '(') { 4773 xmlRegStatePtr start, oldend; 4774 4775 NEXT; 4776 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 4777 start = ctxt->state; 4778 oldend = ctxt->end; 4779 ctxt->end = NULL; 4780 ctxt->atom = NULL; 4781 xmlFAParseRegExp(ctxt, 0); 4782 if (CUR == ')') { 4783 NEXT; 4784 } else { 4785 ERROR("xmlFAParseAtom: expecting ')'"); 4786 } 4787 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 4788 if (ctxt->atom == NULL) 4789 return(-1); 4790 ctxt->atom->start = start; 4791 ctxt->atom->stop = ctxt->state; 4792 ctxt->end = oldend; 4793 return(1); 4794 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 4795 xmlFAParseCharClass(ctxt); 4796 return(1); 4797 } 4798 return(0); 4799} 4800 4801/** 4802 * xmlFAParsePiece: 4803 * @ctxt: a regexp parser context 4804 * 4805 * [3] piece ::= atom quantifier? 4806 */ 4807static int 4808xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 4809 int ret; 4810 4811 ctxt->atom = NULL; 4812 ret = xmlFAParseAtom(ctxt); 4813 if (ret == 0) 4814 return(0); 4815 if (ctxt->atom == NULL) { 4816 ERROR("internal: no atom generated"); 4817 } 4818 xmlFAParseQuantifier(ctxt); 4819 return(1); 4820} 4821 4822/** 4823 * xmlFAParseBranch: 4824 * @ctxt: a regexp parser context 4825 * 4826 * [2] branch ::= piece* 4827 8 4828 */ 4829static int 4830xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) { 4831 xmlRegStatePtr previous; 4832 int ret; 4833 4834 previous = ctxt->state; 4835 ret = xmlFAParsePiece(ctxt); 4836 if (ret != 0) { 4837 if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0) 4838 return(-1); 4839 previous = ctxt->state; 4840 ctxt->atom = NULL; 4841 } 4842 while ((ret != 0) && (ctxt->error == 0)) { 4843 ret = xmlFAParsePiece(ctxt); 4844 if (ret != 0) { 4845 if (xmlFAGenerateTransitions(ctxt, previous, NULL, 4846 ctxt->atom) < 0) 4847 return(-1); 4848 previous = ctxt->state; 4849 ctxt->atom = NULL; 4850 } 4851 } 4852 return(0); 4853} 4854 4855/** 4856 * xmlFAParseRegExp: 4857 * @ctxt: a regexp parser context 4858 * @top: is this the top-level expression ? 4859 * 4860 * [1] regExp ::= branch ( '|' branch )* 4861 */ 4862static void 4863xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 4864 xmlRegStatePtr start, end; 4865 4866 /* if not top start should have been generated by an epsilon trans */ 4867 start = ctxt->state; 4868 ctxt->end = NULL; 4869 xmlFAParseBranch(ctxt); 4870 if (top) { 4871#ifdef DEBUG_REGEXP_GRAPH 4872 printf("State %d is final\n", ctxt->state->no); 4873#endif 4874 ctxt->state->type = XML_REGEXP_FINAL_STATE; 4875 } 4876 if (CUR != '|') { 4877 ctxt->end = ctxt->state; 4878 return; 4879 } 4880 end = ctxt->state; 4881 while ((CUR == '|') && (ctxt->error == 0)) { 4882 NEXT; 4883 ctxt->state = start; 4884 ctxt->end = NULL; 4885 xmlFAParseBranch(ctxt); 4886 if (top) { 4887 ctxt->state->type = XML_REGEXP_FINAL_STATE; 4888#ifdef DEBUG_REGEXP_GRAPH 4889 printf("State %d is final\n", ctxt->state->no); 4890#endif 4891 } else { 4892 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, end); 4893 } 4894 } 4895 if (!top) { 4896 ctxt->state = end; 4897 ctxt->end = end; 4898 } 4899} 4900 4901/************************************************************************ 4902 * * 4903 * The basic API * 4904 * * 4905 ************************************************************************/ 4906 4907/** 4908 * xmlRegexpPrint: 4909 * @output: the file for the output debug 4910 * @regexp: the compiled regexp 4911 * 4912 * Print the content of the compiled regular expression 4913 */ 4914void 4915xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 4916 int i; 4917 4918 if (output == NULL) 4919 return; 4920 fprintf(output, " regexp: "); 4921 if (regexp == NULL) { 4922 fprintf(output, "NULL\n"); 4923 return; 4924 } 4925 fprintf(output, "'%s' ", regexp->string); 4926 fprintf(output, "\n"); 4927 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 4928 for (i = 0;i < regexp->nbAtoms; i++) { 4929 fprintf(output, " %02d ", i); 4930 xmlRegPrintAtom(output, regexp->atoms[i]); 4931 } 4932 fprintf(output, "%d states:", regexp->nbStates); 4933 fprintf(output, "\n"); 4934 for (i = 0;i < regexp->nbStates; i++) { 4935 xmlRegPrintState(output, regexp->states[i]); 4936 } 4937 fprintf(output, "%d counters:\n", regexp->nbCounters); 4938 for (i = 0;i < regexp->nbCounters; i++) { 4939 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 4940 regexp->counters[i].max); 4941 } 4942} 4943 4944/** 4945 * xmlRegexpCompile: 4946 * @regexp: a regular expression string 4947 * 4948 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 4949 * Appendix F and builds an automata suitable for testing strings against 4950 * that regular expression 4951 * 4952 * Returns the compiled expression or NULL in case of error 4953 */ 4954xmlRegexpPtr 4955xmlRegexpCompile(const xmlChar *regexp) { 4956 xmlRegexpPtr ret; 4957 xmlRegParserCtxtPtr ctxt; 4958 4959 ctxt = xmlRegNewParserCtxt(regexp); 4960 if (ctxt == NULL) 4961 return(NULL); 4962 4963 /* initialize the parser */ 4964 ctxt->end = NULL; 4965 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 4966 xmlRegStatePush(ctxt, ctxt->start); 4967 4968 /* parse the expression building an automata */ 4969 xmlFAParseRegExp(ctxt, 1); 4970 if (CUR != 0) { 4971 ERROR("xmlFAParseRegExp: extra characters"); 4972 } 4973 ctxt->end = ctxt->state; 4974 ctxt->start->type = XML_REGEXP_START_STATE; 4975 ctxt->end->type = XML_REGEXP_FINAL_STATE; 4976 4977 /* remove the Epsilon except for counted transitions */ 4978 xmlFAEliminateEpsilonTransitions(ctxt); 4979 4980 4981 if (ctxt->error != 0) { 4982 xmlRegFreeParserCtxt(ctxt); 4983 return(NULL); 4984 } 4985 ret = xmlRegEpxFromParse(ctxt); 4986 xmlRegFreeParserCtxt(ctxt); 4987 return(ret); 4988} 4989 4990/** 4991 * xmlRegexpExec: 4992 * @comp: the compiled regular expression 4993 * @content: the value to check against the regular expression 4994 * 4995 * Check if the regular expression generates the value 4996 * 4997 * Returns 1 if it matches, 0 if not and a negative value in case of error 4998 */ 4999int 5000xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5001 if ((comp == NULL) || (content == NULL)) 5002 return(-1); 5003 return(xmlFARegExec(comp, content)); 5004} 5005 5006/** 5007 * xmlRegexpIsDeterminist: 5008 * @comp: the compiled regular expression 5009 * 5010 * Check if the regular expression is determinist 5011 * 5012 * Returns 1 if it yes, 0 if not and a negative value in case of error 5013 */ 5014int 5015xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5016 xmlAutomataPtr am; 5017 int ret; 5018 5019 if (comp == NULL) 5020 return(-1); 5021 if (comp->determinist != -1) 5022 return(comp->determinist); 5023 5024 am = xmlNewAutomata(); 5025 if (am->states != NULL) { 5026 int i; 5027 5028 for (i = 0;i < am->nbStates;i++) 5029 xmlRegFreeState(am->states[i]); 5030 xmlFree(am->states); 5031 } 5032 am->nbAtoms = comp->nbAtoms; 5033 am->atoms = comp->atoms; 5034 am->nbStates = comp->nbStates; 5035 am->states = comp->states; 5036 am->determinist = -1; 5037 ret = xmlFAComputesDeterminism(am); 5038 am->atoms = NULL; 5039 am->states = NULL; 5040 xmlFreeAutomata(am); 5041 return(ret); 5042} 5043 5044/** 5045 * xmlRegFreeRegexp: 5046 * @regexp: the regexp 5047 * 5048 * Free a regexp 5049 */ 5050void 5051xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5052 int i; 5053 if (regexp == NULL) 5054 return; 5055 5056 if (regexp->string != NULL) 5057 xmlFree(regexp->string); 5058 if (regexp->states != NULL) { 5059 for (i = 0;i < regexp->nbStates;i++) 5060 xmlRegFreeState(regexp->states[i]); 5061 xmlFree(regexp->states); 5062 } 5063 if (regexp->atoms != NULL) { 5064 for (i = 0;i < regexp->nbAtoms;i++) 5065 xmlRegFreeAtom(regexp->atoms[i]); 5066 xmlFree(regexp->atoms); 5067 } 5068 if (regexp->counters != NULL) 5069 xmlFree(regexp->counters); 5070 if (regexp->compact != NULL) 5071 xmlFree(regexp->compact); 5072 if (regexp->transdata != NULL) 5073 xmlFree(regexp->transdata); 5074 if (regexp->stringMap != NULL) { 5075 for (i = 0; i < regexp->nbstrings;i++) 5076 xmlFree(regexp->stringMap[i]); 5077 xmlFree(regexp->stringMap); 5078 } 5079 5080 xmlFree(regexp); 5081} 5082 5083#ifdef LIBXML_AUTOMATA_ENABLED 5084/************************************************************************ 5085 * * 5086 * The Automata interface * 5087 * * 5088 ************************************************************************/ 5089 5090/** 5091 * xmlNewAutomata: 5092 * 5093 * Create a new automata 5094 * 5095 * Returns the new object or NULL in case of failure 5096 */ 5097xmlAutomataPtr 5098xmlNewAutomata(void) { 5099 xmlAutomataPtr ctxt; 5100 5101 ctxt = xmlRegNewParserCtxt(NULL); 5102 if (ctxt == NULL) 5103 return(NULL); 5104 5105 /* initialize the parser */ 5106 ctxt->end = NULL; 5107 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5108 ctxt->start->type = XML_REGEXP_START_STATE; 5109 if (ctxt->start == NULL) { 5110 xmlFreeAutomata(ctxt); 5111 return(NULL); 5112 } 5113 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5114 xmlRegFreeState(ctxt->start); 5115 xmlFreeAutomata(ctxt); 5116 return(NULL); 5117 } 5118 5119 return(ctxt); 5120} 5121 5122/** 5123 * xmlFreeAutomata: 5124 * @am: an automata 5125 * 5126 * Free an automata 5127 */ 5128void 5129xmlFreeAutomata(xmlAutomataPtr am) { 5130 if (am == NULL) 5131 return; 5132 xmlRegFreeParserCtxt(am); 5133} 5134 5135/** 5136 * xmlAutomataGetInitState: 5137 * @am: an automata 5138 * 5139 * Initial state lookup 5140 * 5141 * Returns the initial state of the automata 5142 */ 5143xmlAutomataStatePtr 5144xmlAutomataGetInitState(xmlAutomataPtr am) { 5145 if (am == NULL) 5146 return(NULL); 5147 return(am->start); 5148} 5149 5150/** 5151 * xmlAutomataSetFinalState: 5152 * @am: an automata 5153 * @state: a state in this automata 5154 * 5155 * Makes that state a final state 5156 * 5157 * Returns 0 or -1 in case of error 5158 */ 5159int 5160xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5161 if ((am == NULL) || (state == NULL)) 5162 return(-1); 5163 state->type = XML_REGEXP_FINAL_STATE; 5164 return(0); 5165} 5166 5167/** 5168 * xmlAutomataNewTransition: 5169 * @am: an automata 5170 * @from: the starting point of the transition 5171 * @to: the target point of the transition or NULL 5172 * @token: the input string associated to that transition 5173 * @data: data passed to the callback function if the transition is activated 5174 * 5175 * If @to is NULL, this creates first a new target state in the automata 5176 * and then adds a transition from the @from state to the target state 5177 * activated by the value of @token 5178 * 5179 * Returns the target state or NULL in case of error 5180 */ 5181xmlAutomataStatePtr 5182xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5183 xmlAutomataStatePtr to, const xmlChar *token, 5184 void *data) { 5185 xmlRegAtomPtr atom; 5186 5187 if ((am == NULL) || (from == NULL) || (token == NULL)) 5188 return(NULL); 5189 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5190 if (atom == NULL) 5191 return(NULL); 5192 atom->data = data; 5193 if (atom == NULL) 5194 return(NULL); 5195 atom->valuep = xmlStrdup(token); 5196 5197 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5198 xmlRegFreeAtom(atom); 5199 return(NULL); 5200 } 5201 if (to == NULL) 5202 return(am->state); 5203 return(to); 5204} 5205 5206/** 5207 * xmlAutomataNewTransition2: 5208 * @am: an automata 5209 * @from: the starting point of the transition 5210 * @to: the target point of the transition or NULL 5211 * @token: the first input string associated to that transition 5212 * @token2: the second input string associated to that transition 5213 * @data: data passed to the callback function if the transition is activated 5214 * 5215 * If @to is NULL, this creates first a new target state in the automata 5216 * and then adds a transition from the @from state to the target state 5217 * activated by the value of @token 5218 * 5219 * Returns the target state or NULL in case of error 5220 */ 5221xmlAutomataStatePtr 5222xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5223 xmlAutomataStatePtr to, const xmlChar *token, 5224 const xmlChar *token2, void *data) { 5225 xmlRegAtomPtr atom; 5226 5227 if ((am == NULL) || (from == NULL) || (token == NULL)) 5228 return(NULL); 5229 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5230 atom->data = data; 5231 if (atom == NULL) 5232 return(NULL); 5233 if ((token2 == NULL) || (*token2 == 0)) { 5234 atom->valuep = xmlStrdup(token); 5235 } else { 5236 int lenn, lenp; 5237 xmlChar *str; 5238 5239 lenn = strlen((char *) token2); 5240 lenp = strlen((char *) token); 5241 5242 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5243 if (str == NULL) { 5244 xmlRegFreeAtom(atom); 5245 return(NULL); 5246 } 5247 memcpy(&str[0], token, lenp); 5248 str[lenp] = '|'; 5249 memcpy(&str[lenp + 1], token2, lenn); 5250 str[lenn + lenp + 1] = 0; 5251 5252 atom->valuep = str; 5253 } 5254 5255 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5256 xmlRegFreeAtom(atom); 5257 return(NULL); 5258 } 5259 if (to == NULL) 5260 return(am->state); 5261 return(to); 5262} 5263 5264/** 5265 * xmlAutomataNewNegTrans: 5266 * @am: an automata 5267 * @from: the starting point of the transition 5268 * @to: the target point of the transition or NULL 5269 * @token: the first input string associated to that transition 5270 * @token2: the second input string associated to that transition 5271 * @data: data passed to the callback function if the transition is activated 5272 * 5273 * If @to is NULL, this creates first a new target state in the automata 5274 * and then adds a transition from the @from state to the target state 5275 * activated by any value except (@token,@token2) 5276 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5277 # the semantic of XSD ##other 5278 * 5279 * Returns the target state or NULL in case of error 5280 */ 5281xmlAutomataStatePtr 5282xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5283 xmlAutomataStatePtr to, const xmlChar *token, 5284 const xmlChar *token2, void *data) { 5285 xmlRegAtomPtr atom; 5286 xmlChar err_msg[200]; 5287 5288 if ((am == NULL) || (from == NULL) || (token == NULL)) 5289 return(NULL); 5290 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5291 if (atom == NULL) 5292 return(NULL); 5293 atom->data = data; 5294 atom->neg = 1; 5295 if ((token2 == NULL) || (*token2 == 0)) { 5296 atom->valuep = xmlStrdup(token); 5297 } else { 5298 int lenn, lenp; 5299 xmlChar *str; 5300 5301 lenn = strlen((char *) token2); 5302 lenp = strlen((char *) token); 5303 5304 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5305 if (str == NULL) { 5306 xmlRegFreeAtom(atom); 5307 return(NULL); 5308 } 5309 memcpy(&str[0], token, lenp); 5310 str[lenp] = '|'; 5311 memcpy(&str[lenp + 1], token2, lenn); 5312 str[lenn + lenp + 1] = 0; 5313 5314 atom->valuep = str; 5315 } 5316 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5317 err_msg[199] = 0; 5318 atom->valuep2 = xmlStrdup(err_msg); 5319 5320 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5321 xmlRegFreeAtom(atom); 5322 return(NULL); 5323 } 5324 am->negs++; 5325 if (to == NULL) 5326 return(am->state); 5327 return(to); 5328} 5329 5330/** 5331 * xmlAutomataNewCountTrans2: 5332 * @am: an automata 5333 * @from: the starting point of the transition 5334 * @to: the target point of the transition or NULL 5335 * @token: the input string associated to that transition 5336 * @token2: the second input string associated to that transition 5337 * @min: the minimum successive occurences of token 5338 * @max: the maximum successive occurences of token 5339 * @data: data associated to the transition 5340 * 5341 * If @to is NULL, this creates first a new target state in the automata 5342 * and then adds a transition from the @from state to the target state 5343 * activated by a succession of input of value @token and @token2 and 5344 * whose number is between @min and @max 5345 * 5346 * Returns the target state or NULL in case of error 5347 */ 5348xmlAutomataStatePtr 5349xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5350 xmlAutomataStatePtr to, const xmlChar *token, 5351 const xmlChar *token2, 5352 int min, int max, void *data) { 5353 xmlRegAtomPtr atom; 5354 int counter; 5355 5356 if ((am == NULL) || (from == NULL) || (token == NULL)) 5357 return(NULL); 5358 if (min < 0) 5359 return(NULL); 5360 if ((max < min) || (max < 1)) 5361 return(NULL); 5362 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5363 if (atom == NULL) 5364 return(NULL); 5365 if ((token2 == NULL) || (*token2 == 0)) { 5366 atom->valuep = xmlStrdup(token); 5367 } else { 5368 int lenn, lenp; 5369 xmlChar *str; 5370 5371 lenn = strlen((char *) token2); 5372 lenp = strlen((char *) token); 5373 5374 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5375 if (str == NULL) { 5376 xmlRegFreeAtom(atom); 5377 return(NULL); 5378 } 5379 memcpy(&str[0], token, lenp); 5380 str[lenp] = '|'; 5381 memcpy(&str[lenp + 1], token2, lenn); 5382 str[lenn + lenp + 1] = 0; 5383 5384 atom->valuep = str; 5385 } 5386 atom->data = data; 5387 if (min == 0) 5388 atom->min = 1; 5389 else 5390 atom->min = min; 5391 atom->max = max; 5392 5393 /* 5394 * associate a counter to the transition. 5395 */ 5396 counter = xmlRegGetCounter(am); 5397 am->counters[counter].min = min; 5398 am->counters[counter].max = max; 5399 5400 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5401 if (to == NULL) { 5402 to = xmlRegNewState(am); 5403 xmlRegStatePush(am, to); 5404 } 5405 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5406 xmlRegAtomPush(am, atom); 5407 am->state = to; 5408 5409 if (to == NULL) 5410 to = am->state; 5411 if (to == NULL) 5412 return(NULL); 5413 if (min == 0) 5414 xmlFAGenerateEpsilonTransition(am, from, to); 5415 return(to); 5416} 5417 5418/** 5419 * xmlAutomataNewCountTrans: 5420 * @am: an automata 5421 * @from: the starting point of the transition 5422 * @to: the target point of the transition or NULL 5423 * @token: the input string associated to that transition 5424 * @min: the minimum successive occurences of token 5425 * @max: the maximum successive occurences of token 5426 * @data: data associated to the transition 5427 * 5428 * If @to is NULL, this creates first a new target state in the automata 5429 * and then adds a transition from the @from state to the target state 5430 * activated by a succession of input of value @token and whose number 5431 * is between @min and @max 5432 * 5433 * Returns the target state or NULL in case of error 5434 */ 5435xmlAutomataStatePtr 5436xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5437 xmlAutomataStatePtr to, const xmlChar *token, 5438 int min, int max, void *data) { 5439 xmlRegAtomPtr atom; 5440 int counter; 5441 5442 if ((am == NULL) || (from == NULL) || (token == NULL)) 5443 return(NULL); 5444 if (min < 0) 5445 return(NULL); 5446 if ((max < min) || (max < 1)) 5447 return(NULL); 5448 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5449 if (atom == NULL) 5450 return(NULL); 5451 atom->valuep = xmlStrdup(token); 5452 atom->data = data; 5453 if (min == 0) 5454 atom->min = 1; 5455 else 5456 atom->min = min; 5457 atom->max = max; 5458 5459 /* 5460 * associate a counter to the transition. 5461 */ 5462 counter = xmlRegGetCounter(am); 5463 am->counters[counter].min = min; 5464 am->counters[counter].max = max; 5465 5466 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5467 if (to == NULL) { 5468 to = xmlRegNewState(am); 5469 xmlRegStatePush(am, to); 5470 } 5471 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5472 xmlRegAtomPush(am, atom); 5473 am->state = to; 5474 5475 if (to == NULL) 5476 to = am->state; 5477 if (to == NULL) 5478 return(NULL); 5479 if (min == 0) 5480 xmlFAGenerateEpsilonTransition(am, from, to); 5481 return(to); 5482} 5483 5484/** 5485 * xmlAutomataNewOnceTrans2: 5486 * @am: an automata 5487 * @from: the starting point of the transition 5488 * @to: the target point of the transition or NULL 5489 * @token: the input string associated to that transition 5490 * @token2: the second input string associated to that transition 5491 * @min: the minimum successive occurences of token 5492 * @max: the maximum successive occurences of token 5493 * @data: data associated to the transition 5494 * 5495 * If @to is NULL, this creates first a new target state in the automata 5496 * and then adds a transition from the @from state to the target state 5497 * activated by a succession of input of value @token and @token2 and whose 5498 * number is between @min and @max, moreover that transition can only be 5499 * crossed once. 5500 * 5501 * Returns the target state or NULL in case of error 5502 */ 5503xmlAutomataStatePtr 5504xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5505 xmlAutomataStatePtr to, const xmlChar *token, 5506 const xmlChar *token2, 5507 int min, int max, void *data) { 5508 xmlRegAtomPtr atom; 5509 int counter; 5510 5511 if ((am == NULL) || (from == NULL) || (token == NULL)) 5512 return(NULL); 5513 if (min < 1) 5514 return(NULL); 5515 if ((max < min) || (max < 1)) 5516 return(NULL); 5517 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5518 if (atom == NULL) 5519 return(NULL); 5520 if ((token2 == NULL) || (*token2 == 0)) { 5521 atom->valuep = xmlStrdup(token); 5522 } else { 5523 int lenn, lenp; 5524 xmlChar *str; 5525 5526 lenn = strlen((char *) token2); 5527 lenp = strlen((char *) token); 5528 5529 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5530 if (str == NULL) { 5531 xmlRegFreeAtom(atom); 5532 return(NULL); 5533 } 5534 memcpy(&str[0], token, lenp); 5535 str[lenp] = '|'; 5536 memcpy(&str[lenp + 1], token2, lenn); 5537 str[lenn + lenp + 1] = 0; 5538 5539 atom->valuep = str; 5540 } 5541 atom->data = data; 5542 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 5543 if (min == 0) 5544 atom->min = 1; 5545 else 5546 atom->min = min; 5547 atom->max = max; 5548 /* 5549 * associate a counter to the transition. 5550 */ 5551 counter = xmlRegGetCounter(am); 5552 am->counters[counter].min = 1; 5553 am->counters[counter].max = 1; 5554 5555 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5556 if (to == NULL) { 5557 to = xmlRegNewState(am); 5558 xmlRegStatePush(am, to); 5559 } 5560 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5561 xmlRegAtomPush(am, atom); 5562 am->state = to; 5563 return(to); 5564} 5565 5566 5567 5568/** 5569 * xmlAutomataNewOnceTrans: 5570 * @am: an automata 5571 * @from: the starting point of the transition 5572 * @to: the target point of the transition or NULL 5573 * @token: the input string associated to that transition 5574 * @min: the minimum successive occurences of token 5575 * @max: the maximum successive occurences of token 5576 * @data: data associated to the transition 5577 * 5578 * If @to is NULL, this creates first a new target state in the automata 5579 * and then adds a transition from the @from state to the target state 5580 * activated by a succession of input of value @token and whose number 5581 * is between @min and @max, moreover that transition can only be crossed 5582 * once. 5583 * 5584 * Returns the target state or NULL in case of error 5585 */ 5586xmlAutomataStatePtr 5587xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5588 xmlAutomataStatePtr to, const xmlChar *token, 5589 int min, int max, void *data) { 5590 xmlRegAtomPtr atom; 5591 int counter; 5592 5593 if ((am == NULL) || (from == NULL) || (token == NULL)) 5594 return(NULL); 5595 if (min < 1) 5596 return(NULL); 5597 if ((max < min) || (max < 1)) 5598 return(NULL); 5599 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5600 if (atom == NULL) 5601 return(NULL); 5602 atom->valuep = xmlStrdup(token); 5603 atom->data = data; 5604 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 5605 if (min == 0) 5606 atom->min = 1; 5607 else 5608 atom->min = min; 5609 atom->max = max; 5610 /* 5611 * associate a counter to the transition. 5612 */ 5613 counter = xmlRegGetCounter(am); 5614 am->counters[counter].min = 1; 5615 am->counters[counter].max = 1; 5616 5617 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5618 if (to == NULL) { 5619 to = xmlRegNewState(am); 5620 xmlRegStatePush(am, to); 5621 } 5622 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5623 xmlRegAtomPush(am, atom); 5624 am->state = to; 5625 return(to); 5626} 5627 5628/** 5629 * xmlAutomataNewState: 5630 * @am: an automata 5631 * 5632 * Create a new disconnected state in the automata 5633 * 5634 * Returns the new state or NULL in case of error 5635 */ 5636xmlAutomataStatePtr 5637xmlAutomataNewState(xmlAutomataPtr am) { 5638 xmlAutomataStatePtr to; 5639 5640 if (am == NULL) 5641 return(NULL); 5642 to = xmlRegNewState(am); 5643 xmlRegStatePush(am, to); 5644 return(to); 5645} 5646 5647/** 5648 * xmlAutomataNewEpsilon: 5649 * @am: an automata 5650 * @from: the starting point of the transition 5651 * @to: the target point of the transition or NULL 5652 * 5653 * If @to is NULL, this creates first a new target state in the automata 5654 * and then adds an epsilon transition from the @from state to the 5655 * target state 5656 * 5657 * Returns the target state or NULL in case of error 5658 */ 5659xmlAutomataStatePtr 5660xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 5661 xmlAutomataStatePtr to) { 5662 if ((am == NULL) || (from == NULL)) 5663 return(NULL); 5664 xmlFAGenerateEpsilonTransition(am, from, to); 5665 if (to == NULL) 5666 return(am->state); 5667 return(to); 5668} 5669 5670/** 5671 * xmlAutomataNewAllTrans: 5672 * @am: an automata 5673 * @from: the starting point of the transition 5674 * @to: the target point of the transition or NULL 5675 * @lax: allow to transition if not all all transitions have been activated 5676 * 5677 * If @to is NULL, this creates first a new target state in the automata 5678 * and then adds a an ALL transition from the @from state to the 5679 * target state. That transition is an epsilon transition allowed only when 5680 * all transitions from the @from node have been activated. 5681 * 5682 * Returns the target state or NULL in case of error 5683 */ 5684xmlAutomataStatePtr 5685xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5686 xmlAutomataStatePtr to, int lax) { 5687 if ((am == NULL) || (from == NULL)) 5688 return(NULL); 5689 xmlFAGenerateAllTransition(am, from, to, lax); 5690 if (to == NULL) 5691 return(am->state); 5692 return(to); 5693} 5694 5695/** 5696 * xmlAutomataNewCounter: 5697 * @am: an automata 5698 * @min: the minimal value on the counter 5699 * @max: the maximal value on the counter 5700 * 5701 * Create a new counter 5702 * 5703 * Returns the counter number or -1 in case of error 5704 */ 5705int 5706xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 5707 int ret; 5708 5709 if (am == NULL) 5710 return(-1); 5711 5712 ret = xmlRegGetCounter(am); 5713 if (ret < 0) 5714 return(-1); 5715 am->counters[ret].min = min; 5716 am->counters[ret].max = max; 5717 return(ret); 5718} 5719 5720/** 5721 * xmlAutomataNewCountedTrans: 5722 * @am: an automata 5723 * @from: the starting point of the transition 5724 * @to: the target point of the transition or NULL 5725 * @counter: the counter associated to that transition 5726 * 5727 * If @to is NULL, this creates first a new target state in the automata 5728 * and then adds an epsilon transition from the @from state to the target state 5729 * which will increment the counter provided 5730 * 5731 * Returns the target state or NULL in case of error 5732 */ 5733xmlAutomataStatePtr 5734xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5735 xmlAutomataStatePtr to, int counter) { 5736 if ((am == NULL) || (from == NULL) || (counter < 0)) 5737 return(NULL); 5738 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 5739 if (to == NULL) 5740 return(am->state); 5741 return(to); 5742} 5743 5744/** 5745 * xmlAutomataNewCounterTrans: 5746 * @am: an automata 5747 * @from: the starting point of the transition 5748 * @to: the target point of the transition or NULL 5749 * @counter: the counter associated to that transition 5750 * 5751 * If @to is NULL, this creates first a new target state in the automata 5752 * and then adds an epsilon transition from the @from state to the target state 5753 * which will be allowed only if the counter is within the right range. 5754 * 5755 * Returns the target state or NULL in case of error 5756 */ 5757xmlAutomataStatePtr 5758xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5759 xmlAutomataStatePtr to, int counter) { 5760 if ((am == NULL) || (from == NULL) || (counter < 0)) 5761 return(NULL); 5762 xmlFAGenerateCountedTransition(am, from, to, counter); 5763 if (to == NULL) 5764 return(am->state); 5765 return(to); 5766} 5767 5768/** 5769 * xmlAutomataCompile: 5770 * @am: an automata 5771 * 5772 * Compile the automata into a Reg Exp ready for being executed. 5773 * The automata should be free after this point. 5774 * 5775 * Returns the compiled regexp or NULL in case of error 5776 */ 5777xmlRegexpPtr 5778xmlAutomataCompile(xmlAutomataPtr am) { 5779 xmlRegexpPtr ret; 5780 5781 if ((am == NULL) || (am->error != 0)) return(NULL); 5782 xmlFAEliminateEpsilonTransitions(am); 5783 /* xmlFAComputesDeterminism(am); */ 5784 ret = xmlRegEpxFromParse(am); 5785 5786 return(ret); 5787} 5788 5789/** 5790 * xmlAutomataIsDeterminist: 5791 * @am: an automata 5792 * 5793 * Checks if an automata is determinist. 5794 * 5795 * Returns 1 if true, 0 if not, and -1 in case of error 5796 */ 5797int 5798xmlAutomataIsDeterminist(xmlAutomataPtr am) { 5799 int ret; 5800 5801 if (am == NULL) 5802 return(-1); 5803 5804 ret = xmlFAComputesDeterminism(am); 5805 return(ret); 5806} 5807#endif /* LIBXML_AUTOMATA_ENABLED */ 5808 5809#ifdef LIBXML_EXPR_ENABLED 5810/************************************************************************ 5811 * * 5812 * Formal Expression handling code * 5813 * * 5814 ************************************************************************/ 5815/************************************************************************ 5816 * * 5817 * Expression handling context * 5818 * * 5819 ************************************************************************/ 5820 5821struct _xmlExpCtxt { 5822 xmlDictPtr dict; 5823 xmlExpNodePtr *table; 5824 int size; 5825 int nbElems; 5826 int nb_nodes; 5827 const char *expr; 5828 const char *cur; 5829 int nb_cons; 5830 int tabSize; 5831}; 5832 5833/** 5834 * xmlExpNewCtxt: 5835 * @maxNodes: the maximum number of nodes 5836 * @dict: optional dictionnary to use internally 5837 * 5838 * Creates a new context for manipulating expressions 5839 * 5840 * Returns the context or NULL in case of error 5841 */ 5842xmlExpCtxtPtr 5843xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 5844 xmlExpCtxtPtr ret; 5845 int size = 256; 5846 5847 if (maxNodes <= 4096) 5848 maxNodes = 4096; 5849 5850 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 5851 if (ret == NULL) 5852 return(NULL); 5853 memset(ret, 0, sizeof(xmlExpCtxt)); 5854 ret->size = size; 5855 ret->nbElems = 0; 5856 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 5857 if (ret->table == NULL) { 5858 xmlFree(ret); 5859 return(NULL); 5860 } 5861 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 5862 if (dict == NULL) { 5863 ret->dict = xmlDictCreate(); 5864 if (ret->dict == NULL) { 5865 xmlFree(ret->table); 5866 xmlFree(ret); 5867 return(NULL); 5868 } 5869 } else { 5870 ret->dict = dict; 5871 xmlDictReference(ret->dict); 5872 } 5873 return(ret); 5874} 5875 5876/** 5877 * xmlExpFreeCtxt: 5878 * @ctxt: an expression context 5879 * 5880 * Free an expression context 5881 */ 5882void 5883xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 5884 if (ctxt == NULL) 5885 return; 5886 xmlDictFree(ctxt->dict); 5887 if (ctxt->table != NULL) 5888 xmlFree(ctxt->table); 5889 xmlFree(ctxt); 5890} 5891 5892/************************************************************************ 5893 * * 5894 * Structure associated to an expression node * 5895 * * 5896 ************************************************************************/ 5897#define MAX_NODES 10000 5898 5899/* #define DEBUG_DERIV */ 5900 5901/* 5902 * TODO: 5903 * - Wildcards 5904 * - public API for creation 5905 * 5906 * Started 5907 * - regression testing 5908 * 5909 * Done 5910 * - split into module and test tool 5911 * - memleaks 5912 */ 5913 5914typedef enum { 5915 XML_EXP_NILABLE = (1 << 0) 5916} xmlExpNodeInfo; 5917 5918#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 5919 5920struct _xmlExpNode { 5921 unsigned char type;/* xmlExpNodeType */ 5922 unsigned char info;/* OR of xmlExpNodeInfo */ 5923 unsigned short key; /* the hash key */ 5924 unsigned int ref; /* The number of references */ 5925 int c_max; /* the maximum length it can consume */ 5926 xmlExpNodePtr exp_left; 5927 xmlExpNodePtr next;/* the next node in the hash table or free list */ 5928 union { 5929 struct { 5930 int f_min; 5931 int f_max; 5932 } count; 5933 struct { 5934 xmlExpNodePtr f_right; 5935 } children; 5936 const xmlChar *f_str; 5937 } field; 5938}; 5939 5940#define exp_min field.count.f_min 5941#define exp_max field.count.f_max 5942/* #define exp_left field.children.f_left */ 5943#define exp_right field.children.f_right 5944#define exp_str field.f_str 5945 5946static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 5947static xmlExpNode forbiddenExpNode = { 5948 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 5949}; 5950xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 5951static xmlExpNode emptyExpNode = { 5952 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 5953}; 5954xmlExpNodePtr emptyExp = &emptyExpNode; 5955 5956/************************************************************************ 5957 * * 5958 * The custom hash table for unicity and canonicalization * 5959 * of sub-expressions pointers * 5960 * * 5961 ************************************************************************/ 5962/* 5963 * xmlExpHashNameComputeKey: 5964 * Calculate the hash key for a token 5965 */ 5966static unsigned short 5967xmlExpHashNameComputeKey(const xmlChar *name) { 5968 unsigned short value = 0L; 5969 char ch; 5970 5971 if (name != NULL) { 5972 value += 30 * (*name); 5973 while ((ch = *name++) != 0) { 5974 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 5975 } 5976 } 5977 return (value); 5978} 5979 5980/* 5981 * xmlExpHashComputeKey: 5982 * Calculate the hash key for a compound expression 5983 */ 5984static unsigned short 5985xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 5986 xmlExpNodePtr right) { 5987 unsigned long value; 5988 unsigned short ret; 5989 5990 switch (type) { 5991 case XML_EXP_SEQ: 5992 value = left->key; 5993 value += right->key; 5994 value *= 3; 5995 ret = (unsigned short) value; 5996 break; 5997 case XML_EXP_OR: 5998 value = left->key; 5999 value += right->key; 6000 value *= 7; 6001 ret = (unsigned short) value; 6002 break; 6003 case XML_EXP_COUNT: 6004 value = left->key; 6005 value += right->key; 6006 ret = (unsigned short) value; 6007 break; 6008 default: 6009 ret = 0; 6010 } 6011 return(ret); 6012} 6013 6014 6015static xmlExpNodePtr 6016xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6017 xmlExpNodePtr ret; 6018 6019 if (ctxt->nb_nodes >= MAX_NODES) 6020 return(NULL); 6021 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6022 if (ret == NULL) 6023 return(NULL); 6024 memset(ret, 0, sizeof(xmlExpNode)); 6025 ret->type = type; 6026 ret->next = NULL; 6027 ctxt->nb_nodes++; 6028 ctxt->nb_cons++; 6029 return(ret); 6030} 6031 6032/** 6033 * xmlExpHashGetEntry: 6034 * @table: the hash table 6035 * 6036 * Get the unique entry from the hash table. The entry is created if 6037 * needed. @left and @right are consumed, i.e. their ref count will 6038 * be decremented by the operation. 6039 * 6040 * Returns the pointer or NULL in case of error 6041 */ 6042static xmlExpNodePtr 6043xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6044 xmlExpNodePtr left, xmlExpNodePtr right, 6045 const xmlChar *name, int min, int max) { 6046 unsigned short kbase, key; 6047 xmlExpNodePtr entry; 6048 xmlExpNodePtr insert; 6049 6050 if (ctxt == NULL) 6051 return(NULL); 6052 6053 /* 6054 * Check for duplicate and insertion location. 6055 */ 6056 if (type == XML_EXP_ATOM) { 6057 kbase = xmlExpHashNameComputeKey(name); 6058 } else if (type == XML_EXP_COUNT) { 6059 /* COUNT reduction rule 1 */ 6060 /* a{1} -> a */ 6061 if (min == max) { 6062 if (min == 1) { 6063 return(left); 6064 } 6065 if (min == 0) { 6066 xmlExpFree(ctxt, left); 6067 return(emptyExp); 6068 } 6069 } 6070 if (min < 0) { 6071 xmlExpFree(ctxt, left); 6072 return(forbiddenExp); 6073 } 6074 if (max == -1) 6075 kbase = min + 79; 6076 else 6077 kbase = max - min; 6078 kbase += left->key; 6079 } else if (type == XML_EXP_OR) { 6080 /* Forbid reduction rules */ 6081 if (left->type == XML_EXP_FORBID) { 6082 xmlExpFree(ctxt, left); 6083 return(right); 6084 } 6085 if (right->type == XML_EXP_FORBID) { 6086 xmlExpFree(ctxt, right); 6087 return(left); 6088 } 6089 6090 /* OR reduction rule 1 */ 6091 /* a | a reduced to a */ 6092 if (left == right) { 6093 left->ref--; 6094 return(left); 6095 } 6096 /* OR canonicalization rule 1 */ 6097 /* linearize (a | b) | c into a | (b | c) */ 6098 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6099 xmlExpNodePtr tmp = left; 6100 left = right; 6101 right = tmp; 6102 } 6103 /* OR reduction rule 2 */ 6104 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6105 if (right->type == XML_EXP_OR) { 6106 if ((left == right->exp_left) || 6107 (left == right->exp_right)) { 6108 xmlExpFree(ctxt, left); 6109 return(right); 6110 } 6111 } 6112 /* OR canonicalization rule 2 */ 6113 /* linearize (a | b) | c into a | (b | c) */ 6114 if (left->type == XML_EXP_OR) { 6115 xmlExpNodePtr tmp; 6116 6117 /* OR canonicalization rule 2 */ 6118 if ((left->exp_right->type != XML_EXP_OR) && 6119 (left->exp_right->key < left->exp_left->key)) { 6120 tmp = left->exp_right; 6121 left->exp_right = left->exp_left; 6122 left->exp_left = tmp; 6123 } 6124 left->exp_right->ref++; 6125 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6126 NULL, 0, 0); 6127 left->exp_left->ref++; 6128 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6129 NULL, 0, 0); 6130 6131 xmlExpFree(ctxt, left); 6132 return(tmp); 6133 } 6134 if (right->type == XML_EXP_OR) { 6135 /* Ordering in the tree */ 6136 /* C | (A | B) -> A | (B | C) */ 6137 if (left->key > right->exp_right->key) { 6138 xmlExpNodePtr tmp; 6139 right->exp_right->ref++; 6140 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6141 left, NULL, 0, 0); 6142 right->exp_left->ref++; 6143 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6144 tmp, NULL, 0, 0); 6145 xmlExpFree(ctxt, right); 6146 return(tmp); 6147 } 6148 /* Ordering in the tree */ 6149 /* B | (A | C) -> A | (B | C) */ 6150 if (left->key > right->exp_left->key) { 6151 xmlExpNodePtr tmp; 6152 right->exp_right->ref++; 6153 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6154 right->exp_right, NULL, 0, 0); 6155 right->exp_left->ref++; 6156 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6157 tmp, NULL, 0, 0); 6158 xmlExpFree(ctxt, right); 6159 return(tmp); 6160 } 6161 } 6162 /* we know both types are != XML_EXP_OR here */ 6163 else if (left->key > right->key) { 6164 xmlExpNodePtr tmp = left; 6165 left = right; 6166 right = tmp; 6167 } 6168 kbase = xmlExpHashComputeKey(type, left, right); 6169 } else if (type == XML_EXP_SEQ) { 6170 /* Forbid reduction rules */ 6171 if (left->type == XML_EXP_FORBID) { 6172 xmlExpFree(ctxt, right); 6173 return(left); 6174 } 6175 if (right->type == XML_EXP_FORBID) { 6176 xmlExpFree(ctxt, left); 6177 return(right); 6178 } 6179 /* Empty reduction rules */ 6180 if (right->type == XML_EXP_EMPTY) { 6181 return(left); 6182 } 6183 if (left->type == XML_EXP_EMPTY) { 6184 return(right); 6185 } 6186 kbase = xmlExpHashComputeKey(type, left, right); 6187 } else 6188 return(NULL); 6189 6190 key = kbase % ctxt->size; 6191 if (ctxt->table[key] != NULL) { 6192 for (insert = ctxt->table[key]; insert != NULL; 6193 insert = insert->next) { 6194 if ((insert->key == kbase) && 6195 (insert->type == type)) { 6196 if (type == XML_EXP_ATOM) { 6197 if (name == insert->exp_str) { 6198 insert->ref++; 6199 return(insert); 6200 } 6201 } else if (type == XML_EXP_COUNT) { 6202 if ((insert->exp_min == min) && (insert->exp_max == max) && 6203 (insert->exp_left == left)) { 6204 insert->ref++; 6205 left->ref--; 6206 return(insert); 6207 } 6208 } else if ((insert->exp_left == left) && 6209 (insert->exp_right == right)) { 6210 insert->ref++; 6211 left->ref--; 6212 right->ref--; 6213 return(insert); 6214 } 6215 } 6216 } 6217 } 6218 6219 entry = xmlExpNewNode(ctxt, type); 6220 if (entry == NULL) 6221 return(NULL); 6222 entry->key = kbase; 6223 if (type == XML_EXP_ATOM) { 6224 entry->exp_str = name; 6225 entry->c_max = 1; 6226 } else if (type == XML_EXP_COUNT) { 6227 entry->exp_min = min; 6228 entry->exp_max = max; 6229 entry->exp_left = left; 6230 if ((min == 0) || (IS_NILLABLE(left))) 6231 entry->info |= XML_EXP_NILABLE; 6232 if (max < 0) 6233 entry->c_max = -1; 6234 else 6235 entry->c_max = max * entry->exp_left->c_max; 6236 } else { 6237 entry->exp_left = left; 6238 entry->exp_right = right; 6239 if (type == XML_EXP_OR) { 6240 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6241 entry->info |= XML_EXP_NILABLE; 6242 if ((entry->exp_left->c_max == -1) || 6243 (entry->exp_right->c_max == -1)) 6244 entry->c_max = -1; 6245 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6246 entry->c_max = entry->exp_left->c_max; 6247 else 6248 entry->c_max = entry->exp_right->c_max; 6249 } else { 6250 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6251 entry->info |= XML_EXP_NILABLE; 6252 if ((entry->exp_left->c_max == -1) || 6253 (entry->exp_right->c_max == -1)) 6254 entry->c_max = -1; 6255 else 6256 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6257 } 6258 } 6259 entry->ref = 1; 6260 if (ctxt->table[key] != NULL) 6261 entry->next = ctxt->table[key]; 6262 6263 ctxt->table[key] = entry; 6264 ctxt->nbElems++; 6265 6266 return(entry); 6267} 6268 6269/** 6270 * xmlExpFree: 6271 * @ctxt: the expression context 6272 * @exp: the expression 6273 * 6274 * Dereference the expression 6275 */ 6276void 6277xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6278 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6279 return; 6280 exp->ref--; 6281 if (exp->ref == 0) { 6282 unsigned short key; 6283 6284 /* Unlink it first from the hash table */ 6285 key = exp->key % ctxt->size; 6286 if (ctxt->table[key] == exp) { 6287 ctxt->table[key] = exp->next; 6288 } else { 6289 xmlExpNodePtr tmp; 6290 6291 tmp = ctxt->table[key]; 6292 while (tmp != NULL) { 6293 if (tmp->next == exp) { 6294 tmp->next = exp->next; 6295 break; 6296 } 6297 tmp = tmp->next; 6298 } 6299 } 6300 6301 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6302 xmlExpFree(ctxt, exp->exp_left); 6303 xmlExpFree(ctxt, exp->exp_right); 6304 } else if (exp->type == XML_EXP_COUNT) { 6305 xmlExpFree(ctxt, exp->exp_left); 6306 } 6307 xmlFree(exp); 6308 ctxt->nb_nodes--; 6309 } 6310} 6311 6312/** 6313 * xmlExpRef: 6314 * @exp: the expression 6315 * 6316 * Increase the reference count of the expression 6317 */ 6318void 6319xmlExpRef(xmlExpNodePtr exp) { 6320 if (exp != NULL) 6321 exp->ref++; 6322} 6323 6324/** 6325 * xmlExpNewAtom: 6326 * @ctxt: the expression context 6327 * @name: the atom name 6328 * @len: the atom name lenght in byte (or -1); 6329 * 6330 * Get the atom associated to this name from that context 6331 * 6332 * Returns the node or NULL in case of error 6333 */ 6334xmlExpNodePtr 6335xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6336 if ((ctxt == NULL) || (name == NULL)) 6337 return(NULL); 6338 name = xmlDictLookup(ctxt->dict, name, len); 6339 if (name == NULL) 6340 return(NULL); 6341 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6342} 6343 6344/** 6345 * xmlExpNewOr: 6346 * @ctxt: the expression context 6347 * @left: left expression 6348 * @right: right expression 6349 * 6350 * Get the atom associated to the choice @left | @right 6351 * Note that @left and @right are consumed in the operation, to keep 6352 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6353 * this is true even in case of failure (unless ctxt == NULL). 6354 * 6355 * Returns the node or NULL in case of error 6356 */ 6357xmlExpNodePtr 6358xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6359 if ((ctxt == NULL) || (left == NULL) || (right == NULL)) { 6360 xmlExpFree(ctxt, left); 6361 xmlExpFree(ctxt, right); 6362 return(NULL); 6363 } 6364 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6365} 6366 6367/** 6368 * xmlExpNewSeq: 6369 * @ctxt: the expression context 6370 * @left: left expression 6371 * @right: right expression 6372 * 6373 * Get the atom associated to the sequence @left , @right 6374 * Note that @left and @right are consumed in the operation, to keep 6375 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6376 * this is true even in case of failure (unless ctxt == NULL). 6377 * 6378 * Returns the node or NULL in case of error 6379 */ 6380xmlExpNodePtr 6381xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6382 if ((ctxt == NULL) || (left == NULL) || (right == NULL)) { 6383 xmlExpFree(ctxt, left); 6384 xmlExpFree(ctxt, right); 6385 return(NULL); 6386 } 6387 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 6388} 6389 6390/** 6391 * xmlExpNewRange: 6392 * @ctxt: the expression context 6393 * @subset: the expression to be repeated 6394 * @min: the lower bound for the repetition 6395 * @max: the upper bound for the repetition, -1 means infinite 6396 * 6397 * Get the atom associated to the range (@subset){@min, @max} 6398 * Note that @subset is consumed in the operation, to keep 6399 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 6400 * this is true even in case of failure (unless ctxt == NULL). 6401 * 6402 * Returns the node or NULL in case of error 6403 */ 6404xmlExpNodePtr 6405xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 6406 if ((ctxt == NULL) || (subset == NULL) || (min < 0) || (max < -1) || 6407 ((max >= 0) && (min > max))) { 6408 xmlExpFree(ctxt, subset); 6409 return(NULL); 6410 } 6411 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 6412 NULL, NULL, min, max)); 6413} 6414 6415/************************************************************************ 6416 * * 6417 * Public API for operations on expressions * 6418 * * 6419 ************************************************************************/ 6420 6421static int 6422xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6423 const xmlChar**list, int len, int nb) { 6424 int tmp, tmp2; 6425tail: 6426 switch (exp->type) { 6427 case XML_EXP_EMPTY: 6428 return(0); 6429 case XML_EXP_ATOM: 6430 for (tmp = 0;tmp < nb;tmp++) 6431 if (list[tmp] == exp->exp_str) 6432 return(0); 6433 if (nb >= len) 6434 return(-2); 6435 list[nb++] = exp->exp_str; 6436 return(1); 6437 case XML_EXP_COUNT: 6438 exp = exp->exp_left; 6439 goto tail; 6440 case XML_EXP_SEQ: 6441 case XML_EXP_OR: 6442 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 6443 if (tmp < 0) 6444 return(tmp); 6445 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 6446 nb + tmp); 6447 if (tmp2 < 0) 6448 return(tmp2); 6449 return(tmp + tmp2); 6450 } 6451 return(-1); 6452} 6453 6454/** 6455 * xmlExpGetLanguage: 6456 * @ctxt: the expression context 6457 * @exp: the expression 6458 * @list: where to store the tokens 6459 * @len: the allocated lenght of @list 6460 * 6461 * Find all the strings used in @exp and store them in @list 6462 * 6463 * Returns the number of unique strings found, -1 in case of errors and 6464 * -2 if there is more than @len strings 6465 */ 6466int 6467xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6468 const xmlChar**list, int len) { 6469 if ((ctxt == NULL) || (exp == NULL) || (list == NULL) || (len <= 0)) 6470 return(-1); 6471 return(xmlExpGetLanguageInt(ctxt, exp, list, len, 0)); 6472} 6473 6474static int 6475xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6476 const xmlChar**list, int len, int nb) { 6477 int tmp, tmp2; 6478tail: 6479 switch (exp->type) { 6480 case XML_EXP_FORBID: 6481 return(0); 6482 case XML_EXP_EMPTY: 6483 return(0); 6484 case XML_EXP_ATOM: 6485 for (tmp = 0;tmp < nb;tmp++) 6486 if (list[tmp] == exp->exp_str) 6487 return(0); 6488 if (nb >= len) 6489 return(-2); 6490 list[nb++] = exp->exp_str; 6491 return(1); 6492 case XML_EXP_COUNT: 6493 exp = exp->exp_left; 6494 goto tail; 6495 case XML_EXP_SEQ: 6496 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 6497 if (tmp < 0) 6498 return(tmp); 6499 if (IS_NILLABLE(exp->exp_left)) { 6500 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 6501 nb + tmp); 6502 if (tmp2 < 0) 6503 return(tmp2); 6504 tmp += tmp2; 6505 } 6506 return(tmp); 6507 case XML_EXP_OR: 6508 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 6509 if (tmp < 0) 6510 return(tmp); 6511 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 6512 nb + tmp); 6513 if (tmp2 < 0) 6514 return(tmp2); 6515 return(tmp + tmp2); 6516 } 6517 return(-1); 6518} 6519 6520/** 6521 * xmlExpGetStart: 6522 * @ctxt: the expression context 6523 * @exp: the expression 6524 * @list: where to store the tokens 6525 * @len: the allocated lenght of @list 6526 * 6527 * Find all the strings that appears at the start of the languages 6528 * accepted by @exp and store them in @list. E.g. for (a, b) | c 6529 * it will return the list [a, c] 6530 * 6531 * Returns the number of unique strings found, -1 in case of errors and 6532 * -2 if there is more than @len strings 6533 */ 6534int 6535xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6536 const xmlChar**list, int len) { 6537 if ((ctxt == NULL) || (exp == NULL) || (list == NULL) || (len <= 0)) 6538 return(-1); 6539 return(xmlExpGetStartInt(ctxt, exp, list, len, 0)); 6540} 6541 6542/** 6543 * xmlExpIsNillable: 6544 * @exp: the expression 6545 * 6546 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce 6547 * 6548 * Returns 1 if nillable, 0 if not and -1 in case of error 6549 */ 6550int 6551xmlExpIsNillable(xmlExpNodePtr exp) { 6552 if (exp == NULL) 6553 return(-1); 6554 return(IS_NILLABLE(exp) != 0); 6555} 6556 6557static xmlExpNodePtr 6558xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 6559{ 6560 xmlExpNodePtr ret; 6561 6562 switch (exp->type) { 6563 case XML_EXP_EMPTY: 6564 return(forbiddenExp); 6565 case XML_EXP_FORBID: 6566 return(forbiddenExp); 6567 case XML_EXP_ATOM: 6568 if (exp->exp_str == str) { 6569#ifdef DEBUG_DERIV 6570 printf("deriv atom: equal => Empty\n"); 6571#endif 6572 ret = emptyExp; 6573 } else { 6574#ifdef DEBUG_DERIV 6575 printf("deriv atom: mismatch => forbid\n"); 6576#endif 6577 /* TODO wildcards here */ 6578 ret = forbiddenExp; 6579 } 6580 return(ret); 6581 case XML_EXP_OR: { 6582 xmlExpNodePtr tmp; 6583 6584#ifdef DEBUG_DERIV 6585 printf("deriv or: => or(derivs)\n"); 6586#endif 6587 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 6588 if (tmp == NULL) { 6589 return(NULL); 6590 } 6591 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 6592 if (ret == NULL) { 6593 xmlExpFree(ctxt, tmp); 6594 return(NULL); 6595 } 6596 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 6597 NULL, 0, 0); 6598 return(ret); 6599 } 6600 case XML_EXP_SEQ: 6601#ifdef DEBUG_DERIV 6602 printf("deriv seq: starting with left\n"); 6603#endif 6604 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 6605 if (ret == NULL) { 6606 return(NULL); 6607 } else if (ret == forbiddenExp) { 6608 if (IS_NILLABLE(exp->exp_left)) { 6609#ifdef DEBUG_DERIV 6610 printf("deriv seq: left failed but nillable\n"); 6611#endif 6612 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 6613 } 6614 } else { 6615#ifdef DEBUG_DERIV 6616 printf("deriv seq: left match => sequence\n"); 6617#endif 6618 exp->exp_right->ref++; 6619 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 6620 NULL, 0, 0); 6621 } 6622 return(ret); 6623 case XML_EXP_COUNT: { 6624 int min, max; 6625 xmlExpNodePtr tmp; 6626 6627 if (exp->exp_max == 0) 6628 return(forbiddenExp); 6629 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 6630 if (ret == NULL) 6631 return(NULL); 6632 if (ret == forbiddenExp) { 6633#ifdef DEBUG_DERIV 6634 printf("deriv count: pattern mismatch => forbid\n"); 6635#endif 6636 return(ret); 6637 } 6638 if (exp->exp_max == 1) 6639 return(ret); 6640 if (exp->exp_max < 0) /* unbounded */ 6641 max = -1; 6642 else 6643 max = exp->exp_max - 1; 6644 if (exp->exp_min > 0) 6645 min = exp->exp_min - 1; 6646 else 6647 min = 0; 6648 exp->exp_left->ref++; 6649 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 6650 NULL, min, max); 6651 if (ret == emptyExp) { 6652#ifdef DEBUG_DERIV 6653 printf("deriv count: match to empty => new count\n"); 6654#endif 6655 return(tmp); 6656 } 6657#ifdef DEBUG_DERIV 6658 printf("deriv count: match => sequence with new count\n"); 6659#endif 6660 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 6661 NULL, 0, 0)); 6662 } 6663 } 6664 return(NULL); 6665} 6666 6667/** 6668 * xmlExpStringDerive: 6669 * @ctxt: the expression context 6670 * @exp: the expression 6671 * @str: the string 6672 * @len: the string len in bytes if available 6673 * 6674 * Do one step of Brzozowski derivation of the expression @exp with 6675 * respect to the input string 6676 * 6677 * Returns the resulting expression or NULL in case of internal error 6678 */ 6679xmlExpNodePtr 6680xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6681 const xmlChar *str, int len) { 6682 const xmlChar *input; 6683 6684 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 6685 return(NULL); 6686 } 6687 /* 6688 * check the string is in the dictionnary, if yes use an interned 6689 * copy, otherwise we know it's not an acceptable input 6690 */ 6691 input = xmlDictExists(ctxt->dict, str, len); 6692 if (input == NULL) { 6693 return(forbiddenExp); 6694 } 6695 return(xmlExpStringDeriveInt(ctxt, exp, input)); 6696} 6697 6698static int 6699xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 6700 int ret = 1; 6701 6702 if (sub->c_max == -1) { 6703 if (exp->c_max != -1) 6704 ret = 0; 6705 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 6706 ret = 0; 6707 } 6708#if 0 6709 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 6710 ret = 0; 6711#endif 6712 return(ret); 6713} 6714 6715static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6716 xmlExpNodePtr sub); 6717/** 6718 * xmlExpDivide: 6719 * @ctxt: the expressions context 6720 * @exp: the englobing expression 6721 * @sub: the subexpression 6722 * @mult: the multiple expression 6723 * @remain: the remain from the derivation of the multiple 6724 * 6725 * Check if exp is a multiple of sub, i.e. if there is a finite number n 6726 * so that sub{n} subsume exp 6727 * 6728 * Returns the multiple value if successful, 0 if it is not a multiple 6729 * and -1 in case of internel error. 6730 */ 6731 6732static int 6733xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 6734 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 6735 int i; 6736 xmlExpNodePtr tmp, tmp2; 6737 6738 if (mult != NULL) *mult = NULL; 6739 if (remain != NULL) *remain = NULL; 6740 if (exp->c_max == -1) return(0); 6741 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 6742 6743 for (i = 1;i <= exp->c_max;i++) { 6744 sub->ref++; 6745 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 6746 sub, NULL, NULL, i, i); 6747 if (tmp == NULL) { 6748 return(-1); 6749 } 6750 if (!xmlExpCheckCard(tmp, exp)) { 6751 xmlExpFree(ctxt, tmp); 6752 continue; 6753 } 6754 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 6755 if (tmp2 == NULL) { 6756 xmlExpFree(ctxt, tmp); 6757 return(-1); 6758 } 6759 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 6760 if (remain != NULL) 6761 *remain = tmp2; 6762 else 6763 xmlExpFree(ctxt, tmp2); 6764 if (mult != NULL) 6765 *mult = tmp; 6766 else 6767 xmlExpFree(ctxt, tmp); 6768#ifdef DEBUG_DERIV 6769 printf("Divide succeeded %d\n", i); 6770#endif 6771 return(i); 6772 } 6773 xmlExpFree(ctxt, tmp); 6774 xmlExpFree(ctxt, tmp2); 6775 } 6776#ifdef DEBUG_DERIV 6777 printf("Divide failed\n"); 6778#endif 6779 return(0); 6780} 6781 6782/** 6783 * xmlExpExpDeriveInt: 6784 * @ctxt: the expressions context 6785 * @exp: the englobing expression 6786 * @sub: the subexpression 6787 * 6788 * Try to do a step of Brzozowski derivation but at a higher level 6789 * the input being a subexpression. 6790 * 6791 * Returns the resulting expression or NULL in case of internal error 6792 */ 6793static xmlExpNodePtr 6794xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 6795 xmlExpNodePtr ret, tmp, tmp2, tmp3; 6796 const xmlChar **tab; 6797 int len, i; 6798 6799 /* 6800 * In case of equality and if the expression can only consume a finite 6801 * amount, then the derivation is empty 6802 */ 6803 if ((exp == sub) && (exp->c_max >= 0)) { 6804#ifdef DEBUG_DERIV 6805 printf("Equal(exp, sub) and finite -> Empty\n"); 6806#endif 6807 return(emptyExp); 6808 } 6809 /* 6810 * decompose sub sequence first 6811 */ 6812 if (sub->type == XML_EXP_EMPTY) { 6813#ifdef DEBUG_DERIV 6814 printf("Empty(sub) -> Empty\n"); 6815#endif 6816 exp->ref++; 6817 return(exp); 6818 } 6819 if (sub->type == XML_EXP_SEQ) { 6820#ifdef DEBUG_DERIV 6821 printf("Seq(sub) -> decompose\n"); 6822#endif 6823 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 6824 if (tmp == NULL) 6825 return(NULL); 6826 if (tmp == forbiddenExp) 6827 return(tmp); 6828 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 6829 xmlExpFree(ctxt, tmp); 6830 return(ret); 6831 } 6832 if (sub->type == XML_EXP_OR) { 6833#ifdef DEBUG_DERIV 6834 printf("Or(sub) -> decompose\n"); 6835#endif 6836 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 6837 if (tmp == forbiddenExp) 6838 return(tmp); 6839 if (tmp == NULL) 6840 return(NULL); 6841 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 6842 if ((ret == NULL) || (ret == forbiddenExp)) { 6843 xmlExpFree(ctxt, tmp); 6844 return(ret); 6845 } 6846 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 6847 } 6848 if (!xmlExpCheckCard(exp, sub)) { 6849#ifdef DEBUG_DERIV 6850 printf("CheckCard(exp, sub) failed -> Forbid\n"); 6851#endif 6852 return(forbiddenExp); 6853 } 6854 switch (exp->type) { 6855 case XML_EXP_EMPTY: 6856 if (sub == emptyExp) 6857 return(emptyExp); 6858#ifdef DEBUG_DERIV 6859 printf("Empty(exp) -> Forbid\n"); 6860#endif 6861 return(forbiddenExp); 6862 case XML_EXP_FORBID: 6863#ifdef DEBUG_DERIV 6864 printf("Forbid(exp) -> Forbid\n"); 6865#endif 6866 return(forbiddenExp); 6867 case XML_EXP_ATOM: 6868 if (sub->type == XML_EXP_ATOM) { 6869 /* TODO: handle wildcards */ 6870 if (exp->exp_str == sub->exp_str) { 6871#ifdef DEBUG_DERIV 6872 printf("Atom match -> Empty\n"); 6873#endif 6874 return(emptyExp); 6875 } 6876#ifdef DEBUG_DERIV 6877 printf("Atom mismatch -> Forbid\n"); 6878#endif 6879 return(forbiddenExp); 6880 } 6881 if ((sub->type == XML_EXP_COUNT) && 6882 (sub->exp_max == 1) && 6883 (sub->exp_left->type == XML_EXP_ATOM)) { 6884 /* TODO: handle wildcards */ 6885 if (exp->exp_str == sub->exp_left->exp_str) { 6886#ifdef DEBUG_DERIV 6887 printf("Atom match -> Empty\n"); 6888#endif 6889 return(emptyExp); 6890 } 6891#ifdef DEBUG_DERIV 6892 printf("Atom mismatch -> Forbid\n"); 6893#endif 6894 return(forbiddenExp); 6895 } 6896#ifdef DEBUG_DERIV 6897 printf("Compex exp vs Atom -> Forbid\n"); 6898#endif 6899 return(forbiddenExp); 6900 case XML_EXP_SEQ: 6901 /* try to get the sequence consumed only if possible */ 6902 if (xmlExpCheckCard(exp->exp_left, sub)) { 6903 /* See if the sequence can be consumed directly */ 6904#ifdef DEBUG_DERIV 6905 printf("Seq trying left only\n"); 6906#endif 6907 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 6908 if ((ret != forbiddenExp) && (ret != NULL)) { 6909#ifdef DEBUG_DERIV 6910 printf("Seq trying left only worked\n"); 6911#endif 6912 /* 6913 * TODO: assumption here that we are determinist 6914 * i.e. we won't get to a nillable exp left 6915 * subset which could be matched by the right 6916 * part too. 6917 * e.g.: (a | b)+,(a | c) and 'a+,a' 6918 */ 6919 exp->exp_right->ref++; 6920 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 6921 exp->exp_right, NULL, 0, 0)); 6922 } 6923#ifdef DEBUG_DERIV 6924 } else { 6925 printf("Seq: left too short\n"); 6926#endif 6927 } 6928 /* Try instead to decompose */ 6929 if (sub->type == XML_EXP_COUNT) { 6930 int min, max; 6931 6932#ifdef DEBUG_DERIV 6933 printf("Seq: sub is a count\n"); 6934#endif 6935 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 6936 if (ret == NULL) 6937 return(NULL); 6938 if (ret != forbiddenExp) { 6939#ifdef DEBUG_DERIV 6940 printf("Seq , Count match on left\n"); 6941#endif 6942 if (sub->exp_max < 0) 6943 max = -1; 6944 else 6945 max = sub->exp_max -1; 6946 if (sub->exp_min > 0) 6947 min = sub->exp_min -1; 6948 else 6949 min = 0; 6950 exp->exp_right->ref++; 6951 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 6952 exp->exp_right, NULL, 0, 0); 6953 if (tmp == NULL) 6954 return(NULL); 6955 6956 sub->exp_left->ref++; 6957 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 6958 sub->exp_left, NULL, NULL, min, max); 6959 if (tmp2 == NULL) { 6960 xmlExpFree(ctxt, tmp); 6961 return(NULL); 6962 } 6963 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 6964 xmlExpFree(ctxt, tmp); 6965 xmlExpFree(ctxt, tmp2); 6966 return(ret); 6967 } 6968 } 6969 /* we made no progress on structured operations */ 6970 break; 6971 case XML_EXP_OR: 6972#ifdef DEBUG_DERIV 6973 printf("Or , trying both side\n"); 6974#endif 6975 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 6976 if (ret == NULL) 6977 return(NULL); 6978 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 6979 if (tmp == NULL) { 6980 xmlExpFree(ctxt, ret); 6981 return(NULL); 6982 } 6983 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 6984 case XML_EXP_COUNT: { 6985 int min, max; 6986 6987 if (sub->type == XML_EXP_COUNT) { 6988 /* 6989 * Try to see if the loop is completely subsumed 6990 */ 6991 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 6992 if (tmp == NULL) 6993 return(NULL); 6994 if (tmp == forbiddenExp) { 6995 int mult; 6996 6997#ifdef DEBUG_DERIV 6998 printf("Count, Count inner don't subsume\n"); 6999#endif 7000 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7001 NULL, &tmp); 7002 if (mult <= 0) { 7003#ifdef DEBUG_DERIV 7004 printf("Count, Count not multiple => forbidden\n"); 7005#endif 7006 return(forbiddenExp); 7007 } 7008 if (sub->exp_max == -1) { 7009 max = -1; 7010 if (exp->exp_max == -1) { 7011 if (exp->exp_min <= sub->exp_min * mult) 7012 min = 0; 7013 else 7014 min = exp->exp_min - sub->exp_min * mult; 7015 } else { 7016#ifdef DEBUG_DERIV 7017 printf("Count, Count finite can't subsume infinite\n"); 7018#endif 7019 xmlExpFree(ctxt, tmp); 7020 return(forbiddenExp); 7021 } 7022 } else { 7023 if (exp->exp_max == -1) { 7024#ifdef DEBUG_DERIV 7025 printf("Infinite loop consume mult finite loop\n"); 7026#endif 7027 if (exp->exp_min > sub->exp_min * mult) { 7028 max = -1; 7029 min = exp->exp_min - sub->exp_min * mult; 7030 } else { 7031 max = -1; 7032 min = 0; 7033 } 7034 } else { 7035 if (exp->exp_max < sub->exp_max * mult) { 7036#ifdef DEBUG_DERIV 7037 printf("loops max mult mismatch => forbidden\n"); 7038#endif 7039 xmlExpFree(ctxt, tmp); 7040 return(forbiddenExp); 7041 } 7042 if (sub->exp_max * mult > exp->exp_min) 7043 min = 0; 7044 else 7045 min = exp->exp_min - sub->exp_max * mult; 7046 max = exp->exp_max - sub->exp_max * mult; 7047 } 7048 } 7049 } else if (!IS_NILLABLE(tmp)) { 7050 /* 7051 * TODO: loop here to try to grow if working on finite 7052 * blocks. 7053 */ 7054#ifdef DEBUG_DERIV 7055 printf("Count, Count remain not nillable => forbidden\n"); 7056#endif 7057 xmlExpFree(ctxt, tmp); 7058 return(forbiddenExp); 7059 } else if (sub->exp_max == -1) { 7060 if (exp->exp_max == -1) { 7061 if (exp->exp_min <= sub->exp_min) { 7062#ifdef DEBUG_DERIV 7063 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7064#endif 7065 max = -1; 7066 min = 0; 7067 } else { 7068#ifdef DEBUG_DERIV 7069 printf("Infinite loops min => Count(X,Inf)\n"); 7070#endif 7071 max = -1; 7072 min = exp->exp_min - sub->exp_min; 7073 } 7074 } else if (exp->exp_min > sub->exp_min) { 7075#ifdef DEBUG_DERIV 7076 printf("loops min mismatch 1 => forbidden ???\n"); 7077#endif 7078 xmlExpFree(ctxt, tmp); 7079 return(forbiddenExp); 7080 } else { 7081 max = -1; 7082 min = 0; 7083 } 7084 } else { 7085 if (exp->exp_max == -1) { 7086#ifdef DEBUG_DERIV 7087 printf("Infinite loop consume finite loop\n"); 7088#endif 7089 if (exp->exp_min > sub->exp_min) { 7090 max = -1; 7091 min = exp->exp_min - sub->exp_min; 7092 } else { 7093 max = -1; 7094 min = 0; 7095 } 7096 } else { 7097 if (exp->exp_max < sub->exp_max) { 7098#ifdef DEBUG_DERIV 7099 printf("loops max mismatch => forbidden\n"); 7100#endif 7101 xmlExpFree(ctxt, tmp); 7102 return(forbiddenExp); 7103 } 7104 if (sub->exp_max > exp->exp_min) 7105 min = 0; 7106 else 7107 min = exp->exp_min - sub->exp_max; 7108 max = exp->exp_max - sub->exp_max; 7109 } 7110 } 7111#ifdef DEBUG_DERIV 7112 printf("loops match => SEQ(COUNT())\n"); 7113#endif 7114 exp->exp_left->ref++; 7115 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7116 NULL, NULL, min, max); 7117 if (tmp2 == NULL) { 7118 return(NULL); 7119 } 7120 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7121 NULL, 0, 0); 7122 return(ret); 7123 } 7124 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7125 if (tmp == NULL) 7126 return(NULL); 7127 if (tmp == forbiddenExp) { 7128#ifdef DEBUG_DERIV 7129 printf("loop mismatch => forbidden\n"); 7130#endif 7131 return(forbiddenExp); 7132 } 7133 if (exp->exp_min > 0) 7134 min = exp->exp_min - 1; 7135 else 7136 min = 0; 7137 if (exp->exp_max < 0) 7138 max = -1; 7139 else 7140 max = exp->exp_max - 1; 7141 7142#ifdef DEBUG_DERIV 7143 printf("loop match => SEQ(COUNT())\n"); 7144#endif 7145 exp->exp_left->ref++; 7146 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7147 NULL, NULL, min, max); 7148 if (tmp2 == NULL) 7149 return(NULL); 7150 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7151 NULL, 0, 0); 7152 return(ret); 7153 } 7154 } 7155 7156#ifdef DEBUG_DERIV 7157 printf("Fallback to derivative\n"); 7158#endif 7159 if (IS_NILLABLE(sub)) { 7160 if (!(IS_NILLABLE(exp))) 7161 return(forbiddenExp); 7162 else 7163 ret = emptyExp; 7164 } else 7165 ret = NULL; 7166 /* 7167 * here the structured derivation made no progress so 7168 * we use the default token based derivation to force one more step 7169 */ 7170 if (ctxt->tabSize == 0) 7171 ctxt->tabSize = 40; 7172 7173 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7174 sizeof(const xmlChar *)); 7175 if (tab == NULL) { 7176 return(NULL); 7177 } 7178 7179 /* 7180 * collect all the strings accepted by the subexpression on input 7181 */ 7182 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7183 while (len < 0) { 7184 const xmlChar **temp; 7185 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7186 sizeof(const xmlChar *)); 7187 if (temp == NULL) { 7188 xmlFree((xmlChar **) tab); 7189 return(NULL); 7190 } 7191 tab = temp; 7192 ctxt->tabSize *= 2; 7193 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7194 } 7195 for (i = 0;i < len;i++) { 7196 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7197 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7198 xmlExpFree(ctxt, ret); 7199 xmlFree((xmlChar **) tab); 7200 return(tmp); 7201 } 7202 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7203 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7204 xmlExpFree(ctxt, tmp); 7205 xmlExpFree(ctxt, ret); 7206 xmlFree((xmlChar **) tab); 7207 return(tmp); 7208 } 7209 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7210 xmlExpFree(ctxt, tmp); 7211 xmlExpFree(ctxt, tmp2); 7212 7213 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7214 xmlExpFree(ctxt, ret); 7215 xmlFree((xmlChar **) tab); 7216 return(tmp3); 7217 } 7218 7219 if (ret == NULL) 7220 ret = tmp3; 7221 else { 7222 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7223 if (ret == NULL) { 7224 xmlFree((xmlChar **) tab); 7225 return(NULL); 7226 } 7227 } 7228 } 7229 xmlFree((xmlChar **) tab); 7230 return(ret); 7231} 7232 7233/** 7234 * xmlExpExpDerive: 7235 * @ctxt: the expressions context 7236 * @exp: the englobing expression 7237 * @sub: the subexpression 7238 * 7239 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7240 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7241 * it usually tatkes less than linear time and can handle expressions generating 7242 * infinite languages. 7243 * 7244 * Returns the resulting expression or NULL in case of internal error, the 7245 * result must be freed 7246 */ 7247xmlExpNodePtr 7248xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7249 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7250 return(NULL); 7251 7252 /* 7253 * O(1) speedups 7254 */ 7255 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7256#ifdef DEBUG_DERIV 7257 printf("Sub nillable and not exp : can't subsume\n"); 7258#endif 7259 return(forbiddenExp); 7260 } 7261 if (xmlExpCheckCard(exp, sub) == 0) { 7262#ifdef DEBUG_DERIV 7263 printf("sub generate longuer sequances than exp : can't subsume\n"); 7264#endif 7265 return(forbiddenExp); 7266 } 7267 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7268} 7269 7270/** 7271 * xmlExpSubsume: 7272 * @ctxt: the expressions context 7273 * @exp: the englobing expression 7274 * @sub: the subexpression 7275 * 7276 * Check whether @exp accepts all the languages accexpted by @sub 7277 * the input being a subexpression. 7278 * 7279 * Returns 1 if true 0 if false and -1 in case of failure. 7280 */ 7281int 7282xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7283 xmlExpNodePtr tmp; 7284 7285 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7286 return(-1); 7287 7288 /* 7289 * TODO: speedup by checking the language of sub is a subset of the 7290 * language of exp 7291 */ 7292 /* 7293 * O(1) speedups 7294 */ 7295 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7296#ifdef DEBUG_DERIV 7297 printf("Sub nillable and not exp : can't subsume\n"); 7298#endif 7299 return(0); 7300 } 7301 if (xmlExpCheckCard(exp, sub) == 0) { 7302#ifdef DEBUG_DERIV 7303 printf("sub generate longuer sequances than exp : can't subsume\n"); 7304#endif 7305 return(0); 7306 } 7307 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7308#ifdef DEBUG_DERIV 7309 printf("Result derivation :\n"); 7310 PRINT_EXP(tmp); 7311#endif 7312 if (tmp == NULL) 7313 return(-1); 7314 if (tmp == forbiddenExp) 7315 return(0); 7316 if (tmp == emptyExp) 7317 return(1); 7318 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7319 xmlExpFree(ctxt, tmp); 7320 return(1); 7321 } 7322 xmlExpFree(ctxt, tmp); 7323 return(0); 7324} 7325 7326/************************************************************************ 7327 * * 7328 * Parsing expression * 7329 * * 7330 ************************************************************************/ 7331 7332static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7333 7334#undef CUR 7335#define CUR (*ctxt->cur) 7336#undef NEXT 7337#define NEXT ctxt->cur++; 7338#undef IS_BLANK 7339#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7340#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7341 7342static int 7343xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7344 int ret = 0; 7345 7346 SKIP_BLANKS 7347 if (CUR == '*') { 7348 NEXT 7349 return(-1); 7350 } 7351 if ((CUR < '0') || (CUR > '9')) 7352 return(-1); 7353 while ((CUR >= '0') && (CUR <= '9')) { 7354 ret = ret * 10 + (CUR - '0'); 7355 NEXT 7356 } 7357 return(ret); 7358} 7359 7360static xmlExpNodePtr 7361xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7362 const char *base; 7363 xmlExpNodePtr ret; 7364 const xmlChar *val; 7365 7366 SKIP_BLANKS 7367 base = ctxt->cur; 7368 if (*ctxt->cur == '(') { 7369 NEXT 7370 ret = xmlExpParseExpr(ctxt); 7371 SKIP_BLANKS 7372 if (*ctxt->cur != ')') { 7373 fprintf(stderr, "unbalanced '(' : %s\n", base); 7374 xmlExpFree(ctxt, ret); 7375 return(NULL); 7376 } 7377 NEXT; 7378 SKIP_BLANKS 7379 goto parse_quantifier; 7380 } 7381 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7382 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7383 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 7384 NEXT; 7385 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 7386 if (val == NULL) 7387 return(NULL); 7388 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 7389 if (ret == NULL) 7390 return(NULL); 7391 SKIP_BLANKS 7392parse_quantifier: 7393 if (CUR == '{') { 7394 int min, max; 7395 7396 NEXT 7397 min = xmlExpParseNumber(ctxt); 7398 if (min < 0) { 7399 xmlExpFree(ctxt, ret); 7400 return(NULL); 7401 } 7402 SKIP_BLANKS 7403 if (CUR == ',') { 7404 NEXT 7405 max = xmlExpParseNumber(ctxt); 7406 SKIP_BLANKS 7407 } else 7408 max = min; 7409 if (CUR != '}') { 7410 xmlExpFree(ctxt, ret); 7411 return(NULL); 7412 } 7413 NEXT 7414 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7415 min, max); 7416 SKIP_BLANKS 7417 } else if (CUR == '?') { 7418 NEXT 7419 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7420 0, 1); 7421 SKIP_BLANKS 7422 } else if (CUR == '+') { 7423 NEXT 7424 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7425 1, -1); 7426 SKIP_BLANKS 7427 } else if (CUR == '*') { 7428 NEXT 7429 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7430 0, -1); 7431 SKIP_BLANKS 7432 } 7433 return(ret); 7434} 7435 7436 7437static xmlExpNodePtr 7438xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 7439 xmlExpNodePtr ret, right; 7440 7441 ret = xmlExpParseOr(ctxt); 7442 SKIP_BLANKS 7443 while (CUR == '|') { 7444 NEXT 7445 right = xmlExpParseOr(ctxt); 7446 if (right == NULL) { 7447 xmlExpFree(ctxt, ret); 7448 return(NULL); 7449 } 7450 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 7451 if (ret == NULL) 7452 return(NULL); 7453 } 7454 return(ret); 7455} 7456 7457static xmlExpNodePtr 7458xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 7459 xmlExpNodePtr ret, right; 7460 7461 ret = xmlExpParseSeq(ctxt); 7462 SKIP_BLANKS 7463 while (CUR == ',') { 7464 NEXT 7465 right = xmlExpParseSeq(ctxt); 7466 if (right == NULL) { 7467 xmlExpFree(ctxt, ret); 7468 return(NULL); 7469 } 7470 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 7471 if (ret == NULL) 7472 return(NULL); 7473 } 7474 return(ret); 7475} 7476 7477/** 7478 * xmlExpParse: 7479 * @ctxt: the expressions context 7480 * @expr: the 0 terminated string 7481 * 7482 * Minimal parser for regexps, it understand the following constructs 7483 * - string terminals 7484 * - choice operator | 7485 * - sequence operator , 7486 * - subexpressions (...) 7487 * - usual cardinality operators + * and ? 7488 * - finite sequences { min, max } 7489 * - infinite sequences { min, * } 7490 * There is minimal checkings made especially no checking on strings values 7491 * 7492 * Returns a new expression or NULL in case of failure 7493 */ 7494xmlExpNodePtr 7495xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 7496 xmlExpNodePtr ret; 7497 7498 ctxt->expr = expr; 7499 ctxt->cur = expr; 7500 7501 ret = xmlExpParseExpr(ctxt); 7502 SKIP_BLANKS 7503 if (*ctxt->cur != 0) { 7504 xmlExpFree(ctxt, ret); 7505 return(NULL); 7506 } 7507 return(ret); 7508} 7509 7510static void 7511xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 7512 xmlExpNodePtr c; 7513 7514 if (expr == NULL) return; 7515 if (glob) xmlBufferWriteChar(buf, "("); 7516 switch (expr->type) { 7517 case XML_EXP_EMPTY: 7518 xmlBufferWriteChar(buf, "empty"); 7519 break; 7520 case XML_EXP_FORBID: 7521 xmlBufferWriteChar(buf, "forbidden"); 7522 break; 7523 case XML_EXP_ATOM: 7524 xmlBufferWriteCHAR(buf, expr->exp_str); 7525 break; 7526 case XML_EXP_SEQ: 7527 c = expr->exp_left; 7528 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7529 xmlExpDumpInt(buf, c, 1); 7530 else 7531 xmlExpDumpInt(buf, c, 0); 7532 xmlBufferWriteChar(buf, " , "); 7533 c = expr->exp_right; 7534 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7535 xmlExpDumpInt(buf, c, 1); 7536 else 7537 xmlExpDumpInt(buf, c, 0); 7538 break; 7539 case XML_EXP_OR: 7540 c = expr->exp_left; 7541 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7542 xmlExpDumpInt(buf, c, 1); 7543 else 7544 xmlExpDumpInt(buf, c, 0); 7545 xmlBufferWriteChar(buf, " | "); 7546 c = expr->exp_right; 7547 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7548 xmlExpDumpInt(buf, c, 1); 7549 else 7550 xmlExpDumpInt(buf, c, 0); 7551 break; 7552 case XML_EXP_COUNT: { 7553 char rep[40]; 7554 7555 c = expr->exp_left; 7556 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7557 xmlExpDumpInt(buf, c, 1); 7558 else 7559 xmlExpDumpInt(buf, c, 0); 7560 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 7561 rep[0] = '?'; 7562 rep[1] = 0; 7563 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 7564 rep[0] = '*'; 7565 rep[1] = 0; 7566 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 7567 rep[0] = '+'; 7568 rep[1] = 0; 7569 } else if (expr->exp_max == expr->exp_min) { 7570 snprintf(rep, 39, "{%d}", expr->exp_min); 7571 } else if (expr->exp_max < 0) { 7572 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 7573 } else { 7574 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 7575 } 7576 rep[39] = 0; 7577 xmlBufferWriteChar(buf, rep); 7578 break; 7579 } 7580 default: 7581 fprintf(stderr, "Error in tree\n"); 7582 } 7583 if (glob) 7584 xmlBufferWriteChar(buf, ")"); 7585} 7586/** 7587 * xmlExpDump: 7588 * @buf: a buffer to receive the output 7589 * @expr: the compiled expression 7590 * 7591 * Serialize the expression as compiled to the buffer 7592 */ 7593void 7594xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 7595 if ((buf == NULL) || (expr == NULL)) 7596 return; 7597 xmlExpDumpInt(buf, expr, 0); 7598} 7599 7600/** 7601 * xmlExpMaxToken: 7602 * @expr: a compiled expression 7603 * 7604 * Indicate the maximum number of input a expression can accept 7605 * 7606 * Returns the maximum length or -1 in case of error 7607 */ 7608int 7609xmlExpMaxToken(xmlExpNodePtr expr) { 7610 if (expr == NULL) 7611 return(-1); 7612 return(expr->c_max); 7613} 7614 7615/** 7616 * xmlExpCtxtNbNodes: 7617 * @ctxt: an expression context 7618 * 7619 * Debugging facility provides the number of allocated nodes at a that point 7620 * 7621 * Returns the number of nodes in use or -1 in case of error 7622 */ 7623int 7624xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 7625 if (ctxt == NULL) 7626 return(-1); 7627 return(ctxt->nb_nodes); 7628} 7629 7630/** 7631 * xmlExpCtxtNbCons: 7632 * @ctxt: an expression context 7633 * 7634 * Debugging facility provides the number of allocated nodes over lifetime 7635 * 7636 * Returns the number of nodes ever allocated or -1 in case of error 7637 */ 7638int 7639xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 7640 if (ctxt == NULL) 7641 return(-1); 7642 return(ctxt->nb_cons); 7643} 7644 7645#endif /* LIBXML_EXPR_ENABLED */ 7646#define bottom_xmlregexp 7647#include "elfgcchack.h" 7648#endif /* LIBXML_REGEXP_ENABLED */ 7649