xmlregexp.c revision 0ddb21c46ca6ac5297ff5f6537480de8463223ea
1/* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications thise includes: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17#define IN_LIBXML 18#include "libxml.h" 19 20#ifdef LIBXML_REGEXP_ENABLED 21 22#include <stdio.h> 23#include <string.h> 24#ifdef HAVE_LIMITS_H 25#include <limits.h> 26#endif 27 28#include <libxml/tree.h> 29#include <libxml/parserInternals.h> 30#include <libxml/xmlregexp.h> 31#include <libxml/xmlautomata.h> 32#include <libxml/xmlunicode.h> 33 34#ifndef INT_MAX 35#define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 36#endif 37 38/* #define DEBUG_REGEXP_GRAPH */ 39/* #define DEBUG_REGEXP_EXEC */ 40/* #define DEBUG_PUSH */ 41/* #define DEBUG_COMPACTION */ 42 43#define ERROR(str) \ 44 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 45 xmlRegexpErrCompile(ctxt, str); 46#define NEXT ctxt->cur++ 47#define CUR (*(ctxt->cur)) 48#define NXT(index) (ctxt->cur[index]) 49 50#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 51#define NEXTL(l) ctxt->cur += l; 52 53/** 54 * TODO: 55 * 56 * macro to flag unimplemented blocks 57 */ 58#define TODO \ 59 xmlGenericError(xmlGenericErrorContext, \ 60 "Unimplemented block at %s:%d\n", \ 61 __FILE__, __LINE__); 62 63/************************************************************************ 64 * * 65 * Datatypes and structures * 66 * * 67 ************************************************************************/ 68 69typedef enum { 70 XML_REGEXP_EPSILON = 1, 71 XML_REGEXP_CHARVAL, 72 XML_REGEXP_RANGES, 73 XML_REGEXP_SUBREG, 74 XML_REGEXP_STRING, 75 XML_REGEXP_ANYCHAR, /* . */ 76 XML_REGEXP_ANYSPACE, /* \s */ 77 XML_REGEXP_NOTSPACE, /* \S */ 78 XML_REGEXP_INITNAME, /* \l */ 79 XML_REGEXP_NOTINITNAME, /* \l */ 80 XML_REGEXP_NAMECHAR, /* \c */ 81 XML_REGEXP_NOTNAMECHAR, /* \C */ 82 XML_REGEXP_DECIMAL, /* \d */ 83 XML_REGEXP_NOTDECIMAL, /* \d */ 84 XML_REGEXP_REALCHAR, /* \w */ 85 XML_REGEXP_NOTREALCHAR, /* \w */ 86 XML_REGEXP_LETTER, 87 XML_REGEXP_LETTER_UPPERCASE, 88 XML_REGEXP_LETTER_LOWERCASE, 89 XML_REGEXP_LETTER_TITLECASE, 90 XML_REGEXP_LETTER_MODIFIER, 91 XML_REGEXP_LETTER_OTHERS, 92 XML_REGEXP_MARK, 93 XML_REGEXP_MARK_NONSPACING, 94 XML_REGEXP_MARK_SPACECOMBINING, 95 XML_REGEXP_MARK_ENCLOSING, 96 XML_REGEXP_NUMBER, 97 XML_REGEXP_NUMBER_DECIMAL, 98 XML_REGEXP_NUMBER_LETTER, 99 XML_REGEXP_NUMBER_OTHERS, 100 XML_REGEXP_PUNCT, 101 XML_REGEXP_PUNCT_CONNECTOR, 102 XML_REGEXP_PUNCT_DASH, 103 XML_REGEXP_PUNCT_OPEN, 104 XML_REGEXP_PUNCT_CLOSE, 105 XML_REGEXP_PUNCT_INITQUOTE, 106 XML_REGEXP_PUNCT_FINQUOTE, 107 XML_REGEXP_PUNCT_OTHERS, 108 XML_REGEXP_SEPAR, 109 XML_REGEXP_SEPAR_SPACE, 110 XML_REGEXP_SEPAR_LINE, 111 XML_REGEXP_SEPAR_PARA, 112 XML_REGEXP_SYMBOL, 113 XML_REGEXP_SYMBOL_MATH, 114 XML_REGEXP_SYMBOL_CURRENCY, 115 XML_REGEXP_SYMBOL_MODIFIER, 116 XML_REGEXP_SYMBOL_OTHERS, 117 XML_REGEXP_OTHER, 118 XML_REGEXP_OTHER_CONTROL, 119 XML_REGEXP_OTHER_FORMAT, 120 XML_REGEXP_OTHER_PRIVATE, 121 XML_REGEXP_OTHER_NA, 122 XML_REGEXP_BLOCK_NAME 123} xmlRegAtomType; 124 125typedef enum { 126 XML_REGEXP_QUANT_EPSILON = 1, 127 XML_REGEXP_QUANT_ONCE, 128 XML_REGEXP_QUANT_OPT, 129 XML_REGEXP_QUANT_MULT, 130 XML_REGEXP_QUANT_PLUS, 131 XML_REGEXP_QUANT_ONCEONLY, 132 XML_REGEXP_QUANT_ALL, 133 XML_REGEXP_QUANT_RANGE 134} xmlRegQuantType; 135 136typedef enum { 137 XML_REGEXP_START_STATE = 1, 138 XML_REGEXP_FINAL_STATE, 139 XML_REGEXP_TRANS_STATE 140} xmlRegStateType; 141 142typedef enum { 143 XML_REGEXP_MARK_NORMAL = 0, 144 XML_REGEXP_MARK_START, 145 XML_REGEXP_MARK_VISITED 146} xmlRegMarkedType; 147 148typedef struct _xmlRegRange xmlRegRange; 149typedef xmlRegRange *xmlRegRangePtr; 150 151struct _xmlRegRange { 152 int neg; /* 0 normal, 1 not, 2 exclude */ 153 xmlRegAtomType type; 154 int start; 155 int end; 156 xmlChar *blockName; 157}; 158 159typedef struct _xmlRegAtom xmlRegAtom; 160typedef xmlRegAtom *xmlRegAtomPtr; 161 162typedef struct _xmlAutomataState xmlRegState; 163typedef xmlRegState *xmlRegStatePtr; 164 165struct _xmlRegAtom { 166 int no; 167 xmlRegAtomType type; 168 xmlRegQuantType quant; 169 int min; 170 int max; 171 172 void *valuep; 173 void *valuep2; 174 int neg; 175 int codepoint; 176 xmlRegStatePtr start; 177 xmlRegStatePtr stop; 178 int maxRanges; 179 int nbRanges; 180 xmlRegRangePtr *ranges; 181 void *data; 182}; 183 184typedef struct _xmlRegCounter xmlRegCounter; 185typedef xmlRegCounter *xmlRegCounterPtr; 186 187struct _xmlRegCounter { 188 int min; 189 int max; 190}; 191 192typedef struct _xmlRegTrans xmlRegTrans; 193typedef xmlRegTrans *xmlRegTransPtr; 194 195struct _xmlRegTrans { 196 xmlRegAtomPtr atom; 197 int to; 198 int counter; 199 int count; 200}; 201 202struct _xmlAutomataState { 203 xmlRegStateType type; 204 xmlRegMarkedType mark; 205 xmlRegMarkedType reached; 206 int no; 207 208 int maxTrans; 209 int nbTrans; 210 xmlRegTrans *trans; 211}; 212 213typedef struct _xmlAutomata xmlRegParserCtxt; 214typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 215 216struct _xmlAutomata { 217 xmlChar *string; 218 xmlChar *cur; 219 220 int error; 221 int neg; 222 223 xmlRegStatePtr start; 224 xmlRegStatePtr end; 225 xmlRegStatePtr state; 226 227 xmlRegAtomPtr atom; 228 229 int maxAtoms; 230 int nbAtoms; 231 xmlRegAtomPtr *atoms; 232 233 int maxStates; 234 int nbStates; 235 xmlRegStatePtr *states; 236 237 int maxCounters; 238 int nbCounters; 239 xmlRegCounter *counters; 240 241 int determinist; 242}; 243 244struct _xmlRegexp { 245 xmlChar *string; 246 int nbStates; 247 xmlRegStatePtr *states; 248 int nbAtoms; 249 xmlRegAtomPtr *atoms; 250 int nbCounters; 251 xmlRegCounter *counters; 252 int determinist; 253 /* 254 * That's the compact form for determinists automatas 255 */ 256 int nbstates; 257 int *compact; 258 void **transdata; 259 int nbstrings; 260 xmlChar **stringMap; 261}; 262 263typedef struct _xmlRegExecRollback xmlRegExecRollback; 264typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 265 266struct _xmlRegExecRollback { 267 xmlRegStatePtr state;/* the current state */ 268 int index; /* the index in the input stack */ 269 int nextbranch; /* the next transition to explore in that state */ 270 int *counts; /* save the automate state if it has some */ 271}; 272 273typedef struct _xmlRegInputToken xmlRegInputToken; 274typedef xmlRegInputToken *xmlRegInputTokenPtr; 275 276struct _xmlRegInputToken { 277 xmlChar *value; 278 void *data; 279}; 280 281struct _xmlRegExecCtxt { 282 int status; /* execution status != 0 indicate an error */ 283 int determinist; /* did we found an inderterministic behaviour */ 284 xmlRegexpPtr comp; /* the compiled regexp */ 285 xmlRegExecCallbacks callback; 286 void *data; 287 288 xmlRegStatePtr state;/* the current state */ 289 int transno; /* the current transition on that state */ 290 int transcount; /* the number of char in char counted transitions */ 291 292 /* 293 * A stack of rollback states 294 */ 295 int maxRollbacks; 296 int nbRollbacks; 297 xmlRegExecRollback *rollbacks; 298 299 /* 300 * The state of the automata if any 301 */ 302 int *counts; 303 304 /* 305 * The input stack 306 */ 307 int inputStackMax; 308 int inputStackNr; 309 int index; 310 int *charStack; 311 const xmlChar *inputString; /* when operating on characters */ 312 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 313 314}; 315 316#define REGEXP_ALL_COUNTER 0x123456 317#define REGEXP_ALL_LAX_COUNTER 0x123457 318 319static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 320static void xmlRegFreeState(xmlRegStatePtr state); 321static void xmlRegFreeAtom(xmlRegAtomPtr atom); 322 323/************************************************************************ 324 * * 325 * Regexp memory error handler * 326 * * 327 ************************************************************************/ 328/** 329 * xmlRegexpErrMemory: 330 * @extra: extra informations 331 * 332 * Handle an out of memory condition 333 */ 334static void 335xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 336{ 337 const char *regexp = NULL; 338 if (ctxt != NULL) { 339 regexp = (const char *) ctxt->string; 340 ctxt->error = XML_ERR_NO_MEMORY; 341 } 342 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 343 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 344 regexp, NULL, 0, 0, 345 "Memory allocation failed : %s\n", extra); 346} 347 348/** 349 * xmlRegexpErrCompile: 350 * @extra: extra informations 351 * 352 * Handle an compilation failure 353 */ 354static void 355xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 356{ 357 const char *regexp = NULL; 358 int idx = 0; 359 360 if (ctxt != NULL) { 361 regexp = (const char *) ctxt->string; 362 idx = ctxt->cur - ctxt->string; 363 ctxt->error = XML_REGEXP_COMPILE_ERROR; 364 } 365 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 366 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 367 regexp, NULL, idx, 0, 368 "failed to compile: %s\n", extra); 369} 370 371/************************************************************************ 372 * * 373 * Allocation/Deallocation * 374 * * 375 ************************************************************************/ 376 377static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 378/** 379 * xmlRegEpxFromParse: 380 * @ctxt: the parser context used to build it 381 * 382 * Allocate a new regexp and fill it with the reult from the parser 383 * 384 * Returns the new regexp or NULL in case of error 385 */ 386static xmlRegexpPtr 387xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 388 xmlRegexpPtr ret; 389 390 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 391 if (ret == NULL) { 392 xmlRegexpErrMemory(ctxt, "compiling regexp"); 393 return(NULL); 394 } 395 memset(ret, 0, sizeof(xmlRegexp)); 396 ret->string = ctxt->string; 397 ret->nbStates = ctxt->nbStates; 398 ret->states = ctxt->states; 399 ret->nbAtoms = ctxt->nbAtoms; 400 ret->atoms = ctxt->atoms; 401 ret->nbCounters = ctxt->nbCounters; 402 ret->counters = ctxt->counters; 403 ret->determinist = ctxt->determinist; 404 405 if ((ret->determinist != 0) && 406 (ret->nbCounters == 0) && 407 (ret->atoms != NULL) && 408 (ret->atoms[0] != NULL) && 409 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 410 int i, j, nbstates = 0, nbatoms = 0; 411 int *stateRemap; 412 int *stringRemap; 413 int *transitions; 414 void **transdata; 415 xmlChar **stringMap; 416 xmlChar *value; 417 418 /* 419 * Switch to a compact representation 420 * 1/ counting the effective number of states left 421 * 2/ conting the unique number of atoms, and check that 422 * they are all of the string type 423 * 3/ build a table state x atom for the transitions 424 */ 425 426 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 427 if (stateRemap == NULL) { 428 xmlRegexpErrMemory(ctxt, "compiling regexp"); 429 xmlFree(ret); 430 return(NULL); 431 } 432 for (i = 0;i < ret->nbStates;i++) { 433 if (ret->states[i] != NULL) { 434 stateRemap[i] = nbstates; 435 nbstates++; 436 } else { 437 stateRemap[i] = -1; 438 } 439 } 440#ifdef DEBUG_COMPACTION 441 printf("Final: %d states\n", nbstates); 442#endif 443 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 444 if (stringMap == NULL) { 445 xmlRegexpErrMemory(ctxt, "compiling regexp"); 446 xmlFree(stateRemap); 447 xmlFree(ret); 448 return(NULL); 449 } 450 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 451 if (stringRemap == NULL) { 452 xmlRegexpErrMemory(ctxt, "compiling regexp"); 453 xmlFree(stringMap); 454 xmlFree(stateRemap); 455 xmlFree(ret); 456 return(NULL); 457 } 458 for (i = 0;i < ret->nbAtoms;i++) { 459 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 460 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 461 value = ret->atoms[i]->valuep; 462 for (j = 0;j < nbatoms;j++) { 463 if (xmlStrEqual(stringMap[j], value)) { 464 stringRemap[i] = j; 465 break; 466 } 467 } 468 if (j >= nbatoms) { 469 stringRemap[i] = nbatoms; 470 stringMap[nbatoms] = xmlStrdup(value); 471 if (stringMap[nbatoms] == NULL) { 472 for (i = 0;i < nbatoms;i++) 473 xmlFree(stringMap[i]); 474 xmlFree(stringRemap); 475 xmlFree(stringMap); 476 xmlFree(stateRemap); 477 xmlFree(ret); 478 return(NULL); 479 } 480 nbatoms++; 481 } 482 } else { 483 xmlFree(stateRemap); 484 xmlFree(stringRemap); 485 for (i = 0;i < nbatoms;i++) 486 xmlFree(stringMap[i]); 487 xmlFree(stringMap); 488 xmlFree(ret); 489 return(NULL); 490 } 491 } 492#ifdef DEBUG_COMPACTION 493 printf("Final: %d atoms\n", nbatoms); 494#endif 495 transitions = (int *) xmlMalloc((nbstates + 1) * 496 (nbatoms + 1) * sizeof(int)); 497 if (transitions == NULL) { 498 xmlFree(stateRemap); 499 xmlFree(stringRemap); 500 xmlFree(stringMap); 501 xmlFree(ret); 502 return(NULL); 503 } 504 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 505 506 /* 507 * Allocate the transition table. The first entry for each 508 * state correspond to the state type. 509 */ 510 transdata = NULL; 511 512 for (i = 0;i < ret->nbStates;i++) { 513 int stateno, atomno, targetno, prev; 514 xmlRegStatePtr state; 515 xmlRegTransPtr trans; 516 517 stateno = stateRemap[i]; 518 if (stateno == -1) 519 continue; 520 state = ret->states[i]; 521 522 transitions[stateno * (nbatoms + 1)] = state->type; 523 524 for (j = 0;j < state->nbTrans;j++) { 525 trans = &(state->trans[j]); 526 if ((trans->to == -1) || (trans->atom == NULL)) 527 continue; 528 atomno = stringRemap[trans->atom->no]; 529 if ((trans->atom->data != NULL) && (transdata == NULL)) { 530 transdata = (void **) xmlMalloc(nbstates * nbatoms * 531 sizeof(void *)); 532 if (transdata != NULL) 533 memset(transdata, 0, 534 nbstates * nbatoms * sizeof(void *)); 535 else { 536 xmlRegexpErrMemory(ctxt, "compiling regexp"); 537 break; 538 } 539 } 540 targetno = stateRemap[trans->to]; 541 /* 542 * if the same atome can generate transition to 2 different 543 * states then it means the automata is not determinist and 544 * the compact form can't be used ! 545 */ 546 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 547 if (prev != 0) { 548 if (prev != targetno + 1) { 549 ret->determinist = 0; 550#ifdef DEBUG_COMPACTION 551 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 552 i, j, trans->atom->no, trans->to, atomno, targetno); 553 printf(" previous to is %d\n", prev); 554#endif 555 ret->determinist = 0; 556 if (transdata != NULL) 557 xmlFree(transdata); 558 xmlFree(transitions); 559 xmlFree(stateRemap); 560 xmlFree(stringRemap); 561 for (i = 0;i < nbatoms;i++) 562 xmlFree(stringMap[i]); 563 xmlFree(stringMap); 564 goto not_determ; 565 } 566 } else { 567#if 0 568 printf("State %d trans %d: atom %d to %d : %d to %d\n", 569 i, j, trans->atom->no, trans->to, atomno, targetno); 570#endif 571 transitions[stateno * (nbatoms + 1) + atomno + 1] = 572 targetno + 1; /* to avoid 0 */ 573 if (transdata != NULL) 574 transdata[stateno * nbatoms + atomno] = 575 trans->atom->data; 576 } 577 } 578 } 579 ret->determinist = 1; 580#ifdef DEBUG_COMPACTION 581 /* 582 * Debug 583 */ 584 for (i = 0;i < nbstates;i++) { 585 for (j = 0;j < nbatoms + 1;j++) { 586 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 587 } 588 printf("\n"); 589 } 590 printf("\n"); 591#endif 592 /* 593 * Cleanup of the old data 594 */ 595 if (ret->states != NULL) { 596 for (i = 0;i < ret->nbStates;i++) 597 xmlRegFreeState(ret->states[i]); 598 xmlFree(ret->states); 599 } 600 ret->states = NULL; 601 ret->nbStates = 0; 602 if (ret->atoms != NULL) { 603 for (i = 0;i < ret->nbAtoms;i++) 604 xmlRegFreeAtom(ret->atoms[i]); 605 xmlFree(ret->atoms); 606 } 607 ret->atoms = NULL; 608 ret->nbAtoms = 0; 609 610 ret->compact = transitions; 611 ret->transdata = transdata; 612 ret->stringMap = stringMap; 613 ret->nbstrings = nbatoms; 614 ret->nbstates = nbstates; 615 xmlFree(stateRemap); 616 xmlFree(stringRemap); 617 } 618not_determ: 619 ctxt->string = NULL; 620 ctxt->nbStates = 0; 621 ctxt->states = NULL; 622 ctxt->nbAtoms = 0; 623 ctxt->atoms = NULL; 624 ctxt->nbCounters = 0; 625 ctxt->counters = NULL; 626 return(ret); 627} 628 629/** 630 * xmlRegNewParserCtxt: 631 * @string: the string to parse 632 * 633 * Allocate a new regexp parser context 634 * 635 * Returns the new context or NULL in case of error 636 */ 637static xmlRegParserCtxtPtr 638xmlRegNewParserCtxt(const xmlChar *string) { 639 xmlRegParserCtxtPtr ret; 640 641 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 642 if (ret == NULL) 643 return(NULL); 644 memset(ret, 0, sizeof(xmlRegParserCtxt)); 645 if (string != NULL) 646 ret->string = xmlStrdup(string); 647 ret->cur = ret->string; 648 ret->neg = 0; 649 ret->error = 0; 650 ret->determinist = -1; 651 return(ret); 652} 653 654/** 655 * xmlRegNewRange: 656 * @ctxt: the regexp parser context 657 * @neg: is that negative 658 * @type: the type of range 659 * @start: the start codepoint 660 * @end: the end codepoint 661 * 662 * Allocate a new regexp range 663 * 664 * Returns the new range or NULL in case of error 665 */ 666static xmlRegRangePtr 667xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 668 int neg, xmlRegAtomType type, int start, int end) { 669 xmlRegRangePtr ret; 670 671 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 672 if (ret == NULL) { 673 xmlRegexpErrMemory(ctxt, "allocating range"); 674 return(NULL); 675 } 676 ret->neg = neg; 677 ret->type = type; 678 ret->start = start; 679 ret->end = end; 680 return(ret); 681} 682 683/** 684 * xmlRegFreeRange: 685 * @range: the regexp range 686 * 687 * Free a regexp range 688 */ 689static void 690xmlRegFreeRange(xmlRegRangePtr range) { 691 if (range == NULL) 692 return; 693 694 if (range->blockName != NULL) 695 xmlFree(range->blockName); 696 xmlFree(range); 697} 698 699/** 700 * xmlRegNewAtom: 701 * @ctxt: the regexp parser context 702 * @type: the type of atom 703 * 704 * Allocate a new regexp range 705 * 706 * Returns the new atom or NULL in case of error 707 */ 708static xmlRegAtomPtr 709xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 710 xmlRegAtomPtr ret; 711 712 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 713 if (ret == NULL) { 714 xmlRegexpErrMemory(ctxt, "allocating atom"); 715 return(NULL); 716 } 717 memset(ret, 0, sizeof(xmlRegAtom)); 718 ret->type = type; 719 ret->quant = XML_REGEXP_QUANT_ONCE; 720 ret->min = 0; 721 ret->max = 0; 722 return(ret); 723} 724 725/** 726 * xmlRegFreeAtom: 727 * @atom: the regexp atom 728 * 729 * Free a regexp atom 730 */ 731static void 732xmlRegFreeAtom(xmlRegAtomPtr atom) { 733 int i; 734 735 if (atom == NULL) 736 return; 737 738 for (i = 0;i < atom->nbRanges;i++) 739 xmlRegFreeRange(atom->ranges[i]); 740 if (atom->ranges != NULL) 741 xmlFree(atom->ranges); 742 if (atom->type == XML_REGEXP_STRING) 743 xmlFree(atom->valuep); 744 xmlFree(atom); 745} 746 747static xmlRegStatePtr 748xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 749 xmlRegStatePtr ret; 750 751 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 752 if (ret == NULL) { 753 xmlRegexpErrMemory(ctxt, "allocating state"); 754 return(NULL); 755 } 756 memset(ret, 0, sizeof(xmlRegState)); 757 ret->type = XML_REGEXP_TRANS_STATE; 758 ret->mark = XML_REGEXP_MARK_NORMAL; 759 return(ret); 760} 761 762/** 763 * xmlRegFreeState: 764 * @state: the regexp state 765 * 766 * Free a regexp state 767 */ 768static void 769xmlRegFreeState(xmlRegStatePtr state) { 770 if (state == NULL) 771 return; 772 773 if (state->trans != NULL) 774 xmlFree(state->trans); 775 xmlFree(state); 776} 777 778/** 779 * xmlRegFreeParserCtxt: 780 * @ctxt: the regexp parser context 781 * 782 * Free a regexp parser context 783 */ 784static void 785xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 786 int i; 787 if (ctxt == NULL) 788 return; 789 790 if (ctxt->string != NULL) 791 xmlFree(ctxt->string); 792 if (ctxt->states != NULL) { 793 for (i = 0;i < ctxt->nbStates;i++) 794 xmlRegFreeState(ctxt->states[i]); 795 xmlFree(ctxt->states); 796 } 797 if (ctxt->atoms != NULL) { 798 for (i = 0;i < ctxt->nbAtoms;i++) 799 xmlRegFreeAtom(ctxt->atoms[i]); 800 xmlFree(ctxt->atoms); 801 } 802 if (ctxt->counters != NULL) 803 xmlFree(ctxt->counters); 804 xmlFree(ctxt); 805} 806 807/************************************************************************ 808 * * 809 * Display of Data structures * 810 * * 811 ************************************************************************/ 812 813static void 814xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 815 switch (type) { 816 case XML_REGEXP_EPSILON: 817 fprintf(output, "epsilon "); break; 818 case XML_REGEXP_CHARVAL: 819 fprintf(output, "charval "); break; 820 case XML_REGEXP_RANGES: 821 fprintf(output, "ranges "); break; 822 case XML_REGEXP_SUBREG: 823 fprintf(output, "subexpr "); break; 824 case XML_REGEXP_STRING: 825 fprintf(output, "string "); break; 826 case XML_REGEXP_ANYCHAR: 827 fprintf(output, "anychar "); break; 828 case XML_REGEXP_ANYSPACE: 829 fprintf(output, "anyspace "); break; 830 case XML_REGEXP_NOTSPACE: 831 fprintf(output, "notspace "); break; 832 case XML_REGEXP_INITNAME: 833 fprintf(output, "initname "); break; 834 case XML_REGEXP_NOTINITNAME: 835 fprintf(output, "notinitname "); break; 836 case XML_REGEXP_NAMECHAR: 837 fprintf(output, "namechar "); break; 838 case XML_REGEXP_NOTNAMECHAR: 839 fprintf(output, "notnamechar "); break; 840 case XML_REGEXP_DECIMAL: 841 fprintf(output, "decimal "); break; 842 case XML_REGEXP_NOTDECIMAL: 843 fprintf(output, "notdecimal "); break; 844 case XML_REGEXP_REALCHAR: 845 fprintf(output, "realchar "); break; 846 case XML_REGEXP_NOTREALCHAR: 847 fprintf(output, "notrealchar "); break; 848 case XML_REGEXP_LETTER: 849 fprintf(output, "LETTER "); break; 850 case XML_REGEXP_LETTER_UPPERCASE: 851 fprintf(output, "LETTER_UPPERCASE "); break; 852 case XML_REGEXP_LETTER_LOWERCASE: 853 fprintf(output, "LETTER_LOWERCASE "); break; 854 case XML_REGEXP_LETTER_TITLECASE: 855 fprintf(output, "LETTER_TITLECASE "); break; 856 case XML_REGEXP_LETTER_MODIFIER: 857 fprintf(output, "LETTER_MODIFIER "); break; 858 case XML_REGEXP_LETTER_OTHERS: 859 fprintf(output, "LETTER_OTHERS "); break; 860 case XML_REGEXP_MARK: 861 fprintf(output, "MARK "); break; 862 case XML_REGEXP_MARK_NONSPACING: 863 fprintf(output, "MARK_NONSPACING "); break; 864 case XML_REGEXP_MARK_SPACECOMBINING: 865 fprintf(output, "MARK_SPACECOMBINING "); break; 866 case XML_REGEXP_MARK_ENCLOSING: 867 fprintf(output, "MARK_ENCLOSING "); break; 868 case XML_REGEXP_NUMBER: 869 fprintf(output, "NUMBER "); break; 870 case XML_REGEXP_NUMBER_DECIMAL: 871 fprintf(output, "NUMBER_DECIMAL "); break; 872 case XML_REGEXP_NUMBER_LETTER: 873 fprintf(output, "NUMBER_LETTER "); break; 874 case XML_REGEXP_NUMBER_OTHERS: 875 fprintf(output, "NUMBER_OTHERS "); break; 876 case XML_REGEXP_PUNCT: 877 fprintf(output, "PUNCT "); break; 878 case XML_REGEXP_PUNCT_CONNECTOR: 879 fprintf(output, "PUNCT_CONNECTOR "); break; 880 case XML_REGEXP_PUNCT_DASH: 881 fprintf(output, "PUNCT_DASH "); break; 882 case XML_REGEXP_PUNCT_OPEN: 883 fprintf(output, "PUNCT_OPEN "); break; 884 case XML_REGEXP_PUNCT_CLOSE: 885 fprintf(output, "PUNCT_CLOSE "); break; 886 case XML_REGEXP_PUNCT_INITQUOTE: 887 fprintf(output, "PUNCT_INITQUOTE "); break; 888 case XML_REGEXP_PUNCT_FINQUOTE: 889 fprintf(output, "PUNCT_FINQUOTE "); break; 890 case XML_REGEXP_PUNCT_OTHERS: 891 fprintf(output, "PUNCT_OTHERS "); break; 892 case XML_REGEXP_SEPAR: 893 fprintf(output, "SEPAR "); break; 894 case XML_REGEXP_SEPAR_SPACE: 895 fprintf(output, "SEPAR_SPACE "); break; 896 case XML_REGEXP_SEPAR_LINE: 897 fprintf(output, "SEPAR_LINE "); break; 898 case XML_REGEXP_SEPAR_PARA: 899 fprintf(output, "SEPAR_PARA "); break; 900 case XML_REGEXP_SYMBOL: 901 fprintf(output, "SYMBOL "); break; 902 case XML_REGEXP_SYMBOL_MATH: 903 fprintf(output, "SYMBOL_MATH "); break; 904 case XML_REGEXP_SYMBOL_CURRENCY: 905 fprintf(output, "SYMBOL_CURRENCY "); break; 906 case XML_REGEXP_SYMBOL_MODIFIER: 907 fprintf(output, "SYMBOL_MODIFIER "); break; 908 case XML_REGEXP_SYMBOL_OTHERS: 909 fprintf(output, "SYMBOL_OTHERS "); break; 910 case XML_REGEXP_OTHER: 911 fprintf(output, "OTHER "); break; 912 case XML_REGEXP_OTHER_CONTROL: 913 fprintf(output, "OTHER_CONTROL "); break; 914 case XML_REGEXP_OTHER_FORMAT: 915 fprintf(output, "OTHER_FORMAT "); break; 916 case XML_REGEXP_OTHER_PRIVATE: 917 fprintf(output, "OTHER_PRIVATE "); break; 918 case XML_REGEXP_OTHER_NA: 919 fprintf(output, "OTHER_NA "); break; 920 case XML_REGEXP_BLOCK_NAME: 921 fprintf(output, "BLOCK "); break; 922 } 923} 924 925static void 926xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 927 switch (type) { 928 case XML_REGEXP_QUANT_EPSILON: 929 fprintf(output, "epsilon "); break; 930 case XML_REGEXP_QUANT_ONCE: 931 fprintf(output, "once "); break; 932 case XML_REGEXP_QUANT_OPT: 933 fprintf(output, "? "); break; 934 case XML_REGEXP_QUANT_MULT: 935 fprintf(output, "* "); break; 936 case XML_REGEXP_QUANT_PLUS: 937 fprintf(output, "+ "); break; 938 case XML_REGEXP_QUANT_RANGE: 939 fprintf(output, "range "); break; 940 case XML_REGEXP_QUANT_ONCEONLY: 941 fprintf(output, "onceonly "); break; 942 case XML_REGEXP_QUANT_ALL: 943 fprintf(output, "all "); break; 944 } 945} 946static void 947xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 948 fprintf(output, " range: "); 949 if (range->neg) 950 fprintf(output, "negative "); 951 xmlRegPrintAtomType(output, range->type); 952 fprintf(output, "%c - %c\n", range->start, range->end); 953} 954 955static void 956xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 957 fprintf(output, " atom: "); 958 if (atom == NULL) { 959 fprintf(output, "NULL\n"); 960 return; 961 } 962 xmlRegPrintAtomType(output, atom->type); 963 xmlRegPrintQuantType(output, atom->quant); 964 if (atom->quant == XML_REGEXP_QUANT_RANGE) 965 fprintf(output, "%d-%d ", atom->min, atom->max); 966 if (atom->type == XML_REGEXP_STRING) 967 fprintf(output, "'%s' ", (char *) atom->valuep); 968 if (atom->type == XML_REGEXP_CHARVAL) 969 fprintf(output, "char %c\n", atom->codepoint); 970 else if (atom->type == XML_REGEXP_RANGES) { 971 int i; 972 fprintf(output, "%d entries\n", atom->nbRanges); 973 for (i = 0; i < atom->nbRanges;i++) 974 xmlRegPrintRange(output, atom->ranges[i]); 975 } else if (atom->type == XML_REGEXP_SUBREG) { 976 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 977 } else { 978 fprintf(output, "\n"); 979 } 980} 981 982static void 983xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 984 fprintf(output, " trans: "); 985 if (trans == NULL) { 986 fprintf(output, "NULL\n"); 987 return; 988 } 989 if (trans->to < 0) { 990 fprintf(output, "removed\n"); 991 return; 992 } 993 if (trans->counter >= 0) { 994 fprintf(output, "counted %d, ", trans->counter); 995 } 996 if (trans->count == REGEXP_ALL_COUNTER) { 997 fprintf(output, "all transition, "); 998 } else if (trans->count >= 0) { 999 fprintf(output, "count based %d, ", trans->count); 1000 } 1001 if (trans->atom == NULL) { 1002 fprintf(output, "epsilon to %d\n", trans->to); 1003 return; 1004 } 1005 if (trans->atom->type == XML_REGEXP_CHARVAL) 1006 fprintf(output, "char %c ", trans->atom->codepoint); 1007 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1008} 1009 1010static void 1011xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1012 int i; 1013 1014 fprintf(output, " state: "); 1015 if (state == NULL) { 1016 fprintf(output, "NULL\n"); 1017 return; 1018 } 1019 if (state->type == XML_REGEXP_START_STATE) 1020 fprintf(output, "START "); 1021 if (state->type == XML_REGEXP_FINAL_STATE) 1022 fprintf(output, "FINAL "); 1023 1024 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1025 for (i = 0;i < state->nbTrans; i++) { 1026 xmlRegPrintTrans(output, &(state->trans[i])); 1027 } 1028} 1029 1030#ifdef DEBUG_REGEXP_GRAPH 1031static void 1032xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1033 int i; 1034 1035 fprintf(output, " ctxt: "); 1036 if (ctxt == NULL) { 1037 fprintf(output, "NULL\n"); 1038 return; 1039 } 1040 fprintf(output, "'%s' ", ctxt->string); 1041 if (ctxt->error) 1042 fprintf(output, "error "); 1043 if (ctxt->neg) 1044 fprintf(output, "neg "); 1045 fprintf(output, "\n"); 1046 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1047 for (i = 0;i < ctxt->nbAtoms; i++) { 1048 fprintf(output, " %02d ", i); 1049 xmlRegPrintAtom(output, ctxt->atoms[i]); 1050 } 1051 if (ctxt->atom != NULL) { 1052 fprintf(output, "current atom:\n"); 1053 xmlRegPrintAtom(output, ctxt->atom); 1054 } 1055 fprintf(output, "%d states:", ctxt->nbStates); 1056 if (ctxt->start != NULL) 1057 fprintf(output, " start: %d", ctxt->start->no); 1058 if (ctxt->end != NULL) 1059 fprintf(output, " end: %d", ctxt->end->no); 1060 fprintf(output, "\n"); 1061 for (i = 0;i < ctxt->nbStates; i++) { 1062 xmlRegPrintState(output, ctxt->states[i]); 1063 } 1064 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1065 for (i = 0;i < ctxt->nbCounters; i++) { 1066 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1067 ctxt->counters[i].max); 1068 } 1069} 1070#endif 1071 1072/************************************************************************ 1073 * * 1074 * Finite Automata structures manipulations * 1075 * * 1076 ************************************************************************/ 1077 1078static void 1079xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1080 int neg, xmlRegAtomType type, int start, int end, 1081 xmlChar *blockName) { 1082 xmlRegRangePtr range; 1083 1084 if (atom == NULL) { 1085 ERROR("add range: atom is NULL"); 1086 return; 1087 } 1088 if (atom->type != XML_REGEXP_RANGES) { 1089 ERROR("add range: atom is not ranges"); 1090 return; 1091 } 1092 if (atom->maxRanges == 0) { 1093 atom->maxRanges = 4; 1094 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1095 sizeof(xmlRegRangePtr)); 1096 if (atom->ranges == NULL) { 1097 xmlRegexpErrMemory(ctxt, "adding ranges"); 1098 atom->maxRanges = 0; 1099 return; 1100 } 1101 } else if (atom->nbRanges >= atom->maxRanges) { 1102 xmlRegRangePtr *tmp; 1103 atom->maxRanges *= 2; 1104 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1105 sizeof(xmlRegRangePtr)); 1106 if (tmp == NULL) { 1107 xmlRegexpErrMemory(ctxt, "adding ranges"); 1108 atom->maxRanges /= 2; 1109 return; 1110 } 1111 atom->ranges = tmp; 1112 } 1113 range = xmlRegNewRange(ctxt, neg, type, start, end); 1114 if (range == NULL) 1115 return; 1116 range->blockName = blockName; 1117 atom->ranges[atom->nbRanges++] = range; 1118 1119} 1120 1121static int 1122xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1123 if (ctxt->maxCounters == 0) { 1124 ctxt->maxCounters = 4; 1125 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1126 sizeof(xmlRegCounter)); 1127 if (ctxt->counters == NULL) { 1128 xmlRegexpErrMemory(ctxt, "allocating counter"); 1129 ctxt->maxCounters = 0; 1130 return(-1); 1131 } 1132 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1133 xmlRegCounter *tmp; 1134 ctxt->maxCounters *= 2; 1135 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1136 sizeof(xmlRegCounter)); 1137 if (tmp == NULL) { 1138 xmlRegexpErrMemory(ctxt, "allocating counter"); 1139 ctxt->maxCounters /= 2; 1140 return(-1); 1141 } 1142 ctxt->counters = tmp; 1143 } 1144 ctxt->counters[ctxt->nbCounters].min = -1; 1145 ctxt->counters[ctxt->nbCounters].max = -1; 1146 return(ctxt->nbCounters++); 1147} 1148 1149static int 1150xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1151 if (atom == NULL) { 1152 ERROR("atom push: atom is NULL"); 1153 return(-1); 1154 } 1155 if (ctxt->maxAtoms == 0) { 1156 ctxt->maxAtoms = 4; 1157 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1158 sizeof(xmlRegAtomPtr)); 1159 if (ctxt->atoms == NULL) { 1160 xmlRegexpErrMemory(ctxt, "pushing atom"); 1161 ctxt->maxAtoms = 0; 1162 return(-1); 1163 } 1164 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1165 xmlRegAtomPtr *tmp; 1166 ctxt->maxAtoms *= 2; 1167 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1168 sizeof(xmlRegAtomPtr)); 1169 if (tmp == NULL) { 1170 xmlRegexpErrMemory(ctxt, "allocating counter"); 1171 ctxt->maxAtoms /= 2; 1172 return(-1); 1173 } 1174 ctxt->atoms = tmp; 1175 } 1176 atom->no = ctxt->nbAtoms; 1177 ctxt->atoms[ctxt->nbAtoms++] = atom; 1178 return(0); 1179} 1180 1181static void 1182xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1183 xmlRegAtomPtr atom, xmlRegStatePtr target, 1184 int counter, int count) { 1185 if (state == NULL) { 1186 ERROR("add state: state is NULL"); 1187 return; 1188 } 1189 if (target == NULL) { 1190 ERROR("add state: target is NULL"); 1191 return; 1192 } 1193 if (state->maxTrans == 0) { 1194 state->maxTrans = 4; 1195 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1196 sizeof(xmlRegTrans)); 1197 if (state->trans == NULL) { 1198 xmlRegexpErrMemory(ctxt, "adding transition"); 1199 state->maxTrans = 0; 1200 return; 1201 } 1202 } else if (state->nbTrans >= state->maxTrans) { 1203 xmlRegTrans *tmp; 1204 state->maxTrans *= 2; 1205 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1206 sizeof(xmlRegTrans)); 1207 if (tmp == NULL) { 1208 xmlRegexpErrMemory(ctxt, "adding transition"); 1209 state->maxTrans /= 2; 1210 return; 1211 } 1212 state->trans = tmp; 1213 } 1214#ifdef DEBUG_REGEXP_GRAPH 1215 printf("Add trans from %d to %d ", state->no, target->no); 1216 if (count == REGEXP_ALL_COUNTER) 1217 printf("all transition"); 1218 else if (count >= 0) 1219 printf("count based %d", count); 1220 else if (counter >= 0) 1221 printf("counted %d", counter); 1222 else if (atom == NULL) 1223 printf("epsilon transition"); 1224 printf("\n"); 1225#endif 1226 1227 state->trans[state->nbTrans].atom = atom; 1228 state->trans[state->nbTrans].to = target->no; 1229 state->trans[state->nbTrans].counter = counter; 1230 state->trans[state->nbTrans].count = count; 1231 state->nbTrans++; 1232} 1233 1234static int 1235xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1236 if (state == NULL) return(-1); 1237 if (ctxt->maxStates == 0) { 1238 ctxt->maxStates = 4; 1239 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1240 sizeof(xmlRegStatePtr)); 1241 if (ctxt->states == NULL) { 1242 xmlRegexpErrMemory(ctxt, "adding state"); 1243 ctxt->maxStates = 0; 1244 return(-1); 1245 } 1246 } else if (ctxt->nbStates >= ctxt->maxStates) { 1247 xmlRegStatePtr *tmp; 1248 ctxt->maxStates *= 2; 1249 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1250 sizeof(xmlRegStatePtr)); 1251 if (tmp == NULL) { 1252 xmlRegexpErrMemory(ctxt, "adding state"); 1253 ctxt->maxStates /= 2; 1254 return(-1); 1255 } 1256 ctxt->states = tmp; 1257 } 1258 state->no = ctxt->nbStates; 1259 ctxt->states[ctxt->nbStates++] = state; 1260 return(0); 1261} 1262 1263/** 1264 * xmlFAGenerateAllTransition: 1265 * @ctxt: a regexp parser context 1266 * @from: the from state 1267 * @to: the target state or NULL for building a new one 1268 * @lax: 1269 * 1270 */ 1271static void 1272xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1273 xmlRegStatePtr from, xmlRegStatePtr to, 1274 int lax) { 1275 if (to == NULL) { 1276 to = xmlRegNewState(ctxt); 1277 xmlRegStatePush(ctxt, to); 1278 ctxt->state = to; 1279 } 1280 if (lax) 1281 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1282 else 1283 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1284} 1285 1286/** 1287 * xmlFAGenerateEpsilonTransition: 1288 * @ctxt: a regexp parser context 1289 * @from: the from state 1290 * @to: the target state or NULL for building a new one 1291 * 1292 */ 1293static void 1294xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1295 xmlRegStatePtr from, xmlRegStatePtr to) { 1296 if (to == NULL) { 1297 to = xmlRegNewState(ctxt); 1298 xmlRegStatePush(ctxt, to); 1299 ctxt->state = to; 1300 } 1301 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1302} 1303 1304/** 1305 * xmlFAGenerateCountedEpsilonTransition: 1306 * @ctxt: a regexp parser context 1307 * @from: the from state 1308 * @to: the target state or NULL for building a new one 1309 * counter: the counter for that transition 1310 * 1311 */ 1312static void 1313xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1314 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1315 if (to == NULL) { 1316 to = xmlRegNewState(ctxt); 1317 xmlRegStatePush(ctxt, to); 1318 ctxt->state = to; 1319 } 1320 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1321} 1322 1323/** 1324 * xmlFAGenerateCountedTransition: 1325 * @ctxt: a regexp parser context 1326 * @from: the from state 1327 * @to: the target state or NULL for building a new one 1328 * counter: the counter for that transition 1329 * 1330 */ 1331static void 1332xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1333 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1334 if (to == NULL) { 1335 to = xmlRegNewState(ctxt); 1336 xmlRegStatePush(ctxt, to); 1337 ctxt->state = to; 1338 } 1339 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1340} 1341 1342/** 1343 * xmlFAGenerateTransitions: 1344 * @ctxt: a regexp parser context 1345 * @from: the from state 1346 * @to: the target state or NULL for building a new one 1347 * @atom: the atom generating the transition 1348 * 1349 * Returns 0 if succes and -1 in case of error. 1350 */ 1351static int 1352xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1353 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1354 if (atom == NULL) { 1355 ERROR("genrate transition: atom == NULL"); 1356 return(-1); 1357 } 1358 if (atom->type == XML_REGEXP_SUBREG) { 1359 /* 1360 * this is a subexpression handling one should not need to 1361 * create a new node excep for XML_REGEXP_QUANT_RANGE. 1362 */ 1363 if (xmlRegAtomPush(ctxt, atom) < 0) { 1364 return(-1); 1365 } 1366 if ((to != NULL) && (atom->stop != to) && 1367 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1368 /* 1369 * Generate an epsilon transition to link to the target 1370 */ 1371 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1372 } 1373 switch (atom->quant) { 1374 case XML_REGEXP_QUANT_OPT: 1375 atom->quant = XML_REGEXP_QUANT_ONCE; 1376 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1377 break; 1378 case XML_REGEXP_QUANT_MULT: 1379 atom->quant = XML_REGEXP_QUANT_ONCE; 1380 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1381 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1382 break; 1383 case XML_REGEXP_QUANT_PLUS: 1384 atom->quant = XML_REGEXP_QUANT_ONCE; 1385 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1386 break; 1387 case XML_REGEXP_QUANT_RANGE: { 1388 int counter; 1389 xmlRegStatePtr newstate; 1390 1391 /* 1392 * This one is nasty: 1393 * 1/ register a new counter 1394 * 2/ register an epsilon transition associated to 1395 * this counter going from atom->stop to atom->start 1396 * 3/ create a new state 1397 * 4/ generate a counted transition from atom->stop to 1398 * that state 1399 */ 1400 counter = xmlRegGetCounter(ctxt); 1401 ctxt->counters[counter].min = atom->min - 1; 1402 ctxt->counters[counter].max = atom->max - 1; 1403 atom->min = 0; 1404 atom->max = 0; 1405 atom->quant = XML_REGEXP_QUANT_ONCE; 1406 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1407 atom->start, counter); 1408 if (to != NULL) { 1409 newstate = to; 1410 } else { 1411 newstate = xmlRegNewState(ctxt); 1412 xmlRegStatePush(ctxt, newstate); 1413 ctxt->state = newstate; 1414 } 1415 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1416 newstate, counter); 1417 } 1418 default: 1419 break; 1420 } 1421 return(0); 1422 } else { 1423 if (to == NULL) { 1424 to = xmlRegNewState(ctxt); 1425 if (to != NULL) 1426 xmlRegStatePush(ctxt, to); 1427 else { 1428 return(-1); 1429 } 1430 } 1431 if (xmlRegAtomPush(ctxt, atom) < 0) { 1432 return(-1); 1433 } 1434 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1435 ctxt->state = to; 1436 } 1437 switch (atom->quant) { 1438 case XML_REGEXP_QUANT_OPT: 1439 atom->quant = XML_REGEXP_QUANT_ONCE; 1440 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1441 break; 1442 case XML_REGEXP_QUANT_MULT: 1443 atom->quant = XML_REGEXP_QUANT_ONCE; 1444 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1445 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1446 break; 1447 case XML_REGEXP_QUANT_PLUS: 1448 atom->quant = XML_REGEXP_QUANT_ONCE; 1449 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1450 break; 1451 default: 1452 break; 1453 } 1454 return(0); 1455} 1456 1457/** 1458 * xmlFAReduceEpsilonTransitions: 1459 * @ctxt: a regexp parser context 1460 * @fromnr: the from state 1461 * @tonr: the to state 1462 * @cpunter: should that transition be associted to a counted 1463 * 1464 */ 1465static void 1466xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1467 int tonr, int counter) { 1468 int transnr; 1469 xmlRegStatePtr from; 1470 xmlRegStatePtr to; 1471 1472#ifdef DEBUG_REGEXP_GRAPH 1473 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1474#endif 1475 from = ctxt->states[fromnr]; 1476 if (from == NULL) 1477 return; 1478 to = ctxt->states[tonr]; 1479 if (to == NULL) 1480 return; 1481 if ((to->mark == XML_REGEXP_MARK_START) || 1482 (to->mark == XML_REGEXP_MARK_VISITED)) 1483 return; 1484 1485 to->mark = XML_REGEXP_MARK_VISITED; 1486 if (to->type == XML_REGEXP_FINAL_STATE) { 1487#ifdef DEBUG_REGEXP_GRAPH 1488 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1489#endif 1490 from->type = XML_REGEXP_FINAL_STATE; 1491 } 1492 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1493 if (to->trans[transnr].atom == NULL) { 1494 /* 1495 * Don't remove counted transitions 1496 * Don't loop either 1497 */ 1498 if (to->trans[transnr].to != fromnr) { 1499 if (to->trans[transnr].count >= 0) { 1500 int newto = to->trans[transnr].to; 1501 1502 xmlRegStateAddTrans(ctxt, from, NULL, 1503 ctxt->states[newto], 1504 -1, to->trans[transnr].count); 1505 } else { 1506#ifdef DEBUG_REGEXP_GRAPH 1507 printf("Found epsilon trans %d from %d to %d\n", 1508 transnr, tonr, to->trans[transnr].to); 1509#endif 1510 if (to->trans[transnr].counter >= 0) { 1511 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1512 to->trans[transnr].to, 1513 to->trans[transnr].counter); 1514 } else { 1515 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1516 to->trans[transnr].to, 1517 counter); 1518 } 1519 } 1520 } 1521 } else { 1522 int newto = to->trans[transnr].to; 1523 1524 if (to->trans[transnr].counter >= 0) { 1525 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1526 ctxt->states[newto], 1527 to->trans[transnr].counter, -1); 1528 } else { 1529 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1530 ctxt->states[newto], counter, -1); 1531 } 1532 } 1533 } 1534 to->mark = XML_REGEXP_MARK_NORMAL; 1535} 1536 1537/** 1538 * xmlFAEliminateEpsilonTransitions: 1539 * @ctxt: a regexp parser context 1540 * 1541 */ 1542static void 1543xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1544 int statenr, transnr; 1545 xmlRegStatePtr state; 1546 1547 if (ctxt->states == NULL) return; 1548 1549 1550 /* 1551 * build the completed transitions bypassing the epsilons 1552 * Use a marking algorithm to avoid loops 1553 */ 1554 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1555 state = ctxt->states[statenr]; 1556 if (state == NULL) 1557 continue; 1558 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1559 if ((state->trans[transnr].atom == NULL) && 1560 (state->trans[transnr].to >= 0)) { 1561 if (state->trans[transnr].to == statenr) { 1562 state->trans[transnr].to = -1; 1563#ifdef DEBUG_REGEXP_GRAPH 1564 printf("Removed loopback epsilon trans %d on %d\n", 1565 transnr, statenr); 1566#endif 1567 } else if (state->trans[transnr].count < 0) { 1568 int newto = state->trans[transnr].to; 1569 1570#ifdef DEBUG_REGEXP_GRAPH 1571 printf("Found epsilon trans %d from %d to %d\n", 1572 transnr, statenr, newto); 1573#endif 1574 state->mark = XML_REGEXP_MARK_START; 1575 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1576 newto, state->trans[transnr].counter); 1577 state->mark = XML_REGEXP_MARK_NORMAL; 1578#ifdef DEBUG_REGEXP_GRAPH 1579 } else { 1580 printf("Found counted transition %d on %d\n", 1581 transnr, statenr); 1582#endif 1583 } 1584 } 1585 } 1586 } 1587 /* 1588 * Eliminate the epsilon transitions 1589 */ 1590 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1591 state = ctxt->states[statenr]; 1592 if (state == NULL) 1593 continue; 1594 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1595 if ((state->trans[transnr].atom == NULL) && 1596 (state->trans[transnr].count < 0) && 1597 (state->trans[transnr].to >= 0)) { 1598 state->trans[transnr].to = -1; 1599 } 1600 } 1601 } 1602 1603 /* 1604 * Use this pass to detect unreachable states too 1605 */ 1606 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1607 state = ctxt->states[statenr]; 1608 if (state != NULL) 1609 state->reached = XML_REGEXP_MARK_NORMAL; 1610 } 1611 state = ctxt->states[0]; 1612 if (state != NULL) 1613 state->reached = XML_REGEXP_MARK_START; 1614 while (state != NULL) { 1615 xmlRegStatePtr target = NULL; 1616 state->reached = XML_REGEXP_MARK_VISITED; 1617 /* 1618 * Mark all state reachable from the current reachable state 1619 */ 1620 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1621 if ((state->trans[transnr].to >= 0) && 1622 ((state->trans[transnr].atom != NULL) || 1623 (state->trans[transnr].count >= 0))) { 1624 int newto = state->trans[transnr].to; 1625 1626 if (ctxt->states[newto] == NULL) 1627 continue; 1628 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 1629 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 1630 target = ctxt->states[newto]; 1631 } 1632 } 1633 } 1634 /* 1635 * find the next accessible state not explored 1636 */ 1637 if (target == NULL) { 1638 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 1639 state = ctxt->states[statenr]; 1640 if ((state != NULL) && (state->reached == 1641 XML_REGEXP_MARK_START)) { 1642 target = state; 1643 break; 1644 } 1645 } 1646 } 1647 state = target; 1648 } 1649 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1650 state = ctxt->states[statenr]; 1651 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 1652#ifdef DEBUG_REGEXP_GRAPH 1653 printf("Removed unreachable state %d\n", statenr); 1654#endif 1655 xmlRegFreeState(state); 1656 ctxt->states[statenr] = NULL; 1657 } 1658 } 1659 1660} 1661 1662/** 1663 * xmlFACompareAtoms: 1664 * @atom1: an atom 1665 * @atom2: an atom 1666 * 1667 * Compares two atoms to check whether they are equivatents 1668 * 1669 * Returns 1 if yes and 0 otherwise 1670 */ 1671static int 1672xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 1673 if (atom1 == atom2) 1674 return(1); 1675 if ((atom1 == NULL) || (atom2 == NULL)) 1676 return(0); 1677 1678 if (atom1->type != atom2->type) 1679 return(0); 1680 switch (atom1->type) { 1681 case XML_REGEXP_STRING: 1682 return(xmlStrEqual((xmlChar *)atom1->valuep, 1683 (xmlChar *)atom2->valuep)); 1684 case XML_REGEXP_EPSILON: 1685 return(1); 1686 case XML_REGEXP_CHARVAL: 1687 return(atom1->codepoint == atom2->codepoint); 1688 case XML_REGEXP_RANGES: 1689 TODO; 1690 return(0); 1691 default: 1692 break; 1693 } 1694 return(1); 1695} 1696 1697/** 1698 * xmlFARecurseDeterminism: 1699 * @ctxt: a regexp parser context 1700 * 1701 * Check whether the associated regexp is determinist, 1702 * should be called after xmlFAEliminateEpsilonTransitions() 1703 * 1704 */ 1705static int 1706xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1707 int to, xmlRegAtomPtr atom) { 1708 int ret = 1; 1709 int transnr; 1710 xmlRegTransPtr t1; 1711 1712 if (state == NULL) 1713 return(ret); 1714 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1715 t1 = &(state->trans[transnr]); 1716 /* 1717 * check transitions conflicting with the one looked at 1718 */ 1719 if (t1->atom == NULL) { 1720 if (t1->to == -1) 1721 continue; 1722 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 1723 to, atom); 1724 if (ret == 0) 1725 return(0); 1726 continue; 1727 } 1728 if (t1->to != to) 1729 continue; 1730 if (xmlFACompareAtoms(t1->atom, atom)) 1731 return(0); 1732 } 1733 return(ret); 1734} 1735 1736/** 1737 * xmlFAComputesDeterminism: 1738 * @ctxt: a regexp parser context 1739 * 1740 * Check whether the associated regexp is determinist, 1741 * should be called after xmlFAEliminateEpsilonTransitions() 1742 * 1743 */ 1744static int 1745xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 1746 int statenr, transnr; 1747 xmlRegStatePtr state; 1748 xmlRegTransPtr t1, t2; 1749 int i; 1750 int ret = 1; 1751 1752#ifdef DEBUG_REGEXP_GRAPH 1753 printf("xmlFAComputesDeterminism\n"); 1754 xmlRegPrintCtxt(stdout, ctxt); 1755#endif 1756 if (ctxt->determinist != -1) 1757 return(ctxt->determinist); 1758 1759 /* 1760 * Check for all states that there isn't 2 transitions 1761 * with the same atom and a different target. 1762 */ 1763 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1764 state = ctxt->states[statenr]; 1765 if (state == NULL) 1766 continue; 1767 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1768 t1 = &(state->trans[transnr]); 1769 /* 1770 * Determinism checks in case of counted or all transitions 1771 * will have to be handled separately 1772 */ 1773 if (t1->atom == NULL) 1774 continue; 1775 if (t1->to == -1) /* eliminated */ 1776 continue; 1777 for (i = 0;i < transnr;i++) { 1778 t2 = &(state->trans[i]); 1779 if (t2->to == -1) /* eliminated */ 1780 continue; 1781 if (t2->atom != NULL) { 1782 if (t1->to == t2->to) { 1783 if (xmlFACompareAtoms(t1->atom, t2->atom)) 1784 t2->to = -1; /* eliminate */ 1785 } else { 1786 /* not determinist ! */ 1787 if (xmlFACompareAtoms(t1->atom, t2->atom)) 1788 ret = 0; 1789 } 1790 } else if (t1->to != -1) { 1791 /* 1792 * do the closure in case of remaining specific 1793 * epsilon transitions like choices or all 1794 */ 1795 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 1796 t2->to, t2->atom); 1797 if (ret == 0) 1798 return(0); 1799 } 1800 } 1801 if (ret == 0) 1802 break; 1803 } 1804 if (ret == 0) 1805 break; 1806 } 1807 ctxt->determinist = ret; 1808 return(ret); 1809} 1810 1811/************************************************************************ 1812 * * 1813 * Routines to check input against transition atoms * 1814 * * 1815 ************************************************************************/ 1816 1817static int 1818xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 1819 int start, int end, const xmlChar *blockName) { 1820 int ret = 0; 1821 1822 switch (type) { 1823 case XML_REGEXP_STRING: 1824 case XML_REGEXP_SUBREG: 1825 case XML_REGEXP_RANGES: 1826 case XML_REGEXP_EPSILON: 1827 return(-1); 1828 case XML_REGEXP_ANYCHAR: 1829 ret = ((codepoint != '\n') && (codepoint != '\r')); 1830 break; 1831 case XML_REGEXP_CHARVAL: 1832 ret = ((codepoint >= start) && (codepoint <= end)); 1833 break; 1834 case XML_REGEXP_NOTSPACE: 1835 neg = !neg; 1836 case XML_REGEXP_ANYSPACE: 1837 ret = ((codepoint == '\n') || (codepoint == '\r') || 1838 (codepoint == '\t') || (codepoint == ' ')); 1839 break; 1840 case XML_REGEXP_NOTINITNAME: 1841 neg = !neg; 1842 case XML_REGEXP_INITNAME: 1843 ret = (IS_LETTER(codepoint) || 1844 (codepoint == '_') || (codepoint == ':')); 1845 break; 1846 case XML_REGEXP_NOTNAMECHAR: 1847 neg = !neg; 1848 case XML_REGEXP_NAMECHAR: 1849 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 1850 (codepoint == '.') || (codepoint == '-') || 1851 (codepoint == '_') || (codepoint == ':') || 1852 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 1853 break; 1854 case XML_REGEXP_NOTDECIMAL: 1855 neg = !neg; 1856 case XML_REGEXP_DECIMAL: 1857 ret = xmlUCSIsCatNd(codepoint); 1858 break; 1859 case XML_REGEXP_REALCHAR: 1860 neg = !neg; 1861 case XML_REGEXP_NOTREALCHAR: 1862 ret = xmlUCSIsCatP(codepoint); 1863 if (ret == 0) 1864 ret = xmlUCSIsCatZ(codepoint); 1865 if (ret == 0) 1866 ret = xmlUCSIsCatC(codepoint); 1867 break; 1868 case XML_REGEXP_LETTER: 1869 ret = xmlUCSIsCatL(codepoint); 1870 break; 1871 case XML_REGEXP_LETTER_UPPERCASE: 1872 ret = xmlUCSIsCatLu(codepoint); 1873 break; 1874 case XML_REGEXP_LETTER_LOWERCASE: 1875 ret = xmlUCSIsCatLl(codepoint); 1876 break; 1877 case XML_REGEXP_LETTER_TITLECASE: 1878 ret = xmlUCSIsCatLt(codepoint); 1879 break; 1880 case XML_REGEXP_LETTER_MODIFIER: 1881 ret = xmlUCSIsCatLm(codepoint); 1882 break; 1883 case XML_REGEXP_LETTER_OTHERS: 1884 ret = xmlUCSIsCatLo(codepoint); 1885 break; 1886 case XML_REGEXP_MARK: 1887 ret = xmlUCSIsCatM(codepoint); 1888 break; 1889 case XML_REGEXP_MARK_NONSPACING: 1890 ret = xmlUCSIsCatMn(codepoint); 1891 break; 1892 case XML_REGEXP_MARK_SPACECOMBINING: 1893 ret = xmlUCSIsCatMc(codepoint); 1894 break; 1895 case XML_REGEXP_MARK_ENCLOSING: 1896 ret = xmlUCSIsCatMe(codepoint); 1897 break; 1898 case XML_REGEXP_NUMBER: 1899 ret = xmlUCSIsCatN(codepoint); 1900 break; 1901 case XML_REGEXP_NUMBER_DECIMAL: 1902 ret = xmlUCSIsCatNd(codepoint); 1903 break; 1904 case XML_REGEXP_NUMBER_LETTER: 1905 ret = xmlUCSIsCatNl(codepoint); 1906 break; 1907 case XML_REGEXP_NUMBER_OTHERS: 1908 ret = xmlUCSIsCatNo(codepoint); 1909 break; 1910 case XML_REGEXP_PUNCT: 1911 ret = xmlUCSIsCatP(codepoint); 1912 break; 1913 case XML_REGEXP_PUNCT_CONNECTOR: 1914 ret = xmlUCSIsCatPc(codepoint); 1915 break; 1916 case XML_REGEXP_PUNCT_DASH: 1917 ret = xmlUCSIsCatPd(codepoint); 1918 break; 1919 case XML_REGEXP_PUNCT_OPEN: 1920 ret = xmlUCSIsCatPs(codepoint); 1921 break; 1922 case XML_REGEXP_PUNCT_CLOSE: 1923 ret = xmlUCSIsCatPe(codepoint); 1924 break; 1925 case XML_REGEXP_PUNCT_INITQUOTE: 1926 ret = xmlUCSIsCatPi(codepoint); 1927 break; 1928 case XML_REGEXP_PUNCT_FINQUOTE: 1929 ret = xmlUCSIsCatPf(codepoint); 1930 break; 1931 case XML_REGEXP_PUNCT_OTHERS: 1932 ret = xmlUCSIsCatPo(codepoint); 1933 break; 1934 case XML_REGEXP_SEPAR: 1935 ret = xmlUCSIsCatZ(codepoint); 1936 break; 1937 case XML_REGEXP_SEPAR_SPACE: 1938 ret = xmlUCSIsCatZs(codepoint); 1939 break; 1940 case XML_REGEXP_SEPAR_LINE: 1941 ret = xmlUCSIsCatZl(codepoint); 1942 break; 1943 case XML_REGEXP_SEPAR_PARA: 1944 ret = xmlUCSIsCatZp(codepoint); 1945 break; 1946 case XML_REGEXP_SYMBOL: 1947 ret = xmlUCSIsCatS(codepoint); 1948 break; 1949 case XML_REGEXP_SYMBOL_MATH: 1950 ret = xmlUCSIsCatSm(codepoint); 1951 break; 1952 case XML_REGEXP_SYMBOL_CURRENCY: 1953 ret = xmlUCSIsCatSc(codepoint); 1954 break; 1955 case XML_REGEXP_SYMBOL_MODIFIER: 1956 ret = xmlUCSIsCatSk(codepoint); 1957 break; 1958 case XML_REGEXP_SYMBOL_OTHERS: 1959 ret = xmlUCSIsCatSo(codepoint); 1960 break; 1961 case XML_REGEXP_OTHER: 1962 ret = xmlUCSIsCatC(codepoint); 1963 break; 1964 case XML_REGEXP_OTHER_CONTROL: 1965 ret = xmlUCSIsCatCc(codepoint); 1966 break; 1967 case XML_REGEXP_OTHER_FORMAT: 1968 ret = xmlUCSIsCatCf(codepoint); 1969 break; 1970 case XML_REGEXP_OTHER_PRIVATE: 1971 ret = xmlUCSIsCatCo(codepoint); 1972 break; 1973 case XML_REGEXP_OTHER_NA: 1974 /* ret = xmlUCSIsCatCn(codepoint); */ 1975 /* Seems it doesn't exist anymore in recent Unicode releases */ 1976 ret = 0; 1977 break; 1978 case XML_REGEXP_BLOCK_NAME: 1979 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 1980 break; 1981 } 1982 if (neg) 1983 return(!ret); 1984 return(ret); 1985} 1986 1987static int 1988xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 1989 int i, ret = 0; 1990 xmlRegRangePtr range; 1991 1992 if ((atom == NULL) || (!IS_CHAR(codepoint))) 1993 return(-1); 1994 1995 switch (atom->type) { 1996 case XML_REGEXP_SUBREG: 1997 case XML_REGEXP_EPSILON: 1998 return(-1); 1999 case XML_REGEXP_CHARVAL: 2000 return(codepoint == atom->codepoint); 2001 case XML_REGEXP_RANGES: { 2002 int accept = 0; 2003 2004 for (i = 0;i < atom->nbRanges;i++) { 2005 range = atom->ranges[i]; 2006 if (range->neg == 2) { 2007 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2008 0, range->start, range->end, 2009 range->blockName); 2010 if (ret != 0) 2011 return(0); /* excluded char */ 2012 } else if (range->neg) { 2013 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2014 0, range->start, range->end, 2015 range->blockName); 2016 if (ret == 0) 2017 accept = 1; 2018 else 2019 return(0); 2020 } else { 2021 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2022 0, range->start, range->end, 2023 range->blockName); 2024 if (ret != 0) 2025 accept = 1; /* might still be excluded */ 2026 } 2027 } 2028 return(accept); 2029 } 2030 case XML_REGEXP_STRING: 2031 printf("TODO: XML_REGEXP_STRING\n"); 2032 return(-1); 2033 case XML_REGEXP_ANYCHAR: 2034 case XML_REGEXP_ANYSPACE: 2035 case XML_REGEXP_NOTSPACE: 2036 case XML_REGEXP_INITNAME: 2037 case XML_REGEXP_NOTINITNAME: 2038 case XML_REGEXP_NAMECHAR: 2039 case XML_REGEXP_NOTNAMECHAR: 2040 case XML_REGEXP_DECIMAL: 2041 case XML_REGEXP_NOTDECIMAL: 2042 case XML_REGEXP_REALCHAR: 2043 case XML_REGEXP_NOTREALCHAR: 2044 case XML_REGEXP_LETTER: 2045 case XML_REGEXP_LETTER_UPPERCASE: 2046 case XML_REGEXP_LETTER_LOWERCASE: 2047 case XML_REGEXP_LETTER_TITLECASE: 2048 case XML_REGEXP_LETTER_MODIFIER: 2049 case XML_REGEXP_LETTER_OTHERS: 2050 case XML_REGEXP_MARK: 2051 case XML_REGEXP_MARK_NONSPACING: 2052 case XML_REGEXP_MARK_SPACECOMBINING: 2053 case XML_REGEXP_MARK_ENCLOSING: 2054 case XML_REGEXP_NUMBER: 2055 case XML_REGEXP_NUMBER_DECIMAL: 2056 case XML_REGEXP_NUMBER_LETTER: 2057 case XML_REGEXP_NUMBER_OTHERS: 2058 case XML_REGEXP_PUNCT: 2059 case XML_REGEXP_PUNCT_CONNECTOR: 2060 case XML_REGEXP_PUNCT_DASH: 2061 case XML_REGEXP_PUNCT_OPEN: 2062 case XML_REGEXP_PUNCT_CLOSE: 2063 case XML_REGEXP_PUNCT_INITQUOTE: 2064 case XML_REGEXP_PUNCT_FINQUOTE: 2065 case XML_REGEXP_PUNCT_OTHERS: 2066 case XML_REGEXP_SEPAR: 2067 case XML_REGEXP_SEPAR_SPACE: 2068 case XML_REGEXP_SEPAR_LINE: 2069 case XML_REGEXP_SEPAR_PARA: 2070 case XML_REGEXP_SYMBOL: 2071 case XML_REGEXP_SYMBOL_MATH: 2072 case XML_REGEXP_SYMBOL_CURRENCY: 2073 case XML_REGEXP_SYMBOL_MODIFIER: 2074 case XML_REGEXP_SYMBOL_OTHERS: 2075 case XML_REGEXP_OTHER: 2076 case XML_REGEXP_OTHER_CONTROL: 2077 case XML_REGEXP_OTHER_FORMAT: 2078 case XML_REGEXP_OTHER_PRIVATE: 2079 case XML_REGEXP_OTHER_NA: 2080 case XML_REGEXP_BLOCK_NAME: 2081 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 2082 (const xmlChar *)atom->valuep); 2083 if (atom->neg) 2084 ret = !ret; 2085 break; 2086 } 2087 return(ret); 2088} 2089 2090/************************************************************************ 2091 * * 2092 * Saving an restoring state of an execution context * 2093 * * 2094 ************************************************************************/ 2095 2096#ifdef DEBUG_REGEXP_EXEC 2097static void 2098xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 2099 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 2100 if (exec->inputStack != NULL) { 2101 int i; 2102 printf(": "); 2103 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 2104 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]); 2105 } else { 2106 printf(": %s", &(exec->inputString[exec->index])); 2107 } 2108 printf("\n"); 2109} 2110#endif 2111 2112static void 2113xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 2114#ifdef DEBUG_REGEXP_EXEC 2115 printf("saving "); 2116 exec->transno++; 2117 xmlFARegDebugExec(exec); 2118 exec->transno--; 2119#endif 2120 2121 if (exec->maxRollbacks == 0) { 2122 exec->maxRollbacks = 4; 2123 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 2124 sizeof(xmlRegExecRollback)); 2125 if (exec->rollbacks == NULL) { 2126 xmlRegexpErrMemory(NULL, "saving regexp"); 2127 exec->maxRollbacks = 0; 2128 return; 2129 } 2130 memset(exec->rollbacks, 0, 2131 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2132 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 2133 xmlRegExecRollback *tmp; 2134 int len = exec->maxRollbacks; 2135 2136 exec->maxRollbacks *= 2; 2137 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 2138 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2139 if (tmp == NULL) { 2140 xmlRegexpErrMemory(NULL, "saving regexp"); 2141 exec->maxRollbacks /= 2; 2142 return; 2143 } 2144 exec->rollbacks = tmp; 2145 tmp = &exec->rollbacks[len]; 2146 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 2147 } 2148 exec->rollbacks[exec->nbRollbacks].state = exec->state; 2149 exec->rollbacks[exec->nbRollbacks].index = exec->index; 2150 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 2151 if (exec->comp->nbCounters > 0) { 2152 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2153 exec->rollbacks[exec->nbRollbacks].counts = (int *) 2154 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 2155 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2156 xmlRegexpErrMemory(NULL, "saving regexp"); 2157 exec->status = -5; 2158 return; 2159 } 2160 } 2161 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 2162 exec->comp->nbCounters * sizeof(int)); 2163 } 2164 exec->nbRollbacks++; 2165} 2166 2167static void 2168xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 2169 if (exec->nbRollbacks <= 0) { 2170 exec->status = -1; 2171#ifdef DEBUG_REGEXP_EXEC 2172 printf("rollback failed on empty stack\n"); 2173#endif 2174 return; 2175 } 2176 exec->nbRollbacks--; 2177 exec->state = exec->rollbacks[exec->nbRollbacks].state; 2178 exec->index = exec->rollbacks[exec->nbRollbacks].index; 2179 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 2180 if (exec->comp->nbCounters > 0) { 2181 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2182 fprintf(stderr, "exec save: allocation failed"); 2183 exec->status = -6; 2184 return; 2185 } 2186 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 2187 exec->comp->nbCounters * sizeof(int)); 2188 } 2189 2190#ifdef DEBUG_REGEXP_EXEC 2191 printf("restored "); 2192 xmlFARegDebugExec(exec); 2193#endif 2194} 2195 2196/************************************************************************ 2197 * * 2198 * Verifyer, running an input against a compiled regexp * 2199 * * 2200 ************************************************************************/ 2201 2202static int 2203xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 2204 xmlRegExecCtxt execval; 2205 xmlRegExecCtxtPtr exec = &execval; 2206 int ret, codepoint, len; 2207 2208 exec->inputString = content; 2209 exec->index = 0; 2210 exec->determinist = 1; 2211 exec->maxRollbacks = 0; 2212 exec->nbRollbacks = 0; 2213 exec->rollbacks = NULL; 2214 exec->status = 0; 2215 exec->comp = comp; 2216 exec->state = comp->states[0]; 2217 exec->transno = 0; 2218 exec->transcount = 0; 2219 exec->inputStack = NULL; 2220 exec->inputStackMax = 0; 2221 if (comp->nbCounters > 0) { 2222 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 2223 if (exec->counts == NULL) { 2224 xmlRegexpErrMemory(NULL, "running regexp"); 2225 return(-1); 2226 } 2227 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 2228 } else 2229 exec->counts = NULL; 2230 while ((exec->status == 0) && 2231 ((exec->inputString[exec->index] != 0) || 2232 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 2233 xmlRegTransPtr trans; 2234 xmlRegAtomPtr atom; 2235 2236 /* 2237 * End of input on non-terminal state, rollback, however we may 2238 * still have epsilon like transition for counted transitions 2239 * on counters, in that case don't break too early. 2240 */ 2241 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 2242 goto rollback; 2243 2244 exec->transcount = 0; 2245 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2246 trans = &exec->state->trans[exec->transno]; 2247 if (trans->to < 0) 2248 continue; 2249 atom = trans->atom; 2250 ret = 0; 2251 if (trans->count >= 0) { 2252 int count; 2253 xmlRegCounterPtr counter; 2254 2255 /* 2256 * A counted transition. 2257 */ 2258 2259 count = exec->counts[trans->count]; 2260 counter = &exec->comp->counters[trans->count]; 2261#ifdef DEBUG_REGEXP_EXEC 2262 printf("testing count %d: val %d, min %d, max %d\n", 2263 trans->count, count, counter->min, counter->max); 2264#endif 2265 ret = ((count >= counter->min) && (count <= counter->max)); 2266 } else if (atom == NULL) { 2267 fprintf(stderr, "epsilon transition left at runtime\n"); 2268 exec->status = -2; 2269 break; 2270 } else if (exec->inputString[exec->index] != 0) { 2271 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 2272 ret = xmlRegCheckCharacter(atom, codepoint); 2273 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 2274 xmlRegStatePtr to = comp->states[trans->to]; 2275 2276 /* 2277 * this is a multiple input sequence 2278 */ 2279 if (exec->state->nbTrans > exec->transno + 1) { 2280 xmlFARegExecSave(exec); 2281 } 2282 exec->transcount = 1; 2283 do { 2284 /* 2285 * Try to progress as much as possible on the input 2286 */ 2287 if (exec->transcount == atom->max) { 2288 break; 2289 } 2290 exec->index += len; 2291 /* 2292 * End of input: stop here 2293 */ 2294 if (exec->inputString[exec->index] == 0) { 2295 exec->index -= len; 2296 break; 2297 } 2298 if (exec->transcount >= atom->min) { 2299 int transno = exec->transno; 2300 xmlRegStatePtr state = exec->state; 2301 2302 /* 2303 * The transition is acceptable save it 2304 */ 2305 exec->transno = -1; /* trick */ 2306 exec->state = to; 2307 xmlFARegExecSave(exec); 2308 exec->transno = transno; 2309 exec->state = state; 2310 } 2311 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 2312 len); 2313 ret = xmlRegCheckCharacter(atom, codepoint); 2314 exec->transcount++; 2315 } while (ret == 1); 2316 if (exec->transcount < atom->min) 2317 ret = 0; 2318 2319 /* 2320 * If the last check failed but one transition was found 2321 * possible, rollback 2322 */ 2323 if (ret < 0) 2324 ret = 0; 2325 if (ret == 0) { 2326 goto rollback; 2327 } 2328 } 2329 } 2330 if (ret == 1) { 2331 if (exec->state->nbTrans > exec->transno + 1) { 2332 xmlFARegExecSave(exec); 2333 } 2334 if (trans->counter >= 0) { 2335#ifdef DEBUG_REGEXP_EXEC 2336 printf("Increasing count %d\n", trans->counter); 2337#endif 2338 exec->counts[trans->counter]++; 2339 } 2340#ifdef DEBUG_REGEXP_EXEC 2341 printf("entering state %d\n", trans->to); 2342#endif 2343 exec->state = comp->states[trans->to]; 2344 exec->transno = 0; 2345 if (trans->atom != NULL) { 2346 exec->index += len; 2347 } 2348 goto progress; 2349 } else if (ret < 0) { 2350 exec->status = -4; 2351 break; 2352 } 2353 } 2354 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2355rollback: 2356 /* 2357 * Failed to find a way out 2358 */ 2359 exec->determinist = 0; 2360 xmlFARegExecRollBack(exec); 2361 } 2362progress: 2363 continue; 2364 } 2365 if (exec->rollbacks != NULL) { 2366 if (exec->counts != NULL) { 2367 int i; 2368 2369 for (i = 0;i < exec->maxRollbacks;i++) 2370 if (exec->rollbacks[i].counts != NULL) 2371 xmlFree(exec->rollbacks[i].counts); 2372 } 2373 xmlFree(exec->rollbacks); 2374 } 2375 if (exec->counts != NULL) 2376 xmlFree(exec->counts); 2377 if (exec->status == 0) 2378 return(1); 2379 if (exec->status == -1) 2380 return(0); 2381 return(exec->status); 2382} 2383 2384/************************************************************************ 2385 * * 2386 * Progressive interface to the verifyer one atom at a time * 2387 * * 2388 ************************************************************************/ 2389 2390/** 2391 * xmlRegNewExecCtxt: 2392 * @comp: a precompiled regular expression 2393 * @callback: a callback function used for handling progresses in the 2394 * automata matching phase 2395 * @data: the context data associated to the callback in this context 2396 * 2397 * Build a context used for progressive evaluation of a regexp. 2398 * 2399 * Returns the new context 2400 */ 2401xmlRegExecCtxtPtr 2402xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 2403 xmlRegExecCtxtPtr exec; 2404 2405 if (comp == NULL) 2406 return(NULL); 2407 if ((comp->compact == NULL) && (comp->states == NULL)) 2408 return(NULL); 2409 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 2410 if (exec == NULL) { 2411 xmlRegexpErrMemory(NULL, "creating execution context"); 2412 return(NULL); 2413 } 2414 memset(exec, 0, sizeof(xmlRegExecCtxt)); 2415 exec->inputString = NULL; 2416 exec->index = 0; 2417 exec->determinist = 1; 2418 exec->maxRollbacks = 0; 2419 exec->nbRollbacks = 0; 2420 exec->rollbacks = NULL; 2421 exec->status = 0; 2422 exec->comp = comp; 2423 if (comp->compact == NULL) 2424 exec->state = comp->states[0]; 2425 exec->transno = 0; 2426 exec->transcount = 0; 2427 exec->callback = callback; 2428 exec->data = data; 2429 if (comp->nbCounters > 0) { 2430 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 2431 if (exec->counts == NULL) { 2432 xmlRegexpErrMemory(NULL, "creating execution context"); 2433 xmlFree(exec); 2434 return(NULL); 2435 } 2436 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 2437 } else 2438 exec->counts = NULL; 2439 exec->inputStackMax = 0; 2440 exec->inputStackNr = 0; 2441 exec->inputStack = NULL; 2442 return(exec); 2443} 2444 2445/** 2446 * xmlRegFreeExecCtxt: 2447 * @exec: a regular expression evaulation context 2448 * 2449 * Free the structures associated to a regular expression evaulation context. 2450 */ 2451void 2452xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 2453 if (exec == NULL) 2454 return; 2455 2456 if (exec->rollbacks != NULL) { 2457 if (exec->counts != NULL) { 2458 int i; 2459 2460 for (i = 0;i < exec->maxRollbacks;i++) 2461 if (exec->rollbacks[i].counts != NULL) 2462 xmlFree(exec->rollbacks[i].counts); 2463 } 2464 xmlFree(exec->rollbacks); 2465 } 2466 if (exec->counts != NULL) 2467 xmlFree(exec->counts); 2468 if (exec->inputStack != NULL) { 2469 int i; 2470 2471 for (i = 0;i < exec->inputStackNr;i++) { 2472 if (exec->inputStack[i].value != NULL) 2473 xmlFree(exec->inputStack[i].value); 2474 } 2475 xmlFree(exec->inputStack); 2476 } 2477 xmlFree(exec); 2478} 2479 2480static void 2481xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 2482 void *data) { 2483#ifdef DEBUG_PUSH 2484 printf("saving value: %d:%s\n", exec->inputStackNr, value); 2485#endif 2486 if (exec->inputStackMax == 0) { 2487 exec->inputStackMax = 4; 2488 exec->inputStack = (xmlRegInputTokenPtr) 2489 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 2490 if (exec->inputStack == NULL) { 2491 xmlRegexpErrMemory(NULL, "pushing input string"); 2492 exec->inputStackMax = 0; 2493 return; 2494 } 2495 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 2496 xmlRegInputTokenPtr tmp; 2497 2498 exec->inputStackMax *= 2; 2499 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 2500 exec->inputStackMax * sizeof(xmlRegInputToken)); 2501 if (tmp == NULL) { 2502 xmlRegexpErrMemory(NULL, "pushing input string"); 2503 exec->inputStackMax /= 2; 2504 return; 2505 } 2506 exec->inputStack = tmp; 2507 } 2508 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 2509 exec->inputStack[exec->inputStackNr].data = data; 2510 exec->inputStackNr++; 2511 exec->inputStack[exec->inputStackNr].value = NULL; 2512 exec->inputStack[exec->inputStackNr].data = NULL; 2513} 2514 2515 2516/** 2517 * xmlRegCompactPushString: 2518 * @exec: a regexp execution context 2519 * @comp: the precompiled exec with a compact table 2520 * @value: a string token input 2521 * @data: data associated to the token to reuse in callbacks 2522 * 2523 * Push one input token in the execution context 2524 * 2525 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 2526 * a negative value in case of error. 2527 */ 2528static int 2529xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 2530 xmlRegexpPtr comp, 2531 const xmlChar *value, 2532 void *data) { 2533 int state = exec->index; 2534 int i, target; 2535 2536 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 2537 return(-1); 2538 2539 if (value == NULL) { 2540 /* 2541 * are we at a final state ? 2542 */ 2543 if (comp->compact[state * (comp->nbstrings + 1)] == 2544 XML_REGEXP_FINAL_STATE) 2545 return(1); 2546 return(0); 2547 } 2548 2549#ifdef DEBUG_PUSH 2550 printf("value pushed: %s\n", value); 2551#endif 2552 2553 /* 2554 * Examine all outside transition from current state 2555 */ 2556 for (i = 0;i < comp->nbstrings;i++) { 2557 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 2558 if ((target > 0) && (target <= comp->nbstates)) { 2559 target--; /* to avoid 0 */ 2560 if (xmlStrEqual(comp->stringMap[i], value)) { 2561 exec->index = target; 2562 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 2563 exec->callback(exec->data, value, 2564 comp->transdata[state * comp->nbstrings + i], data); 2565 } 2566#ifdef DEBUG_PUSH 2567 printf("entering state %d\n", target); 2568#endif 2569 if (comp->compact[target * (comp->nbstrings + 1)] == 2570 XML_REGEXP_FINAL_STATE) 2571 return(1); 2572 return(0); 2573 } 2574 } 2575 } 2576 /* 2577 * Failed to find an exit transition out from current state for the 2578 * current token 2579 */ 2580#ifdef DEBUG_PUSH 2581 printf("failed to find a transition for %s on state %d\n", value, state); 2582#endif 2583 exec->status = -1; 2584 return(-1); 2585} 2586 2587/** 2588 * xmlRegExecPushString: 2589 * @exec: a regexp execution context or NULL to indicate the end 2590 * @value: a string token input 2591 * @data: data associated to the token to reuse in callbacks 2592 * 2593 * Push one input token in the execution context 2594 * 2595 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 2596 * a negative value in case of error. 2597 */ 2598int 2599xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 2600 void *data) { 2601 xmlRegTransPtr trans; 2602 xmlRegAtomPtr atom; 2603 int ret; 2604 int final = 0; 2605 2606 if (exec == NULL) 2607 return(-1); 2608 if (exec->comp == NULL) 2609 return(-1); 2610 if (exec->status != 0) 2611 return(exec->status); 2612 2613 if (exec->comp->compact != NULL) 2614 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 2615 2616 if (value == NULL) { 2617 if (exec->state->type == XML_REGEXP_FINAL_STATE) 2618 return(1); 2619 final = 1; 2620 } 2621 2622#ifdef DEBUG_PUSH 2623 printf("value pushed: %s\n", value); 2624#endif 2625 /* 2626 * If we have an active rollback stack push the new value there 2627 * and get back to where we were left 2628 */ 2629 if ((value != NULL) && (exec->inputStackNr > 0)) { 2630 xmlFARegExecSaveInputString(exec, value, data); 2631 value = exec->inputStack[exec->index].value; 2632 data = exec->inputStack[exec->index].data; 2633#ifdef DEBUG_PUSH 2634 printf("value loaded: %s\n", value); 2635#endif 2636 } 2637 2638 while ((exec->status == 0) && 2639 ((value != NULL) || 2640 ((final == 1) && 2641 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 2642 2643 /* 2644 * End of input on non-terminal state, rollback, however we may 2645 * still have epsilon like transition for counted transitions 2646 * on counters, in that case don't break too early. 2647 */ 2648 if ((value == NULL) && (exec->counts == NULL)) 2649 goto rollback; 2650 2651 exec->transcount = 0; 2652 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2653 trans = &exec->state->trans[exec->transno]; 2654 if (trans->to < 0) 2655 continue; 2656 atom = trans->atom; 2657 ret = 0; 2658 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 2659 int i; 2660 int count; 2661 xmlRegTransPtr t; 2662 xmlRegCounterPtr counter; 2663 2664 ret = 0; 2665 2666#ifdef DEBUG_PUSH 2667 printf("testing all lax %d\n", trans->count); 2668#endif 2669 /* 2670 * Check all counted transitions from the current state 2671 */ 2672 if ((value == NULL) && (final)) { 2673 ret = 1; 2674 } else if (value != NULL) { 2675 for (i = 0;i < exec->state->nbTrans;i++) { 2676 t = &exec->state->trans[i]; 2677 if ((t->counter < 0) || (t == trans)) 2678 continue; 2679 counter = &exec->comp->counters[t->counter]; 2680 count = exec->counts[t->counter]; 2681 if ((count < counter->max) && 2682 (t->atom != NULL) && 2683 (xmlStrEqual(value, t->atom->valuep))) { 2684 ret = 0; 2685 break; 2686 } 2687 if ((count >= counter->min) && 2688 (count < counter->max) && 2689 (xmlStrEqual(value, t->atom->valuep))) { 2690 ret = 1; 2691 break; 2692 } 2693 } 2694 } 2695 } else if (trans->count == REGEXP_ALL_COUNTER) { 2696 int i; 2697 int count; 2698 xmlRegTransPtr t; 2699 xmlRegCounterPtr counter; 2700 2701 ret = 1; 2702 2703#ifdef DEBUG_PUSH 2704 printf("testing all %d\n", trans->count); 2705#endif 2706 /* 2707 * Check all counted transitions from the current state 2708 */ 2709 for (i = 0;i < exec->state->nbTrans;i++) { 2710 t = &exec->state->trans[i]; 2711 if ((t->counter < 0) || (t == trans)) 2712 continue; 2713 counter = &exec->comp->counters[t->counter]; 2714 count = exec->counts[t->counter]; 2715 if ((count < counter->min) || (count > counter->max)) { 2716 ret = 0; 2717 break; 2718 } 2719 } 2720 } else if (trans->count >= 0) { 2721 int count; 2722 xmlRegCounterPtr counter; 2723 2724 /* 2725 * A counted transition. 2726 */ 2727 2728 count = exec->counts[trans->count]; 2729 counter = &exec->comp->counters[trans->count]; 2730#ifdef DEBUG_PUSH 2731 printf("testing count %d: val %d, min %d, max %d\n", 2732 trans->count, count, counter->min, counter->max); 2733#endif 2734 ret = ((count >= counter->min) && (count <= counter->max)); 2735 } else if (atom == NULL) { 2736 fprintf(stderr, "epsilon transition left at runtime\n"); 2737 exec->status = -2; 2738 break; 2739 } else if (value != NULL) { 2740 ret = xmlStrEqual(value, atom->valuep); 2741 if ((ret == 1) && (trans->counter >= 0)) { 2742 xmlRegCounterPtr counter; 2743 int count; 2744 2745 count = exec->counts[trans->counter]; 2746 counter = &exec->comp->counters[trans->counter]; 2747 if (count >= counter->max) 2748 ret = 0; 2749 } 2750 2751 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 2752 xmlRegStatePtr to = exec->comp->states[trans->to]; 2753 2754 /* 2755 * this is a multiple input sequence 2756 */ 2757 if (exec->state->nbTrans > exec->transno + 1) { 2758 if (exec->inputStackNr <= 0) { 2759 xmlFARegExecSaveInputString(exec, value, data); 2760 } 2761 xmlFARegExecSave(exec); 2762 } 2763 exec->transcount = 1; 2764 do { 2765 /* 2766 * Try to progress as much as possible on the input 2767 */ 2768 if (exec->transcount == atom->max) { 2769 break; 2770 } 2771 exec->index++; 2772 value = exec->inputStack[exec->index].value; 2773 data = exec->inputStack[exec->index].data; 2774#ifdef DEBUG_PUSH 2775 printf("value loaded: %s\n", value); 2776#endif 2777 2778 /* 2779 * End of input: stop here 2780 */ 2781 if (value == NULL) { 2782 exec->index --; 2783 break; 2784 } 2785 if (exec->transcount >= atom->min) { 2786 int transno = exec->transno; 2787 xmlRegStatePtr state = exec->state; 2788 2789 /* 2790 * The transition is acceptable save it 2791 */ 2792 exec->transno = -1; /* trick */ 2793 exec->state = to; 2794 if (exec->inputStackNr <= 0) { 2795 xmlFARegExecSaveInputString(exec, value, data); 2796 } 2797 xmlFARegExecSave(exec); 2798 exec->transno = transno; 2799 exec->state = state; 2800 } 2801 ret = xmlStrEqual(value, atom->valuep); 2802 exec->transcount++; 2803 } while (ret == 1); 2804 if (exec->transcount < atom->min) 2805 ret = 0; 2806 2807 /* 2808 * If the last check failed but one transition was found 2809 * possible, rollback 2810 */ 2811 if (ret < 0) 2812 ret = 0; 2813 if (ret == 0) { 2814 goto rollback; 2815 } 2816 } 2817 } 2818 if (ret == 1) { 2819 if ((exec->callback != NULL) && (atom != NULL) && 2820 (data != NULL)) { 2821 exec->callback(exec->data, atom->valuep, 2822 atom->data, data); 2823 } 2824 if (exec->state->nbTrans > exec->transno + 1) { 2825 if (exec->inputStackNr <= 0) { 2826 xmlFARegExecSaveInputString(exec, value, data); 2827 } 2828 xmlFARegExecSave(exec); 2829 } 2830 if (trans->counter >= 0) { 2831#ifdef DEBUG_PUSH 2832 printf("Increasing count %d\n", trans->counter); 2833#endif 2834 exec->counts[trans->counter]++; 2835 } 2836#ifdef DEBUG_PUSH 2837 printf("entering state %d\n", trans->to); 2838#endif 2839 exec->state = exec->comp->states[trans->to]; 2840 exec->transno = 0; 2841 if (trans->atom != NULL) { 2842 if (exec->inputStack != NULL) { 2843 exec->index++; 2844 if (exec->index < exec->inputStackNr) { 2845 value = exec->inputStack[exec->index].value; 2846 data = exec->inputStack[exec->index].data; 2847#ifdef DEBUG_PUSH 2848 printf("value loaded: %s\n", value); 2849#endif 2850 } else { 2851 value = NULL; 2852 data = NULL; 2853#ifdef DEBUG_PUSH 2854 printf("end of input\n"); 2855#endif 2856 } 2857 } else { 2858 value = NULL; 2859 data = NULL; 2860#ifdef DEBUG_PUSH 2861 printf("end of input\n"); 2862#endif 2863 } 2864 } 2865 goto progress; 2866 } else if (ret < 0) { 2867 exec->status = -4; 2868 break; 2869 } 2870 } 2871 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2872rollback: 2873 /* 2874 * Failed to find a way out 2875 */ 2876 exec->determinist = 0; 2877 xmlFARegExecRollBack(exec); 2878 if (exec->status == 0) { 2879 value = exec->inputStack[exec->index].value; 2880 data = exec->inputStack[exec->index].data; 2881#ifdef DEBUG_PUSH 2882 printf("value loaded: %s\n", value); 2883#endif 2884 } 2885 } 2886progress: 2887 continue; 2888 } 2889 if (exec->status == 0) { 2890 return(exec->state->type == XML_REGEXP_FINAL_STATE); 2891 } 2892 return(exec->status); 2893} 2894 2895/** 2896 * xmlRegExecPushString2: 2897 * @exec: a regexp execution context or NULL to indicate the end 2898 * @value: the first string token input 2899 * @value2: the second string token input 2900 * @data: data associated to the token to reuse in callbacks 2901 * 2902 * Push one input token in the execution context 2903 * 2904 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 2905 * a negative value in case of error. 2906 */ 2907int 2908xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 2909 const xmlChar *value2, void *data) { 2910 xmlChar buf[150]; 2911 int lenn, lenp, ret; 2912 xmlChar *str; 2913 2914 if (exec == NULL) 2915 return(-1); 2916 if (exec->comp == NULL) 2917 return(-1); 2918 if (exec->status != 0) 2919 return(exec->status); 2920 2921 if (value2 == NULL) 2922 return(xmlRegExecPushString(exec, value, data)); 2923 2924 lenn = strlen((char *) value2); 2925 lenp = strlen((char *) value); 2926 2927 if (150 < lenn + lenp + 2) { 2928 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 2929 if (str == NULL) { 2930 exec->status = -1; 2931 return(-1); 2932 } 2933 } else { 2934 str = buf; 2935 } 2936 memcpy(&str[0], value, lenp); 2937 str[lenp] = '|'; 2938 memcpy(&str[lenp + 1], value2, lenn); 2939 str[lenn + lenp + 1] = 0; 2940 2941 if (exec->comp->compact != NULL) 2942 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 2943 else 2944 ret = xmlRegExecPushString(exec, str, data); 2945 2946 if (str != buf) 2947 xmlFree(buf); 2948 return(ret); 2949} 2950 2951#if 0 2952static int 2953xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 2954 xmlRegTransPtr trans; 2955 xmlRegAtomPtr atom; 2956 int ret; 2957 int codepoint, len; 2958 2959 if (exec == NULL) 2960 return(-1); 2961 if (exec->status != 0) 2962 return(exec->status); 2963 2964 while ((exec->status == 0) && 2965 ((exec->inputString[exec->index] != 0) || 2966 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 2967 2968 /* 2969 * End of input on non-terminal state, rollback, however we may 2970 * still have epsilon like transition for counted transitions 2971 * on counters, in that case don't break too early. 2972 */ 2973 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 2974 goto rollback; 2975 2976 exec->transcount = 0; 2977 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2978 trans = &exec->state->trans[exec->transno]; 2979 if (trans->to < 0) 2980 continue; 2981 atom = trans->atom; 2982 ret = 0; 2983 if (trans->count >= 0) { 2984 int count; 2985 xmlRegCounterPtr counter; 2986 2987 /* 2988 * A counted transition. 2989 */ 2990 2991 count = exec->counts[trans->count]; 2992 counter = &exec->comp->counters[trans->count]; 2993#ifdef DEBUG_REGEXP_EXEC 2994 printf("testing count %d: val %d, min %d, max %d\n", 2995 trans->count, count, counter->min, counter->max); 2996#endif 2997 ret = ((count >= counter->min) && (count <= counter->max)); 2998 } else if (atom == NULL) { 2999 fprintf(stderr, "epsilon transition left at runtime\n"); 3000 exec->status = -2; 3001 break; 3002 } else if (exec->inputString[exec->index] != 0) { 3003 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3004 ret = xmlRegCheckCharacter(atom, codepoint); 3005 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3006 xmlRegStatePtr to = exec->comp->states[trans->to]; 3007 3008 /* 3009 * this is a multiple input sequence 3010 */ 3011 if (exec->state->nbTrans > exec->transno + 1) { 3012 xmlFARegExecSave(exec); 3013 } 3014 exec->transcount = 1; 3015 do { 3016 /* 3017 * Try to progress as much as possible on the input 3018 */ 3019 if (exec->transcount == atom->max) { 3020 break; 3021 } 3022 exec->index += len; 3023 /* 3024 * End of input: stop here 3025 */ 3026 if (exec->inputString[exec->index] == 0) { 3027 exec->index -= len; 3028 break; 3029 } 3030 if (exec->transcount >= atom->min) { 3031 int transno = exec->transno; 3032 xmlRegStatePtr state = exec->state; 3033 3034 /* 3035 * The transition is acceptable save it 3036 */ 3037 exec->transno = -1; /* trick */ 3038 exec->state = to; 3039 xmlFARegExecSave(exec); 3040 exec->transno = transno; 3041 exec->state = state; 3042 } 3043 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3044 len); 3045 ret = xmlRegCheckCharacter(atom, codepoint); 3046 exec->transcount++; 3047 } while (ret == 1); 3048 if (exec->transcount < atom->min) 3049 ret = 0; 3050 3051 /* 3052 * If the last check failed but one transition was found 3053 * possible, rollback 3054 */ 3055 if (ret < 0) 3056 ret = 0; 3057 if (ret == 0) { 3058 goto rollback; 3059 } 3060 } 3061 } 3062 if (ret == 1) { 3063 if (exec->state->nbTrans > exec->transno + 1) { 3064 xmlFARegExecSave(exec); 3065 } 3066 if (trans->counter >= 0) { 3067#ifdef DEBUG_REGEXP_EXEC 3068 printf("Increasing count %d\n", trans->counter); 3069#endif 3070 exec->counts[trans->counter]++; 3071 } 3072#ifdef DEBUG_REGEXP_EXEC 3073 printf("entering state %d\n", trans->to); 3074#endif 3075 exec->state = exec->comp->states[trans->to]; 3076 exec->transno = 0; 3077 if (trans->atom != NULL) { 3078 exec->index += len; 3079 } 3080 goto progress; 3081 } else if (ret < 0) { 3082 exec->status = -4; 3083 break; 3084 } 3085 } 3086 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3087rollback: 3088 /* 3089 * Failed to find a way out 3090 */ 3091 exec->determinist = 0; 3092 xmlFARegExecRollBack(exec); 3093 } 3094progress: 3095 continue; 3096 } 3097} 3098#endif 3099/************************************************************************ 3100 * * 3101 * Parser for the Shemas Datatype Regular Expressions * 3102 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 3103 * * 3104 ************************************************************************/ 3105 3106/** 3107 * xmlFAIsChar: 3108 * @ctxt: a regexp parser context 3109 * 3110 * [10] Char ::= [^.\?*+()|#x5B#x5D] 3111 */ 3112static int 3113xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 3114 int cur; 3115 int len; 3116 3117 cur = CUR_SCHAR(ctxt->cur, len); 3118 if ((cur == '.') || (cur == '\\') || (cur == '?') || 3119 (cur == '*') || (cur == '+') || (cur == '(') || 3120 (cur == ')') || (cur == '|') || (cur == 0x5B) || 3121 (cur == 0x5D) || (cur == 0)) 3122 return(-1); 3123 return(cur); 3124} 3125 3126/** 3127 * xmlFAParseCharProp: 3128 * @ctxt: a regexp parser context 3129 * 3130 * [27] charProp ::= IsCategory | IsBlock 3131 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 3132 * Separators | Symbols | Others 3133 * [29] Letters ::= 'L' [ultmo]? 3134 * [30] Marks ::= 'M' [nce]? 3135 * [31] Numbers ::= 'N' [dlo]? 3136 * [32] Punctuation ::= 'P' [cdseifo]? 3137 * [33] Separators ::= 'Z' [slp]? 3138 * [34] Symbols ::= 'S' [mcko]? 3139 * [35] Others ::= 'C' [cfon]? 3140 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 3141 */ 3142static void 3143xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 3144 int cur; 3145 xmlRegAtomType type = (xmlRegAtomType) 0; 3146 xmlChar *blockName = NULL; 3147 3148 cur = CUR; 3149 if (cur == 'L') { 3150 NEXT; 3151 cur = CUR; 3152 if (cur == 'u') { 3153 NEXT; 3154 type = XML_REGEXP_LETTER_UPPERCASE; 3155 } else if (cur == 'l') { 3156 NEXT; 3157 type = XML_REGEXP_LETTER_LOWERCASE; 3158 } else if (cur == 't') { 3159 NEXT; 3160 type = XML_REGEXP_LETTER_TITLECASE; 3161 } else if (cur == 'm') { 3162 NEXT; 3163 type = XML_REGEXP_LETTER_MODIFIER; 3164 } else if (cur == 'o') { 3165 NEXT; 3166 type = XML_REGEXP_LETTER_OTHERS; 3167 } else { 3168 type = XML_REGEXP_LETTER; 3169 } 3170 } else if (cur == 'M') { 3171 NEXT; 3172 cur = CUR; 3173 if (cur == 'n') { 3174 NEXT; 3175 /* nonspacing */ 3176 type = XML_REGEXP_MARK_NONSPACING; 3177 } else if (cur == 'c') { 3178 NEXT; 3179 /* spacing combining */ 3180 type = XML_REGEXP_MARK_SPACECOMBINING; 3181 } else if (cur == 'e') { 3182 NEXT; 3183 /* enclosing */ 3184 type = XML_REGEXP_MARK_ENCLOSING; 3185 } else { 3186 /* all marks */ 3187 type = XML_REGEXP_MARK; 3188 } 3189 } else if (cur == 'N') { 3190 NEXT; 3191 cur = CUR; 3192 if (cur == 'd') { 3193 NEXT; 3194 /* digital */ 3195 type = XML_REGEXP_NUMBER_DECIMAL; 3196 } else if (cur == 'l') { 3197 NEXT; 3198 /* letter */ 3199 type = XML_REGEXP_NUMBER_LETTER; 3200 } else if (cur == 'o') { 3201 NEXT; 3202 /* other */ 3203 type = XML_REGEXP_NUMBER_OTHERS; 3204 } else { 3205 /* all numbers */ 3206 type = XML_REGEXP_NUMBER; 3207 } 3208 } else if (cur == 'P') { 3209 NEXT; 3210 cur = CUR; 3211 if (cur == 'c') { 3212 NEXT; 3213 /* connector */ 3214 type = XML_REGEXP_PUNCT_CONNECTOR; 3215 } else if (cur == 'd') { 3216 NEXT; 3217 /* dash */ 3218 type = XML_REGEXP_PUNCT_DASH; 3219 } else if (cur == 's') { 3220 NEXT; 3221 /* open */ 3222 type = XML_REGEXP_PUNCT_OPEN; 3223 } else if (cur == 'e') { 3224 NEXT; 3225 /* close */ 3226 type = XML_REGEXP_PUNCT_CLOSE; 3227 } else if (cur == 'i') { 3228 NEXT; 3229 /* initial quote */ 3230 type = XML_REGEXP_PUNCT_INITQUOTE; 3231 } else if (cur == 'f') { 3232 NEXT; 3233 /* final quote */ 3234 type = XML_REGEXP_PUNCT_FINQUOTE; 3235 } else if (cur == 'o') { 3236 NEXT; 3237 /* other */ 3238 type = XML_REGEXP_PUNCT_OTHERS; 3239 } else { 3240 /* all punctuation */ 3241 type = XML_REGEXP_PUNCT; 3242 } 3243 } else if (cur == 'Z') { 3244 NEXT; 3245 cur = CUR; 3246 if (cur == 's') { 3247 NEXT; 3248 /* space */ 3249 type = XML_REGEXP_SEPAR_SPACE; 3250 } else if (cur == 'l') { 3251 NEXT; 3252 /* line */ 3253 type = XML_REGEXP_SEPAR_LINE; 3254 } else if (cur == 'p') { 3255 NEXT; 3256 /* paragraph */ 3257 type = XML_REGEXP_SEPAR_PARA; 3258 } else { 3259 /* all separators */ 3260 type = XML_REGEXP_SEPAR; 3261 } 3262 } else if (cur == 'S') { 3263 NEXT; 3264 cur = CUR; 3265 if (cur == 'm') { 3266 NEXT; 3267 type = XML_REGEXP_SYMBOL_MATH; 3268 /* math */ 3269 } else if (cur == 'c') { 3270 NEXT; 3271 type = XML_REGEXP_SYMBOL_CURRENCY; 3272 /* currency */ 3273 } else if (cur == 'k') { 3274 NEXT; 3275 type = XML_REGEXP_SYMBOL_MODIFIER; 3276 /* modifiers */ 3277 } else if (cur == 'o') { 3278 NEXT; 3279 type = XML_REGEXP_SYMBOL_OTHERS; 3280 /* other */ 3281 } else { 3282 /* all symbols */ 3283 type = XML_REGEXP_SYMBOL; 3284 } 3285 } else if (cur == 'C') { 3286 NEXT; 3287 cur = CUR; 3288 if (cur == 'c') { 3289 NEXT; 3290 /* control */ 3291 type = XML_REGEXP_OTHER_CONTROL; 3292 } else if (cur == 'f') { 3293 NEXT; 3294 /* format */ 3295 type = XML_REGEXP_OTHER_FORMAT; 3296 } else if (cur == 'o') { 3297 NEXT; 3298 /* private use */ 3299 type = XML_REGEXP_OTHER_PRIVATE; 3300 } else if (cur == 'n') { 3301 NEXT; 3302 /* not assigned */ 3303 type = XML_REGEXP_OTHER_NA; 3304 } else { 3305 /* all others */ 3306 type = XML_REGEXP_OTHER; 3307 } 3308 } else if (cur == 'I') { 3309 const xmlChar *start; 3310 NEXT; 3311 cur = CUR; 3312 if (cur != 's') { 3313 ERROR("IsXXXX expected"); 3314 return; 3315 } 3316 NEXT; 3317 start = ctxt->cur; 3318 cur = CUR; 3319 if (((cur >= 'a') && (cur <= 'z')) || 3320 ((cur >= 'A') && (cur <= 'Z')) || 3321 ((cur >= '0') && (cur <= '9')) || 3322 (cur == 0x2D)) { 3323 NEXT; 3324 cur = CUR; 3325 while (((cur >= 'a') && (cur <= 'z')) || 3326 ((cur >= 'A') && (cur <= 'Z')) || 3327 ((cur >= '0') && (cur <= '9')) || 3328 (cur == 0x2D)) { 3329 NEXT; 3330 cur = CUR; 3331 } 3332 } 3333 type = XML_REGEXP_BLOCK_NAME; 3334 blockName = xmlStrndup(start, ctxt->cur - start); 3335 } else { 3336 ERROR("Unknown char property"); 3337 return; 3338 } 3339 if (ctxt->atom == NULL) { 3340 ctxt->atom = xmlRegNewAtom(ctxt, type); 3341 if (ctxt->atom != NULL) 3342 ctxt->atom->valuep = blockName; 3343 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3344 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3345 type, 0, 0, blockName); 3346 } 3347} 3348 3349/** 3350 * xmlFAParseCharClassEsc: 3351 * @ctxt: a regexp parser context 3352 * 3353 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 3354 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 3355 * [25] catEsc ::= '\p{' charProp '}' 3356 * [26] complEsc ::= '\P{' charProp '}' 3357 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 3358 */ 3359static void 3360xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 3361 int cur; 3362 3363 if (CUR == '.') { 3364 if (ctxt->atom == NULL) { 3365 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 3366 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3367 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3368 XML_REGEXP_ANYCHAR, 0, 0, NULL); 3369 } 3370 NEXT; 3371 return; 3372 } 3373 if (CUR != '\\') { 3374 ERROR("Escaped sequence: expecting \\"); 3375 return; 3376 } 3377 NEXT; 3378 cur = CUR; 3379 if (cur == 'p') { 3380 NEXT; 3381 if (CUR != '{') { 3382 ERROR("Expecting '{'"); 3383 return; 3384 } 3385 NEXT; 3386 xmlFAParseCharProp(ctxt); 3387 if (CUR != '}') { 3388 ERROR("Expecting '}'"); 3389 return; 3390 } 3391 NEXT; 3392 } else if (cur == 'P') { 3393 NEXT; 3394 if (CUR != '{') { 3395 ERROR("Expecting '{'"); 3396 return; 3397 } 3398 NEXT; 3399 xmlFAParseCharProp(ctxt); 3400 ctxt->atom->neg = 1; 3401 if (CUR != '}') { 3402 ERROR("Expecting '}'"); 3403 return; 3404 } 3405 NEXT; 3406 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 3407 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 3408 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 3409 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 3410 (cur == 0x5E)) { 3411 if (ctxt->atom == NULL) { 3412 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 3413 if (ctxt->atom != NULL) 3414 ctxt->atom->codepoint = cur; 3415 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3416 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3417 XML_REGEXP_CHARVAL, cur, cur, NULL); 3418 } 3419 NEXT; 3420 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 3421 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 3422 (cur == 'w') || (cur == 'W')) { 3423 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 3424 3425 switch (cur) { 3426 case 's': 3427 type = XML_REGEXP_ANYSPACE; 3428 break; 3429 case 'S': 3430 type = XML_REGEXP_NOTSPACE; 3431 break; 3432 case 'i': 3433 type = XML_REGEXP_INITNAME; 3434 break; 3435 case 'I': 3436 type = XML_REGEXP_NOTINITNAME; 3437 break; 3438 case 'c': 3439 type = XML_REGEXP_NAMECHAR; 3440 break; 3441 case 'C': 3442 type = XML_REGEXP_NOTNAMECHAR; 3443 break; 3444 case 'd': 3445 type = XML_REGEXP_DECIMAL; 3446 break; 3447 case 'D': 3448 type = XML_REGEXP_NOTDECIMAL; 3449 break; 3450 case 'w': 3451 type = XML_REGEXP_REALCHAR; 3452 break; 3453 case 'W': 3454 type = XML_REGEXP_NOTREALCHAR; 3455 break; 3456 } 3457 NEXT; 3458 if (ctxt->atom == NULL) { 3459 ctxt->atom = xmlRegNewAtom(ctxt, type); 3460 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3461 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3462 type, 0, 0, NULL); 3463 } 3464 } 3465} 3466 3467/** 3468 * xmlFAParseCharRef: 3469 * @ctxt: a regexp parser context 3470 * 3471 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' ) 3472 */ 3473static int 3474xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) { 3475 int ret = 0, cur; 3476 3477 if ((CUR != '&') || (NXT(1) != '#')) 3478 return(-1); 3479 NEXT; 3480 NEXT; 3481 cur = CUR; 3482 if (cur == 'x') { 3483 NEXT; 3484 cur = CUR; 3485 if (((cur >= '0') && (cur <= '9')) || 3486 ((cur >= 'a') && (cur <= 'f')) || 3487 ((cur >= 'A') && (cur <= 'F'))) { 3488 while (((cur >= '0') && (cur <= '9')) || 3489 ((cur >= 'A') && (cur <= 'F'))) { 3490 if ((cur >= '0') && (cur <= '9')) 3491 ret = ret * 16 + cur - '0'; 3492 else if ((cur >= 'a') && (cur <= 'f')) 3493 ret = ret * 16 + 10 + (cur - 'a'); 3494 else 3495 ret = ret * 16 + 10 + (cur - 'A'); 3496 NEXT; 3497 cur = CUR; 3498 } 3499 } else { 3500 ERROR("Char ref: expecting [0-9A-F]"); 3501 return(-1); 3502 } 3503 } else { 3504 if ((cur >= '0') && (cur <= '9')) { 3505 while ((cur >= '0') && (cur <= '9')) { 3506 ret = ret * 10 + cur - '0'; 3507 NEXT; 3508 cur = CUR; 3509 } 3510 } else { 3511 ERROR("Char ref: expecting [0-9]"); 3512 return(-1); 3513 } 3514 } 3515 if (cur != ';') { 3516 ERROR("Char ref: expecting ';'"); 3517 return(-1); 3518 } else { 3519 NEXT; 3520 } 3521 return(ret); 3522} 3523 3524/** 3525 * xmlFAParseCharRange: 3526 * @ctxt: a regexp parser context 3527 * 3528 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 3529 * [18] seRange ::= charOrEsc '-' charOrEsc 3530 * [20] charOrEsc ::= XmlChar | SingleCharEsc 3531 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 3532 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 3533 */ 3534static void 3535xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 3536 int cur, len; 3537 int start = -1; 3538 int end = -1; 3539 3540 if ((CUR == '&') && (NXT(1) == '#')) { 3541 end = start = xmlFAParseCharRef(ctxt); 3542 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3543 XML_REGEXP_CHARVAL, start, end, NULL); 3544 return; 3545 } 3546 cur = CUR; 3547 if (cur == '\\') { 3548 NEXT; 3549 cur = CUR; 3550 switch (cur) { 3551 case 'n': start = 0xA; break; 3552 case 'r': start = 0xD; break; 3553 case 't': start = 0x9; break; 3554 case '\\': case '|': case '.': case '-': case '^': case '?': 3555 case '*': case '+': case '{': case '}': case '(': case ')': 3556 case '[': case ']': 3557 start = cur; break; 3558 default: 3559 ERROR("Invalid escape value"); 3560 return; 3561 } 3562 end = start; 3563 len = 1; 3564 } else if ((cur != 0x5B) && (cur != 0x5D)) { 3565 end = start = CUR_SCHAR(ctxt->cur, len); 3566 } else { 3567 ERROR("Expecting a char range"); 3568 return; 3569 } 3570 NEXTL(len); 3571 if (start == '-') { 3572 return; 3573 } 3574 cur = CUR; 3575 if (cur != '-') { 3576 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3577 XML_REGEXP_CHARVAL, start, end, NULL); 3578 return; 3579 } 3580 NEXT; 3581 cur = CUR; 3582 if (cur == '\\') { 3583 NEXT; 3584 cur = CUR; 3585 switch (cur) { 3586 case 'n': end = 0xA; break; 3587 case 'r': end = 0xD; break; 3588 case 't': end = 0x9; break; 3589 case '\\': case '|': case '.': case '-': case '^': case '?': 3590 case '*': case '+': case '{': case '}': case '(': case ')': 3591 case '[': case ']': 3592 end = cur; break; 3593 default: 3594 ERROR("Invalid escape value"); 3595 return; 3596 } 3597 len = 1; 3598 } else if ((cur != 0x5B) && (cur != 0x5D)) { 3599 end = CUR_SCHAR(ctxt->cur, len); 3600 } else { 3601 ERROR("Expecting the end of a char range"); 3602 return; 3603 } 3604 NEXTL(len); 3605 /* TODO check that the values are acceptable character ranges for XML */ 3606 if (end < start) { 3607 ERROR("End of range is before start of range"); 3608 } else { 3609 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3610 XML_REGEXP_CHARVAL, start, end, NULL); 3611 } 3612 return; 3613} 3614 3615/** 3616 * xmlFAParsePosCharGroup: 3617 * @ctxt: a regexp parser context 3618 * 3619 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 3620 */ 3621static void 3622xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 3623 do { 3624 if ((CUR == '\\') || (CUR == '.')) { 3625 xmlFAParseCharClassEsc(ctxt); 3626 } else { 3627 xmlFAParseCharRange(ctxt); 3628 } 3629 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 3630 (ctxt->error == 0)); 3631} 3632 3633/** 3634 * xmlFAParseCharGroup: 3635 * @ctxt: a regexp parser context 3636 * 3637 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 3638 * [15] negCharGroup ::= '^' posCharGroup 3639 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 3640 * [12] charClassExpr ::= '[' charGroup ']' 3641 */ 3642static void 3643xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 3644 int n = ctxt->neg; 3645 while ((CUR != ']') && (ctxt->error == 0)) { 3646 if (CUR == '^') { 3647 int neg = ctxt->neg; 3648 3649 NEXT; 3650 ctxt->neg = !ctxt->neg; 3651 xmlFAParsePosCharGroup(ctxt); 3652 ctxt->neg = neg; 3653 } else if (CUR == '-') { 3654 int neg = ctxt->neg; 3655 NEXT; 3656 ctxt->neg = 2; 3657 if (CUR != '[') { 3658 ERROR("charClassExpr: '[' expected"); 3659 break; 3660 } 3661 NEXT; 3662 xmlFAParseCharGroup(ctxt); 3663 if (CUR == ']') { 3664 NEXT; 3665 } else { 3666 ERROR("charClassExpr: ']' expected"); 3667 break; 3668 } 3669 ctxt->neg = neg; 3670 break; 3671 } else if (CUR != ']') { 3672 xmlFAParsePosCharGroup(ctxt); 3673 } 3674 } 3675 ctxt->neg = n; 3676} 3677 3678/** 3679 * xmlFAParseCharClass: 3680 * @ctxt: a regexp parser context 3681 * 3682 * [11] charClass ::= charClassEsc | charClassExpr 3683 * [12] charClassExpr ::= '[' charGroup ']' 3684 */ 3685static void 3686xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 3687 if (CUR == '[') { 3688 NEXT; 3689 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 3690 if (ctxt->atom == NULL) 3691 return; 3692 xmlFAParseCharGroup(ctxt); 3693 if (CUR == ']') { 3694 NEXT; 3695 } else { 3696 ERROR("xmlFAParseCharClass: ']' expected"); 3697 } 3698 } else { 3699 xmlFAParseCharClassEsc(ctxt); 3700 } 3701} 3702 3703/** 3704 * xmlFAParseQuantExact: 3705 * @ctxt: a regexp parser context 3706 * 3707 * [8] QuantExact ::= [0-9]+ 3708 * 3709 * Returns 0 if success or -1 in case of error 3710 */ 3711static int 3712xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 3713 int ret = 0; 3714 int ok = 0; 3715 3716 while ((CUR >= '0') && (CUR <= '9')) { 3717 ret = ret * 10 + (CUR - '0'); 3718 ok = 1; 3719 NEXT; 3720 } 3721 if (ok != 1) { 3722 return(-1); 3723 } 3724 return(ret); 3725} 3726 3727/** 3728 * xmlFAParseQuantifier: 3729 * @ctxt: a regexp parser context 3730 * 3731 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 3732 * [5] quantity ::= quantRange | quantMin | QuantExact 3733 * [6] quantRange ::= QuantExact ',' QuantExact 3734 * [7] quantMin ::= QuantExact ',' 3735 * [8] QuantExact ::= [0-9]+ 3736 */ 3737static int 3738xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 3739 int cur; 3740 3741 cur = CUR; 3742 if ((cur == '?') || (cur == '*') || (cur == '+')) { 3743 if (ctxt->atom != NULL) { 3744 if (cur == '?') 3745 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 3746 else if (cur == '*') 3747 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 3748 else if (cur == '+') 3749 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 3750 } 3751 NEXT; 3752 return(1); 3753 } 3754 if (cur == '{') { 3755 int min = 0, max = 0; 3756 3757 NEXT; 3758 cur = xmlFAParseQuantExact(ctxt); 3759 if (cur >= 0) 3760 min = cur; 3761 if (CUR == ',') { 3762 NEXT; 3763 if (CUR == '}') 3764 max = INT_MAX; 3765 else { 3766 cur = xmlFAParseQuantExact(ctxt); 3767 if (cur >= 0) 3768 max = cur; 3769 else { 3770 ERROR("Improper quantifier"); 3771 } 3772 } 3773 } 3774 if (CUR == '}') { 3775 NEXT; 3776 } else { 3777 ERROR("Unterminated quantifier"); 3778 } 3779 if (max == 0) 3780 max = min; 3781 if (ctxt->atom != NULL) { 3782 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 3783 ctxt->atom->min = min; 3784 ctxt->atom->max = max; 3785 } 3786 return(1); 3787 } 3788 return(0); 3789} 3790 3791/** 3792 * xmlFAParseAtom: 3793 * @ctxt: a regexp parser context 3794 * 3795 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 3796 */ 3797static int 3798xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 3799 int codepoint, len; 3800 3801 codepoint = xmlFAIsChar(ctxt); 3802 if (codepoint > 0) { 3803 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 3804 if (ctxt->atom == NULL) 3805 return(-1); 3806 codepoint = CUR_SCHAR(ctxt->cur, len); 3807 ctxt->atom->codepoint = codepoint; 3808 NEXTL(len); 3809 return(1); 3810 } else if (CUR == '|') { 3811 return(0); 3812 } else if (CUR == 0) { 3813 return(0); 3814 } else if (CUR == ')') { 3815 return(0); 3816 } else if (CUR == '(') { 3817 xmlRegStatePtr start, oldend; 3818 3819 NEXT; 3820 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 3821 start = ctxt->state; 3822 oldend = ctxt->end; 3823 ctxt->end = NULL; 3824 ctxt->atom = NULL; 3825 xmlFAParseRegExp(ctxt, 0); 3826 if (CUR == ')') { 3827 NEXT; 3828 } else { 3829 ERROR("xmlFAParseAtom: expecting ')'"); 3830 } 3831 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 3832 if (ctxt->atom == NULL) 3833 return(-1); 3834 ctxt->atom->start = start; 3835 ctxt->atom->stop = ctxt->state; 3836 ctxt->end = oldend; 3837 return(1); 3838 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 3839 xmlFAParseCharClass(ctxt); 3840 return(1); 3841 } 3842 return(0); 3843} 3844 3845/** 3846 * xmlFAParsePiece: 3847 * @ctxt: a regexp parser context 3848 * 3849 * [3] piece ::= atom quantifier? 3850 */ 3851static int 3852xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 3853 int ret; 3854 3855 ctxt->atom = NULL; 3856 ret = xmlFAParseAtom(ctxt); 3857 if (ret == 0) 3858 return(0); 3859 if (ctxt->atom == NULL) { 3860 ERROR("internal: no atom generated"); 3861 } 3862 xmlFAParseQuantifier(ctxt); 3863 return(1); 3864} 3865 3866/** 3867 * xmlFAParseBranch: 3868 * @ctxt: a regexp parser context 3869 * @first: is taht the first 3870 * 3871 * [2] branch ::= piece* 3872 8 3873 */ 3874static int 3875xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) { 3876 xmlRegStatePtr previous; 3877 xmlRegAtomPtr prevatom = NULL; 3878 int ret; 3879 3880 previous = ctxt->state; 3881 ret = xmlFAParsePiece(ctxt); 3882 if (ret != 0) { 3883 if (first) { 3884 if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0) 3885 return(-1); 3886 previous = ctxt->state; 3887 } else { 3888 prevatom = ctxt->atom; 3889 } 3890 ctxt->atom = NULL; 3891 } 3892 while ((ret != 0) && (ctxt->error == 0)) { 3893 ret = xmlFAParsePiece(ctxt); 3894 if (ret != 0) { 3895 if (first) { 3896 if (xmlFAGenerateTransitions(ctxt, previous, NULL, 3897 ctxt->atom) < 0) 3898 return(-1); 3899 } else { 3900 if (xmlFAGenerateTransitions(ctxt, previous, NULL, 3901 prevatom) < 0) 3902 return(-1); 3903 prevatom = ctxt->atom; 3904 } 3905 previous = ctxt->state; 3906 ctxt->atom = NULL; 3907 } 3908 } 3909 if (!first) { 3910 if (xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom) < 0) 3911 return(-1); 3912 } 3913 return(0); 3914} 3915 3916/** 3917 * xmlFAParseRegExp: 3918 * @ctxt: a regexp parser context 3919 * @top: is that the top-level expressions ? 3920 * 3921 * [1] regExp ::= branch ( '|' branch )* 3922 */ 3923static void 3924xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 3925 xmlRegStatePtr start, end, oldend; 3926 3927 oldend = ctxt->end; 3928 3929 start = ctxt->state; 3930 xmlFAParseBranch(ctxt, (ctxt->end == NULL)); 3931 if (CUR != '|') { 3932 ctxt->end = ctxt->state; 3933 return; 3934 } 3935 end = ctxt->state; 3936 while ((CUR == '|') && (ctxt->error == 0)) { 3937 NEXT; 3938 ctxt->state = start; 3939 ctxt->end = end; 3940 xmlFAParseBranch(ctxt, 0); 3941 } 3942 if (!top) 3943 ctxt->end = oldend; 3944} 3945 3946/************************************************************************ 3947 * * 3948 * The basic API * 3949 * * 3950 ************************************************************************/ 3951 3952/** 3953 * xmlRegexpPrint: 3954 * @output: the file for the output debug 3955 * @regexp: the compiled regexp 3956 * 3957 * Print the content of the compiled regular expression 3958 */ 3959void 3960xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 3961 int i; 3962 3963 fprintf(output, " regexp: "); 3964 if (regexp == NULL) { 3965 fprintf(output, "NULL\n"); 3966 return; 3967 } 3968 fprintf(output, "'%s' ", regexp->string); 3969 fprintf(output, "\n"); 3970 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 3971 for (i = 0;i < regexp->nbAtoms; i++) { 3972 fprintf(output, " %02d ", i); 3973 xmlRegPrintAtom(output, regexp->atoms[i]); 3974 } 3975 fprintf(output, "%d states:", regexp->nbStates); 3976 fprintf(output, "\n"); 3977 for (i = 0;i < regexp->nbStates; i++) { 3978 xmlRegPrintState(output, regexp->states[i]); 3979 } 3980 fprintf(output, "%d counters:\n", regexp->nbCounters); 3981 for (i = 0;i < regexp->nbCounters; i++) { 3982 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 3983 regexp->counters[i].max); 3984 } 3985} 3986 3987/** 3988 * xmlRegexpCompile: 3989 * @regexp: a regular expression string 3990 * 3991 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 3992 * Appendix F and build an automata suitable for testing strings against 3993 * that regular expression 3994 * 3995 * Returns the compiled expression or NULL in case of error 3996 */ 3997xmlRegexpPtr 3998xmlRegexpCompile(const xmlChar *regexp) { 3999 xmlRegexpPtr ret; 4000 xmlRegParserCtxtPtr ctxt; 4001 4002 ctxt = xmlRegNewParserCtxt(regexp); 4003 if (ctxt == NULL) 4004 return(NULL); 4005 4006 /* initialize the parser */ 4007 ctxt->end = NULL; 4008 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 4009 xmlRegStatePush(ctxt, ctxt->start); 4010 4011 /* parse the expression building an automata */ 4012 xmlFAParseRegExp(ctxt, 1); 4013 if (CUR != 0) { 4014 ERROR("xmlFAParseRegExp: extra characters"); 4015 } 4016 ctxt->end = ctxt->state; 4017 ctxt->start->type = XML_REGEXP_START_STATE; 4018 ctxt->end->type = XML_REGEXP_FINAL_STATE; 4019 4020 /* remove the Epsilon except for counted transitions */ 4021 xmlFAEliminateEpsilonTransitions(ctxt); 4022 4023 4024 if (ctxt->error != 0) { 4025 xmlRegFreeParserCtxt(ctxt); 4026 return(NULL); 4027 } 4028 ret = xmlRegEpxFromParse(ctxt); 4029 xmlRegFreeParserCtxt(ctxt); 4030 return(ret); 4031} 4032 4033/** 4034 * xmlRegexpExec: 4035 * @comp: the compiled regular expression 4036 * @content: the value to check against the regular expression 4037 * 4038 * Check if the regular expression generate the value 4039 * 4040 * Returns 1 if it matches, 0 if not and a negativa value in case of error 4041 */ 4042int 4043xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 4044 if ((comp == NULL) || (content == NULL)) 4045 return(-1); 4046 return(xmlFARegExec(comp, content)); 4047} 4048 4049/** 4050 * xmlRegexpIsDeterminist: 4051 * @comp: the compiled regular expression 4052 * 4053 * Check if the regular expression is determinist 4054 * 4055 * Returns 1 if it yes, 0 if not and a negativa value in case of error 4056 */ 4057int 4058xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 4059 xmlAutomataPtr am; 4060 int ret; 4061 4062 if (comp == NULL) 4063 return(-1); 4064 if (comp->determinist != -1) 4065 return(comp->determinist); 4066 4067 am = xmlNewAutomata(); 4068 if (am->states != NULL) { 4069 int i; 4070 4071 for (i = 0;i < am->nbStates;i++) 4072 xmlRegFreeState(am->states[i]); 4073 xmlFree(am->states); 4074 } 4075 am->nbAtoms = comp->nbAtoms; 4076 am->atoms = comp->atoms; 4077 am->nbStates = comp->nbStates; 4078 am->states = comp->states; 4079 am->determinist = -1; 4080 ret = xmlFAComputesDeterminism(am); 4081 am->atoms = NULL; 4082 am->states = NULL; 4083 xmlFreeAutomata(am); 4084 return(ret); 4085} 4086 4087/** 4088 * xmlRegFreeRegexp: 4089 * @regexp: the regexp 4090 * 4091 * Free a regexp 4092 */ 4093void 4094xmlRegFreeRegexp(xmlRegexpPtr regexp) { 4095 int i; 4096 if (regexp == NULL) 4097 return; 4098 4099 if (regexp->string != NULL) 4100 xmlFree(regexp->string); 4101 if (regexp->states != NULL) { 4102 for (i = 0;i < regexp->nbStates;i++) 4103 xmlRegFreeState(regexp->states[i]); 4104 xmlFree(regexp->states); 4105 } 4106 if (regexp->atoms != NULL) { 4107 for (i = 0;i < regexp->nbAtoms;i++) 4108 xmlRegFreeAtom(regexp->atoms[i]); 4109 xmlFree(regexp->atoms); 4110 } 4111 if (regexp->counters != NULL) 4112 xmlFree(regexp->counters); 4113 if (regexp->compact != NULL) 4114 xmlFree(regexp->compact); 4115 if (regexp->transdata != NULL) 4116 xmlFree(regexp->transdata); 4117 if (regexp->stringMap != NULL) { 4118 for (i = 0; i < regexp->nbstrings;i++) 4119 xmlFree(regexp->stringMap[i]); 4120 xmlFree(regexp->stringMap); 4121 } 4122 4123 xmlFree(regexp); 4124} 4125 4126#ifdef LIBXML_AUTOMATA_ENABLED 4127/************************************************************************ 4128 * * 4129 * The Automata interface * 4130 * * 4131 ************************************************************************/ 4132 4133/** 4134 * xmlNewAutomata: 4135 * 4136 * Create a new automata 4137 * 4138 * Returns the new object or NULL in case of failure 4139 */ 4140xmlAutomataPtr 4141xmlNewAutomata(void) { 4142 xmlAutomataPtr ctxt; 4143 4144 ctxt = xmlRegNewParserCtxt(NULL); 4145 if (ctxt == NULL) 4146 return(NULL); 4147 4148 /* initialize the parser */ 4149 ctxt->end = NULL; 4150 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 4151 if (ctxt->start == NULL) { 4152 xmlFreeAutomata(ctxt); 4153 return(NULL); 4154 } 4155 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 4156 xmlRegFreeState(ctxt->start); 4157 xmlFreeAutomata(ctxt); 4158 return(NULL); 4159 } 4160 4161 return(ctxt); 4162} 4163 4164/** 4165 * xmlFreeAutomata: 4166 * @am: an automata 4167 * 4168 * Free an automata 4169 */ 4170void 4171xmlFreeAutomata(xmlAutomataPtr am) { 4172 if (am == NULL) 4173 return; 4174 xmlRegFreeParserCtxt(am); 4175} 4176 4177/** 4178 * xmlAutomataGetInitState: 4179 * @am: an automata 4180 * 4181 * Initial state lookup 4182 * 4183 * Returns the initial state of the automata 4184 */ 4185xmlAutomataStatePtr 4186xmlAutomataGetInitState(xmlAutomataPtr am) { 4187 if (am == NULL) 4188 return(NULL); 4189 return(am->start); 4190} 4191 4192/** 4193 * xmlAutomataSetFinalState: 4194 * @am: an automata 4195 * @state: a state in this automata 4196 * 4197 * Makes that state a final state 4198 * 4199 * Returns 0 or -1 in case of error 4200 */ 4201int 4202xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 4203 if ((am == NULL) || (state == NULL)) 4204 return(-1); 4205 state->type = XML_REGEXP_FINAL_STATE; 4206 return(0); 4207} 4208 4209/** 4210 * xmlAutomataNewTransition: 4211 * @am: an automata 4212 * @from: the starting point of the transition 4213 * @to: the target point of the transition or NULL 4214 * @token: the input string associated to that transition 4215 * @data: data passed to the callback function if the transition is activated 4216 * 4217 * If @to is NULL, this create first a new target state in the automata 4218 * and then adds a transition from the @from state to the target state 4219 * activated by the value of @token 4220 * 4221 * Returns the target state or NULL in case of error 4222 */ 4223xmlAutomataStatePtr 4224xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 4225 xmlAutomataStatePtr to, const xmlChar *token, 4226 void *data) { 4227 xmlRegAtomPtr atom; 4228 4229 if ((am == NULL) || (from == NULL) || (token == NULL)) 4230 return(NULL); 4231 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4232 if (atom == NULL) 4233 return(NULL); 4234 atom->data = data; 4235 if (atom == NULL) 4236 return(NULL); 4237 atom->valuep = xmlStrdup(token); 4238 4239 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 4240 xmlRegFreeAtom(atom); 4241 return(NULL); 4242 } 4243 if (to == NULL) 4244 return(am->state); 4245 return(to); 4246} 4247 4248/** 4249 * xmlAutomataNewTransition2: 4250 * @am: an automata 4251 * @from: the starting point of the transition 4252 * @to: the target point of the transition or NULL 4253 * @token: the first input string associated to that transition 4254 * @token2: the second input string associated to that transition 4255 * @data: data passed to the callback function if the transition is activated 4256 * 4257 * If @to is NULL, this create first a new target state in the automata 4258 * and then adds a transition from the @from state to the target state 4259 * activated by the value of @token 4260 * 4261 * Returns the target state or NULL in case of error 4262 */ 4263xmlAutomataStatePtr 4264xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 4265 xmlAutomataStatePtr to, const xmlChar *token, 4266 const xmlChar *token2, void *data) { 4267 xmlRegAtomPtr atom; 4268 4269 if ((am == NULL) || (from == NULL) || (token == NULL)) 4270 return(NULL); 4271 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4272 atom->data = data; 4273 if (atom == NULL) 4274 return(NULL); 4275 if ((token2 == NULL) || (*token2 == 0)) { 4276 atom->valuep = xmlStrdup(token); 4277 } else { 4278 int lenn, lenp; 4279 xmlChar *str; 4280 4281 lenn = strlen((char *) token2); 4282 lenp = strlen((char *) token); 4283 4284 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4285 if (str == NULL) { 4286 xmlRegFreeAtom(atom); 4287 return(NULL); 4288 } 4289 memcpy(&str[0], token, lenp); 4290 str[lenp] = '|'; 4291 memcpy(&str[lenp + 1], token2, lenn); 4292 str[lenn + lenp + 1] = 0; 4293 4294 atom->valuep = str; 4295 } 4296 4297 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 4298 xmlRegFreeAtom(atom); 4299 return(NULL); 4300 } 4301 if (to == NULL) 4302 return(am->state); 4303 return(to); 4304} 4305 4306/** 4307 * xmlAutomataNewCountTrans: 4308 * @am: an automata 4309 * @from: the starting point of the transition 4310 * @to: the target point of the transition or NULL 4311 * @token: the input string associated to that transition 4312 * @min: the minimum successive occurences of token 4313 * @max: the maximum successive occurences of token 4314 * @data: data associated to the transition 4315 * 4316 * If @to is NULL, this create first a new target state in the automata 4317 * and then adds a transition from the @from state to the target state 4318 * activated by a succession of input of value @token and whose number 4319 * is between @min and @max 4320 * 4321 * Returns the target state or NULL in case of error 4322 */ 4323xmlAutomataStatePtr 4324xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4325 xmlAutomataStatePtr to, const xmlChar *token, 4326 int min, int max, void *data) { 4327 xmlRegAtomPtr atom; 4328 int counter; 4329 4330 if ((am == NULL) || (from == NULL) || (token == NULL)) 4331 return(NULL); 4332 if (min < 0) 4333 return(NULL); 4334 if ((max < min) || (max < 1)) 4335 return(NULL); 4336 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4337 if (atom == NULL) 4338 return(NULL); 4339 atom->valuep = xmlStrdup(token); 4340 atom->data = data; 4341 if (min == 0) 4342 atom->min = 1; 4343 else 4344 atom->min = min; 4345 atom->max = max; 4346 4347 /* 4348 * associate a counter to the transition. 4349 */ 4350 counter = xmlRegGetCounter(am); 4351 am->counters[counter].min = min; 4352 am->counters[counter].max = max; 4353 4354 /* xmlFAGenerateTransitions(am, from, to, atom); */ 4355 if (to == NULL) { 4356 to = xmlRegNewState(am); 4357 xmlRegStatePush(am, to); 4358 } 4359 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 4360 xmlRegAtomPush(am, atom); 4361 am->state = to; 4362 4363 if (to == NULL) 4364 to = am->state; 4365 if (to == NULL) 4366 return(NULL); 4367 if (min == 0) 4368 xmlFAGenerateEpsilonTransition(am, from, to); 4369 return(to); 4370} 4371 4372/** 4373 * xmlAutomataNewOnceTrans: 4374 * @am: an automata 4375 * @from: the starting point of the transition 4376 * @to: the target point of the transition or NULL 4377 * @token: the input string associated to that transition 4378 * @min: the minimum successive occurences of token 4379 * @max: the maximum successive occurences of token 4380 * @data: data associated to the transition 4381 * 4382 * If @to is NULL, this create first a new target state in the automata 4383 * and then adds a transition from the @from state to the target state 4384 * activated by a succession of input of value @token and whose number 4385 * is between @min and @max, moreover that transistion can only be crossed 4386 * once. 4387 * 4388 * Returns the target state or NULL in case of error 4389 */ 4390xmlAutomataStatePtr 4391xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4392 xmlAutomataStatePtr to, const xmlChar *token, 4393 int min, int max, void *data) { 4394 xmlRegAtomPtr atom; 4395 int counter; 4396 4397 if ((am == NULL) || (from == NULL) || (token == NULL)) 4398 return(NULL); 4399 if (min < 1) 4400 return(NULL); 4401 if ((max < min) || (max < 1)) 4402 return(NULL); 4403 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4404 if (atom == NULL) 4405 return(NULL); 4406 atom->valuep = xmlStrdup(token); 4407 atom->data = data; 4408 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 4409 if (min == 0) 4410 atom->min = 1; 4411 else 4412 atom->min = min; 4413 atom->max = max; 4414 /* 4415 * associate a counter to the transition. 4416 */ 4417 counter = xmlRegGetCounter(am); 4418 am->counters[counter].min = 1; 4419 am->counters[counter].max = 1; 4420 4421 /* xmlFAGenerateTransitions(am, from, to, atom); */ 4422 if (to == NULL) { 4423 to = xmlRegNewState(am); 4424 xmlRegStatePush(am, to); 4425 } 4426 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 4427 xmlRegAtomPush(am, atom); 4428 am->state = to; 4429 if (to == NULL) 4430 to = am->state; 4431 if (to == NULL) 4432 return(NULL); 4433 return(to); 4434} 4435 4436/** 4437 * xmlAutomataNewState: 4438 * @am: an automata 4439 * 4440 * Create a new disconnected state in the automata 4441 * 4442 * Returns the new state or NULL in case of error 4443 */ 4444xmlAutomataStatePtr 4445xmlAutomataNewState(xmlAutomataPtr am) { 4446 xmlAutomataStatePtr to; 4447 4448 if (am == NULL) 4449 return(NULL); 4450 to = xmlRegNewState(am); 4451 xmlRegStatePush(am, to); 4452 return(to); 4453} 4454 4455/** 4456 * xmlAutomataNewEpsilon: 4457 * @am: an automata 4458 * @from: the starting point of the transition 4459 * @to: the target point of the transition or NULL 4460 * 4461 * If @to is NULL, this create first a new target state in the automata 4462 * and then adds a an epsilon transition from the @from state to the 4463 * target state 4464 * 4465 * Returns the target state or NULL in case of error 4466 */ 4467xmlAutomataStatePtr 4468xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 4469 xmlAutomataStatePtr to) { 4470 if ((am == NULL) || (from == NULL)) 4471 return(NULL); 4472 xmlFAGenerateEpsilonTransition(am, from, to); 4473 if (to == NULL) 4474 return(am->state); 4475 return(to); 4476} 4477 4478/** 4479 * xmlAutomataNewAllTrans: 4480 * @am: an automata 4481 * @from: the starting point of the transition 4482 * @to: the target point of the transition or NULL 4483 * @lax: allow to transition if not all all transitions have been activated 4484 * 4485 * If @to is NULL, this create first a new target state in the automata 4486 * and then adds a an ALL transition from the @from state to the 4487 * target state. That transition is an epsilon transition allowed only when 4488 * all transitions from the @from node have been activated. 4489 * 4490 * Returns the target state or NULL in case of error 4491 */ 4492xmlAutomataStatePtr 4493xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4494 xmlAutomataStatePtr to, int lax) { 4495 if ((am == NULL) || (from == NULL)) 4496 return(NULL); 4497 xmlFAGenerateAllTransition(am, from, to, lax); 4498 if (to == NULL) 4499 return(am->state); 4500 return(to); 4501} 4502 4503/** 4504 * xmlAutomataNewCounter: 4505 * @am: an automata 4506 * @min: the minimal value on the counter 4507 * @max: the maximal value on the counter 4508 * 4509 * Create a new counter 4510 * 4511 * Returns the counter number or -1 in case of error 4512 */ 4513int 4514xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 4515 int ret; 4516 4517 if (am == NULL) 4518 return(-1); 4519 4520 ret = xmlRegGetCounter(am); 4521 if (ret < 0) 4522 return(-1); 4523 am->counters[ret].min = min; 4524 am->counters[ret].max = max; 4525 return(ret); 4526} 4527 4528/** 4529 * xmlAutomataNewCountedTrans: 4530 * @am: an automata 4531 * @from: the starting point of the transition 4532 * @to: the target point of the transition or NULL 4533 * @counter: the counter associated to that transition 4534 * 4535 * If @to is NULL, this create first a new target state in the automata 4536 * and then adds an epsilon transition from the @from state to the target state 4537 * which will increment the counter provided 4538 * 4539 * Returns the target state or NULL in case of error 4540 */ 4541xmlAutomataStatePtr 4542xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4543 xmlAutomataStatePtr to, int counter) { 4544 if ((am == NULL) || (from == NULL) || (counter < 0)) 4545 return(NULL); 4546 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 4547 if (to == NULL) 4548 return(am->state); 4549 return(to); 4550} 4551 4552/** 4553 * xmlAutomataNewCounterTrans: 4554 * @am: an automata 4555 * @from: the starting point of the transition 4556 * @to: the target point of the transition or NULL 4557 * @counter: the counter associated to that transition 4558 * 4559 * If @to is NULL, this create first a new target state in the automata 4560 * and then adds an epsilon transition from the @from state to the target state 4561 * which will be allowed only if the counter is within the right range. 4562 * 4563 * Returns the target state or NULL in case of error 4564 */ 4565xmlAutomataStatePtr 4566xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4567 xmlAutomataStatePtr to, int counter) { 4568 if ((am == NULL) || (from == NULL) || (counter < 0)) 4569 return(NULL); 4570 xmlFAGenerateCountedTransition(am, from, to, counter); 4571 if (to == NULL) 4572 return(am->state); 4573 return(to); 4574} 4575 4576/** 4577 * xmlAutomataCompile: 4578 * @am: an automata 4579 * 4580 * Compile the automata into a Reg Exp ready for being executed. 4581 * The automata should be free after this point. 4582 * 4583 * Returns the compiled regexp or NULL in case of error 4584 */ 4585xmlRegexpPtr 4586xmlAutomataCompile(xmlAutomataPtr am) { 4587 xmlRegexpPtr ret; 4588 4589 if ((am == NULL) || (am->error != 0)) return(NULL); 4590 xmlFAEliminateEpsilonTransitions(am); 4591 /* xmlFAComputesDeterminism(am); */ 4592 ret = xmlRegEpxFromParse(am); 4593 4594 return(ret); 4595} 4596 4597/** 4598 * xmlAutomataIsDeterminist: 4599 * @am: an automata 4600 * 4601 * Checks if an automata is determinist. 4602 * 4603 * Returns 1 if true, 0 if not, and -1 in case of error 4604 */ 4605int 4606xmlAutomataIsDeterminist(xmlAutomataPtr am) { 4607 int ret; 4608 4609 if (am == NULL) 4610 return(-1); 4611 4612 ret = xmlFAComputesDeterminism(am); 4613 return(ret); 4614} 4615#endif /* LIBXML_AUTOMATA_ENABLED */ 4616#endif /* LIBXML_REGEXP_ENABLED */ 4617