xmlregexp.c revision 52b48c7a7bfb338f434d39f9fc3e54768e301575
1/* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications thise includes: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17#define IN_LIBXML 18#include "libxml.h" 19 20#ifdef LIBXML_REGEXP_ENABLED 21 22#include <stdio.h> 23#include <string.h> 24#include <libxml/tree.h> 25#include <libxml/parserInternals.h> 26#include <libxml/xmlregexp.h> 27#include <libxml/xmlautomata.h> 28#include <libxml/xmlunicode.h> 29 30/* #define DEBUG_REGEXP_GRAPH */ 31/* #define DEBUG_REGEXP_EXEC */ 32/* #define DEBUG_PUSH */ 33/* #define DEBUG_COMPACTION */ 34 35#define ERROR(str) ctxt->error = 1; \ 36 xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur) 37#define NEXT ctxt->cur++ 38#define CUR (*(ctxt->cur)) 39#define NXT(index) (ctxt->cur[index]) 40 41#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 42#define NEXTL(l) ctxt->cur += l; 43 44/** 45 * TODO: 46 * 47 * macro to flag unimplemented blocks 48 */ 49#define TODO \ 50 xmlGenericError(xmlGenericErrorContext, \ 51 "Unimplemented block at %s:%d\n", \ 52 __FILE__, __LINE__); 53 54 55/************************************************************************ 56 * * 57 * Datatypes and structures * 58 * * 59 ************************************************************************/ 60 61typedef enum { 62 XML_REGEXP_EPSILON = 1, 63 XML_REGEXP_CHARVAL, 64 XML_REGEXP_RANGES, 65 XML_REGEXP_SUBREG, 66 XML_REGEXP_STRING, 67 XML_REGEXP_ANYCHAR, /* . */ 68 XML_REGEXP_ANYSPACE, /* \s */ 69 XML_REGEXP_NOTSPACE, /* \S */ 70 XML_REGEXP_INITNAME, /* \l */ 71 XML_REGEXP_NOTINITNAME, /* \l */ 72 XML_REGEXP_NAMECHAR, /* \c */ 73 XML_REGEXP_NOTNAMECHAR, /* \C */ 74 XML_REGEXP_DECIMAL, /* \d */ 75 XML_REGEXP_NOTDECIMAL, /* \d */ 76 XML_REGEXP_REALCHAR, /* \w */ 77 XML_REGEXP_NOTREALCHAR, /* \w */ 78 XML_REGEXP_LETTER, 79 XML_REGEXP_LETTER_UPPERCASE, 80 XML_REGEXP_LETTER_LOWERCASE, 81 XML_REGEXP_LETTER_TITLECASE, 82 XML_REGEXP_LETTER_MODIFIER, 83 XML_REGEXP_LETTER_OTHERS, 84 XML_REGEXP_MARK, 85 XML_REGEXP_MARK_NONSPACING, 86 XML_REGEXP_MARK_SPACECOMBINING, 87 XML_REGEXP_MARK_ENCLOSING, 88 XML_REGEXP_NUMBER, 89 XML_REGEXP_NUMBER_DECIMAL, 90 XML_REGEXP_NUMBER_LETTER, 91 XML_REGEXP_NUMBER_OTHERS, 92 XML_REGEXP_PUNCT, 93 XML_REGEXP_PUNCT_CONNECTOR, 94 XML_REGEXP_PUNCT_DASH, 95 XML_REGEXP_PUNCT_OPEN, 96 XML_REGEXP_PUNCT_CLOSE, 97 XML_REGEXP_PUNCT_INITQUOTE, 98 XML_REGEXP_PUNCT_FINQUOTE, 99 XML_REGEXP_PUNCT_OTHERS, 100 XML_REGEXP_SEPAR, 101 XML_REGEXP_SEPAR_SPACE, 102 XML_REGEXP_SEPAR_LINE, 103 XML_REGEXP_SEPAR_PARA, 104 XML_REGEXP_SYMBOL, 105 XML_REGEXP_SYMBOL_MATH, 106 XML_REGEXP_SYMBOL_CURRENCY, 107 XML_REGEXP_SYMBOL_MODIFIER, 108 XML_REGEXP_SYMBOL_OTHERS, 109 XML_REGEXP_OTHER, 110 XML_REGEXP_OTHER_CONTROL, 111 XML_REGEXP_OTHER_FORMAT, 112 XML_REGEXP_OTHER_PRIVATE, 113 XML_REGEXP_OTHER_NA, 114 XML_REGEXP_BLOCK_NAME 115} xmlRegAtomType; 116 117typedef enum { 118 XML_REGEXP_QUANT_EPSILON = 1, 119 XML_REGEXP_QUANT_ONCE, 120 XML_REGEXP_QUANT_OPT, 121 XML_REGEXP_QUANT_MULT, 122 XML_REGEXP_QUANT_PLUS, 123 XML_REGEXP_QUANT_ONCEONLY, 124 XML_REGEXP_QUANT_ALL, 125 XML_REGEXP_QUANT_RANGE 126} xmlRegQuantType; 127 128typedef enum { 129 XML_REGEXP_START_STATE = 1, 130 XML_REGEXP_FINAL_STATE, 131 XML_REGEXP_TRANS_STATE 132} xmlRegStateType; 133 134typedef enum { 135 XML_REGEXP_MARK_NORMAL = 0, 136 XML_REGEXP_MARK_START, 137 XML_REGEXP_MARK_VISITED 138} xmlRegMarkedType; 139 140typedef struct _xmlRegRange xmlRegRange; 141typedef xmlRegRange *xmlRegRangePtr; 142 143struct _xmlRegRange { 144 int neg; 145 xmlRegAtomType type; 146 int start; 147 int end; 148 xmlChar *blockName; 149}; 150 151typedef struct _xmlRegAtom xmlRegAtom; 152typedef xmlRegAtom *xmlRegAtomPtr; 153 154typedef struct _xmlAutomataState xmlRegState; 155typedef xmlRegState *xmlRegStatePtr; 156 157struct _xmlRegAtom { 158 int no; 159 xmlRegAtomType type; 160 xmlRegQuantType quant; 161 int min; 162 int max; 163 164 void *valuep; 165 void *valuep2; 166 int neg; 167 int codepoint; 168 xmlRegStatePtr start; 169 xmlRegStatePtr stop; 170 int maxRanges; 171 int nbRanges; 172 xmlRegRangePtr *ranges; 173 void *data; 174}; 175 176typedef struct _xmlRegCounter xmlRegCounter; 177typedef xmlRegCounter *xmlRegCounterPtr; 178 179struct _xmlRegCounter { 180 int min; 181 int max; 182}; 183 184typedef struct _xmlRegTrans xmlRegTrans; 185typedef xmlRegTrans *xmlRegTransPtr; 186 187struct _xmlRegTrans { 188 xmlRegAtomPtr atom; 189 int to; 190 int counter; 191 int count; 192}; 193 194struct _xmlAutomataState { 195 xmlRegStateType type; 196 xmlRegMarkedType mark; 197 xmlRegMarkedType reached; 198 int no; 199 200 int maxTrans; 201 int nbTrans; 202 xmlRegTrans *trans; 203}; 204 205typedef struct _xmlAutomata xmlRegParserCtxt; 206typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 207 208struct _xmlAutomata { 209 xmlChar *string; 210 xmlChar *cur; 211 212 int error; 213 int neg; 214 215 xmlRegStatePtr start; 216 xmlRegStatePtr end; 217 xmlRegStatePtr state; 218 219 xmlRegAtomPtr atom; 220 221 int maxAtoms; 222 int nbAtoms; 223 xmlRegAtomPtr *atoms; 224 225 int maxStates; 226 int nbStates; 227 xmlRegStatePtr *states; 228 229 int maxCounters; 230 int nbCounters; 231 xmlRegCounter *counters; 232 233 int determinist; 234}; 235 236struct _xmlRegexp { 237 xmlChar *string; 238 int nbStates; 239 xmlRegStatePtr *states; 240 int nbAtoms; 241 xmlRegAtomPtr *atoms; 242 int nbCounters; 243 xmlRegCounter *counters; 244 int determinist; 245 /* 246 * That's the compact form for determinists automatas 247 */ 248 int nbstates; 249 int *compact; 250 void **transdata; 251 int nbstrings; 252 xmlChar **stringMap; 253}; 254 255typedef struct _xmlRegExecRollback xmlRegExecRollback; 256typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 257 258struct _xmlRegExecRollback { 259 xmlRegStatePtr state;/* the current state */ 260 int index; /* the index in the input stack */ 261 int nextbranch; /* the next transition to explore in that state */ 262 int *counts; /* save the automate state if it has some */ 263}; 264 265typedef struct _xmlRegInputToken xmlRegInputToken; 266typedef xmlRegInputToken *xmlRegInputTokenPtr; 267 268struct _xmlRegInputToken { 269 xmlChar *value; 270 void *data; 271}; 272 273struct _xmlRegExecCtxt { 274 int status; /* execution status != 0 indicate an error */ 275 int determinist; /* did we found an inderterministic behaviour */ 276 xmlRegexpPtr comp; /* the compiled regexp */ 277 xmlRegExecCallbacks callback; 278 void *data; 279 280 xmlRegStatePtr state;/* the current state */ 281 int transno; /* the current transition on that state */ 282 int transcount; /* the number of char in char counted transitions */ 283 284 /* 285 * A stack of rollback states 286 */ 287 int maxRollbacks; 288 int nbRollbacks; 289 xmlRegExecRollback *rollbacks; 290 291 /* 292 * The state of the automata if any 293 */ 294 int *counts; 295 296 /* 297 * The input stack 298 */ 299 int inputStackMax; 300 int inputStackNr; 301 int index; 302 int *charStack; 303 const xmlChar *inputString; /* when operating on characters */ 304 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 305 306}; 307 308#define REGEXP_ALL_COUNTER 0x123456 309#define REGEXP_ALL_LAX_COUNTER 0x123457 310 311static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 312static void xmlRegFreeState(xmlRegStatePtr state); 313static void xmlRegFreeAtom(xmlRegAtomPtr atom); 314 315/************************************************************************ 316 * * 317 * Allocation/Deallocation * 318 * * 319 ************************************************************************/ 320 321static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 322/** 323 * xmlRegEpxFromParse: 324 * @ctxt: the parser context used to build it 325 * 326 * Allocate a new regexp and fill it with the reult from the parser 327 * 328 * Returns the new regexp or NULL in case of error 329 */ 330static xmlRegexpPtr 331xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 332 xmlRegexpPtr ret; 333 334 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 335 if (ret == NULL) 336 return(NULL); 337 memset(ret, 0, sizeof(xmlRegexp)); 338 ret->string = ctxt->string; 339 ctxt->string = NULL; 340 ret->nbStates = ctxt->nbStates; 341 ctxt->nbStates = 0; 342 ret->states = ctxt->states; 343 ctxt->states = NULL; 344 ret->nbAtoms = ctxt->nbAtoms; 345 ctxt->nbAtoms = 0; 346 ret->atoms = ctxt->atoms; 347 ctxt->atoms = NULL; 348 ret->nbCounters = ctxt->nbCounters; 349 ctxt->nbCounters = 0; 350 ret->counters = ctxt->counters; 351 ctxt->counters = NULL; 352 ret->determinist = ctxt->determinist; 353 354 if ((ret->determinist != 0) && 355 (ret->nbCounters == 0) && 356 (ret->atoms != NULL) && 357 (ret->atoms[0] != NULL) && 358 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 359 int i, j, nbstates = 0, nbatoms = 0; 360 int *stateRemap; 361 int *stringRemap; 362 int *transitions; 363 void **transdata; 364 xmlChar **stringMap; 365 xmlChar *value; 366 367 /* 368 * Switch to a compact representation 369 * 1/ counting the effective number of states left 370 * 2/ conting the unique number of atoms, and check that 371 * they are all of the string type 372 * 3/ build a table state x atom for the transitions 373 */ 374 375 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 376 for (i = 0;i < ret->nbStates;i++) { 377 if (ret->states[i] != NULL) { 378 stateRemap[i] = nbstates; 379 nbstates++; 380 } else { 381 stateRemap[i] = -1; 382 } 383 } 384#ifdef DEBUG_COMPACTION 385 printf("Final: %d states\n", nbstates); 386#endif 387 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 388 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 389 for (i = 0;i < ret->nbAtoms;i++) { 390 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 391 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 392 value = ret->atoms[i]->valuep; 393 for (j = 0;j < nbatoms;j++) { 394 if (xmlStrEqual(stringMap[j], value)) { 395 stringRemap[i] = j; 396 break; 397 } 398 } 399 if (j >= nbatoms) { 400 stringRemap[i] = nbatoms; 401 stringMap[nbatoms] = xmlStrdup(value); 402 nbatoms++; 403 } 404 } else { 405 xmlFree(stateRemap); 406 xmlFree(stringRemap); 407 for (i = 0;i < nbatoms;i++) 408 xmlFree(stringMap[i]); 409 xmlFree(stringMap); 410 goto fail_compact; 411 } 412 } 413#ifdef DEBUG_COMPACTION 414 printf("Final: %d atoms\n", nbatoms); 415#endif 416 417 /* 418 * Allocate the transition table. The first entry for each 419 * state correspond to the state type. 420 */ 421 transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int)); 422 transdata = NULL; 423 memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int)); 424 425 for (i = 0;i < ret->nbStates;i++) { 426 int stateno, atomno, targetno, prev; 427 xmlRegStatePtr state; 428 xmlRegTransPtr trans; 429 430 stateno = stateRemap[i]; 431 if (stateno == -1) 432 continue; 433 state = ret->states[i]; 434 435 transitions[stateno * (nbatoms + 1)] = state->type; 436 437 for (j = 0;j < state->nbTrans;j++) { 438 trans = &(state->trans[j]); 439 if ((trans->to == -1) || (trans->atom == NULL)) 440 continue; 441 atomno = stringRemap[trans->atom->no]; 442 if ((trans->atom->data != NULL) && (transdata == NULL)) { 443 transdata = (void **) xmlMalloc(nbstates * nbatoms * 444 sizeof(void *)); 445 if (transdata != NULL) 446 memset(transdata, 0, 447 nbstates * nbatoms * sizeof(void *)); 448 } 449 targetno = stateRemap[trans->to]; 450 /* 451 * if the same atome can generate transition to 2 different 452 * states then it means the automata is not determinist and 453 * the compact form can't be used ! 454 */ 455 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 456 if (prev != 0) { 457 if (prev != targetno + 1) { 458 ret->determinist = 0; 459#ifdef DEBUG_COMPACTION 460 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 461 i, j, trans->atom->no, trans->to, atomno, targetno); 462 printf(" previous to is %d\n", prev); 463#endif 464 ret->determinist = 0; 465 if (transdata != NULL) 466 xmlFree(transdata); 467 xmlFree(transitions); 468 xmlFree(stateRemap); 469 xmlFree(stringRemap); 470 for (i = 0;i < nbatoms;i++) 471 xmlFree(stringMap[i]); 472 xmlFree(stringMap); 473 goto fail_compact; 474 } 475 } else { 476#if 0 477 printf("State %d trans %d: atom %d to %d : %d to %d\n", 478 i, j, trans->atom->no, trans->to, atomno, targetno); 479#endif 480 transitions[stateno * (nbatoms + 1) + atomno + 1] = 481 targetno + 1; /* to avoid 0 */ 482 if (transdata != NULL) 483 transdata[stateno * nbatoms + atomno] = 484 trans->atom->data; 485 } 486 } 487 } 488 ret->determinist = 1; 489#ifdef DEBUG_COMPACTION 490 /* 491 * Debug 492 */ 493 for (i = 0;i < nbstates;i++) { 494 for (j = 0;j < nbatoms + 1;j++) { 495 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 496 } 497 printf("\n"); 498 } 499 printf("\n"); 500#endif 501 /* 502 * Cleanup of the old data 503 */ 504 if (ret->states != NULL) { 505 for (i = 0;i < ret->nbStates;i++) 506 xmlRegFreeState(ret->states[i]); 507 xmlFree(ret->states); 508 } 509 ret->states = NULL; 510 ret->nbStates = 0; 511 if (ret->atoms != NULL) { 512 for (i = 0;i < ret->nbAtoms;i++) 513 xmlRegFreeAtom(ret->atoms[i]); 514 xmlFree(ret->atoms); 515 } 516 ret->atoms = NULL; 517 ret->nbAtoms = 0; 518 519 ret->compact = transitions; 520 ret->transdata = transdata; 521 ret->stringMap = stringMap; 522 ret->nbstrings = nbatoms; 523 ret->nbstates = nbstates; 524 xmlFree(stateRemap); 525 xmlFree(stringRemap); 526 } 527fail_compact: 528 return(ret); 529} 530 531/** 532 * xmlRegNewParserCtxt: 533 * @string: the string to parse 534 * 535 * Allocate a new regexp parser context 536 * 537 * Returns the new context or NULL in case of error 538 */ 539static xmlRegParserCtxtPtr 540xmlRegNewParserCtxt(const xmlChar *string) { 541 xmlRegParserCtxtPtr ret; 542 543 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 544 if (ret == NULL) 545 return(NULL); 546 memset(ret, 0, sizeof(xmlRegParserCtxt)); 547 if (string != NULL) 548 ret->string = xmlStrdup(string); 549 ret->cur = ret->string; 550 ret->neg = 0; 551 ret->error = 0; 552 ret->determinist = -1; 553 return(ret); 554} 555 556/** 557 * xmlRegNewRange: 558 * @ctxt: the regexp parser context 559 * @neg: is that negative 560 * @type: the type of range 561 * @start: the start codepoint 562 * @end: the end codepoint 563 * 564 * Allocate a new regexp range 565 * 566 * Returns the new range or NULL in case of error 567 */ 568static xmlRegRangePtr 569xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 570 int neg, xmlRegAtomType type, int start, int end) { 571 xmlRegRangePtr ret; 572 573 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 574 if (ret == NULL) { 575 ERROR("failed to allocate regexp range"); 576 return(NULL); 577 } 578 ret->neg = neg; 579 ret->type = type; 580 ret->start = start; 581 ret->end = end; 582 return(ret); 583} 584 585/** 586 * xmlRegFreeRange: 587 * @range: the regexp range 588 * 589 * Free a regexp range 590 */ 591static void 592xmlRegFreeRange(xmlRegRangePtr range) { 593 if (range == NULL) 594 return; 595 596 if (range->blockName != NULL) 597 xmlFree(range->blockName); 598 xmlFree(range); 599} 600 601/** 602 * xmlRegNewAtom: 603 * @ctxt: the regexp parser context 604 * @type: the type of atom 605 * 606 * Allocate a new regexp range 607 * 608 * Returns the new atom or NULL in case of error 609 */ 610static xmlRegAtomPtr 611xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 612 xmlRegAtomPtr ret; 613 614 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 615 if (ret == NULL) { 616 ERROR("failed to allocate regexp atom"); 617 return(NULL); 618 } 619 memset(ret, 0, sizeof(xmlRegAtom)); 620 ret->type = type; 621 ret->quant = XML_REGEXP_QUANT_ONCE; 622 ret->min = 0; 623 ret->max = 0; 624 return(ret); 625} 626 627/** 628 * xmlRegFreeAtom: 629 * @atom: the regexp atom 630 * 631 * Free a regexp atom 632 */ 633static void 634xmlRegFreeAtom(xmlRegAtomPtr atom) { 635 int i; 636 637 if (atom == NULL) 638 return; 639 640 for (i = 0;i < atom->nbRanges;i++) 641 xmlRegFreeRange(atom->ranges[i]); 642 if (atom->ranges != NULL) 643 xmlFree(atom->ranges); 644 if (atom->type == XML_REGEXP_STRING) 645 xmlFree(atom->valuep); 646 xmlFree(atom); 647} 648 649static xmlRegStatePtr 650xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 651 xmlRegStatePtr ret; 652 653 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 654 if (ret == NULL) { 655 ERROR("failed to allocate regexp state"); 656 return(NULL); 657 } 658 memset(ret, 0, sizeof(xmlRegState)); 659 ret->type = XML_REGEXP_TRANS_STATE; 660 ret->mark = XML_REGEXP_MARK_NORMAL; 661 return(ret); 662} 663 664/** 665 * xmlRegFreeState: 666 * @state: the regexp state 667 * 668 * Free a regexp state 669 */ 670static void 671xmlRegFreeState(xmlRegStatePtr state) { 672 if (state == NULL) 673 return; 674 675 if (state->trans != NULL) 676 xmlFree(state->trans); 677 xmlFree(state); 678} 679 680/** 681 * xmlRegFreeParserCtxt: 682 * @ctxt: the regexp parser context 683 * 684 * Free a regexp parser context 685 */ 686static void 687xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 688 int i; 689 if (ctxt == NULL) 690 return; 691 692 if (ctxt->string != NULL) 693 xmlFree(ctxt->string); 694 if (ctxt->states != NULL) { 695 for (i = 0;i < ctxt->nbStates;i++) 696 xmlRegFreeState(ctxt->states[i]); 697 xmlFree(ctxt->states); 698 } 699 if (ctxt->atoms != NULL) { 700 for (i = 0;i < ctxt->nbAtoms;i++) 701 xmlRegFreeAtom(ctxt->atoms[i]); 702 xmlFree(ctxt->atoms); 703 } 704 if (ctxt->counters != NULL) 705 xmlFree(ctxt->counters); 706 xmlFree(ctxt); 707} 708 709/************************************************************************ 710 * * 711 * Display of Data structures * 712 * * 713 ************************************************************************/ 714 715static void 716xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 717 switch (type) { 718 case XML_REGEXP_EPSILON: 719 fprintf(output, "epsilon "); break; 720 case XML_REGEXP_CHARVAL: 721 fprintf(output, "charval "); break; 722 case XML_REGEXP_RANGES: 723 fprintf(output, "ranges "); break; 724 case XML_REGEXP_SUBREG: 725 fprintf(output, "subexpr "); break; 726 case XML_REGEXP_STRING: 727 fprintf(output, "string "); break; 728 case XML_REGEXP_ANYCHAR: 729 fprintf(output, "anychar "); break; 730 case XML_REGEXP_ANYSPACE: 731 fprintf(output, "anyspace "); break; 732 case XML_REGEXP_NOTSPACE: 733 fprintf(output, "notspace "); break; 734 case XML_REGEXP_INITNAME: 735 fprintf(output, "initname "); break; 736 case XML_REGEXP_NOTINITNAME: 737 fprintf(output, "notinitname "); break; 738 case XML_REGEXP_NAMECHAR: 739 fprintf(output, "namechar "); break; 740 case XML_REGEXP_NOTNAMECHAR: 741 fprintf(output, "notnamechar "); break; 742 case XML_REGEXP_DECIMAL: 743 fprintf(output, "decimal "); break; 744 case XML_REGEXP_NOTDECIMAL: 745 fprintf(output, "notdecimal "); break; 746 case XML_REGEXP_REALCHAR: 747 fprintf(output, "realchar "); break; 748 case XML_REGEXP_NOTREALCHAR: 749 fprintf(output, "notrealchar "); break; 750 case XML_REGEXP_LETTER: 751 fprintf(output, "LETTER "); break; 752 case XML_REGEXP_LETTER_UPPERCASE: 753 fprintf(output, "LETTER_UPPERCASE "); break; 754 case XML_REGEXP_LETTER_LOWERCASE: 755 fprintf(output, "LETTER_LOWERCASE "); break; 756 case XML_REGEXP_LETTER_TITLECASE: 757 fprintf(output, "LETTER_TITLECASE "); break; 758 case XML_REGEXP_LETTER_MODIFIER: 759 fprintf(output, "LETTER_MODIFIER "); break; 760 case XML_REGEXP_LETTER_OTHERS: 761 fprintf(output, "LETTER_OTHERS "); break; 762 case XML_REGEXP_MARK: 763 fprintf(output, "MARK "); break; 764 case XML_REGEXP_MARK_NONSPACING: 765 fprintf(output, "MARK_NONSPACING "); break; 766 case XML_REGEXP_MARK_SPACECOMBINING: 767 fprintf(output, "MARK_SPACECOMBINING "); break; 768 case XML_REGEXP_MARK_ENCLOSING: 769 fprintf(output, "MARK_ENCLOSING "); break; 770 case XML_REGEXP_NUMBER: 771 fprintf(output, "NUMBER "); break; 772 case XML_REGEXP_NUMBER_DECIMAL: 773 fprintf(output, "NUMBER_DECIMAL "); break; 774 case XML_REGEXP_NUMBER_LETTER: 775 fprintf(output, "NUMBER_LETTER "); break; 776 case XML_REGEXP_NUMBER_OTHERS: 777 fprintf(output, "NUMBER_OTHERS "); break; 778 case XML_REGEXP_PUNCT: 779 fprintf(output, "PUNCT "); break; 780 case XML_REGEXP_PUNCT_CONNECTOR: 781 fprintf(output, "PUNCT_CONNECTOR "); break; 782 case XML_REGEXP_PUNCT_DASH: 783 fprintf(output, "PUNCT_DASH "); break; 784 case XML_REGEXP_PUNCT_OPEN: 785 fprintf(output, "PUNCT_OPEN "); break; 786 case XML_REGEXP_PUNCT_CLOSE: 787 fprintf(output, "PUNCT_CLOSE "); break; 788 case XML_REGEXP_PUNCT_INITQUOTE: 789 fprintf(output, "PUNCT_INITQUOTE "); break; 790 case XML_REGEXP_PUNCT_FINQUOTE: 791 fprintf(output, "PUNCT_FINQUOTE "); break; 792 case XML_REGEXP_PUNCT_OTHERS: 793 fprintf(output, "PUNCT_OTHERS "); break; 794 case XML_REGEXP_SEPAR: 795 fprintf(output, "SEPAR "); break; 796 case XML_REGEXP_SEPAR_SPACE: 797 fprintf(output, "SEPAR_SPACE "); break; 798 case XML_REGEXP_SEPAR_LINE: 799 fprintf(output, "SEPAR_LINE "); break; 800 case XML_REGEXP_SEPAR_PARA: 801 fprintf(output, "SEPAR_PARA "); break; 802 case XML_REGEXP_SYMBOL: 803 fprintf(output, "SYMBOL "); break; 804 case XML_REGEXP_SYMBOL_MATH: 805 fprintf(output, "SYMBOL_MATH "); break; 806 case XML_REGEXP_SYMBOL_CURRENCY: 807 fprintf(output, "SYMBOL_CURRENCY "); break; 808 case XML_REGEXP_SYMBOL_MODIFIER: 809 fprintf(output, "SYMBOL_MODIFIER "); break; 810 case XML_REGEXP_SYMBOL_OTHERS: 811 fprintf(output, "SYMBOL_OTHERS "); break; 812 case XML_REGEXP_OTHER: 813 fprintf(output, "OTHER "); break; 814 case XML_REGEXP_OTHER_CONTROL: 815 fprintf(output, "OTHER_CONTROL "); break; 816 case XML_REGEXP_OTHER_FORMAT: 817 fprintf(output, "OTHER_FORMAT "); break; 818 case XML_REGEXP_OTHER_PRIVATE: 819 fprintf(output, "OTHER_PRIVATE "); break; 820 case XML_REGEXP_OTHER_NA: 821 fprintf(output, "OTHER_NA "); break; 822 case XML_REGEXP_BLOCK_NAME: 823 fprintf(output, "BLOCK "); break; 824 } 825} 826 827static void 828xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 829 switch (type) { 830 case XML_REGEXP_QUANT_EPSILON: 831 fprintf(output, "epsilon "); break; 832 case XML_REGEXP_QUANT_ONCE: 833 fprintf(output, "once "); break; 834 case XML_REGEXP_QUANT_OPT: 835 fprintf(output, "? "); break; 836 case XML_REGEXP_QUANT_MULT: 837 fprintf(output, "* "); break; 838 case XML_REGEXP_QUANT_PLUS: 839 fprintf(output, "+ "); break; 840 case XML_REGEXP_QUANT_RANGE: 841 fprintf(output, "range "); break; 842 case XML_REGEXP_QUANT_ONCEONLY: 843 fprintf(output, "onceonly "); break; 844 case XML_REGEXP_QUANT_ALL: 845 fprintf(output, "all "); break; 846 } 847} 848static void 849xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 850 fprintf(output, " range: "); 851 if (range->neg) 852 fprintf(output, "negative "); 853 xmlRegPrintAtomType(output, range->type); 854 fprintf(output, "%c - %c\n", range->start, range->end); 855} 856 857static void 858xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 859 fprintf(output, " atom: "); 860 if (atom == NULL) { 861 fprintf(output, "NULL\n"); 862 return; 863 } 864 xmlRegPrintAtomType(output, atom->type); 865 xmlRegPrintQuantType(output, atom->quant); 866 if (atom->quant == XML_REGEXP_QUANT_RANGE) 867 fprintf(output, "%d-%d ", atom->min, atom->max); 868 if (atom->type == XML_REGEXP_STRING) 869 fprintf(output, "'%s' ", (char *) atom->valuep); 870 if (atom->type == XML_REGEXP_CHARVAL) 871 fprintf(output, "char %c\n", atom->codepoint); 872 else if (atom->type == XML_REGEXP_RANGES) { 873 int i; 874 fprintf(output, "%d entries\n", atom->nbRanges); 875 for (i = 0; i < atom->nbRanges;i++) 876 xmlRegPrintRange(output, atom->ranges[i]); 877 } else if (atom->type == XML_REGEXP_SUBREG) { 878 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 879 } else { 880 fprintf(output, "\n"); 881 } 882} 883 884static void 885xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 886 fprintf(output, " trans: "); 887 if (trans == NULL) { 888 fprintf(output, "NULL\n"); 889 return; 890 } 891 if (trans->to < 0) { 892 fprintf(output, "removed\n"); 893 return; 894 } 895 if (trans->counter >= 0) { 896 fprintf(output, "counted %d, ", trans->counter); 897 } 898 if (trans->count == REGEXP_ALL_COUNTER) { 899 fprintf(output, "all transition, "); 900 } else if (trans->count >= 0) { 901 fprintf(output, "count based %d, ", trans->count); 902 } 903 if (trans->atom == NULL) { 904 fprintf(output, "epsilon to %d\n", trans->to); 905 return; 906 } 907 if (trans->atom->type == XML_REGEXP_CHARVAL) 908 fprintf(output, "char %c ", trans->atom->codepoint); 909 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 910} 911 912static void 913xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 914 int i; 915 916 fprintf(output, " state: "); 917 if (state == NULL) { 918 fprintf(output, "NULL\n"); 919 return; 920 } 921 if (state->type == XML_REGEXP_START_STATE) 922 fprintf(output, "START "); 923 if (state->type == XML_REGEXP_FINAL_STATE) 924 fprintf(output, "FINAL "); 925 926 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 927 for (i = 0;i < state->nbTrans; i++) { 928 xmlRegPrintTrans(output, &(state->trans[i])); 929 } 930} 931 932#ifdef DEBUG_REGEXP_GRAPH 933static void 934xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 935 int i; 936 937 fprintf(output, " ctxt: "); 938 if (ctxt == NULL) { 939 fprintf(output, "NULL\n"); 940 return; 941 } 942 fprintf(output, "'%s' ", ctxt->string); 943 if (ctxt->error) 944 fprintf(output, "error "); 945 if (ctxt->neg) 946 fprintf(output, "neg "); 947 fprintf(output, "\n"); 948 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 949 for (i = 0;i < ctxt->nbAtoms; i++) { 950 fprintf(output, " %02d ", i); 951 xmlRegPrintAtom(output, ctxt->atoms[i]); 952 } 953 if (ctxt->atom != NULL) { 954 fprintf(output, "current atom:\n"); 955 xmlRegPrintAtom(output, ctxt->atom); 956 } 957 fprintf(output, "%d states:", ctxt->nbStates); 958 if (ctxt->start != NULL) 959 fprintf(output, " start: %d", ctxt->start->no); 960 if (ctxt->end != NULL) 961 fprintf(output, " end: %d", ctxt->end->no); 962 fprintf(output, "\n"); 963 for (i = 0;i < ctxt->nbStates; i++) { 964 xmlRegPrintState(output, ctxt->states[i]); 965 } 966 fprintf(output, "%d counters:\n", ctxt->nbCounters); 967 for (i = 0;i < ctxt->nbCounters; i++) { 968 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 969 ctxt->counters[i].max); 970 } 971} 972#endif 973 974/************************************************************************ 975 * * 976 * Finite Automata structures manipulations * 977 * * 978 ************************************************************************/ 979 980static void 981xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 982 int neg, xmlRegAtomType type, int start, int end, 983 xmlChar *blockName) { 984 xmlRegRangePtr range; 985 986 if (atom == NULL) { 987 ERROR("add range: atom is NULL"); 988 return; 989 } 990 if (atom->type != XML_REGEXP_RANGES) { 991 ERROR("add range: atom is not ranges"); 992 return; 993 } 994 if (atom->maxRanges == 0) { 995 atom->maxRanges = 4; 996 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 997 sizeof(xmlRegRangePtr)); 998 if (atom->ranges == NULL) { 999 ERROR("add range: allocation failed"); 1000 atom->maxRanges = 0; 1001 return; 1002 } 1003 } else if (atom->nbRanges >= atom->maxRanges) { 1004 xmlRegRangePtr *tmp; 1005 atom->maxRanges *= 2; 1006 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1007 sizeof(xmlRegRangePtr)); 1008 if (tmp == NULL) { 1009 ERROR("add range: allocation failed"); 1010 atom->maxRanges /= 2; 1011 return; 1012 } 1013 atom->ranges = tmp; 1014 } 1015 range = xmlRegNewRange(ctxt, neg, type, start, end); 1016 if (range == NULL) 1017 return; 1018 range->blockName = blockName; 1019 atom->ranges[atom->nbRanges++] = range; 1020 1021} 1022 1023static int 1024xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1025 if (ctxt->maxCounters == 0) { 1026 ctxt->maxCounters = 4; 1027 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1028 sizeof(xmlRegCounter)); 1029 if (ctxt->counters == NULL) { 1030 ERROR("reg counter: allocation failed"); 1031 ctxt->maxCounters = 0; 1032 return(-1); 1033 } 1034 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1035 xmlRegCounter *tmp; 1036 ctxt->maxCounters *= 2; 1037 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1038 sizeof(xmlRegCounter)); 1039 if (tmp == NULL) { 1040 ERROR("reg counter: allocation failed"); 1041 ctxt->maxCounters /= 2; 1042 return(-1); 1043 } 1044 ctxt->counters = tmp; 1045 } 1046 ctxt->counters[ctxt->nbCounters].min = -1; 1047 ctxt->counters[ctxt->nbCounters].max = -1; 1048 return(ctxt->nbCounters++); 1049} 1050 1051static void 1052xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1053 if (atom == NULL) { 1054 ERROR("atom push: atom is NULL"); 1055 return; 1056 } 1057 if (ctxt->maxAtoms == 0) { 1058 ctxt->maxAtoms = 4; 1059 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1060 sizeof(xmlRegAtomPtr)); 1061 if (ctxt->atoms == NULL) { 1062 ERROR("atom push: allocation failed"); 1063 ctxt->maxAtoms = 0; 1064 return; 1065 } 1066 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1067 xmlRegAtomPtr *tmp; 1068 ctxt->maxAtoms *= 2; 1069 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1070 sizeof(xmlRegAtomPtr)); 1071 if (tmp == NULL) { 1072 ERROR("atom push: allocation failed"); 1073 ctxt->maxAtoms /= 2; 1074 return; 1075 } 1076 ctxt->atoms = tmp; 1077 } 1078 atom->no = ctxt->nbAtoms; 1079 ctxt->atoms[ctxt->nbAtoms++] = atom; 1080} 1081 1082static void 1083xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1084 xmlRegAtomPtr atom, xmlRegStatePtr target, 1085 int counter, int count) { 1086 if (state == NULL) { 1087 ERROR("add state: state is NULL"); 1088 return; 1089 } 1090 if (target == NULL) { 1091 ERROR("add state: target is NULL"); 1092 return; 1093 } 1094 if (state->maxTrans == 0) { 1095 state->maxTrans = 4; 1096 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1097 sizeof(xmlRegTrans)); 1098 if (state->trans == NULL) { 1099 ERROR("add range: allocation failed"); 1100 state->maxTrans = 0; 1101 return; 1102 } 1103 } else if (state->nbTrans >= state->maxTrans) { 1104 xmlRegTrans *tmp; 1105 state->maxTrans *= 2; 1106 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1107 sizeof(xmlRegTrans)); 1108 if (tmp == NULL) { 1109 ERROR("add range: allocation failed"); 1110 state->maxTrans /= 2; 1111 return; 1112 } 1113 state->trans = tmp; 1114 } 1115#ifdef DEBUG_REGEXP_GRAPH 1116 printf("Add trans from %d to %d ", state->no, target->no); 1117 if (count == REGEXP_ALL_COUNTER) 1118 printf("all transition"); 1119 else if (count >= 0) 1120 printf("count based %d", count); 1121 else if (counter >= 0) 1122 printf("counted %d", counter); 1123 else if (atom == NULL) 1124 printf("epsilon transition"); 1125 printf("\n"); 1126#endif 1127 1128 state->trans[state->nbTrans].atom = atom; 1129 state->trans[state->nbTrans].to = target->no; 1130 state->trans[state->nbTrans].counter = counter; 1131 state->trans[state->nbTrans].count = count; 1132 state->nbTrans++; 1133} 1134 1135static void 1136xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1137 if (ctxt->maxStates == 0) { 1138 ctxt->maxStates = 4; 1139 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1140 sizeof(xmlRegStatePtr)); 1141 if (ctxt->states == NULL) { 1142 ERROR("add range: allocation failed"); 1143 ctxt->maxStates = 0; 1144 return; 1145 } 1146 } else if (ctxt->nbStates >= ctxt->maxStates) { 1147 xmlRegStatePtr *tmp; 1148 ctxt->maxStates *= 2; 1149 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1150 sizeof(xmlRegStatePtr)); 1151 if (tmp == NULL) { 1152 ERROR("add range: allocation failed"); 1153 ctxt->maxStates /= 2; 1154 return; 1155 } 1156 ctxt->states = tmp; 1157 } 1158 state->no = ctxt->nbStates; 1159 ctxt->states[ctxt->nbStates++] = state; 1160} 1161 1162/** 1163 * xmlFAGenerateAllTransition: 1164 * @ctxt: a regexp parser context 1165 * @from: the from state 1166 * @to: the target state or NULL for building a new one 1167 * @lax: 1168 * 1169 */ 1170static void 1171xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1172 xmlRegStatePtr from, xmlRegStatePtr to, 1173 int lax) { 1174 if (to == NULL) { 1175 to = xmlRegNewState(ctxt); 1176 xmlRegStatePush(ctxt, to); 1177 ctxt->state = to; 1178 } 1179 if (lax) 1180 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1181 else 1182 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1183} 1184 1185/** 1186 * xmlFAGenerateEpsilonTransition: 1187 * @ctxt: a regexp parser context 1188 * @from: the from state 1189 * @to: the target state or NULL for building a new one 1190 * 1191 */ 1192static void 1193xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1194 xmlRegStatePtr from, xmlRegStatePtr to) { 1195 if (to == NULL) { 1196 to = xmlRegNewState(ctxt); 1197 xmlRegStatePush(ctxt, to); 1198 ctxt->state = to; 1199 } 1200 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1201} 1202 1203/** 1204 * xmlFAGenerateCountedEpsilonTransition: 1205 * @ctxt: a regexp parser context 1206 * @from: the from state 1207 * @to: the target state or NULL for building a new one 1208 * counter: the counter for that transition 1209 * 1210 */ 1211static void 1212xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1213 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1214 if (to == NULL) { 1215 to = xmlRegNewState(ctxt); 1216 xmlRegStatePush(ctxt, to); 1217 ctxt->state = to; 1218 } 1219 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1220} 1221 1222/** 1223 * xmlFAGenerateCountedTransition: 1224 * @ctxt: a regexp parser context 1225 * @from: the from state 1226 * @to: the target state or NULL for building a new one 1227 * counter: the counter for that transition 1228 * 1229 */ 1230static void 1231xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1232 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1233 if (to == NULL) { 1234 to = xmlRegNewState(ctxt); 1235 xmlRegStatePush(ctxt, to); 1236 ctxt->state = to; 1237 } 1238 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1239} 1240 1241/** 1242 * xmlFAGenerateTransitions: 1243 * @ctxt: a regexp parser context 1244 * @from: the from state 1245 * @to: the target state or NULL for building a new one 1246 * @atom: the atom generating the transition 1247 * 1248 */ 1249static void 1250xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1251 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1252 if (atom == NULL) { 1253 ERROR("genrate transition: atom == NULL"); 1254 return; 1255 } 1256 if (atom->type == XML_REGEXP_SUBREG) { 1257 /* 1258 * this is a subexpression handling one should not need to 1259 * create a new node excep for XML_REGEXP_QUANT_RANGE. 1260 */ 1261 xmlRegAtomPush(ctxt, atom); 1262 if ((to != NULL) && (atom->stop != to) && 1263 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1264 /* 1265 * Generate an epsilon transition to link to the target 1266 */ 1267 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1268 } 1269 switch (atom->quant) { 1270 case XML_REGEXP_QUANT_OPT: 1271 atom->quant = XML_REGEXP_QUANT_ONCE; 1272 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1273 break; 1274 case XML_REGEXP_QUANT_MULT: 1275 atom->quant = XML_REGEXP_QUANT_ONCE; 1276 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1277 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1278 break; 1279 case XML_REGEXP_QUANT_PLUS: 1280 atom->quant = XML_REGEXP_QUANT_ONCE; 1281 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1282 break; 1283 case XML_REGEXP_QUANT_RANGE: { 1284 int counter; 1285 xmlRegStatePtr newstate; 1286 1287 /* 1288 * This one is nasty: 1289 * 1/ register a new counter 1290 * 2/ register an epsilon transition associated to 1291 * this counter going from atom->stop to atom->start 1292 * 3/ create a new state 1293 * 4/ generate a counted transition from atom->stop to 1294 * that state 1295 */ 1296 counter = xmlRegGetCounter(ctxt); 1297 ctxt->counters[counter].min = atom->min - 1; 1298 ctxt->counters[counter].max = atom->max - 1; 1299 atom->min = 0; 1300 atom->max = 0; 1301 atom->quant = XML_REGEXP_QUANT_ONCE; 1302 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1303 atom->start, counter); 1304 if (to != NULL) { 1305 newstate = to; 1306 } else { 1307 newstate = xmlRegNewState(ctxt); 1308 xmlRegStatePush(ctxt, newstate); 1309 ctxt->state = newstate; 1310 } 1311 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1312 newstate, counter); 1313 } 1314 default: 1315 break; 1316 } 1317 return; 1318 } else { 1319 if (to == NULL) { 1320 to = xmlRegNewState(ctxt); 1321 xmlRegStatePush(ctxt, to); 1322 } 1323 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1324 xmlRegAtomPush(ctxt, atom); 1325 ctxt->state = to; 1326 } 1327 switch (atom->quant) { 1328 case XML_REGEXP_QUANT_OPT: 1329 atom->quant = XML_REGEXP_QUANT_ONCE; 1330 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1331 break; 1332 case XML_REGEXP_QUANT_MULT: 1333 atom->quant = XML_REGEXP_QUANT_ONCE; 1334 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1335 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1336 break; 1337 case XML_REGEXP_QUANT_PLUS: 1338 atom->quant = XML_REGEXP_QUANT_ONCE; 1339 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1340 break; 1341 default: 1342 break; 1343 } 1344} 1345 1346/** 1347 * xmlFAReduceEpsilonTransitions: 1348 * @ctxt: a regexp parser context 1349 * @fromnr: the from state 1350 * @tonr: the to state 1351 * @cpunter: should that transition be associted to a counted 1352 * 1353 */ 1354static void 1355xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1356 int tonr, int counter) { 1357 int transnr; 1358 xmlRegStatePtr from; 1359 xmlRegStatePtr to; 1360 1361#ifdef DEBUG_REGEXP_GRAPH 1362 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1363#endif 1364 from = ctxt->states[fromnr]; 1365 if (from == NULL) 1366 return; 1367 to = ctxt->states[tonr]; 1368 if (to == NULL) 1369 return; 1370 if ((to->mark == XML_REGEXP_MARK_START) || 1371 (to->mark == XML_REGEXP_MARK_VISITED)) 1372 return; 1373 1374 to->mark = XML_REGEXP_MARK_VISITED; 1375 if (to->type == XML_REGEXP_FINAL_STATE) { 1376#ifdef DEBUG_REGEXP_GRAPH 1377 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1378#endif 1379 from->type = XML_REGEXP_FINAL_STATE; 1380 } 1381 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1382 if (to->trans[transnr].atom == NULL) { 1383 /* 1384 * Don't remove counted transitions 1385 * Don't loop either 1386 */ 1387 if (to->trans[transnr].to != fromnr) { 1388 if (to->trans[transnr].count >= 0) { 1389 int newto = to->trans[transnr].to; 1390 1391 xmlRegStateAddTrans(ctxt, from, NULL, 1392 ctxt->states[newto], 1393 -1, to->trans[transnr].count); 1394 } else { 1395#ifdef DEBUG_REGEXP_GRAPH 1396 printf("Found epsilon trans %d from %d to %d\n", 1397 transnr, tonr, to->trans[transnr].to); 1398#endif 1399 if (to->trans[transnr].counter >= 0) { 1400 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1401 to->trans[transnr].to, 1402 to->trans[transnr].counter); 1403 } else { 1404 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1405 to->trans[transnr].to, 1406 counter); 1407 } 1408 } 1409 } 1410 } else { 1411 int newto = to->trans[transnr].to; 1412 1413 if (to->trans[transnr].counter >= 0) { 1414 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1415 ctxt->states[newto], 1416 to->trans[transnr].counter, -1); 1417 } else { 1418 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1419 ctxt->states[newto], counter, -1); 1420 } 1421 } 1422 } 1423 to->mark = XML_REGEXP_MARK_NORMAL; 1424} 1425 1426/** 1427 * xmlFAEliminateEpsilonTransitions: 1428 * @ctxt: a regexp parser context 1429 * 1430 */ 1431static void 1432xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1433 int statenr, transnr; 1434 xmlRegStatePtr state; 1435 1436 /* 1437 * build the completed transitions bypassing the epsilons 1438 * Use a marking algorithm to avoid loops 1439 */ 1440 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1441 state = ctxt->states[statenr]; 1442 if (state == NULL) 1443 continue; 1444 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1445 if ((state->trans[transnr].atom == NULL) && 1446 (state->trans[transnr].to >= 0)) { 1447 if (state->trans[transnr].to == statenr) { 1448 state->trans[transnr].to = -1; 1449#ifdef DEBUG_REGEXP_GRAPH 1450 printf("Removed loopback epsilon trans %d on %d\n", 1451 transnr, statenr); 1452#endif 1453 } else if (state->trans[transnr].count < 0) { 1454 int newto = state->trans[transnr].to; 1455 1456#ifdef DEBUG_REGEXP_GRAPH 1457 printf("Found epsilon trans %d from %d to %d\n", 1458 transnr, statenr, newto); 1459#endif 1460 state->mark = XML_REGEXP_MARK_START; 1461 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1462 newto, state->trans[transnr].counter); 1463 state->mark = XML_REGEXP_MARK_NORMAL; 1464#ifdef DEBUG_REGEXP_GRAPH 1465 } else { 1466 printf("Found counted transition %d on %d\n", 1467 transnr, statenr); 1468#endif 1469 } 1470 } 1471 } 1472 } 1473 /* 1474 * Eliminate the epsilon transitions 1475 */ 1476 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1477 state = ctxt->states[statenr]; 1478 if (state == NULL) 1479 continue; 1480 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1481 if ((state->trans[transnr].atom == NULL) && 1482 (state->trans[transnr].count < 0) && 1483 (state->trans[transnr].to >= 0)) { 1484 state->trans[transnr].to = -1; 1485 } 1486 } 1487 } 1488 1489 /* 1490 * Use this pass to detect unreachable states too 1491 */ 1492 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1493 state = ctxt->states[statenr]; 1494 if (state != NULL) 1495 state->reached = 0; 1496 } 1497 state = ctxt->states[0]; 1498 if (state != NULL) 1499 state->reached = 1; 1500 while (state != NULL) { 1501 xmlRegStatePtr target = NULL; 1502 state->reached = 2; 1503 /* 1504 * Mark all state reachable from the current reachable state 1505 */ 1506 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1507 if ((state->trans[transnr].to >= 0) && 1508 ((state->trans[transnr].atom != NULL) || 1509 (state->trans[transnr].count >= 0))) { 1510 int newto = state->trans[transnr].to; 1511 1512 if (ctxt->states[newto] == NULL) 1513 continue; 1514 if (ctxt->states[newto]->reached == 0) { 1515 ctxt->states[newto]->reached = 1; 1516 target = ctxt->states[newto]; 1517 } 1518 } 1519 } 1520 /* 1521 * find the next accessible state not explored 1522 */ 1523 if (target == NULL) { 1524 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 1525 state = ctxt->states[statenr]; 1526 if ((state != NULL) && (state->reached == 1)) { 1527 target = state; 1528 break; 1529 } 1530 } 1531 } 1532 state = target; 1533 } 1534 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1535 state = ctxt->states[statenr]; 1536 if ((state != NULL) && (state->reached == 0)) { 1537#ifdef DEBUG_REGEXP_GRAPH 1538 printf("Removed unreachable state %d\n", statenr); 1539#endif 1540 xmlRegFreeState(state); 1541 ctxt->states[statenr] = NULL; 1542 } 1543 } 1544 1545} 1546 1547/** 1548 * xmlFACompareAtoms: 1549 * @atom1: an atom 1550 * @atom2: an atom 1551 * 1552 * Compares two atoms to check whether they are equivatents 1553 * 1554 * Returns 1 if yes and 0 otherwise 1555 */ 1556static int 1557xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 1558 if (atom1 == atom2) 1559 return(1); 1560 if ((atom1 == NULL) || (atom2 == NULL)) 1561 return(0); 1562 1563 if (atom1->type != atom2->type) 1564 return(0); 1565 switch (atom1->type) { 1566 case XML_REGEXP_STRING: 1567 return(xmlStrEqual((xmlChar *)atom1->valuep, 1568 (xmlChar *)atom2->valuep)); 1569 case XML_REGEXP_EPSILON: 1570 return(1); 1571 case XML_REGEXP_CHARVAL: 1572 return(atom1->codepoint == atom2->codepoint); 1573 case XML_REGEXP_RANGES: 1574 TODO; 1575 return(0); 1576 default: 1577 break; 1578 } 1579 return(1); 1580} 1581 1582/** 1583 * xmlFARecurseDeterminism: 1584 * @ctxt: a regexp parser context 1585 * 1586 * Check whether the associated regexp is determinist, 1587 * should be called after xmlFAEliminateEpsilonTransitions() 1588 * 1589 */ 1590static int 1591xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1592 int to, xmlRegAtomPtr atom) { 1593 int ret = 1; 1594 int transnr; 1595 xmlRegTransPtr t1; 1596 1597 if (state == NULL) 1598 return(ret); 1599 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1600 t1 = &(state->trans[transnr]); 1601 /* 1602 * check transitions conflicting with the one looked at 1603 */ 1604 if (t1->atom == NULL) { 1605 if (t1->to == -1) 1606 continue; 1607 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 1608 to, atom); 1609 if (ret == 0) 1610 return(0); 1611 continue; 1612 } 1613 if (t1->to != to) 1614 continue; 1615 if (xmlFACompareAtoms(t1->atom, atom)) 1616 return(0); 1617 } 1618 return(ret); 1619} 1620 1621/** 1622 * xmlFAComputesDeterminism: 1623 * @ctxt: a regexp parser context 1624 * 1625 * Check whether the associated regexp is determinist, 1626 * should be called after xmlFAEliminateEpsilonTransitions() 1627 * 1628 */ 1629static int 1630xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 1631 int statenr, transnr; 1632 xmlRegStatePtr state; 1633 xmlRegTransPtr t1, t2; 1634 int i; 1635 int ret = 1; 1636 1637#ifdef DEBUG_REGEXP_GRAPH 1638 printf("xmlFAComputesDeterminism\n"); 1639 xmlRegPrintCtxt(stdout, ctxt); 1640#endif 1641 if (ctxt->determinist != -1) 1642 return(ctxt->determinist); 1643 1644 /* 1645 * Check for all states that there isn't 2 transitions 1646 * with the same atom and a different target. 1647 */ 1648 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1649 state = ctxt->states[statenr]; 1650 if (state == NULL) 1651 continue; 1652 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1653 t1 = &(state->trans[transnr]); 1654 /* 1655 * Determinism checks in case of counted or all transitions 1656 * will have to be handled separately 1657 */ 1658 if (t1->atom == NULL) 1659 continue; 1660 if (t1->to == -1) /* eliminated */ 1661 continue; 1662 for (i = 0;i < transnr;i++) { 1663 t2 = &(state->trans[i]); 1664 if (t2->to == -1) /* eliminated */ 1665 continue; 1666 if (t2->atom != NULL) { 1667 if (t1->to == t2->to) { 1668 if (xmlFACompareAtoms(t1->atom, t2->atom)) 1669 t2->to = -1; /* eliminate */ 1670 } else { 1671 /* not determinist ! */ 1672 if (xmlFACompareAtoms(t1->atom, t2->atom)) 1673 ret = 0; 1674 } 1675 } else if (t1->to != -1) { 1676 /* 1677 * do the closure in case of remaining specific 1678 * epsilon transitions like choices or all 1679 */ 1680 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 1681 t2->to, t2->atom); 1682 if (ret == 0) 1683 return(0); 1684 } 1685 } 1686 if (ret == 0) 1687 break; 1688 } 1689 if (ret == 0) 1690 break; 1691 } 1692 ctxt->determinist = ret; 1693 return(ret); 1694} 1695 1696/************************************************************************ 1697 * * 1698 * Routines to check input against transition atoms * 1699 * * 1700 ************************************************************************/ 1701 1702static int 1703xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 1704 int start, int end, const xmlChar *blockName) { 1705 int ret = 0; 1706 1707 switch (type) { 1708 case XML_REGEXP_STRING: 1709 case XML_REGEXP_SUBREG: 1710 case XML_REGEXP_RANGES: 1711 case XML_REGEXP_EPSILON: 1712 return(-1); 1713 case XML_REGEXP_ANYCHAR: 1714 ret = ((codepoint != '\n') && (codepoint != '\r')); 1715 break; 1716 case XML_REGEXP_CHARVAL: 1717 ret = ((codepoint >= start) && (codepoint <= end)); 1718 break; 1719 case XML_REGEXP_NOTSPACE: 1720 neg = !neg; 1721 case XML_REGEXP_ANYSPACE: 1722 ret = ((codepoint == '\n') || (codepoint == '\r') || 1723 (codepoint == '\t') || (codepoint == ' ')); 1724 break; 1725 case XML_REGEXP_NOTINITNAME: 1726 neg = !neg; 1727 case XML_REGEXP_INITNAME: 1728 ret = (xmlIsLetter(codepoint) || 1729 (codepoint == '_') || (codepoint == ':')); 1730 break; 1731 case XML_REGEXP_NOTNAMECHAR: 1732 neg = !neg; 1733 case XML_REGEXP_NAMECHAR: 1734 ret = (xmlIsLetter(codepoint) || xmlIsDigit(codepoint) || 1735 (codepoint == '.') || (codepoint == '-') || 1736 (codepoint == '_') || (codepoint == ':') || 1737 xmlIsCombining(codepoint) || xmlIsExtender(codepoint)); 1738 break; 1739 case XML_REGEXP_NOTDECIMAL: 1740 neg = !neg; 1741 case XML_REGEXP_DECIMAL: 1742 ret = xmlUCSIsCatNd(codepoint); 1743 break; 1744 case XML_REGEXP_REALCHAR: 1745 neg = !neg; 1746 case XML_REGEXP_NOTREALCHAR: 1747 ret = xmlUCSIsCatP(codepoint); 1748 if (ret == 0) 1749 ret = xmlUCSIsCatZ(codepoint); 1750 if (ret == 0) 1751 ret = xmlUCSIsCatC(codepoint); 1752 break; 1753 case XML_REGEXP_LETTER: 1754 ret = xmlUCSIsCatL(codepoint); 1755 break; 1756 case XML_REGEXP_LETTER_UPPERCASE: 1757 ret = xmlUCSIsCatLu(codepoint); 1758 break; 1759 case XML_REGEXP_LETTER_LOWERCASE: 1760 ret = xmlUCSIsCatLl(codepoint); 1761 break; 1762 case XML_REGEXP_LETTER_TITLECASE: 1763 ret = xmlUCSIsCatLt(codepoint); 1764 break; 1765 case XML_REGEXP_LETTER_MODIFIER: 1766 ret = xmlUCSIsCatLm(codepoint); 1767 break; 1768 case XML_REGEXP_LETTER_OTHERS: 1769 ret = xmlUCSIsCatLo(codepoint); 1770 break; 1771 case XML_REGEXP_MARK: 1772 ret = xmlUCSIsCatM(codepoint); 1773 break; 1774 case XML_REGEXP_MARK_NONSPACING: 1775 ret = xmlUCSIsCatMn(codepoint); 1776 break; 1777 case XML_REGEXP_MARK_SPACECOMBINING: 1778 ret = xmlUCSIsCatMc(codepoint); 1779 break; 1780 case XML_REGEXP_MARK_ENCLOSING: 1781 ret = xmlUCSIsCatMe(codepoint); 1782 break; 1783 case XML_REGEXP_NUMBER: 1784 ret = xmlUCSIsCatN(codepoint); 1785 break; 1786 case XML_REGEXP_NUMBER_DECIMAL: 1787 ret = xmlUCSIsCatNd(codepoint); 1788 break; 1789 case XML_REGEXP_NUMBER_LETTER: 1790 ret = xmlUCSIsCatNl(codepoint); 1791 break; 1792 case XML_REGEXP_NUMBER_OTHERS: 1793 ret = xmlUCSIsCatNo(codepoint); 1794 break; 1795 case XML_REGEXP_PUNCT: 1796 ret = xmlUCSIsCatP(codepoint); 1797 break; 1798 case XML_REGEXP_PUNCT_CONNECTOR: 1799 ret = xmlUCSIsCatPc(codepoint); 1800 break; 1801 case XML_REGEXP_PUNCT_DASH: 1802 ret = xmlUCSIsCatPd(codepoint); 1803 break; 1804 case XML_REGEXP_PUNCT_OPEN: 1805 ret = xmlUCSIsCatPs(codepoint); 1806 break; 1807 case XML_REGEXP_PUNCT_CLOSE: 1808 ret = xmlUCSIsCatPe(codepoint); 1809 break; 1810 case XML_REGEXP_PUNCT_INITQUOTE: 1811 ret = xmlUCSIsCatPi(codepoint); 1812 break; 1813 case XML_REGEXP_PUNCT_FINQUOTE: 1814 ret = xmlUCSIsCatPf(codepoint); 1815 break; 1816 case XML_REGEXP_PUNCT_OTHERS: 1817 ret = xmlUCSIsCatPo(codepoint); 1818 break; 1819 case XML_REGEXP_SEPAR: 1820 ret = xmlUCSIsCatZ(codepoint); 1821 break; 1822 case XML_REGEXP_SEPAR_SPACE: 1823 ret = xmlUCSIsCatZs(codepoint); 1824 break; 1825 case XML_REGEXP_SEPAR_LINE: 1826 ret = xmlUCSIsCatZl(codepoint); 1827 break; 1828 case XML_REGEXP_SEPAR_PARA: 1829 ret = xmlUCSIsCatZp(codepoint); 1830 break; 1831 case XML_REGEXP_SYMBOL: 1832 ret = xmlUCSIsCatS(codepoint); 1833 break; 1834 case XML_REGEXP_SYMBOL_MATH: 1835 ret = xmlUCSIsCatSm(codepoint); 1836 break; 1837 case XML_REGEXP_SYMBOL_CURRENCY: 1838 ret = xmlUCSIsCatSc(codepoint); 1839 break; 1840 case XML_REGEXP_SYMBOL_MODIFIER: 1841 ret = xmlUCSIsCatSk(codepoint); 1842 break; 1843 case XML_REGEXP_SYMBOL_OTHERS: 1844 ret = xmlUCSIsCatSo(codepoint); 1845 break; 1846 case XML_REGEXP_OTHER: 1847 ret = xmlUCSIsCatC(codepoint); 1848 break; 1849 case XML_REGEXP_OTHER_CONTROL: 1850 ret = xmlUCSIsCatCc(codepoint); 1851 break; 1852 case XML_REGEXP_OTHER_FORMAT: 1853 ret = xmlUCSIsCatCf(codepoint); 1854 break; 1855 case XML_REGEXP_OTHER_PRIVATE: 1856 ret = xmlUCSIsCatCo(codepoint); 1857 break; 1858 case XML_REGEXP_OTHER_NA: 1859 /* ret = xmlUCSIsCatCn(codepoint); */ 1860 /* Seems it doesn't exist anymore in recent Unicode releases */ 1861 ret = 0; 1862 break; 1863 case XML_REGEXP_BLOCK_NAME: 1864 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 1865 break; 1866 } 1867 if (neg) 1868 return(!ret); 1869 return(ret); 1870} 1871 1872static int 1873xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 1874 int i, ret = 0; 1875 xmlRegRangePtr range; 1876 1877 if ((atom == NULL) || (!xmlIsChar(codepoint))) 1878 return(-1); 1879 1880 switch (atom->type) { 1881 case XML_REGEXP_SUBREG: 1882 case XML_REGEXP_EPSILON: 1883 return(-1); 1884 case XML_REGEXP_CHARVAL: 1885 return(codepoint == atom->codepoint); 1886 case XML_REGEXP_RANGES: { 1887 int accept = 0; 1888 for (i = 0;i < atom->nbRanges;i++) { 1889 range = atom->ranges[i]; 1890 if (range->neg) { 1891 ret = xmlRegCheckCharacterRange(range->type, codepoint, 1892 0, range->start, range->end, 1893 range->blockName); 1894 if (ret != 0) 1895 return(0); /* excluded char */ 1896 } else { 1897 ret = xmlRegCheckCharacterRange(range->type, codepoint, 1898 0, range->start, range->end, 1899 range->blockName); 1900 if (ret != 0) 1901 accept = 1; /* might still be excluded */ 1902 } 1903 } 1904 return(accept); 1905 } 1906 case XML_REGEXP_STRING: 1907 printf("TODO: XML_REGEXP_STRING\n"); 1908 return(-1); 1909 case XML_REGEXP_ANYCHAR: 1910 case XML_REGEXP_ANYSPACE: 1911 case XML_REGEXP_NOTSPACE: 1912 case XML_REGEXP_INITNAME: 1913 case XML_REGEXP_NOTINITNAME: 1914 case XML_REGEXP_NAMECHAR: 1915 case XML_REGEXP_NOTNAMECHAR: 1916 case XML_REGEXP_DECIMAL: 1917 case XML_REGEXP_NOTDECIMAL: 1918 case XML_REGEXP_REALCHAR: 1919 case XML_REGEXP_NOTREALCHAR: 1920 case XML_REGEXP_LETTER: 1921 case XML_REGEXP_LETTER_UPPERCASE: 1922 case XML_REGEXP_LETTER_LOWERCASE: 1923 case XML_REGEXP_LETTER_TITLECASE: 1924 case XML_REGEXP_LETTER_MODIFIER: 1925 case XML_REGEXP_LETTER_OTHERS: 1926 case XML_REGEXP_MARK: 1927 case XML_REGEXP_MARK_NONSPACING: 1928 case XML_REGEXP_MARK_SPACECOMBINING: 1929 case XML_REGEXP_MARK_ENCLOSING: 1930 case XML_REGEXP_NUMBER: 1931 case XML_REGEXP_NUMBER_DECIMAL: 1932 case XML_REGEXP_NUMBER_LETTER: 1933 case XML_REGEXP_NUMBER_OTHERS: 1934 case XML_REGEXP_PUNCT: 1935 case XML_REGEXP_PUNCT_CONNECTOR: 1936 case XML_REGEXP_PUNCT_DASH: 1937 case XML_REGEXP_PUNCT_OPEN: 1938 case XML_REGEXP_PUNCT_CLOSE: 1939 case XML_REGEXP_PUNCT_INITQUOTE: 1940 case XML_REGEXP_PUNCT_FINQUOTE: 1941 case XML_REGEXP_PUNCT_OTHERS: 1942 case XML_REGEXP_SEPAR: 1943 case XML_REGEXP_SEPAR_SPACE: 1944 case XML_REGEXP_SEPAR_LINE: 1945 case XML_REGEXP_SEPAR_PARA: 1946 case XML_REGEXP_SYMBOL: 1947 case XML_REGEXP_SYMBOL_MATH: 1948 case XML_REGEXP_SYMBOL_CURRENCY: 1949 case XML_REGEXP_SYMBOL_MODIFIER: 1950 case XML_REGEXP_SYMBOL_OTHERS: 1951 case XML_REGEXP_OTHER: 1952 case XML_REGEXP_OTHER_CONTROL: 1953 case XML_REGEXP_OTHER_FORMAT: 1954 case XML_REGEXP_OTHER_PRIVATE: 1955 case XML_REGEXP_OTHER_NA: 1956 case XML_REGEXP_BLOCK_NAME: 1957 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 1958 (const xmlChar *)atom->valuep); 1959 if (atom->neg) 1960 ret = !ret; 1961 break; 1962 } 1963 return(ret); 1964} 1965 1966/************************************************************************ 1967 * * 1968 * Saving an restoring state of an execution context * 1969 * * 1970 ************************************************************************/ 1971 1972#ifdef DEBUG_REGEXP_EXEC 1973static void 1974xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 1975 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 1976 if (exec->inputStack != NULL) { 1977 int i; 1978 printf(": "); 1979 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 1980 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]); 1981 } else { 1982 printf(": %s", &(exec->inputString[exec->index])); 1983 } 1984 printf("\n"); 1985} 1986#endif 1987 1988static void 1989xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 1990#ifdef DEBUG_REGEXP_EXEC 1991 printf("saving "); 1992 exec->transno++; 1993 xmlFARegDebugExec(exec); 1994 exec->transno--; 1995#endif 1996 1997 if (exec->maxRollbacks == 0) { 1998 exec->maxRollbacks = 4; 1999 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 2000 sizeof(xmlRegExecRollback)); 2001 if (exec->rollbacks == NULL) { 2002 fprintf(stderr, "exec save: allocation failed"); 2003 exec->maxRollbacks = 0; 2004 return; 2005 } 2006 memset(exec->rollbacks, 0, 2007 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2008 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 2009 xmlRegExecRollback *tmp; 2010 int len = exec->maxRollbacks; 2011 2012 exec->maxRollbacks *= 2; 2013 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 2014 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2015 if (tmp == NULL) { 2016 fprintf(stderr, "exec save: allocation failed"); 2017 exec->maxRollbacks /= 2; 2018 return; 2019 } 2020 exec->rollbacks = tmp; 2021 tmp = &exec->rollbacks[len]; 2022 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 2023 } 2024 exec->rollbacks[exec->nbRollbacks].state = exec->state; 2025 exec->rollbacks[exec->nbRollbacks].index = exec->index; 2026 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 2027 if (exec->comp->nbCounters > 0) { 2028 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2029 exec->rollbacks[exec->nbRollbacks].counts = (int *) 2030 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 2031 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2032 fprintf(stderr, "exec save: allocation failed"); 2033 exec->status = -5; 2034 return; 2035 } 2036 } 2037 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 2038 exec->comp->nbCounters * sizeof(int)); 2039 } 2040 exec->nbRollbacks++; 2041} 2042 2043static void 2044xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 2045 if (exec->nbRollbacks <= 0) { 2046 exec->status = -1; 2047#ifdef DEBUG_REGEXP_EXEC 2048 printf("rollback failed on empty stack\n"); 2049#endif 2050 return; 2051 } 2052 exec->nbRollbacks--; 2053 exec->state = exec->rollbacks[exec->nbRollbacks].state; 2054 exec->index = exec->rollbacks[exec->nbRollbacks].index; 2055 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 2056 if (exec->comp->nbCounters > 0) { 2057 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2058 fprintf(stderr, "exec save: allocation failed"); 2059 exec->status = -6; 2060 return; 2061 } 2062 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 2063 exec->comp->nbCounters * sizeof(int)); 2064 } 2065 2066#ifdef DEBUG_REGEXP_EXEC 2067 printf("restored "); 2068 xmlFARegDebugExec(exec); 2069#endif 2070} 2071 2072/************************************************************************ 2073 * * 2074 * Verifyer, running an input against a compiled regexp * 2075 * * 2076 ************************************************************************/ 2077 2078static int 2079xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 2080 xmlRegExecCtxt execval; 2081 xmlRegExecCtxtPtr exec = &execval; 2082 int ret, codepoint, len; 2083 2084 exec->inputString = content; 2085 exec->index = 0; 2086 exec->determinist = 1; 2087 exec->maxRollbacks = 0; 2088 exec->nbRollbacks = 0; 2089 exec->rollbacks = NULL; 2090 exec->status = 0; 2091 exec->comp = comp; 2092 exec->state = comp->states[0]; 2093 exec->transno = 0; 2094 exec->transcount = 0; 2095 if (comp->nbCounters > 0) { 2096 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 2097 if (exec->counts == NULL) 2098 return(-1); 2099 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 2100 } else 2101 exec->counts = NULL; 2102 while ((exec->status == 0) && 2103 ((exec->inputString[exec->index] != 0) || 2104 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 2105 xmlRegTransPtr trans; 2106 xmlRegAtomPtr atom; 2107 2108 /* 2109 * End of input on non-terminal state, rollback, however we may 2110 * still have epsilon like transition for counted transitions 2111 * on counters, in that case don't break too early. 2112 */ 2113 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 2114 goto rollback; 2115 2116 exec->transcount = 0; 2117 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2118 trans = &exec->state->trans[exec->transno]; 2119 if (trans->to < 0) 2120 continue; 2121 atom = trans->atom; 2122 ret = 0; 2123 if (trans->count >= 0) { 2124 int count; 2125 xmlRegCounterPtr counter; 2126 2127 /* 2128 * A counted transition. 2129 */ 2130 2131 count = exec->counts[trans->count]; 2132 counter = &exec->comp->counters[trans->count]; 2133#ifdef DEBUG_REGEXP_EXEC 2134 printf("testing count %d: val %d, min %d, max %d\n", 2135 trans->count, count, counter->min, counter->max); 2136#endif 2137 ret = ((count >= counter->min) && (count <= counter->max)); 2138 } else if (atom == NULL) { 2139 fprintf(stderr, "epsilon transition left at runtime\n"); 2140 exec->status = -2; 2141 break; 2142 } else if (exec->inputString[exec->index] != 0) { 2143 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 2144 ret = xmlRegCheckCharacter(atom, codepoint); 2145 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 2146 xmlRegStatePtr to = comp->states[trans->to]; 2147 2148 /* 2149 * this is a multiple input sequence 2150 */ 2151 if (exec->state->nbTrans > exec->transno + 1) { 2152 xmlFARegExecSave(exec); 2153 } 2154 exec->transcount = 1; 2155 do { 2156 /* 2157 * Try to progress as much as possible on the input 2158 */ 2159 if (exec->transcount == atom->max) { 2160 break; 2161 } 2162 exec->index += len; 2163 /* 2164 * End of input: stop here 2165 */ 2166 if (exec->inputString[exec->index] == 0) { 2167 exec->index -= len; 2168 break; 2169 } 2170 if (exec->transcount >= atom->min) { 2171 int transno = exec->transno; 2172 xmlRegStatePtr state = exec->state; 2173 2174 /* 2175 * The transition is acceptable save it 2176 */ 2177 exec->transno = -1; /* trick */ 2178 exec->state = to; 2179 xmlFARegExecSave(exec); 2180 exec->transno = transno; 2181 exec->state = state; 2182 } 2183 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 2184 len); 2185 ret = xmlRegCheckCharacter(atom, codepoint); 2186 exec->transcount++; 2187 } while (ret == 1); 2188 if (exec->transcount < atom->min) 2189 ret = 0; 2190 2191 /* 2192 * If the last check failed but one transition was found 2193 * possible, rollback 2194 */ 2195 if (ret < 0) 2196 ret = 0; 2197 if (ret == 0) { 2198 goto rollback; 2199 } 2200 } 2201 } 2202 if (ret == 1) { 2203 if (exec->state->nbTrans > exec->transno + 1) { 2204 xmlFARegExecSave(exec); 2205 } 2206 if (trans->counter >= 0) { 2207#ifdef DEBUG_REGEXP_EXEC 2208 printf("Increasing count %d\n", trans->counter); 2209#endif 2210 exec->counts[trans->counter]++; 2211 } 2212#ifdef DEBUG_REGEXP_EXEC 2213 printf("entering state %d\n", trans->to); 2214#endif 2215 exec->state = comp->states[trans->to]; 2216 exec->transno = 0; 2217 if (trans->atom != NULL) { 2218 exec->index += len; 2219 } 2220 goto progress; 2221 } else if (ret < 0) { 2222 exec->status = -4; 2223 break; 2224 } 2225 } 2226 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2227rollback: 2228 /* 2229 * Failed to find a way out 2230 */ 2231 exec->determinist = 0; 2232 xmlFARegExecRollBack(exec); 2233 } 2234progress: 2235 continue; 2236 } 2237 if (exec->rollbacks != NULL) { 2238 if (exec->counts != NULL) { 2239 int i; 2240 2241 for (i = 0;i < exec->maxRollbacks;i++) 2242 if (exec->rollbacks[i].counts != NULL) 2243 xmlFree(exec->rollbacks[i].counts); 2244 } 2245 xmlFree(exec->rollbacks); 2246 } 2247 if (exec->counts != NULL) 2248 xmlFree(exec->counts); 2249 if (exec->status == 0) 2250 return(1); 2251 if (exec->status == -1) 2252 return(0); 2253 return(exec->status); 2254} 2255 2256/************************************************************************ 2257 * * 2258 * Progressive interface to the verifyer one atom at a time * 2259 * * 2260 ************************************************************************/ 2261 2262/** 2263 * xmlRegNewExecCtxt: 2264 * @comp: a precompiled regular expression 2265 * @callback: a callback function used for handling progresses in the 2266 * automata matching phase 2267 * @data: the context data associated to the callback in this context 2268 * 2269 * Build a context used for progressive evaluation of a regexp. 2270 * 2271 * Returns the new context 2272 */ 2273xmlRegExecCtxtPtr 2274xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 2275 xmlRegExecCtxtPtr exec; 2276 2277 if (comp == NULL) 2278 return(NULL); 2279 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 2280 if (exec == NULL) { 2281 return(NULL); 2282 } 2283 memset(exec, 0, sizeof(xmlRegExecCtxt)); 2284 exec->inputString = NULL; 2285 exec->index = 0; 2286 exec->determinist = 1; 2287 exec->maxRollbacks = 0; 2288 exec->nbRollbacks = 0; 2289 exec->rollbacks = NULL; 2290 exec->status = 0; 2291 exec->comp = comp; 2292 if (comp->compact == NULL) 2293 exec->state = comp->states[0]; 2294 exec->transno = 0; 2295 exec->transcount = 0; 2296 exec->callback = callback; 2297 exec->data = data; 2298 if (comp->nbCounters > 0) { 2299 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 2300 if (exec->counts == NULL) { 2301 xmlFree(exec); 2302 return(NULL); 2303 } 2304 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 2305 } else 2306 exec->counts = NULL; 2307 exec->inputStackMax = 0; 2308 exec->inputStackNr = 0; 2309 exec->inputStack = NULL; 2310 return(exec); 2311} 2312 2313/** 2314 * xmlRegFreeExecCtxt: 2315 * @exec: a regular expression evaulation context 2316 * 2317 * Free the structures associated to a regular expression evaulation context. 2318 */ 2319void 2320xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 2321 if (exec == NULL) 2322 return; 2323 2324 if (exec->rollbacks != NULL) { 2325 if (exec->counts != NULL) { 2326 int i; 2327 2328 for (i = 0;i < exec->maxRollbacks;i++) 2329 if (exec->rollbacks[i].counts != NULL) 2330 xmlFree(exec->rollbacks[i].counts); 2331 } 2332 xmlFree(exec->rollbacks); 2333 } 2334 if (exec->counts != NULL) 2335 xmlFree(exec->counts); 2336 if (exec->inputStack != NULL) { 2337 int i; 2338 2339 for (i = 0;i < exec->inputStackNr;i++) { 2340 if (exec->inputStack[i].value != NULL) 2341 xmlFree(exec->inputStack[i].value); 2342 } 2343 xmlFree(exec->inputStack); 2344 } 2345 xmlFree(exec); 2346} 2347 2348static void 2349xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 2350 void *data) { 2351#ifdef DEBUG_PUSH 2352 printf("saving value: %d:%s\n", exec->inputStackNr, value); 2353#endif 2354 if (exec->inputStackMax == 0) { 2355 exec->inputStackMax = 4; 2356 exec->inputStack = (xmlRegInputTokenPtr) 2357 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 2358 if (exec->inputStack == NULL) { 2359 fprintf(stderr, "push input: allocation failed"); 2360 exec->inputStackMax = 0; 2361 return; 2362 } 2363 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 2364 xmlRegInputTokenPtr tmp; 2365 2366 exec->inputStackMax *= 2; 2367 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 2368 exec->inputStackMax * sizeof(xmlRegInputToken)); 2369 if (tmp == NULL) { 2370 fprintf(stderr, "push input: allocation failed"); 2371 exec->inputStackMax /= 2; 2372 return; 2373 } 2374 exec->inputStack = tmp; 2375 } 2376 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 2377 exec->inputStack[exec->inputStackNr].data = data; 2378 exec->inputStackNr++; 2379 exec->inputStack[exec->inputStackNr].value = NULL; 2380 exec->inputStack[exec->inputStackNr].data = NULL; 2381} 2382 2383 2384/** 2385 * xmlRegCompactPushString: 2386 * @exec: a regexp execution context 2387 * @comp: the precompiled exec with a compact table 2388 * @value: a string token input 2389 * @data: data associated to the token to reuse in callbacks 2390 * 2391 * Push one input token in the execution context 2392 * 2393 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 2394 * a negative value in case of error. 2395 */ 2396static int 2397xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 2398 xmlRegexpPtr comp, 2399 const xmlChar *value, 2400 void *data) { 2401 int state = exec->index; 2402 int i, target; 2403 2404 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 2405 return(-1); 2406 2407 if (value == NULL) { 2408 /* 2409 * are we at a final state ? 2410 */ 2411 if (comp->compact[state * (comp->nbstrings + 1)] == 2412 XML_REGEXP_FINAL_STATE) 2413 return(1); 2414 return(0); 2415 } 2416 2417#ifdef DEBUG_PUSH 2418 printf("value pushed: %s\n", value); 2419#endif 2420 2421 /* 2422 * Examine all outside transition from current state 2423 */ 2424 for (i = 0;i < comp->nbstrings;i++) { 2425 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 2426 if ((target > 0) && (target <= comp->nbstates)) { 2427 target--; /* to avoid 0 */ 2428 if (xmlStrEqual(comp->stringMap[i], value)) { 2429 exec->index = target; 2430 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 2431 exec->callback(exec->data, value, 2432 comp->transdata[state * comp->nbstrings + i], data); 2433 } 2434#ifdef DEBUG_PUSH 2435 printf("entering state %d\n", target); 2436#endif 2437 if (comp->compact[target * (comp->nbstrings + 1)] == 2438 XML_REGEXP_FINAL_STATE) 2439 return(1); 2440 return(0); 2441 } 2442 } 2443 } 2444 /* 2445 * Failed to find an exit transition out from current state for the 2446 * current token 2447 */ 2448#ifdef DEBUG_PUSH 2449 printf("failed to find a transition for %s on state %d\n", value, state); 2450#endif 2451 exec->status = -1; 2452 return(-1); 2453} 2454 2455/** 2456 * xmlRegExecPushString: 2457 * @exec: a regexp execution context or NULL to indicate the end 2458 * @value: a string token input 2459 * @data: data associated to the token to reuse in callbacks 2460 * 2461 * Push one input token in the execution context 2462 * 2463 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 2464 * a negative value in case of error. 2465 */ 2466int 2467xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 2468 void *data) { 2469 xmlRegTransPtr trans; 2470 xmlRegAtomPtr atom; 2471 int ret; 2472 int final = 0; 2473 2474 if (exec == NULL) 2475 return(-1); 2476 if (exec->comp == NULL) 2477 return(-1); 2478 if (exec->status != 0) 2479 return(exec->status); 2480 2481 if (exec->comp->compact != NULL) 2482 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 2483 2484 if (value == NULL) { 2485 if (exec->state->type == XML_REGEXP_FINAL_STATE) 2486 return(1); 2487 final = 1; 2488 } 2489 2490#ifdef DEBUG_PUSH 2491 printf("value pushed: %s\n", value); 2492#endif 2493 /* 2494 * If we have an active rollback stack push the new value there 2495 * and get back to where we were left 2496 */ 2497 if ((value != NULL) && (exec->inputStackNr > 0)) { 2498 xmlFARegExecSaveInputString(exec, value, data); 2499 value = exec->inputStack[exec->index].value; 2500 data = exec->inputStack[exec->index].data; 2501#ifdef DEBUG_PUSH 2502 printf("value loaded: %s\n", value); 2503#endif 2504 } 2505 2506 while ((exec->status == 0) && 2507 ((value != NULL) || 2508 ((final == 1) && 2509 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 2510 2511 /* 2512 * End of input on non-terminal state, rollback, however we may 2513 * still have epsilon like transition for counted transitions 2514 * on counters, in that case don't break too early. 2515 */ 2516 if ((value == NULL) && (exec->counts == NULL)) 2517 goto rollback; 2518 2519 exec->transcount = 0; 2520 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2521 trans = &exec->state->trans[exec->transno]; 2522 if (trans->to < 0) 2523 continue; 2524 atom = trans->atom; 2525 ret = 0; 2526 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 2527 int i; 2528 int count; 2529 xmlRegTransPtr t; 2530 xmlRegCounterPtr counter; 2531 2532 ret = 0; 2533 2534#ifdef DEBUG_PUSH 2535 printf("testing all lax %d\n", trans->count); 2536#endif 2537 /* 2538 * Check all counted transitions from the current state 2539 */ 2540 if ((value == NULL) && (final)) { 2541 ret = 1; 2542 } else if (value != NULL) { 2543 for (i = 0;i < exec->state->nbTrans;i++) { 2544 t = &exec->state->trans[i]; 2545 if ((t->counter < 0) || (t == trans)) 2546 continue; 2547 counter = &exec->comp->counters[t->counter]; 2548 count = exec->counts[t->counter]; 2549 if ((count < counter->max) && 2550 (t->atom != NULL) && 2551 (xmlStrEqual(value, t->atom->valuep))) { 2552 ret = 0; 2553 break; 2554 } 2555 if ((count >= counter->min) && 2556 (count < counter->max) && 2557 (xmlStrEqual(value, t->atom->valuep))) { 2558 ret = 1; 2559 break; 2560 } 2561 } 2562 } 2563 } else if (trans->count == REGEXP_ALL_COUNTER) { 2564 int i; 2565 int count; 2566 xmlRegTransPtr t; 2567 xmlRegCounterPtr counter; 2568 2569 ret = 1; 2570 2571#ifdef DEBUG_PUSH 2572 printf("testing all %d\n", trans->count); 2573#endif 2574 /* 2575 * Check all counted transitions from the current state 2576 */ 2577 for (i = 0;i < exec->state->nbTrans;i++) { 2578 t = &exec->state->trans[i]; 2579 if ((t->counter < 0) || (t == trans)) 2580 continue; 2581 counter = &exec->comp->counters[t->counter]; 2582 count = exec->counts[t->counter]; 2583 if ((count < counter->min) || (count > counter->max)) { 2584 ret = 0; 2585 break; 2586 } 2587 } 2588 } else if (trans->count >= 0) { 2589 int count; 2590 xmlRegCounterPtr counter; 2591 2592 /* 2593 * A counted transition. 2594 */ 2595 2596 count = exec->counts[trans->count]; 2597 counter = &exec->comp->counters[trans->count]; 2598#ifdef DEBUG_PUSH 2599 printf("testing count %d: val %d, min %d, max %d\n", 2600 trans->count, count, counter->min, counter->max); 2601#endif 2602 ret = ((count >= counter->min) && (count <= counter->max)); 2603 } else if (atom == NULL) { 2604 fprintf(stderr, "epsilon transition left at runtime\n"); 2605 exec->status = -2; 2606 break; 2607 } else if (value != NULL) { 2608 ret = xmlStrEqual(value, atom->valuep); 2609 if ((ret == 1) && (trans->counter >= 0)) { 2610 xmlRegCounterPtr counter; 2611 int count; 2612 2613 count = exec->counts[trans->counter]; 2614 counter = &exec->comp->counters[trans->counter]; 2615 if (count >= counter->max) 2616 ret = 0; 2617 } 2618 2619 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 2620 xmlRegStatePtr to = exec->comp->states[trans->to]; 2621 2622 /* 2623 * this is a multiple input sequence 2624 */ 2625 if (exec->state->nbTrans > exec->transno + 1) { 2626 if (exec->inputStackNr <= 0) { 2627 xmlFARegExecSaveInputString(exec, value, data); 2628 } 2629 xmlFARegExecSave(exec); 2630 } 2631 exec->transcount = 1; 2632 do { 2633 /* 2634 * Try to progress as much as possible on the input 2635 */ 2636 if (exec->transcount == atom->max) { 2637 break; 2638 } 2639 exec->index++; 2640 value = exec->inputStack[exec->index].value; 2641 data = exec->inputStack[exec->index].data; 2642#ifdef DEBUG_PUSH 2643 printf("value loaded: %s\n", value); 2644#endif 2645 2646 /* 2647 * End of input: stop here 2648 */ 2649 if (value == NULL) { 2650 exec->index --; 2651 break; 2652 } 2653 if (exec->transcount >= atom->min) { 2654 int transno = exec->transno; 2655 xmlRegStatePtr state = exec->state; 2656 2657 /* 2658 * The transition is acceptable save it 2659 */ 2660 exec->transno = -1; /* trick */ 2661 exec->state = to; 2662 if (exec->inputStackNr <= 0) { 2663 xmlFARegExecSaveInputString(exec, value, data); 2664 } 2665 xmlFARegExecSave(exec); 2666 exec->transno = transno; 2667 exec->state = state; 2668 } 2669 ret = xmlStrEqual(value, atom->valuep); 2670 exec->transcount++; 2671 } while (ret == 1); 2672 if (exec->transcount < atom->min) 2673 ret = 0; 2674 2675 /* 2676 * If the last check failed but one transition was found 2677 * possible, rollback 2678 */ 2679 if (ret < 0) 2680 ret = 0; 2681 if (ret == 0) { 2682 goto rollback; 2683 } 2684 } 2685 } 2686 if (ret == 1) { 2687 if ((exec->callback != NULL) && (atom != NULL)) { 2688 exec->callback(exec->data, atom->valuep, 2689 atom->data, data); 2690 } 2691 if (exec->state->nbTrans > exec->transno + 1) { 2692 if (exec->inputStackNr <= 0) { 2693 xmlFARegExecSaveInputString(exec, value, data); 2694 } 2695 xmlFARegExecSave(exec); 2696 } 2697 if (trans->counter >= 0) { 2698#ifdef DEBUG_PUSH 2699 printf("Increasing count %d\n", trans->counter); 2700#endif 2701 exec->counts[trans->counter]++; 2702 } 2703#ifdef DEBUG_PUSH 2704 printf("entering state %d\n", trans->to); 2705#endif 2706 exec->state = exec->comp->states[trans->to]; 2707 exec->transno = 0; 2708 if (trans->atom != NULL) { 2709 if (exec->inputStack != NULL) { 2710 exec->index++; 2711 if (exec->index < exec->inputStackNr) { 2712 value = exec->inputStack[exec->index].value; 2713 data = exec->inputStack[exec->index].data; 2714#ifdef DEBUG_PUSH 2715 printf("value loaded: %s\n", value); 2716#endif 2717 } else { 2718 value = NULL; 2719 data = NULL; 2720#ifdef DEBUG_PUSH 2721 printf("end of input\n"); 2722#endif 2723 } 2724 } else { 2725 value = NULL; 2726 data = NULL; 2727#ifdef DEBUG_PUSH 2728 printf("end of input\n"); 2729#endif 2730 } 2731 } 2732 goto progress; 2733 } else if (ret < 0) { 2734 exec->status = -4; 2735 break; 2736 } 2737 } 2738 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2739rollback: 2740 /* 2741 * Failed to find a way out 2742 */ 2743 exec->determinist = 0; 2744 xmlFARegExecRollBack(exec); 2745 if (exec->status == 0) { 2746 value = exec->inputStack[exec->index].value; 2747 data = exec->inputStack[exec->index].data; 2748#ifdef DEBUG_PUSH 2749 printf("value loaded: %s\n", value); 2750#endif 2751 } 2752 } 2753progress: 2754 continue; 2755 } 2756 if (exec->status == 0) { 2757 return(exec->state->type == XML_REGEXP_FINAL_STATE); 2758 } 2759 return(exec->status); 2760} 2761 2762/** 2763 * xmlRegExecPushString2: 2764 * @exec: a regexp execution context or NULL to indicate the end 2765 * @value: the first string token input 2766 * @value2: the second string token input 2767 * @data: data associated to the token to reuse in callbacks 2768 * 2769 * Push one input token in the execution context 2770 * 2771 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 2772 * a negative value in case of error. 2773 */ 2774int 2775xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 2776 const xmlChar *value2, void *data) { 2777 xmlChar buf[150]; 2778 int lenn, lenp, ret; 2779 xmlChar *str; 2780 2781 if (exec == NULL) 2782 return(-1); 2783 if (exec->comp == NULL) 2784 return(-1); 2785 if (exec->status != 0) 2786 return(exec->status); 2787 2788 if (value2 == NULL) 2789 return(xmlRegExecPushString(exec, value, data)); 2790 2791 lenn = strlen((char *) value2); 2792 lenp = strlen((char *) value); 2793 2794 if (150 < lenn + lenp + 2) { 2795 str = (xmlChar *) xmlMalloc(lenn + lenp + 2); 2796 if (str == NULL) { 2797 exec->status = -1; 2798 return(-1); 2799 } 2800 } else { 2801 str = buf; 2802 } 2803 memcpy(&str[0], value, lenp); 2804 str[lenp] = '|'; 2805 memcpy(&str[lenp + 1], value2, lenn); 2806 str[lenn + lenp + 1] = 0; 2807 2808 if (exec->comp->compact != NULL) 2809 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 2810 else 2811 ret = xmlRegExecPushString(exec, str, data); 2812 2813 if (str != buf) 2814 xmlFree(buf); 2815 return(ret); 2816} 2817 2818#if 0 2819static int 2820xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 2821 xmlRegTransPtr trans; 2822 xmlRegAtomPtr atom; 2823 int ret; 2824 int codepoint, len; 2825 2826 if (exec == NULL) 2827 return(-1); 2828 if (exec->status != 0) 2829 return(exec->status); 2830 2831 while ((exec->status == 0) && 2832 ((exec->inputString[exec->index] != 0) || 2833 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 2834 2835 /* 2836 * End of input on non-terminal state, rollback, however we may 2837 * still have epsilon like transition for counted transitions 2838 * on counters, in that case don't break too early. 2839 */ 2840 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 2841 goto rollback; 2842 2843 exec->transcount = 0; 2844 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2845 trans = &exec->state->trans[exec->transno]; 2846 if (trans->to < 0) 2847 continue; 2848 atom = trans->atom; 2849 ret = 0; 2850 if (trans->count >= 0) { 2851 int count; 2852 xmlRegCounterPtr counter; 2853 2854 /* 2855 * A counted transition. 2856 */ 2857 2858 count = exec->counts[trans->count]; 2859 counter = &exec->comp->counters[trans->count]; 2860#ifdef DEBUG_REGEXP_EXEC 2861 printf("testing count %d: val %d, min %d, max %d\n", 2862 trans->count, count, counter->min, counter->max); 2863#endif 2864 ret = ((count >= counter->min) && (count <= counter->max)); 2865 } else if (atom == NULL) { 2866 fprintf(stderr, "epsilon transition left at runtime\n"); 2867 exec->status = -2; 2868 break; 2869 } else if (exec->inputString[exec->index] != 0) { 2870 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 2871 ret = xmlRegCheckCharacter(atom, codepoint); 2872 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 2873 xmlRegStatePtr to = exec->comp->states[trans->to]; 2874 2875 /* 2876 * this is a multiple input sequence 2877 */ 2878 if (exec->state->nbTrans > exec->transno + 1) { 2879 xmlFARegExecSave(exec); 2880 } 2881 exec->transcount = 1; 2882 do { 2883 /* 2884 * Try to progress as much as possible on the input 2885 */ 2886 if (exec->transcount == atom->max) { 2887 break; 2888 } 2889 exec->index += len; 2890 /* 2891 * End of input: stop here 2892 */ 2893 if (exec->inputString[exec->index] == 0) { 2894 exec->index -= len; 2895 break; 2896 } 2897 if (exec->transcount >= atom->min) { 2898 int transno = exec->transno; 2899 xmlRegStatePtr state = exec->state; 2900 2901 /* 2902 * The transition is acceptable save it 2903 */ 2904 exec->transno = -1; /* trick */ 2905 exec->state = to; 2906 xmlFARegExecSave(exec); 2907 exec->transno = transno; 2908 exec->state = state; 2909 } 2910 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 2911 len); 2912 ret = xmlRegCheckCharacter(atom, codepoint); 2913 exec->transcount++; 2914 } while (ret == 1); 2915 if (exec->transcount < atom->min) 2916 ret = 0; 2917 2918 /* 2919 * If the last check failed but one transition was found 2920 * possible, rollback 2921 */ 2922 if (ret < 0) 2923 ret = 0; 2924 if (ret == 0) { 2925 goto rollback; 2926 } 2927 } 2928 } 2929 if (ret == 1) { 2930 if (exec->state->nbTrans > exec->transno + 1) { 2931 xmlFARegExecSave(exec); 2932 } 2933 if (trans->counter >= 0) { 2934#ifdef DEBUG_REGEXP_EXEC 2935 printf("Increasing count %d\n", trans->counter); 2936#endif 2937 exec->counts[trans->counter]++; 2938 } 2939#ifdef DEBUG_REGEXP_EXEC 2940 printf("entering state %d\n", trans->to); 2941#endif 2942 exec->state = exec->comp->states[trans->to]; 2943 exec->transno = 0; 2944 if (trans->atom != NULL) { 2945 exec->index += len; 2946 } 2947 goto progress; 2948 } else if (ret < 0) { 2949 exec->status = -4; 2950 break; 2951 } 2952 } 2953 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2954rollback: 2955 /* 2956 * Failed to find a way out 2957 */ 2958 exec->determinist = 0; 2959 xmlFARegExecRollBack(exec); 2960 } 2961progress: 2962 continue; 2963 } 2964} 2965#endif 2966/************************************************************************ 2967 * * 2968 * Parser for the Shemas Datatype Regular Expressions * 2969 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 2970 * * 2971 ************************************************************************/ 2972 2973/** 2974 * xmlFAIsChar: 2975 * @ctxt: a regexp parser context 2976 * 2977 * [10] Char ::= [^.\?*+()|#x5B#x5D] 2978 */ 2979static int 2980xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 2981 int cur; 2982 int len; 2983 2984 cur = CUR_SCHAR(ctxt->cur, len); 2985 if ((cur == '.') || (cur == '\\') || (cur == '?') || 2986 (cur == '*') || (cur == '+') || (cur == '(') || 2987 (cur == ')') || (cur == '|') || (cur == 0x5B) || 2988 (cur == 0x5D) || (cur == 0)) 2989 return(-1); 2990 return(cur); 2991} 2992 2993/** 2994 * xmlFAParseCharProp: 2995 * @ctxt: a regexp parser context 2996 * 2997 * [27] charProp ::= IsCategory | IsBlock 2998 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 2999 * Separators | Symbols | Others 3000 * [29] Letters ::= 'L' [ultmo]? 3001 * [30] Marks ::= 'M' [nce]? 3002 * [31] Numbers ::= 'N' [dlo]? 3003 * [32] Punctuation ::= 'P' [cdseifo]? 3004 * [33] Separators ::= 'Z' [slp]? 3005 * [34] Symbols ::= 'S' [mcko]? 3006 * [35] Others ::= 'C' [cfon]? 3007 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 3008 */ 3009static void 3010xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 3011 int cur; 3012 xmlRegAtomType type = 0; 3013 xmlChar *blockName = NULL; 3014 3015 cur = CUR; 3016 if (cur == 'L') { 3017 NEXT; 3018 cur = CUR; 3019 if (cur == 'u') { 3020 NEXT; 3021 type = XML_REGEXP_LETTER_UPPERCASE; 3022 } else if (cur == 'l') { 3023 NEXT; 3024 type = XML_REGEXP_LETTER_LOWERCASE; 3025 } else if (cur == 't') { 3026 NEXT; 3027 type = XML_REGEXP_LETTER_TITLECASE; 3028 } else if (cur == 'm') { 3029 NEXT; 3030 type = XML_REGEXP_LETTER_MODIFIER; 3031 } else if (cur == 'o') { 3032 NEXT; 3033 type = XML_REGEXP_LETTER_OTHERS; 3034 } else { 3035 type = XML_REGEXP_LETTER; 3036 } 3037 } else if (cur == 'M') { 3038 NEXT; 3039 cur = CUR; 3040 if (cur == 'n') { 3041 NEXT; 3042 /* nonspacing */ 3043 type = XML_REGEXP_MARK_NONSPACING; 3044 } else if (cur == 'c') { 3045 NEXT; 3046 /* spacing combining */ 3047 type = XML_REGEXP_MARK_SPACECOMBINING; 3048 } else if (cur == 'e') { 3049 NEXT; 3050 /* enclosing */ 3051 type = XML_REGEXP_MARK_ENCLOSING; 3052 } else { 3053 /* all marks */ 3054 type = XML_REGEXP_MARK; 3055 } 3056 } else if (cur == 'N') { 3057 NEXT; 3058 cur = CUR; 3059 if (cur == 'd') { 3060 NEXT; 3061 /* digital */ 3062 type = XML_REGEXP_NUMBER_DECIMAL; 3063 } else if (cur == 'l') { 3064 NEXT; 3065 /* letter */ 3066 type = XML_REGEXP_NUMBER_LETTER; 3067 } else if (cur == 'o') { 3068 NEXT; 3069 /* other */ 3070 type = XML_REGEXP_NUMBER_OTHERS; 3071 } else { 3072 /* all numbers */ 3073 type = XML_REGEXP_NUMBER; 3074 } 3075 } else if (cur == 'P') { 3076 NEXT; 3077 cur = CUR; 3078 if (cur == 'c') { 3079 NEXT; 3080 /* connector */ 3081 type = XML_REGEXP_PUNCT_CONNECTOR; 3082 } else if (cur == 'd') { 3083 NEXT; 3084 /* dash */ 3085 type = XML_REGEXP_PUNCT_DASH; 3086 } else if (cur == 's') { 3087 NEXT; 3088 /* open */ 3089 type = XML_REGEXP_PUNCT_OPEN; 3090 } else if (cur == 'e') { 3091 NEXT; 3092 /* close */ 3093 type = XML_REGEXP_PUNCT_CLOSE; 3094 } else if (cur == 'i') { 3095 NEXT; 3096 /* initial quote */ 3097 type = XML_REGEXP_PUNCT_INITQUOTE; 3098 } else if (cur == 'f') { 3099 NEXT; 3100 /* final quote */ 3101 type = XML_REGEXP_PUNCT_FINQUOTE; 3102 } else if (cur == 'o') { 3103 NEXT; 3104 /* other */ 3105 type = XML_REGEXP_PUNCT_OTHERS; 3106 } else { 3107 /* all punctuation */ 3108 type = XML_REGEXP_PUNCT; 3109 } 3110 } else if (cur == 'Z') { 3111 NEXT; 3112 cur = CUR; 3113 if (cur == 's') { 3114 NEXT; 3115 /* space */ 3116 type = XML_REGEXP_SEPAR_SPACE; 3117 } else if (cur == 'l') { 3118 NEXT; 3119 /* line */ 3120 type = XML_REGEXP_SEPAR_LINE; 3121 } else if (cur == 'p') { 3122 NEXT; 3123 /* paragraph */ 3124 type = XML_REGEXP_SEPAR_PARA; 3125 } else { 3126 /* all separators */ 3127 type = XML_REGEXP_SEPAR; 3128 } 3129 } else if (cur == 'S') { 3130 NEXT; 3131 cur = CUR; 3132 if (cur == 'm') { 3133 NEXT; 3134 type = XML_REGEXP_SYMBOL_MATH; 3135 /* math */ 3136 } else if (cur == 'c') { 3137 NEXT; 3138 type = XML_REGEXP_SYMBOL_CURRENCY; 3139 /* currency */ 3140 } else if (cur == 'k') { 3141 NEXT; 3142 type = XML_REGEXP_SYMBOL_MODIFIER; 3143 /* modifiers */ 3144 } else if (cur == 'o') { 3145 NEXT; 3146 type = XML_REGEXP_SYMBOL_OTHERS; 3147 /* other */ 3148 } else { 3149 /* all symbols */ 3150 type = XML_REGEXP_SYMBOL; 3151 } 3152 } else if (cur == 'C') { 3153 NEXT; 3154 cur = CUR; 3155 if (cur == 'c') { 3156 NEXT; 3157 /* control */ 3158 type = XML_REGEXP_OTHER_CONTROL; 3159 } else if (cur == 'f') { 3160 NEXT; 3161 /* format */ 3162 type = XML_REGEXP_OTHER_FORMAT; 3163 } else if (cur == 'o') { 3164 NEXT; 3165 /* private use */ 3166 type = XML_REGEXP_OTHER_PRIVATE; 3167 } else if (cur == 'n') { 3168 NEXT; 3169 /* not assigned */ 3170 type = XML_REGEXP_OTHER_NA; 3171 } else { 3172 /* all others */ 3173 type = XML_REGEXP_OTHER; 3174 } 3175 } else if (cur == 'I') { 3176 const xmlChar *start; 3177 NEXT; 3178 cur = CUR; 3179 if (cur != 's') { 3180 ERROR("IsXXXX expected"); 3181 return; 3182 } 3183 NEXT; 3184 start = ctxt->cur; 3185 cur = CUR; 3186 if (((cur >= 'a') && (cur <= 'z')) || 3187 ((cur >= 'A') && (cur <= 'Z')) || 3188 ((cur >= '0') && (cur <= '9')) || 3189 (cur == 0x2D)) { 3190 NEXT; 3191 cur = CUR; 3192 while (((cur >= 'a') && (cur <= 'z')) || 3193 ((cur >= 'A') && (cur <= 'Z')) || 3194 ((cur >= '0') && (cur <= '9')) || 3195 (cur == 0x2D)) { 3196 NEXT; 3197 cur = CUR; 3198 } 3199 } 3200 type = XML_REGEXP_BLOCK_NAME; 3201 blockName = xmlStrndup(start, ctxt->cur - start); 3202 } else { 3203 ERROR("Unknown char property"); 3204 return; 3205 } 3206 if (ctxt->atom == NULL) { 3207 ctxt->atom = xmlRegNewAtom(ctxt, type); 3208 if (ctxt->atom != NULL) 3209 ctxt->atom->valuep = blockName; 3210 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3211 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3212 type, 0, 0, blockName); 3213 } 3214} 3215 3216/** 3217 * xmlFAParseCharClassEsc: 3218 * @ctxt: a regexp parser context 3219 * 3220 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 3221 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 3222 * [25] catEsc ::= '\p{' charProp '}' 3223 * [26] complEsc ::= '\P{' charProp '}' 3224 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 3225 */ 3226static void 3227xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 3228 int cur; 3229 3230 if (CUR == '.') { 3231 if (ctxt->atom == NULL) { 3232 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 3233 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3234 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3235 XML_REGEXP_ANYCHAR, 0, 0, NULL); 3236 } 3237 NEXT; 3238 return; 3239 } 3240 if (CUR != '\\') { 3241 ERROR("Escaped sequence: expecting \\"); 3242 return; 3243 } 3244 NEXT; 3245 cur = CUR; 3246 if (cur == 'p') { 3247 NEXT; 3248 if (CUR != '{') { 3249 ERROR("Expecting '{'"); 3250 return; 3251 } 3252 NEXT; 3253 xmlFAParseCharProp(ctxt); 3254 if (CUR != '}') { 3255 ERROR("Expecting '}'"); 3256 return; 3257 } 3258 NEXT; 3259 } else if (cur == 'P') { 3260 NEXT; 3261 if (CUR != '{') { 3262 ERROR("Expecting '{'"); 3263 return; 3264 } 3265 NEXT; 3266 xmlFAParseCharProp(ctxt); 3267 ctxt->atom->neg = 1; 3268 if (CUR != '}') { 3269 ERROR("Expecting '}'"); 3270 return; 3271 } 3272 NEXT; 3273 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 3274 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 3275 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 3276 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 3277 (cur == 0x5E)) { 3278 if (ctxt->atom == NULL) { 3279 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 3280 if (ctxt->atom != NULL) 3281 ctxt->atom->codepoint = cur; 3282 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3283 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3284 XML_REGEXP_CHARVAL, cur, cur, NULL); 3285 } 3286 NEXT; 3287 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 3288 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 3289 (cur == 'w') || (cur == 'W')) { 3290 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 3291 3292 switch (cur) { 3293 case 's': 3294 type = XML_REGEXP_ANYSPACE; 3295 break; 3296 case 'S': 3297 type = XML_REGEXP_NOTSPACE; 3298 break; 3299 case 'i': 3300 type = XML_REGEXP_INITNAME; 3301 break; 3302 case 'I': 3303 type = XML_REGEXP_NOTINITNAME; 3304 break; 3305 case 'c': 3306 type = XML_REGEXP_NAMECHAR; 3307 break; 3308 case 'C': 3309 type = XML_REGEXP_NOTNAMECHAR; 3310 break; 3311 case 'd': 3312 type = XML_REGEXP_DECIMAL; 3313 break; 3314 case 'D': 3315 type = XML_REGEXP_NOTDECIMAL; 3316 break; 3317 case 'w': 3318 type = XML_REGEXP_REALCHAR; 3319 break; 3320 case 'W': 3321 type = XML_REGEXP_NOTREALCHAR; 3322 break; 3323 } 3324 NEXT; 3325 if (ctxt->atom == NULL) { 3326 ctxt->atom = xmlRegNewAtom(ctxt, type); 3327 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3328 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3329 type, 0, 0, NULL); 3330 } 3331 } 3332} 3333 3334/** 3335 * xmlFAParseCharRef: 3336 * @ctxt: a regexp parser context 3337 * 3338 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' ) 3339 */ 3340static int 3341xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) { 3342 int ret = 0, cur; 3343 3344 if ((CUR != '&') || (NXT(1) != '#')) 3345 return(-1); 3346 NEXT; 3347 NEXT; 3348 cur = CUR; 3349 if (cur == 'x') { 3350 NEXT; 3351 cur = CUR; 3352 if (((cur >= '0') && (cur <= '9')) || 3353 ((cur >= 'a') && (cur <= 'f')) || 3354 ((cur >= 'A') && (cur <= 'F'))) { 3355 while (((cur >= '0') && (cur <= '9')) || 3356 ((cur >= 'A') && (cur <= 'F'))) { 3357 if ((cur >= '0') && (cur <= '9')) 3358 ret = ret * 16 + cur - '0'; 3359 else if ((cur >= 'a') && (cur <= 'f')) 3360 ret = ret * 16 + 10 + (cur - 'a'); 3361 else 3362 ret = ret * 16 + 10 + (cur - 'A'); 3363 NEXT; 3364 cur = CUR; 3365 } 3366 } else { 3367 ERROR("Char ref: expecting [0-9A-F]"); 3368 return(-1); 3369 } 3370 } else { 3371 if ((cur >= '0') && (cur <= '9')) { 3372 while ((cur >= '0') && (cur <= '9')) { 3373 ret = ret * 10 + cur - '0'; 3374 NEXT; 3375 cur = CUR; 3376 } 3377 } else { 3378 ERROR("Char ref: expecting [0-9]"); 3379 return(-1); 3380 } 3381 } 3382 if (cur != ';') { 3383 ERROR("Char ref: expecting ';'"); 3384 return(-1); 3385 } else { 3386 NEXT; 3387 } 3388 return(ret); 3389} 3390 3391/** 3392 * xmlFAParseCharRange: 3393 * @ctxt: a regexp parser context 3394 * 3395 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 3396 * [18] seRange ::= charOrEsc '-' charOrEsc 3397 * [20] charOrEsc ::= XmlChar | SingleCharEsc 3398 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 3399 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 3400 */ 3401static void 3402xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 3403 int cur; 3404 int start = -1; 3405 int end = -1; 3406 3407 if ((CUR == '&') && (NXT(1) == '#')) { 3408 end = start = xmlFAParseCharRef(ctxt); 3409 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3410 XML_REGEXP_CHARVAL, start, end, NULL); 3411 return; 3412 } 3413 cur = CUR; 3414 if (cur == '\\') { 3415 NEXT; 3416 cur = CUR; 3417 switch (cur) { 3418 case 'n': start = 0xA; break; 3419 case 'r': start = 0xD; break; 3420 case 't': start = 0x9; break; 3421 case '\\': case '|': case '.': case '-': case '^': case '?': 3422 case '*': case '+': case '{': case '}': case '(': case ')': 3423 case '[': case ']': 3424 start = cur; break; 3425 default: 3426 ERROR("Invalid escape value"); 3427 return; 3428 } 3429 end = start; 3430 } else if ((cur != 0x5B) && (cur != 0x5D)) { 3431 end = start = cur; 3432 } else { 3433 ERROR("Expecting a char range"); 3434 return; 3435 } 3436 NEXT; 3437 if (start == '-') { 3438 return; 3439 } 3440 cur = CUR; 3441 if (cur != '-') { 3442 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3443 XML_REGEXP_CHARVAL, start, end, NULL); 3444 return; 3445 } 3446 NEXT; 3447 cur = CUR; 3448 if (cur == '\\') { 3449 NEXT; 3450 cur = CUR; 3451 switch (cur) { 3452 case 'n': end = 0xA; break; 3453 case 'r': end = 0xD; break; 3454 case 't': end = 0x9; break; 3455 case '\\': case '|': case '.': case '-': case '^': case '?': 3456 case '*': case '+': case '{': case '}': case '(': case ')': 3457 case '[': case ']': 3458 end = cur; break; 3459 default: 3460 ERROR("Invalid escape value"); 3461 return; 3462 } 3463 } else if ((cur != 0x5B) && (cur != 0x5D)) { 3464 end = cur; 3465 } else { 3466 ERROR("Expecting the end of a char range"); 3467 return; 3468 } 3469 NEXT; 3470 /* TODO check that the values are acceptable character ranges for XML */ 3471 if (end < start) { 3472 ERROR("End of range is before start of range"); 3473 } else { 3474 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3475 XML_REGEXP_CHARVAL, start, end, NULL); 3476 } 3477 return; 3478} 3479 3480/** 3481 * xmlFAParsePosCharGroup: 3482 * @ctxt: a regexp parser context 3483 * 3484 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 3485 */ 3486static void 3487xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 3488 do { 3489 if ((CUR == '\\') || (CUR == '.')) { 3490 xmlFAParseCharClassEsc(ctxt); 3491 } else { 3492 xmlFAParseCharRange(ctxt); 3493 } 3494 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 3495 (ctxt->error == 0)); 3496} 3497 3498/** 3499 * xmlFAParseCharGroup: 3500 * @ctxt: a regexp parser context 3501 * 3502 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 3503 * [15] negCharGroup ::= '^' posCharGroup 3504 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 3505 * [12] charClassExpr ::= '[' charGroup ']' 3506 */ 3507static void 3508xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 3509 int n = ctxt->neg; 3510 while ((CUR != ']') && (ctxt->error == 0)) { 3511 if (CUR == '^') { 3512 int neg = ctxt->neg; 3513 3514 NEXT; 3515 ctxt->neg = !ctxt->neg; 3516 xmlFAParsePosCharGroup(ctxt); 3517 ctxt->neg = neg; 3518 } else if (CUR == '-') { 3519 NEXT; 3520 ctxt->neg = !ctxt->neg; 3521 if (CUR != '[') { 3522 ERROR("charClassExpr: '[' expected"); 3523 break; 3524 } 3525 NEXT; 3526 xmlFAParseCharGroup(ctxt); 3527 if (CUR == ']') { 3528 NEXT; 3529 } else { 3530 ERROR("charClassExpr: ']' expected"); 3531 break; 3532 } 3533 break; 3534 } else if (CUR != ']') { 3535 xmlFAParsePosCharGroup(ctxt); 3536 } 3537 } 3538 ctxt->neg = n; 3539} 3540 3541/** 3542 * xmlFAParseCharClass: 3543 * @ctxt: a regexp parser context 3544 * 3545 * [11] charClass ::= charClassEsc | charClassExpr 3546 * [12] charClassExpr ::= '[' charGroup ']' 3547 */ 3548static void 3549xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 3550 if (CUR == '[') { 3551 NEXT; 3552 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 3553 if (ctxt->atom == NULL) 3554 return; 3555 xmlFAParseCharGroup(ctxt); 3556 if (CUR == ']') { 3557 NEXT; 3558 } else { 3559 ERROR("xmlFAParseCharClass: ']' expected"); 3560 } 3561 } else { 3562 xmlFAParseCharClassEsc(ctxt); 3563 } 3564} 3565 3566/** 3567 * xmlFAParseQuantExact: 3568 * @ctxt: a regexp parser context 3569 * 3570 * [8] QuantExact ::= [0-9]+ 3571 */ 3572static int 3573xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 3574 int ret = 0; 3575 int ok = 0; 3576 3577 while ((CUR >= '0') && (CUR <= '9')) { 3578 ret = ret * 10 + (CUR - '0'); 3579 ok = 1; 3580 NEXT; 3581 } 3582 if (ok != 1) { 3583 return(-1); 3584 } 3585 return(ret); 3586} 3587 3588/** 3589 * xmlFAParseQuantifier: 3590 * @ctxt: a regexp parser context 3591 * 3592 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 3593 * [5] quantity ::= quantRange | quantMin | QuantExact 3594 * [6] quantRange ::= QuantExact ',' QuantExact 3595 * [7] quantMin ::= QuantExact ',' 3596 * [8] QuantExact ::= [0-9]+ 3597 */ 3598static int 3599xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 3600 int cur; 3601 3602 cur = CUR; 3603 if ((cur == '?') || (cur == '*') || (cur == '+')) { 3604 if (ctxt->atom != NULL) { 3605 if (cur == '?') 3606 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 3607 else if (cur == '*') 3608 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 3609 else if (cur == '+') 3610 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 3611 } 3612 NEXT; 3613 return(1); 3614 } 3615 if (cur == '{') { 3616 int min = 0, max = 0; 3617 3618 NEXT; 3619 cur = xmlFAParseQuantExact(ctxt); 3620 if (cur >= 0) 3621 min = cur; 3622 if (CUR == ',') { 3623 NEXT; 3624 cur = xmlFAParseQuantExact(ctxt); 3625 if (cur >= 0) 3626 max = cur; 3627 } 3628 if (CUR == '}') { 3629 NEXT; 3630 } else { 3631 ERROR("Unterminated quantifier"); 3632 } 3633 if (max == 0) 3634 max = min; 3635 if (ctxt->atom != NULL) { 3636 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 3637 ctxt->atom->min = min; 3638 ctxt->atom->max = max; 3639 } 3640 return(1); 3641 } 3642 return(0); 3643} 3644 3645/** 3646 * xmlFAParseAtom: 3647 * @ctxt: a regexp parser context 3648 * 3649 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 3650 */ 3651static int 3652xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 3653 int codepoint, len; 3654 3655 codepoint = xmlFAIsChar(ctxt); 3656 if (codepoint > 0) { 3657 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 3658 if (ctxt->atom == NULL) 3659 return(-1); 3660 codepoint = CUR_SCHAR(ctxt->cur, len); 3661 ctxt->atom->codepoint = codepoint; 3662 NEXTL(len); 3663 return(1); 3664 } else if (CUR == '|') { 3665 return(0); 3666 } else if (CUR == 0) { 3667 return(0); 3668 } else if (CUR == ')') { 3669 return(0); 3670 } else if (CUR == '(') { 3671 xmlRegStatePtr start, oldend; 3672 3673 NEXT; 3674 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 3675 start = ctxt->state; 3676 oldend = ctxt->end; 3677 ctxt->end = NULL; 3678 ctxt->atom = NULL; 3679 xmlFAParseRegExp(ctxt, 0); 3680 if (CUR == ')') { 3681 NEXT; 3682 } else { 3683 ERROR("xmlFAParseAtom: expecting ')'"); 3684 } 3685 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 3686 if (ctxt->atom == NULL) 3687 return(-1); 3688 ctxt->atom->start = start; 3689 ctxt->atom->stop = ctxt->state; 3690 ctxt->end = oldend; 3691 return(1); 3692 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 3693 xmlFAParseCharClass(ctxt); 3694 return(1); 3695 } 3696 return(0); 3697} 3698 3699/** 3700 * xmlFAParsePiece: 3701 * @ctxt: a regexp parser context 3702 * 3703 * [3] piece ::= atom quantifier? 3704 */ 3705static int 3706xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 3707 int ret; 3708 3709 ctxt->atom = NULL; 3710 ret = xmlFAParseAtom(ctxt); 3711 if (ret == 0) 3712 return(0); 3713 if (ctxt->atom == NULL) { 3714 ERROR("internal: no atom generated"); 3715 } 3716 xmlFAParseQuantifier(ctxt); 3717 return(1); 3718} 3719 3720/** 3721 * xmlFAParseBranch: 3722 * @ctxt: a regexp parser context 3723 * @first: is taht the first 3724 * 3725 * [2] branch ::= piece* 3726 */ 3727static void 3728xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) { 3729 xmlRegStatePtr previous; 3730 xmlRegAtomPtr prevatom = NULL; 3731 int ret; 3732 3733 previous = ctxt->state; 3734 ret = xmlFAParsePiece(ctxt); 3735 if (ret != 0) { 3736 if (first) { 3737 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom); 3738 previous = ctxt->state; 3739 } else { 3740 prevatom = ctxt->atom; 3741 } 3742 ctxt->atom = NULL; 3743 } 3744 while ((ret != 0) && (ctxt->error == 0)) { 3745 ret = xmlFAParsePiece(ctxt); 3746 if (ret != 0) { 3747 if (first) { 3748 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom); 3749 } else { 3750 xmlFAGenerateTransitions(ctxt, previous, NULL, prevatom); 3751 prevatom = ctxt->atom; 3752 } 3753 previous = ctxt->state; 3754 ctxt->atom = NULL; 3755 } 3756 } 3757 if (!first) { 3758 xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom); 3759 } 3760} 3761 3762/** 3763 * xmlFAParseRegExp: 3764 * @ctxt: a regexp parser context 3765 * @top: is that the top-level expressions ? 3766 * 3767 * [1] regExp ::= branch ( '|' branch )* 3768 */ 3769static void 3770xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 3771 xmlRegStatePtr start, end, oldend; 3772 3773 oldend = ctxt->end; 3774 3775 start = ctxt->state; 3776 xmlFAParseBranch(ctxt, (ctxt->end == NULL)); 3777 if (CUR != '|') { 3778 ctxt->end = ctxt->state; 3779 return; 3780 } 3781 end = ctxt->state; 3782 while ((CUR == '|') && (ctxt->error == 0)) { 3783 NEXT; 3784 ctxt->state = start; 3785 ctxt->end = end; 3786 xmlFAParseBranch(ctxt, 0); 3787 } 3788 if (!top) 3789 ctxt->end = oldend; 3790} 3791 3792/************************************************************************ 3793 * * 3794 * The basic API * 3795 * * 3796 ************************************************************************/ 3797 3798/** 3799 * xmlRegexpPrint: 3800 * @output: the file for the output debug 3801 * @regexp: the compiled regexp 3802 * 3803 * Print the content of the compiled regular expression 3804 */ 3805void 3806xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 3807 int i; 3808 3809 fprintf(output, " regexp: "); 3810 if (regexp == NULL) { 3811 fprintf(output, "NULL\n"); 3812 return; 3813 } 3814 fprintf(output, "'%s' ", regexp->string); 3815 fprintf(output, "\n"); 3816 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 3817 for (i = 0;i < regexp->nbAtoms; i++) { 3818 fprintf(output, " %02d ", i); 3819 xmlRegPrintAtom(output, regexp->atoms[i]); 3820 } 3821 fprintf(output, "%d states:", regexp->nbStates); 3822 fprintf(output, "\n"); 3823 for (i = 0;i < regexp->nbStates; i++) { 3824 xmlRegPrintState(output, regexp->states[i]); 3825 } 3826 fprintf(output, "%d counters:\n", regexp->nbCounters); 3827 for (i = 0;i < regexp->nbCounters; i++) { 3828 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 3829 regexp->counters[i].max); 3830 } 3831} 3832 3833/** 3834 * xmlRegexpCompile: 3835 * @regexp: a regular expression string 3836 * 3837 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 3838 * Appendix F and build an automata suitable for testing strings against 3839 * that regular expression 3840 * 3841 * Returns the compiled expression or NULL in case of error 3842 */ 3843xmlRegexpPtr 3844xmlRegexpCompile(const xmlChar *regexp) { 3845 xmlRegexpPtr ret; 3846 xmlRegParserCtxtPtr ctxt; 3847 3848 ctxt = xmlRegNewParserCtxt(regexp); 3849 if (ctxt == NULL) 3850 return(NULL); 3851 3852 /* initialize the parser */ 3853 ctxt->end = NULL; 3854 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 3855 xmlRegStatePush(ctxt, ctxt->start); 3856 3857 /* parse the expression building an automata */ 3858 xmlFAParseRegExp(ctxt, 1); 3859 if (CUR != 0) { 3860 ERROR("xmlFAParseRegExp: extra characters"); 3861 } 3862 ctxt->end = ctxt->state; 3863 ctxt->start->type = XML_REGEXP_START_STATE; 3864 ctxt->end->type = XML_REGEXP_FINAL_STATE; 3865 3866 /* remove the Epsilon except for counted transitions */ 3867 xmlFAEliminateEpsilonTransitions(ctxt); 3868 3869 3870 if (ctxt->error != 0) { 3871 xmlRegFreeParserCtxt(ctxt); 3872 return(NULL); 3873 } 3874 ret = xmlRegEpxFromParse(ctxt); 3875 xmlRegFreeParserCtxt(ctxt); 3876 return(ret); 3877} 3878 3879/** 3880 * xmlRegexpExec: 3881 * @comp: the compiled regular expression 3882 * @content: the value to check against the regular expression 3883 * 3884 * Check if the regular expression generate the value 3885 * 3886 * Returns 1 if it matches, 0 if not and a negativa value in case of error 3887 */ 3888int 3889xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 3890 if ((comp == NULL) || (content == NULL)) 3891 return(-1); 3892 return(xmlFARegExec(comp, content)); 3893} 3894 3895/** 3896 * xmlRegexpIsDeterminist: 3897 * @comp: the compiled regular expression 3898 * 3899 * Check if the regular expression is determinist 3900 * 3901 * Returns 1 if it yes, 0 if not and a negativa value in case of error 3902 */ 3903int 3904xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 3905 xmlAutomataPtr am; 3906 int ret; 3907 3908 if (comp == NULL) 3909 return(-1); 3910 if (comp->determinist != -1) 3911 return(comp->determinist); 3912 3913 am = xmlNewAutomata(); 3914 if (am->states != NULL) { 3915 int i; 3916 3917 for (i = 0;i < am->nbStates;i++) 3918 xmlRegFreeState(am->states[i]); 3919 xmlFree(am->states); 3920 } 3921 am->nbAtoms = comp->nbAtoms; 3922 am->atoms = comp->atoms; 3923 am->nbStates = comp->nbStates; 3924 am->states = comp->states; 3925 am->determinist = -1; 3926 ret = xmlFAComputesDeterminism(am); 3927 am->atoms = NULL; 3928 am->states = NULL; 3929 xmlFreeAutomata(am); 3930 return(ret); 3931} 3932 3933/** 3934 * xmlRegFreeRegexp: 3935 * @regexp: the regexp 3936 * 3937 * Free a regexp 3938 */ 3939void 3940xmlRegFreeRegexp(xmlRegexpPtr regexp) { 3941 int i; 3942 if (regexp == NULL) 3943 return; 3944 3945 if (regexp->string != NULL) 3946 xmlFree(regexp->string); 3947 if (regexp->states != NULL) { 3948 for (i = 0;i < regexp->nbStates;i++) 3949 xmlRegFreeState(regexp->states[i]); 3950 xmlFree(regexp->states); 3951 } 3952 if (regexp->atoms != NULL) { 3953 for (i = 0;i < regexp->nbAtoms;i++) 3954 xmlRegFreeAtom(regexp->atoms[i]); 3955 xmlFree(regexp->atoms); 3956 } 3957 if (regexp->counters != NULL) 3958 xmlFree(regexp->counters); 3959 if (regexp->compact != NULL) 3960 xmlFree(regexp->compact); 3961 if (regexp->transdata != NULL) 3962 xmlFree(regexp->transdata); 3963 if (regexp->stringMap != NULL) { 3964 for (i = 0; i < regexp->nbstrings;i++) 3965 xmlFree(regexp->stringMap[i]); 3966 xmlFree(regexp->stringMap); 3967 } 3968 3969 xmlFree(regexp); 3970} 3971 3972#ifdef LIBXML_AUTOMATA_ENABLED 3973/************************************************************************ 3974 * * 3975 * The Automata interface * 3976 * * 3977 ************************************************************************/ 3978 3979/** 3980 * xmlNewAutomata: 3981 * 3982 * Create a new automata 3983 * 3984 * Returns the new object or NULL in case of failure 3985 */ 3986xmlAutomataPtr 3987xmlNewAutomata(void) { 3988 xmlAutomataPtr ctxt; 3989 3990 ctxt = xmlRegNewParserCtxt(NULL); 3991 if (ctxt == NULL) 3992 return(NULL); 3993 3994 /* initialize the parser */ 3995 ctxt->end = NULL; 3996 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 3997 xmlRegStatePush(ctxt, ctxt->start); 3998 3999 return(ctxt); 4000} 4001 4002/** 4003 * xmlFreeAutomata: 4004 * @am: an automata 4005 * 4006 * Free an automata 4007 */ 4008void 4009xmlFreeAutomata(xmlAutomataPtr am) { 4010 if (am == NULL) 4011 return; 4012 xmlRegFreeParserCtxt(am); 4013} 4014 4015/** 4016 * xmlAutomataGetInitState: 4017 * @am: an automata 4018 * 4019 * Initial state lookup 4020 * 4021 * Returns the initial state of the automata 4022 */ 4023xmlAutomataStatePtr 4024xmlAutomataGetInitState(xmlAutomataPtr am) { 4025 if (am == NULL) 4026 return(NULL); 4027 return(am->start); 4028} 4029 4030/** 4031 * xmlAutomataSetFinalState: 4032 * @am: an automata 4033 * @state: a state in this automata 4034 * 4035 * Makes that state a final state 4036 * 4037 * Returns 0 or -1 in case of error 4038 */ 4039int 4040xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 4041 if ((am == NULL) || (state == NULL)) 4042 return(-1); 4043 state->type = XML_REGEXP_FINAL_STATE; 4044 return(0); 4045} 4046 4047/** 4048 * xmlAutomataNewTransition: 4049 * @am: an automata 4050 * @from: the starting point of the transition 4051 * @to: the target point of the transition or NULL 4052 * @token: the input string associated to that transition 4053 * @data: data passed to the callback function if the transition is activated 4054 * 4055 * If @to is NULL, this create first a new target state in the automata 4056 * and then adds a transition from the @from state to the target state 4057 * activated by the value of @token 4058 * 4059 * Returns the target state or NULL in case of error 4060 */ 4061xmlAutomataStatePtr 4062xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 4063 xmlAutomataStatePtr to, const xmlChar *token, 4064 void *data) { 4065 xmlRegAtomPtr atom; 4066 4067 if ((am == NULL) || (from == NULL) || (token == NULL)) 4068 return(NULL); 4069 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4070 atom->data = data; 4071 if (atom == NULL) 4072 return(NULL); 4073 atom->valuep = xmlStrdup(token); 4074 4075 xmlFAGenerateTransitions(am, from, to, atom); 4076 if (to == NULL) 4077 return(am->state); 4078 return(to); 4079} 4080 4081/** 4082 * xmlAutomataNewTransition2: 4083 * @am: an automata 4084 * @from: the starting point of the transition 4085 * @to: the target point of the transition or NULL 4086 * @token: the first input string associated to that transition 4087 * @token2: the second input string associated to that transition 4088 * @data: data passed to the callback function if the transition is activated 4089 * 4090 * If @to is NULL, this create first a new target state in the automata 4091 * and then adds a transition from the @from state to the target state 4092 * activated by the value of @token 4093 * 4094 * Returns the target state or NULL in case of error 4095 */ 4096xmlAutomataStatePtr 4097xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 4098 xmlAutomataStatePtr to, const xmlChar *token, 4099 const xmlChar *token2, void *data) { 4100 xmlRegAtomPtr atom; 4101 4102 if ((am == NULL) || (from == NULL) || (token == NULL)) 4103 return(NULL); 4104 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4105 atom->data = data; 4106 if (atom == NULL) 4107 return(NULL); 4108 if ((token2 == NULL) || (*token2 == 0)) { 4109 atom->valuep = xmlStrdup(token); 4110 } else { 4111 int lenn, lenp; 4112 xmlChar *str; 4113 4114 lenn = strlen((char *) token2); 4115 lenp = strlen((char *) token); 4116 4117 str = (xmlChar *) xmlMalloc(lenn + lenp + 2); 4118 if (str == NULL) { 4119 xmlRegFreeAtom(atom); 4120 return(NULL); 4121 } 4122 memcpy(&str[0], token, lenp); 4123 str[lenp] = '|'; 4124 memcpy(&str[lenp + 1], token2, lenn); 4125 str[lenn + lenp + 1] = 0; 4126 4127 atom->valuep = str; 4128 } 4129 4130 xmlFAGenerateTransitions(am, from, to, atom); 4131 if (to == NULL) 4132 return(am->state); 4133 return(to); 4134} 4135 4136/** 4137 * xmlAutomataNewCountTrans: 4138 * @am: an automata 4139 * @from: the starting point of the transition 4140 * @to: the target point of the transition or NULL 4141 * @token: the input string associated to that transition 4142 * @min: the minimum successive occurences of token 4143 * @max: the maximum successive occurences of token 4144 * @data: data associated to the transition 4145 * 4146 * If @to is NULL, this create first a new target state in the automata 4147 * and then adds a transition from the @from state to the target state 4148 * activated by a succession of input of value @token and whose number 4149 * is between @min and @max 4150 * 4151 * Returns the target state or NULL in case of error 4152 */ 4153xmlAutomataStatePtr 4154xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4155 xmlAutomataStatePtr to, const xmlChar *token, 4156 int min, int max, void *data) { 4157 xmlRegAtomPtr atom; 4158 4159 if ((am == NULL) || (from == NULL) || (token == NULL)) 4160 return(NULL); 4161 if (min < 0) 4162 return(NULL); 4163 if ((max < min) || (max < 1)) 4164 return(NULL); 4165 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4166 if (atom == NULL) 4167 return(NULL); 4168 atom->valuep = xmlStrdup(token); 4169 atom->data = data; 4170 if (min == 0) 4171 atom->min = 1; 4172 else 4173 atom->min = min; 4174 atom->max = max; 4175 4176 xmlFAGenerateTransitions(am, from, to, atom); 4177 if (to == NULL) 4178 to = am->state; 4179 if (to == NULL) 4180 return(NULL); 4181 if (min == 0) 4182 xmlFAGenerateEpsilonTransition(am, from, to); 4183 return(to); 4184} 4185 4186/** 4187 * xmlAutomataNewOnceTrans: 4188 * @am: an automata 4189 * @from: the starting point of the transition 4190 * @to: the target point of the transition or NULL 4191 * @token: the input string associated to that transition 4192 * @min: the minimum successive occurences of token 4193 * @max: the maximum successive occurences of token 4194 * @data: data associated to the transition 4195 * 4196 * If @to is NULL, this create first a new target state in the automata 4197 * and then adds a transition from the @from state to the target state 4198 * activated by a succession of input of value @token and whose number 4199 * is between @min and @max, moreover that transistion can only be crossed 4200 * once. 4201 * 4202 * Returns the target state or NULL in case of error 4203 */ 4204xmlAutomataStatePtr 4205xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4206 xmlAutomataStatePtr to, const xmlChar *token, 4207 int min, int max, void *data) { 4208 xmlRegAtomPtr atom; 4209 int counter; 4210 4211 if ((am == NULL) || (from == NULL) || (token == NULL)) 4212 return(NULL); 4213 if (min < 1) 4214 return(NULL); 4215 if ((max < min) || (max < 1)) 4216 return(NULL); 4217 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4218 if (atom == NULL) 4219 return(NULL); 4220 atom->valuep = xmlStrdup(token); 4221 atom->data = data; 4222 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 4223 if (min == 0) 4224 atom->min = 1; 4225 else 4226 atom->min = min; 4227 atom->max = max; 4228 /* 4229 * associate a counter to the transition. 4230 */ 4231 counter = xmlRegGetCounter(am); 4232 am->counters[counter].min = 1; 4233 am->counters[counter].max = 1; 4234 4235 /* xmlFAGenerateTransitions(am, from, to, atom); */ 4236 if (to == NULL) { 4237 to = xmlRegNewState(am); 4238 xmlRegStatePush(am, to); 4239 } 4240 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 4241 xmlRegAtomPush(am, atom); 4242 am->state = to; 4243 if (to == NULL) 4244 to = am->state; 4245 if (to == NULL) 4246 return(NULL); 4247 return(to); 4248} 4249 4250/** 4251 * xmlAutomataNewState: 4252 * @am: an automata 4253 * 4254 * Create a new disconnected state in the automata 4255 * 4256 * Returns the new state or NULL in case of error 4257 */ 4258xmlAutomataStatePtr 4259xmlAutomataNewState(xmlAutomataPtr am) { 4260 xmlAutomataStatePtr to; 4261 4262 if (am == NULL) 4263 return(NULL); 4264 to = xmlRegNewState(am); 4265 xmlRegStatePush(am, to); 4266 return(to); 4267} 4268 4269/** 4270 * xmlAutomataNewEpsilon: 4271 * @am: an automata 4272 * @from: the starting point of the transition 4273 * @to: the target point of the transition or NULL 4274 * 4275 * If @to is NULL, this create first a new target state in the automata 4276 * and then adds a an epsilon transition from the @from state to the 4277 * target state 4278 * 4279 * Returns the target state or NULL in case of error 4280 */ 4281xmlAutomataStatePtr 4282xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 4283 xmlAutomataStatePtr to) { 4284 if ((am == NULL) || (from == NULL)) 4285 return(NULL); 4286 xmlFAGenerateEpsilonTransition(am, from, to); 4287 if (to == NULL) 4288 return(am->state); 4289 return(to); 4290} 4291 4292/** 4293 * xmlAutomataNewAllTrans: 4294 * @am: an automata 4295 * @from: the starting point of the transition 4296 * @to: the target point of the transition or NULL 4297 * @lax: allow to transition if not all all transitions have been activated 4298 * 4299 * If @to is NULL, this create first a new target state in the automata 4300 * and then adds a an ALL transition from the @from state to the 4301 * target state. That transition is an epsilon transition allowed only when 4302 * all transitions from the @from node have been activated. 4303 * 4304 * Returns the target state or NULL in case of error 4305 */ 4306xmlAutomataStatePtr 4307xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4308 xmlAutomataStatePtr to, int lax) { 4309 if ((am == NULL) || (from == NULL)) 4310 return(NULL); 4311 xmlFAGenerateAllTransition(am, from, to, lax); 4312 if (to == NULL) 4313 return(am->state); 4314 return(to); 4315} 4316 4317/** 4318 * xmlAutomataNewCounter: 4319 * @am: an automata 4320 * @min: the minimal value on the counter 4321 * @max: the maximal value on the counter 4322 * 4323 * Create a new counter 4324 * 4325 * Returns the counter number or -1 in case of error 4326 */ 4327int 4328xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 4329 int ret; 4330 4331 if (am == NULL) 4332 return(-1); 4333 4334 ret = xmlRegGetCounter(am); 4335 if (ret < 0) 4336 return(-1); 4337 am->counters[ret].min = min; 4338 am->counters[ret].max = max; 4339 return(ret); 4340} 4341 4342/** 4343 * xmlAutomataNewCountedTrans: 4344 * @am: an automata 4345 * @from: the starting point of the transition 4346 * @to: the target point of the transition or NULL 4347 * @counter: the counter associated to that transition 4348 * 4349 * If @to is NULL, this create first a new target state in the automata 4350 * and then adds an epsilon transition from the @from state to the target state 4351 * which will increment the counter provided 4352 * 4353 * Returns the target state or NULL in case of error 4354 */ 4355xmlAutomataStatePtr 4356xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4357 xmlAutomataStatePtr to, int counter) { 4358 if ((am == NULL) || (from == NULL) || (counter < 0)) 4359 return(NULL); 4360 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 4361 if (to == NULL) 4362 return(am->state); 4363 return(to); 4364} 4365 4366/** 4367 * xmlAutomataNewCounterTrans: 4368 * @am: an automata 4369 * @from: the starting point of the transition 4370 * @to: the target point of the transition or NULL 4371 * @counter: the counter associated to that transition 4372 * 4373 * If @to is NULL, this create first a new target state in the automata 4374 * and then adds an epsilon transition from the @from state to the target state 4375 * which will be allowed only if the counter is within the right range. 4376 * 4377 * Returns the target state or NULL in case of error 4378 */ 4379xmlAutomataStatePtr 4380xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4381 xmlAutomataStatePtr to, int counter) { 4382 if ((am == NULL) || (from == NULL) || (counter < 0)) 4383 return(NULL); 4384 xmlFAGenerateCountedTransition(am, from, to, counter); 4385 if (to == NULL) 4386 return(am->state); 4387 return(to); 4388} 4389 4390/** 4391 * xmlAutomataCompile: 4392 * @am: an automata 4393 * 4394 * Compile the automata into a Reg Exp ready for being executed. 4395 * The automata should be free after this point. 4396 * 4397 * Returns the compiled regexp or NULL in case of error 4398 */ 4399xmlRegexpPtr 4400xmlAutomataCompile(xmlAutomataPtr am) { 4401 xmlRegexpPtr ret; 4402 4403 xmlFAEliminateEpsilonTransitions(am); 4404 /* xmlFAComputesDeterminism(am); */ 4405 ret = xmlRegEpxFromParse(am); 4406 4407 return(ret); 4408} 4409 4410/** 4411 * xmlAutomataIsDeterminist: 4412 * @am: an automata 4413 * 4414 * Checks if an automata is determinist. 4415 * 4416 * Returns 1 if true, 0 if not, and -1 in case of error 4417 */ 4418int 4419xmlAutomataIsDeterminist(xmlAutomataPtr am) { 4420 int ret; 4421 4422 if (am == NULL) 4423 return(-1); 4424 4425 ret = xmlFAComputesDeterminism(am); 4426 return(ret); 4427} 4428#endif /* LIBXML_AUTOMATA_ENABLED */ 4429#endif /* LIBXML_REGEXP_ENABLED */ 4430