xmlregexp.c revision 23e73571f8f6918e4ea7be3506ee5bd24ee86c52
1/* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications thise includes: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17#define IN_LIBXML 18#include "libxml.h" 19 20#ifdef LIBXML_REGEXP_ENABLED 21 22#include <stdio.h> 23#include <string.h> 24#include <libxml/tree.h> 25#include <libxml/parserInternals.h> 26#include <libxml/xmlregexp.h> 27#include <libxml/xmlautomata.h> 28#include <libxml/xmlunicode.h> 29 30/* #define DEBUG_REGEXP_GRAPH */ 31/* #define DEBUG_REGEXP_EXEC */ 32/* #define DEBUG_PUSH */ 33/* #define DEBUG_COMPACTION */ 34 35#define ERROR(str) ctxt->error = 1; \ 36 xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur) 37#define NEXT ctxt->cur++ 38#define CUR (*(ctxt->cur)) 39#define NXT(index) (ctxt->cur[index]) 40 41#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 42#define NEXTL(l) ctxt->cur += l; 43 44/** 45 * TODO: 46 * 47 * macro to flag unimplemented blocks 48 */ 49#define TODO \ 50 xmlGenericError(xmlGenericErrorContext, \ 51 "Unimplemented block at %s:%d\n", \ 52 __FILE__, __LINE__); 53 54 55/************************************************************************ 56 * * 57 * Datatypes and structures * 58 * * 59 ************************************************************************/ 60 61typedef enum { 62 XML_REGEXP_EPSILON = 1, 63 XML_REGEXP_CHARVAL, 64 XML_REGEXP_RANGES, 65 XML_REGEXP_SUBREG, 66 XML_REGEXP_STRING, 67 XML_REGEXP_ANYCHAR, /* . */ 68 XML_REGEXP_ANYSPACE, /* \s */ 69 XML_REGEXP_NOTSPACE, /* \S */ 70 XML_REGEXP_INITNAME, /* \l */ 71 XML_REGEXP_NOTINITNAME, /* \l */ 72 XML_REGEXP_NAMECHAR, /* \c */ 73 XML_REGEXP_NOTNAMECHAR, /* \C */ 74 XML_REGEXP_DECIMAL, /* \d */ 75 XML_REGEXP_NOTDECIMAL, /* \d */ 76 XML_REGEXP_REALCHAR, /* \w */ 77 XML_REGEXP_NOTREALCHAR, /* \w */ 78 XML_REGEXP_LETTER, 79 XML_REGEXP_LETTER_UPPERCASE, 80 XML_REGEXP_LETTER_LOWERCASE, 81 XML_REGEXP_LETTER_TITLECASE, 82 XML_REGEXP_LETTER_MODIFIER, 83 XML_REGEXP_LETTER_OTHERS, 84 XML_REGEXP_MARK, 85 XML_REGEXP_MARK_NONSPACING, 86 XML_REGEXP_MARK_SPACECOMBINING, 87 XML_REGEXP_MARK_ENCLOSING, 88 XML_REGEXP_NUMBER, 89 XML_REGEXP_NUMBER_DECIMAL, 90 XML_REGEXP_NUMBER_LETTER, 91 XML_REGEXP_NUMBER_OTHERS, 92 XML_REGEXP_PUNCT, 93 XML_REGEXP_PUNCT_CONNECTOR, 94 XML_REGEXP_PUNCT_DASH, 95 XML_REGEXP_PUNCT_OPEN, 96 XML_REGEXP_PUNCT_CLOSE, 97 XML_REGEXP_PUNCT_INITQUOTE, 98 XML_REGEXP_PUNCT_FINQUOTE, 99 XML_REGEXP_PUNCT_OTHERS, 100 XML_REGEXP_SEPAR, 101 XML_REGEXP_SEPAR_SPACE, 102 XML_REGEXP_SEPAR_LINE, 103 XML_REGEXP_SEPAR_PARA, 104 XML_REGEXP_SYMBOL, 105 XML_REGEXP_SYMBOL_MATH, 106 XML_REGEXP_SYMBOL_CURRENCY, 107 XML_REGEXP_SYMBOL_MODIFIER, 108 XML_REGEXP_SYMBOL_OTHERS, 109 XML_REGEXP_OTHER, 110 XML_REGEXP_OTHER_CONTROL, 111 XML_REGEXP_OTHER_FORMAT, 112 XML_REGEXP_OTHER_PRIVATE, 113 XML_REGEXP_OTHER_NA, 114 XML_REGEXP_BLOCK_NAME 115} xmlRegAtomType; 116 117typedef enum { 118 XML_REGEXP_QUANT_EPSILON = 1, 119 XML_REGEXP_QUANT_ONCE, 120 XML_REGEXP_QUANT_OPT, 121 XML_REGEXP_QUANT_MULT, 122 XML_REGEXP_QUANT_PLUS, 123 XML_REGEXP_QUANT_ONCEONLY, 124 XML_REGEXP_QUANT_ALL, 125 XML_REGEXP_QUANT_RANGE 126} xmlRegQuantType; 127 128typedef enum { 129 XML_REGEXP_START_STATE = 1, 130 XML_REGEXP_FINAL_STATE, 131 XML_REGEXP_TRANS_STATE 132} xmlRegStateType; 133 134typedef enum { 135 XML_REGEXP_MARK_NORMAL = 0, 136 XML_REGEXP_MARK_START, 137 XML_REGEXP_MARK_VISITED 138} xmlRegMarkedType; 139 140typedef struct _xmlRegRange xmlRegRange; 141typedef xmlRegRange *xmlRegRangePtr; 142 143struct _xmlRegRange { 144 int neg; 145 xmlRegAtomType type; 146 int start; 147 int end; 148 xmlChar *blockName; 149}; 150 151typedef struct _xmlRegAtom xmlRegAtom; 152typedef xmlRegAtom *xmlRegAtomPtr; 153 154typedef struct _xmlAutomataState xmlRegState; 155typedef xmlRegState *xmlRegStatePtr; 156 157struct _xmlRegAtom { 158 int no; 159 xmlRegAtomType type; 160 xmlRegQuantType quant; 161 int min; 162 int max; 163 164 void *valuep; 165 void *valuep2; 166 int neg; 167 int codepoint; 168 xmlRegStatePtr start; 169 xmlRegStatePtr stop; 170 int maxRanges; 171 int nbRanges; 172 xmlRegRangePtr *ranges; 173 void *data; 174}; 175 176typedef struct _xmlRegCounter xmlRegCounter; 177typedef xmlRegCounter *xmlRegCounterPtr; 178 179struct _xmlRegCounter { 180 int min; 181 int max; 182}; 183 184typedef struct _xmlRegTrans xmlRegTrans; 185typedef xmlRegTrans *xmlRegTransPtr; 186 187struct _xmlRegTrans { 188 xmlRegAtomPtr atom; 189 int to; 190 int counter; 191 int count; 192}; 193 194struct _xmlAutomataState { 195 xmlRegStateType type; 196 xmlRegMarkedType mark; 197 xmlRegMarkedType reached; 198 int no; 199 200 int maxTrans; 201 int nbTrans; 202 xmlRegTrans *trans; 203}; 204 205typedef struct _xmlAutomata xmlRegParserCtxt; 206typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 207 208struct _xmlAutomata { 209 xmlChar *string; 210 xmlChar *cur; 211 212 int error; 213 int neg; 214 215 xmlRegStatePtr start; 216 xmlRegStatePtr end; 217 xmlRegStatePtr state; 218 219 xmlRegAtomPtr atom; 220 221 int maxAtoms; 222 int nbAtoms; 223 xmlRegAtomPtr *atoms; 224 225 int maxStates; 226 int nbStates; 227 xmlRegStatePtr *states; 228 229 int maxCounters; 230 int nbCounters; 231 xmlRegCounter *counters; 232 233 int determinist; 234}; 235 236struct _xmlRegexp { 237 xmlChar *string; 238 int nbStates; 239 xmlRegStatePtr *states; 240 int nbAtoms; 241 xmlRegAtomPtr *atoms; 242 int nbCounters; 243 xmlRegCounter *counters; 244 int determinist; 245 /* 246 * That's the compact form for determinists automatas 247 */ 248 int nbstates; 249 int *compact; 250 int nbstrings; 251 xmlChar **stringMap; 252}; 253 254typedef struct _xmlRegExecRollback xmlRegExecRollback; 255typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 256 257struct _xmlRegExecRollback { 258 xmlRegStatePtr state;/* the current state */ 259 int index; /* the index in the input stack */ 260 int nextbranch; /* the next transition to explore in that state */ 261 int *counts; /* save the automate state if it has some */ 262}; 263 264typedef struct _xmlRegInputToken xmlRegInputToken; 265typedef xmlRegInputToken *xmlRegInputTokenPtr; 266 267struct _xmlRegInputToken { 268 xmlChar *value; 269 void *data; 270}; 271 272struct _xmlRegExecCtxt { 273 int status; /* execution status != 0 indicate an error */ 274 int determinist; /* did we found an inderterministic behaviour */ 275 xmlRegexpPtr comp; /* the compiled regexp */ 276 xmlRegExecCallbacks callback; 277 void *data; 278 279 xmlRegStatePtr state;/* the current state */ 280 int transno; /* the current transition on that state */ 281 int transcount; /* the number of char in char counted transitions */ 282 283 /* 284 * A stack of rollback states 285 */ 286 int maxRollbacks; 287 int nbRollbacks; 288 xmlRegExecRollback *rollbacks; 289 290 /* 291 * The state of the automata if any 292 */ 293 int *counts; 294 295 /* 296 * The input stack 297 */ 298 int inputStackMax; 299 int inputStackNr; 300 int index; 301 int *charStack; 302 const xmlChar *inputString; /* when operating on characters */ 303 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 304 305}; 306 307#define REGEXP_ALL_COUNTER 0x123456 308#define REGEXP_ALL_LAX_COUNTER 0x123457 309 310static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 311static void xmlRegFreeState(xmlRegStatePtr state); 312static void xmlRegFreeAtom(xmlRegAtomPtr atom); 313 314/************************************************************************ 315 * * 316 * Allocation/Deallocation * 317 * * 318 ************************************************************************/ 319 320static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 321/** 322 * xmlRegEpxFromParse: 323 * @ctxt: the parser context used to build it 324 * 325 * Allocate a new regexp and fill it with the reult from the parser 326 * 327 * Returns the new regexp or NULL in case of error 328 */ 329static xmlRegexpPtr 330xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 331 xmlRegexpPtr ret; 332 333 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 334 if (ret == NULL) 335 return(NULL); 336 memset(ret, 0, sizeof(xmlRegexp)); 337 ret->string = ctxt->string; 338 ctxt->string = NULL; 339 ret->nbStates = ctxt->nbStates; 340 ctxt->nbStates = 0; 341 ret->states = ctxt->states; 342 ctxt->states = NULL; 343 ret->nbAtoms = ctxt->nbAtoms; 344 ctxt->nbAtoms = 0; 345 ret->atoms = ctxt->atoms; 346 ctxt->atoms = NULL; 347 ret->nbCounters = ctxt->nbCounters; 348 ctxt->nbCounters = 0; 349 ret->counters = ctxt->counters; 350 ctxt->counters = NULL; 351 ret->determinist = ctxt->determinist; 352 353 if ((ret->determinist != 0) && 354 (ret->nbCounters == 0) && 355 (ret->atoms[0] != NULL) && 356 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 357 int i, j, nbstates = 0, nbatoms = 0; 358 int *stateRemap; 359 int *stringRemap; 360 int *transitions; 361 xmlChar **stringMap; 362 xmlChar *value; 363 364 /* 365 * Switch to a compact representation 366 * 1/ counting the effective number of states left 367 * 2/ conting the unique number of atoms, and check that 368 * they are all of the string type 369 * 3/ build a table state x atom for the transitions 370 */ 371 372 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 373 for (i = 0;i < ret->nbStates;i++) { 374 if (ret->states[i] != NULL) { 375 stateRemap[i] = nbstates; 376 nbstates++; 377 } else { 378 stateRemap[i] = -1; 379 } 380 } 381#ifdef DEBUG_COMPACTION 382 printf("Final: %d states\n", nbstates); 383#endif 384 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 385 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 386 for (i = 0;i < ret->nbAtoms;i++) { 387 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 388 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 389 value = ret->atoms[i]->valuep; 390 for (j = 0;j < nbatoms;j++) { 391 if (xmlStrEqual(stringMap[j], value)) { 392 stringRemap[i] = j; 393 break; 394 } 395 } 396 if (j >= nbatoms) { 397 stringRemap[i] = nbatoms; 398 stringMap[nbatoms] = xmlStrdup(value); 399 nbatoms++; 400 } 401 } else { 402 xmlFree(stateRemap); 403 xmlFree(stringRemap); 404 for (i = 0;i < nbatoms;i++) 405 xmlFree(stringMap[i]); 406 xmlFree(stringMap); 407 goto fail_compact; 408 } 409 } 410#ifdef DEBUG_COMPACTION 411 printf("Final: %d atoms\n", nbatoms); 412#endif 413 414 /* 415 * Allocate the transition table. The first entry for each 416 * state correspond to the state type. 417 */ 418 transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int)); 419 memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int)); 420 421 for (i = 0;i < ret->nbStates;i++) { 422 int stateno, atomno, targetno, prev; 423 xmlRegStatePtr state; 424 xmlRegTransPtr trans; 425 426 stateno = stateRemap[i]; 427 if (stateno == -1) 428 continue; 429 state = ret->states[i]; 430 431 transitions[stateno * (nbatoms + 1)] = state->type; 432 433 for (j = 0;j < state->nbTrans;j++) { 434 trans = &(state->trans[j]); 435 if ((trans->to == -1) || (trans->atom == NULL)) 436 continue; 437 atomno = stringRemap[trans->atom->no]; 438 targetno = stateRemap[trans->to]; 439 /* 440 * if the same atome can generate transition to 2 different 441 * states then it means the automata is not determinist and 442 * the compact form can't be used ! 443 */ 444 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 445 if (prev != 0) { 446 if (prev != targetno + 1) { 447 printf("not determinist\n"); 448 ret->determinist = 0; 449#ifdef DEBUG_COMPACTION 450 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 451 i, j, trans->atom->no, trans->to, atomno, targetno); 452 printf(" previous to is %d\n", prev); 453#endif 454 ret->determinist = 0; 455 xmlFree(transitions); 456 xmlFree(stateRemap); 457 xmlFree(stringRemap); 458 for (i = 0;i < nbatoms;i++) 459 xmlFree(stringMap[i]); 460 xmlFree(stringMap); 461 goto fail_compact; 462 } 463 } else { 464#if 0 465 printf("State %d trans %d: atom %d to %d : %d to %d\n", 466 i, j, trans->atom->no, trans->to, atomno, targetno); 467#endif 468 transitions[stateno * (nbatoms + 1) + atomno + 1] = 469 targetno + 1;; /* to avoid 0 */ 470 } 471 } 472 } 473 ret->determinist = 1; 474#ifdef DEBUG_COMPACTION 475 /* 476 * Debug 477 */ 478 for (i = 0;i < nbstates;i++) { 479 for (j = 0;j < nbatoms + 1;j++) { 480 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 481 } 482 printf("\n"); 483 } 484 printf("\n"); 485#endif 486 /* 487 * Cleanup of the old data 488 */ 489 if (ret->states != NULL) { 490 for (i = 0;i < ret->nbStates;i++) 491 xmlRegFreeState(ret->states[i]); 492 xmlFree(ret->states); 493 } 494 ret->states = NULL; 495 ret->nbStates = 0; 496 if (ret->atoms != NULL) { 497 for (i = 0;i < ret->nbAtoms;i++) 498 xmlRegFreeAtom(ret->atoms[i]); 499 xmlFree(ret->atoms); 500 } 501 ret->atoms = NULL; 502 ret->nbAtoms = 0; 503 504 ret->compact = transitions; 505 ret->stringMap = stringMap; 506 ret->nbstrings = nbatoms; 507 ret->nbstates = nbstates; 508 xmlFree(stateRemap); 509 xmlFree(stringRemap); 510 } 511fail_compact: 512 return(ret); 513} 514 515/** 516 * xmlRegNewParserCtxt: 517 * @string: the string to parse 518 * 519 * Allocate a new regexp parser context 520 * 521 * Returns the new context or NULL in case of error 522 */ 523static xmlRegParserCtxtPtr 524xmlRegNewParserCtxt(const xmlChar *string) { 525 xmlRegParserCtxtPtr ret; 526 527 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 528 if (ret == NULL) 529 return(NULL); 530 memset(ret, 0, sizeof(xmlRegParserCtxt)); 531 if (string != NULL) 532 ret->string = xmlStrdup(string); 533 ret->cur = ret->string; 534 ret->neg = 0; 535 ret->error = 0; 536 ret->determinist = -1; 537 return(ret); 538} 539 540/** 541 * xmlRegNewRange: 542 * @ctxt: the regexp parser context 543 * @neg: is that negative 544 * @type: the type of range 545 * @start: the start codepoint 546 * @end: the end codepoint 547 * 548 * Allocate a new regexp range 549 * 550 * Returns the new range or NULL in case of error 551 */ 552static xmlRegRangePtr 553xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 554 int neg, xmlRegAtomType type, int start, int end) { 555 xmlRegRangePtr ret; 556 557 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 558 if (ret == NULL) { 559 ERROR("failed to allocate regexp range"); 560 return(NULL); 561 } 562 ret->neg = neg; 563 ret->type = type; 564 ret->start = start; 565 ret->end = end; 566 return(ret); 567} 568 569/** 570 * xmlRegFreeRange: 571 * @range: the regexp range 572 * 573 * Free a regexp range 574 */ 575static void 576xmlRegFreeRange(xmlRegRangePtr range) { 577 if (range == NULL) 578 return; 579 580 if (range->blockName != NULL) 581 xmlFree(range->blockName); 582 xmlFree(range); 583} 584 585/** 586 * xmlRegNewAtom: 587 * @ctxt: the regexp parser context 588 * @type: the type of atom 589 * 590 * Allocate a new regexp range 591 * 592 * Returns the new atom or NULL in case of error 593 */ 594static xmlRegAtomPtr 595xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 596 xmlRegAtomPtr ret; 597 598 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 599 if (ret == NULL) { 600 ERROR("failed to allocate regexp atom"); 601 return(NULL); 602 } 603 memset(ret, 0, sizeof(xmlRegAtom)); 604 ret->type = type; 605 ret->quant = XML_REGEXP_QUANT_ONCE; 606 ret->min = 0; 607 ret->max = 0; 608 return(ret); 609} 610 611/** 612 * xmlRegFreeAtom: 613 * @atom: the regexp atom 614 * 615 * Free a regexp atom 616 */ 617static void 618xmlRegFreeAtom(xmlRegAtomPtr atom) { 619 int i; 620 621 if (atom == NULL) 622 return; 623 624 for (i = 0;i < atom->nbRanges;i++) 625 xmlRegFreeRange(atom->ranges[i]); 626 if (atom->ranges != NULL) 627 xmlFree(atom->ranges); 628 if (atom->type == XML_REGEXP_STRING) 629 xmlFree(atom->valuep); 630 xmlFree(atom); 631} 632 633static xmlRegStatePtr 634xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 635 xmlRegStatePtr ret; 636 637 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 638 if (ret == NULL) { 639 ERROR("failed to allocate regexp state"); 640 return(NULL); 641 } 642 memset(ret, 0, sizeof(xmlRegState)); 643 ret->type = XML_REGEXP_TRANS_STATE; 644 ret->mark = XML_REGEXP_MARK_NORMAL; 645 return(ret); 646} 647 648/** 649 * xmlRegFreeState: 650 * @state: the regexp state 651 * 652 * Free a regexp state 653 */ 654static void 655xmlRegFreeState(xmlRegStatePtr state) { 656 if (state == NULL) 657 return; 658 659 if (state->trans != NULL) 660 xmlFree(state->trans); 661 xmlFree(state); 662} 663 664/** 665 * xmlRegFreeParserCtxt: 666 * @ctxt: the regexp parser context 667 * 668 * Free a regexp parser context 669 */ 670static void 671xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 672 int i; 673 if (ctxt == NULL) 674 return; 675 676 if (ctxt->string != NULL) 677 xmlFree(ctxt->string); 678 if (ctxt->states != NULL) { 679 for (i = 0;i < ctxt->nbStates;i++) 680 xmlRegFreeState(ctxt->states[i]); 681 xmlFree(ctxt->states); 682 } 683 if (ctxt->atoms != NULL) { 684 for (i = 0;i < ctxt->nbAtoms;i++) 685 xmlRegFreeAtom(ctxt->atoms[i]); 686 xmlFree(ctxt->atoms); 687 } 688 if (ctxt->counters != NULL) 689 xmlFree(ctxt->counters); 690 xmlFree(ctxt); 691} 692 693/************************************************************************ 694 * * 695 * Display of Data structures * 696 * * 697 ************************************************************************/ 698 699static void 700xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 701 switch (type) { 702 case XML_REGEXP_EPSILON: 703 fprintf(output, "epsilon "); break; 704 case XML_REGEXP_CHARVAL: 705 fprintf(output, "charval "); break; 706 case XML_REGEXP_RANGES: 707 fprintf(output, "ranges "); break; 708 case XML_REGEXP_SUBREG: 709 fprintf(output, "subexpr "); break; 710 case XML_REGEXP_STRING: 711 fprintf(output, "string "); break; 712 case XML_REGEXP_ANYCHAR: 713 fprintf(output, "anychar "); break; 714 case XML_REGEXP_ANYSPACE: 715 fprintf(output, "anyspace "); break; 716 case XML_REGEXP_NOTSPACE: 717 fprintf(output, "notspace "); break; 718 case XML_REGEXP_INITNAME: 719 fprintf(output, "initname "); break; 720 case XML_REGEXP_NOTINITNAME: 721 fprintf(output, "notinitname "); break; 722 case XML_REGEXP_NAMECHAR: 723 fprintf(output, "namechar "); break; 724 case XML_REGEXP_NOTNAMECHAR: 725 fprintf(output, "notnamechar "); break; 726 case XML_REGEXP_DECIMAL: 727 fprintf(output, "decimal "); break; 728 case XML_REGEXP_NOTDECIMAL: 729 fprintf(output, "notdecimal "); break; 730 case XML_REGEXP_REALCHAR: 731 fprintf(output, "realchar "); break; 732 case XML_REGEXP_NOTREALCHAR: 733 fprintf(output, "notrealchar "); break; 734 case XML_REGEXP_LETTER: 735 fprintf(output, "LETTER "); break; 736 case XML_REGEXP_LETTER_UPPERCASE: 737 fprintf(output, "LETTER_UPPERCASE "); break; 738 case XML_REGEXP_LETTER_LOWERCASE: 739 fprintf(output, "LETTER_LOWERCASE "); break; 740 case XML_REGEXP_LETTER_TITLECASE: 741 fprintf(output, "LETTER_TITLECASE "); break; 742 case XML_REGEXP_LETTER_MODIFIER: 743 fprintf(output, "LETTER_MODIFIER "); break; 744 case XML_REGEXP_LETTER_OTHERS: 745 fprintf(output, "LETTER_OTHERS "); break; 746 case XML_REGEXP_MARK: 747 fprintf(output, "MARK "); break; 748 case XML_REGEXP_MARK_NONSPACING: 749 fprintf(output, "MARK_NONSPACING "); break; 750 case XML_REGEXP_MARK_SPACECOMBINING: 751 fprintf(output, "MARK_SPACECOMBINING "); break; 752 case XML_REGEXP_MARK_ENCLOSING: 753 fprintf(output, "MARK_ENCLOSING "); break; 754 case XML_REGEXP_NUMBER: 755 fprintf(output, "NUMBER "); break; 756 case XML_REGEXP_NUMBER_DECIMAL: 757 fprintf(output, "NUMBER_DECIMAL "); break; 758 case XML_REGEXP_NUMBER_LETTER: 759 fprintf(output, "NUMBER_LETTER "); break; 760 case XML_REGEXP_NUMBER_OTHERS: 761 fprintf(output, "NUMBER_OTHERS "); break; 762 case XML_REGEXP_PUNCT: 763 fprintf(output, "PUNCT "); break; 764 case XML_REGEXP_PUNCT_CONNECTOR: 765 fprintf(output, "PUNCT_CONNECTOR "); break; 766 case XML_REGEXP_PUNCT_DASH: 767 fprintf(output, "PUNCT_DASH "); break; 768 case XML_REGEXP_PUNCT_OPEN: 769 fprintf(output, "PUNCT_OPEN "); break; 770 case XML_REGEXP_PUNCT_CLOSE: 771 fprintf(output, "PUNCT_CLOSE "); break; 772 case XML_REGEXP_PUNCT_INITQUOTE: 773 fprintf(output, "PUNCT_INITQUOTE "); break; 774 case XML_REGEXP_PUNCT_FINQUOTE: 775 fprintf(output, "PUNCT_FINQUOTE "); break; 776 case XML_REGEXP_PUNCT_OTHERS: 777 fprintf(output, "PUNCT_OTHERS "); break; 778 case XML_REGEXP_SEPAR: 779 fprintf(output, "SEPAR "); break; 780 case XML_REGEXP_SEPAR_SPACE: 781 fprintf(output, "SEPAR_SPACE "); break; 782 case XML_REGEXP_SEPAR_LINE: 783 fprintf(output, "SEPAR_LINE "); break; 784 case XML_REGEXP_SEPAR_PARA: 785 fprintf(output, "SEPAR_PARA "); break; 786 case XML_REGEXP_SYMBOL: 787 fprintf(output, "SYMBOL "); break; 788 case XML_REGEXP_SYMBOL_MATH: 789 fprintf(output, "SYMBOL_MATH "); break; 790 case XML_REGEXP_SYMBOL_CURRENCY: 791 fprintf(output, "SYMBOL_CURRENCY "); break; 792 case XML_REGEXP_SYMBOL_MODIFIER: 793 fprintf(output, "SYMBOL_MODIFIER "); break; 794 case XML_REGEXP_SYMBOL_OTHERS: 795 fprintf(output, "SYMBOL_OTHERS "); break; 796 case XML_REGEXP_OTHER: 797 fprintf(output, "OTHER "); break; 798 case XML_REGEXP_OTHER_CONTROL: 799 fprintf(output, "OTHER_CONTROL "); break; 800 case XML_REGEXP_OTHER_FORMAT: 801 fprintf(output, "OTHER_FORMAT "); break; 802 case XML_REGEXP_OTHER_PRIVATE: 803 fprintf(output, "OTHER_PRIVATE "); break; 804 case XML_REGEXP_OTHER_NA: 805 fprintf(output, "OTHER_NA "); break; 806 case XML_REGEXP_BLOCK_NAME: 807 fprintf(output, "BLOCK "); break; 808 } 809} 810 811static void 812xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 813 switch (type) { 814 case XML_REGEXP_QUANT_EPSILON: 815 fprintf(output, "epsilon "); break; 816 case XML_REGEXP_QUANT_ONCE: 817 fprintf(output, "once "); break; 818 case XML_REGEXP_QUANT_OPT: 819 fprintf(output, "? "); break; 820 case XML_REGEXP_QUANT_MULT: 821 fprintf(output, "* "); break; 822 case XML_REGEXP_QUANT_PLUS: 823 fprintf(output, "+ "); break; 824 case XML_REGEXP_QUANT_RANGE: 825 fprintf(output, "range "); break; 826 case XML_REGEXP_QUANT_ONCEONLY: 827 fprintf(output, "onceonly "); break; 828 case XML_REGEXP_QUANT_ALL: 829 fprintf(output, "all "); break; 830 } 831} 832static void 833xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 834 fprintf(output, " range: "); 835 if (range->neg) 836 fprintf(output, "negative "); 837 xmlRegPrintAtomType(output, range->type); 838 fprintf(output, "%c - %c\n", range->start, range->end); 839} 840 841static void 842xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 843 fprintf(output, " atom: "); 844 if (atom == NULL) { 845 fprintf(output, "NULL\n"); 846 return; 847 } 848 xmlRegPrintAtomType(output, atom->type); 849 xmlRegPrintQuantType(output, atom->quant); 850 if (atom->quant == XML_REGEXP_QUANT_RANGE) 851 fprintf(output, "%d-%d ", atom->min, atom->max); 852 if (atom->type == XML_REGEXP_STRING) 853 fprintf(output, "'%s' ", (char *) atom->valuep); 854 if (atom->type == XML_REGEXP_CHARVAL) 855 fprintf(output, "char %c\n", atom->codepoint); 856 else if (atom->type == XML_REGEXP_RANGES) { 857 int i; 858 fprintf(output, "%d entries\n", atom->nbRanges); 859 for (i = 0; i < atom->nbRanges;i++) 860 xmlRegPrintRange(output, atom->ranges[i]); 861 } else if (atom->type == XML_REGEXP_SUBREG) { 862 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 863 } else { 864 fprintf(output, "\n"); 865 } 866} 867 868static void 869xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 870 fprintf(output, " trans: "); 871 if (trans == NULL) { 872 fprintf(output, "NULL\n"); 873 return; 874 } 875 if (trans->to < 0) { 876 fprintf(output, "removed\n"); 877 return; 878 } 879 if (trans->counter >= 0) { 880 fprintf(output, "counted %d, ", trans->counter); 881 } 882 if (trans->count == REGEXP_ALL_COUNTER) { 883 fprintf(output, "all transition, "); 884 } else if (trans->count >= 0) { 885 fprintf(output, "count based %d, ", trans->count); 886 } 887 if (trans->atom == NULL) { 888 fprintf(output, "epsilon to %d\n", trans->to); 889 return; 890 } 891 if (trans->atom->type == XML_REGEXP_CHARVAL) 892 fprintf(output, "char %c ", trans->atom->codepoint); 893 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 894} 895 896static void 897xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 898 int i; 899 900 fprintf(output, " state: "); 901 if (state == NULL) { 902 fprintf(output, "NULL\n"); 903 return; 904 } 905 if (state->type == XML_REGEXP_START_STATE) 906 fprintf(output, "START "); 907 if (state->type == XML_REGEXP_FINAL_STATE) 908 fprintf(output, "FINAL "); 909 910 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 911 for (i = 0;i < state->nbTrans; i++) { 912 xmlRegPrintTrans(output, &(state->trans[i])); 913 } 914} 915 916#ifdef DEBUG_REGEXP_GRAPH 917static void 918xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 919 int i; 920 921 fprintf(output, " ctxt: "); 922 if (ctxt == NULL) { 923 fprintf(output, "NULL\n"); 924 return; 925 } 926 fprintf(output, "'%s' ", ctxt->string); 927 if (ctxt->error) 928 fprintf(output, "error "); 929 if (ctxt->neg) 930 fprintf(output, "neg "); 931 fprintf(output, "\n"); 932 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 933 for (i = 0;i < ctxt->nbAtoms; i++) { 934 fprintf(output, " %02d ", i); 935 xmlRegPrintAtom(output, ctxt->atoms[i]); 936 } 937 if (ctxt->atom != NULL) { 938 fprintf(output, "current atom:\n"); 939 xmlRegPrintAtom(output, ctxt->atom); 940 } 941 fprintf(output, "%d states:", ctxt->nbStates); 942 if (ctxt->start != NULL) 943 fprintf(output, " start: %d", ctxt->start->no); 944 if (ctxt->end != NULL) 945 fprintf(output, " end: %d", ctxt->end->no); 946 fprintf(output, "\n"); 947 for (i = 0;i < ctxt->nbStates; i++) { 948 xmlRegPrintState(output, ctxt->states[i]); 949 } 950 fprintf(output, "%d counters:\n", ctxt->nbCounters); 951 for (i = 0;i < ctxt->nbCounters; i++) { 952 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 953 ctxt->counters[i].max); 954 } 955} 956#endif 957 958/************************************************************************ 959 * * 960 * Finite Automata structures manipulations * 961 * * 962 ************************************************************************/ 963 964static void 965xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 966 int neg, xmlRegAtomType type, int start, int end, 967 xmlChar *blockName) { 968 xmlRegRangePtr range; 969 970 if (atom == NULL) { 971 ERROR("add range: atom is NULL"); 972 return; 973 } 974 if (atom->type != XML_REGEXP_RANGES) { 975 ERROR("add range: atom is not ranges"); 976 return; 977 } 978 if (atom->maxRanges == 0) { 979 atom->maxRanges = 4; 980 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 981 sizeof(xmlRegRangePtr)); 982 if (atom->ranges == NULL) { 983 ERROR("add range: allocation failed"); 984 atom->maxRanges = 0; 985 return; 986 } 987 } else if (atom->nbRanges >= atom->maxRanges) { 988 xmlRegRangePtr *tmp; 989 atom->maxRanges *= 2; 990 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 991 sizeof(xmlRegRangePtr)); 992 if (tmp == NULL) { 993 ERROR("add range: allocation failed"); 994 atom->maxRanges /= 2; 995 return; 996 } 997 atom->ranges = tmp; 998 } 999 range = xmlRegNewRange(ctxt, neg, type, start, end); 1000 if (range == NULL) 1001 return; 1002 range->blockName = blockName; 1003 atom->ranges[atom->nbRanges++] = range; 1004 1005} 1006 1007static int 1008xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1009 if (ctxt->maxCounters == 0) { 1010 ctxt->maxCounters = 4; 1011 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1012 sizeof(xmlRegCounter)); 1013 if (ctxt->counters == NULL) { 1014 ERROR("reg counter: allocation failed"); 1015 ctxt->maxCounters = 0; 1016 return(-1); 1017 } 1018 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1019 xmlRegCounter *tmp; 1020 ctxt->maxCounters *= 2; 1021 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1022 sizeof(xmlRegCounter)); 1023 if (tmp == NULL) { 1024 ERROR("reg counter: allocation failed"); 1025 ctxt->maxCounters /= 2; 1026 return(-1); 1027 } 1028 ctxt->counters = tmp; 1029 } 1030 ctxt->counters[ctxt->nbCounters].min = -1; 1031 ctxt->counters[ctxt->nbCounters].max = -1; 1032 return(ctxt->nbCounters++); 1033} 1034 1035static void 1036xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1037 if (atom == NULL) { 1038 ERROR("atom push: atom is NULL"); 1039 return; 1040 } 1041 if (ctxt->maxAtoms == 0) { 1042 ctxt->maxAtoms = 4; 1043 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1044 sizeof(xmlRegAtomPtr)); 1045 if (ctxt->atoms == NULL) { 1046 ERROR("atom push: allocation failed"); 1047 ctxt->maxAtoms = 0; 1048 return; 1049 } 1050 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1051 xmlRegAtomPtr *tmp; 1052 ctxt->maxAtoms *= 2; 1053 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1054 sizeof(xmlRegAtomPtr)); 1055 if (tmp == NULL) { 1056 ERROR("atom push: allocation failed"); 1057 ctxt->maxAtoms /= 2; 1058 return; 1059 } 1060 ctxt->atoms = tmp; 1061 } 1062 atom->no = ctxt->nbAtoms; 1063 ctxt->atoms[ctxt->nbAtoms++] = atom; 1064} 1065 1066static void 1067xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1068 xmlRegAtomPtr atom, xmlRegStatePtr target, 1069 int counter, int count) { 1070 if (state == NULL) { 1071 ERROR("add state: state is NULL"); 1072 return; 1073 } 1074 if (target == NULL) { 1075 ERROR("add state: target is NULL"); 1076 return; 1077 } 1078 if (state->maxTrans == 0) { 1079 state->maxTrans = 4; 1080 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1081 sizeof(xmlRegTrans)); 1082 if (state->trans == NULL) { 1083 ERROR("add range: allocation failed"); 1084 state->maxTrans = 0; 1085 return; 1086 } 1087 } else if (state->nbTrans >= state->maxTrans) { 1088 xmlRegTrans *tmp; 1089 state->maxTrans *= 2; 1090 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1091 sizeof(xmlRegTrans)); 1092 if (tmp == NULL) { 1093 ERROR("add range: allocation failed"); 1094 state->maxTrans /= 2; 1095 return; 1096 } 1097 state->trans = tmp; 1098 } 1099#ifdef DEBUG_REGEXP_GRAPH 1100 printf("Add trans from %d to %d ", state->no, target->no); 1101 if (count == REGEXP_ALL_COUNTER) 1102 printf("all transition"); 1103 else if (count >= 0) 1104 printf("count based %d", count); 1105 else if (counter >= 0) 1106 printf("counted %d", counter); 1107 else if (atom == NULL) 1108 printf("epsilon transition"); 1109 printf("\n"); 1110#endif 1111 1112 state->trans[state->nbTrans].atom = atom; 1113 state->trans[state->nbTrans].to = target->no; 1114 state->trans[state->nbTrans].counter = counter; 1115 state->trans[state->nbTrans].count = count; 1116 state->nbTrans++; 1117} 1118 1119static void 1120xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1121 if (ctxt->maxStates == 0) { 1122 ctxt->maxStates = 4; 1123 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1124 sizeof(xmlRegStatePtr)); 1125 if (ctxt->states == NULL) { 1126 ERROR("add range: allocation failed"); 1127 ctxt->maxStates = 0; 1128 return; 1129 } 1130 } else if (ctxt->nbStates >= ctxt->maxStates) { 1131 xmlRegStatePtr *tmp; 1132 ctxt->maxStates *= 2; 1133 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1134 sizeof(xmlRegStatePtr)); 1135 if (tmp == NULL) { 1136 ERROR("add range: allocation failed"); 1137 ctxt->maxStates /= 2; 1138 return; 1139 } 1140 ctxt->states = tmp; 1141 } 1142 state->no = ctxt->nbStates; 1143 ctxt->states[ctxt->nbStates++] = state; 1144} 1145 1146/** 1147 * xmlFAGenerateAllTransition: 1148 * @ctxt: a regexp parser context 1149 * @from: the from state 1150 * @to: the target state or NULL for building a new one 1151 * @lax: 1152 * 1153 */ 1154static void 1155xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1156 xmlRegStatePtr from, xmlRegStatePtr to, 1157 int lax) { 1158 if (to == NULL) { 1159 to = xmlRegNewState(ctxt); 1160 xmlRegStatePush(ctxt, to); 1161 ctxt->state = to; 1162 } 1163 if (lax) 1164 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1165 else 1166 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1167} 1168 1169/** 1170 * xmlFAGenerateEpsilonTransition: 1171 * @ctxt: a regexp parser context 1172 * @from: the from state 1173 * @to: the target state or NULL for building a new one 1174 * 1175 */ 1176static void 1177xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1178 xmlRegStatePtr from, xmlRegStatePtr to) { 1179 if (to == NULL) { 1180 to = xmlRegNewState(ctxt); 1181 xmlRegStatePush(ctxt, to); 1182 ctxt->state = to; 1183 } 1184 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1185} 1186 1187/** 1188 * xmlFAGenerateCountedEpsilonTransition: 1189 * @ctxt: a regexp parser context 1190 * @from: the from state 1191 * @to: the target state or NULL for building a new one 1192 * counter: the counter for that transition 1193 * 1194 */ 1195static void 1196xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1197 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1198 if (to == NULL) { 1199 to = xmlRegNewState(ctxt); 1200 xmlRegStatePush(ctxt, to); 1201 ctxt->state = to; 1202 } 1203 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1204} 1205 1206/** 1207 * xmlFAGenerateCountedTransition: 1208 * @ctxt: a regexp parser context 1209 * @from: the from state 1210 * @to: the target state or NULL for building a new one 1211 * counter: the counter for that transition 1212 * 1213 */ 1214static void 1215xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1216 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1217 if (to == NULL) { 1218 to = xmlRegNewState(ctxt); 1219 xmlRegStatePush(ctxt, to); 1220 ctxt->state = to; 1221 } 1222 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1223} 1224 1225/** 1226 * xmlFAGenerateTransitions: 1227 * @ctxt: a regexp parser context 1228 * @from: the from state 1229 * @to: the target state or NULL for building a new one 1230 * @atom: the atom generating the transition 1231 * 1232 */ 1233static void 1234xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1235 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1236 if (atom == NULL) { 1237 ERROR("genrate transition: atom == NULL"); 1238 return; 1239 } 1240 if (atom->type == XML_REGEXP_SUBREG) { 1241 /* 1242 * this is a subexpression handling one should not need to 1243 * create a new node excep for XML_REGEXP_QUANT_RANGE. 1244 */ 1245 xmlRegAtomPush(ctxt, atom); 1246 if ((to != NULL) && (atom->stop != to) && 1247 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1248 /* 1249 * Generate an epsilon transition to link to the target 1250 */ 1251 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1252 } 1253 switch (atom->quant) { 1254 case XML_REGEXP_QUANT_OPT: 1255 atom->quant = XML_REGEXP_QUANT_ONCE; 1256 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1257 break; 1258 case XML_REGEXP_QUANT_MULT: 1259 atom->quant = XML_REGEXP_QUANT_ONCE; 1260 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1261 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1262 break; 1263 case XML_REGEXP_QUANT_PLUS: 1264 atom->quant = XML_REGEXP_QUANT_ONCE; 1265 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1266 break; 1267 case XML_REGEXP_QUANT_RANGE: { 1268 int counter; 1269 xmlRegStatePtr newstate; 1270 1271 /* 1272 * This one is nasty: 1273 * 1/ register a new counter 1274 * 2/ register an epsilon transition associated to 1275 * this counter going from atom->stop to atom->start 1276 * 3/ create a new state 1277 * 4/ generate a counted transition from atom->stop to 1278 * that state 1279 */ 1280 counter = xmlRegGetCounter(ctxt); 1281 ctxt->counters[counter].min = atom->min - 1; 1282 ctxt->counters[counter].max = atom->max - 1; 1283 atom->min = 0; 1284 atom->max = 0; 1285 atom->quant = XML_REGEXP_QUANT_ONCE; 1286 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1287 atom->start, counter); 1288 if (to != NULL) { 1289 newstate = to; 1290 } else { 1291 newstate = xmlRegNewState(ctxt); 1292 xmlRegStatePush(ctxt, newstate); 1293 ctxt->state = newstate; 1294 } 1295 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1296 newstate, counter); 1297 } 1298 default: 1299 break; 1300 } 1301 return; 1302 } else { 1303 if (to == NULL) { 1304 to = xmlRegNewState(ctxt); 1305 xmlRegStatePush(ctxt, to); 1306 } 1307 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1308 xmlRegAtomPush(ctxt, atom); 1309 ctxt->state = to; 1310 } 1311 switch (atom->quant) { 1312 case XML_REGEXP_QUANT_OPT: 1313 atom->quant = XML_REGEXP_QUANT_ONCE; 1314 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1315 break; 1316 case XML_REGEXP_QUANT_MULT: 1317 atom->quant = XML_REGEXP_QUANT_ONCE; 1318 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1319 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1320 break; 1321 case XML_REGEXP_QUANT_PLUS: 1322 atom->quant = XML_REGEXP_QUANT_ONCE; 1323 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1324 break; 1325 default: 1326 break; 1327 } 1328} 1329 1330/** 1331 * xmlFAReduceEpsilonTransitions: 1332 * @ctxt: a regexp parser context 1333 * @fromnr: the from state 1334 * @tonr: the to state 1335 * @cpunter: should that transition be associted to a counted 1336 * 1337 */ 1338static void 1339xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1340 int tonr, int counter) { 1341 int transnr; 1342 xmlRegStatePtr from; 1343 xmlRegStatePtr to; 1344 1345#ifdef DEBUG_REGEXP_GRAPH 1346 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1347#endif 1348 from = ctxt->states[fromnr]; 1349 if (from == NULL) 1350 return; 1351 to = ctxt->states[tonr]; 1352 if (to == NULL) 1353 return; 1354 if ((to->mark == XML_REGEXP_MARK_START) || 1355 (to->mark == XML_REGEXP_MARK_VISITED)) 1356 return; 1357 1358 to->mark = XML_REGEXP_MARK_VISITED; 1359 if (to->type == XML_REGEXP_FINAL_STATE) { 1360#ifdef DEBUG_REGEXP_GRAPH 1361 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1362#endif 1363 from->type = XML_REGEXP_FINAL_STATE; 1364 } 1365 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1366 if (to->trans[transnr].atom == NULL) { 1367 /* 1368 * Don't remove counted transitions 1369 * Don't loop either 1370 */ 1371 if (to->trans[transnr].to != fromnr) { 1372 if (to->trans[transnr].count >= 0) { 1373 int newto = to->trans[transnr].to; 1374 1375 xmlRegStateAddTrans(ctxt, from, NULL, 1376 ctxt->states[newto], 1377 -1, to->trans[transnr].count); 1378 } else { 1379#ifdef DEBUG_REGEXP_GRAPH 1380 printf("Found epsilon trans %d from %d to %d\n", 1381 transnr, tonr, to->trans[transnr].to); 1382#endif 1383 if (to->trans[transnr].counter >= 0) { 1384 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1385 to->trans[transnr].to, 1386 to->trans[transnr].counter); 1387 } else { 1388 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1389 to->trans[transnr].to, 1390 counter); 1391 } 1392 } 1393 } 1394 } else { 1395 int newto = to->trans[transnr].to; 1396 1397 if (to->trans[transnr].counter >= 0) { 1398 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1399 ctxt->states[newto], 1400 to->trans[transnr].counter, -1); 1401 } else { 1402 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1403 ctxt->states[newto], counter, -1); 1404 } 1405 } 1406 } 1407 to->mark = XML_REGEXP_MARK_NORMAL; 1408} 1409 1410/** 1411 * xmlFAEliminateEpsilonTransitions: 1412 * @ctxt: a regexp parser context 1413 * 1414 */ 1415static void 1416xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1417 int statenr, transnr; 1418 xmlRegStatePtr state; 1419 1420 /* 1421 * build the completed transitions bypassing the epsilons 1422 * Use a marking algorithm to avoid loops 1423 */ 1424 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1425 state = ctxt->states[statenr]; 1426 if (state == NULL) 1427 continue; 1428 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1429 if ((state->trans[transnr].atom == NULL) && 1430 (state->trans[transnr].to >= 0)) { 1431 if (state->trans[transnr].to == statenr) { 1432 state->trans[transnr].to = -1; 1433#ifdef DEBUG_REGEXP_GRAPH 1434 printf("Removed loopback epsilon trans %d on %d\n", 1435 transnr, statenr); 1436#endif 1437 } else if (state->trans[transnr].count < 0) { 1438 int newto = state->trans[transnr].to; 1439 1440#ifdef DEBUG_REGEXP_GRAPH 1441 printf("Found epsilon trans %d from %d to %d\n", 1442 transnr, statenr, newto); 1443#endif 1444 state->mark = XML_REGEXP_MARK_START; 1445 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1446 newto, state->trans[transnr].counter); 1447 state->mark = XML_REGEXP_MARK_NORMAL; 1448#ifdef DEBUG_REGEXP_GRAPH 1449 } else { 1450 printf("Found counted transition %d on %d\n", 1451 transnr, statenr); 1452#endif 1453 } 1454 } 1455 } 1456 } 1457 /* 1458 * Eliminate the epsilon transitions 1459 */ 1460 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1461 state = ctxt->states[statenr]; 1462 if (state == NULL) 1463 continue; 1464 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1465 if ((state->trans[transnr].atom == NULL) && 1466 (state->trans[transnr].count < 0) && 1467 (state->trans[transnr].to >= 0)) { 1468 state->trans[transnr].to = -1; 1469 } 1470 } 1471 } 1472 1473 /* 1474 * Use this pass to detect unreachable states too 1475 */ 1476 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1477 state = ctxt->states[statenr]; 1478 if (state != NULL) 1479 state->reached = 0; 1480 } 1481 state = ctxt->states[0]; 1482 if (state != NULL) 1483 state->reached = 1; 1484 while (state != NULL) { 1485 xmlRegStatePtr target = NULL; 1486 state->reached = 2; 1487 /* 1488 * Mark all state reachable from the current reachable state 1489 */ 1490 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1491 if ((state->trans[transnr].to >= 0) && 1492 ((state->trans[transnr].atom != NULL) || 1493 (state->trans[transnr].count >= 0))) { 1494 int newto = state->trans[transnr].to; 1495 1496 if (ctxt->states[newto] == NULL) 1497 continue; 1498 if (ctxt->states[newto]->reached == 0) { 1499 ctxt->states[newto]->reached = 1; 1500 target = ctxt->states[newto]; 1501 } 1502 } 1503 } 1504 /* 1505 * find the next accessible state not explored 1506 */ 1507 if (target == NULL) { 1508 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 1509 state = ctxt->states[statenr]; 1510 if ((state != NULL) && (state->reached == 1)) { 1511 target = state; 1512 break; 1513 } 1514 } 1515 } 1516 state = target; 1517 } 1518 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1519 state = ctxt->states[statenr]; 1520 if ((state != NULL) && (state->reached == 0)) { 1521#ifdef DEBUG_REGEXP_GRAPH 1522 printf("Removed unreachable state %d\n", statenr); 1523#endif 1524 xmlRegFreeState(state); 1525 ctxt->states[statenr] = NULL; 1526 } 1527 } 1528 1529} 1530 1531/** 1532 * xmlFACompareAtoms: 1533 * @atom1: an atom 1534 * @atom2: an atom 1535 * 1536 * Compares two atoms to check whether they are equivatents 1537 * 1538 * Returns 1 if yes and 0 otherwise 1539 */ 1540static int 1541xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 1542 if (atom1 == atom2) 1543 return(1); 1544 if ((atom1 == NULL) || (atom2 == NULL)) 1545 return(0); 1546 1547 if (atom1->type != atom2->type) 1548 return(0); 1549 switch (atom1->type) { 1550 case XML_REGEXP_STRING: 1551 return(xmlStrEqual((xmlChar *)atom1->valuep, 1552 (xmlChar *)atom2->valuep)); 1553 case XML_REGEXP_EPSILON: 1554 return(1); 1555 case XML_REGEXP_CHARVAL: 1556 return(atom1->codepoint == atom2->codepoint); 1557 case XML_REGEXP_RANGES: 1558 TODO; 1559 return(0); 1560 default: 1561 break; 1562 } 1563 return(1); 1564} 1565 1566/** 1567 * xmlFARecurseDeterminism: 1568 * @ctxt: a regexp parser context 1569 * 1570 * Check whether the associated regexp is determinist, 1571 * should be called after xmlFAEliminateEpsilonTransitions() 1572 * 1573 */ 1574static int 1575xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1576 int to, xmlRegAtomPtr atom) { 1577 int ret = 1; 1578 int transnr; 1579 xmlRegTransPtr t1; 1580 1581 if (state == NULL) 1582 return(ret); 1583 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1584 t1 = &(state->trans[transnr]); 1585 /* 1586 * check transitions conflicting with the one looked at 1587 */ 1588 if (t1->atom == NULL) { 1589 if (t1->to == -1) 1590 continue; 1591 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 1592 to, atom); 1593 if (ret == 0) 1594 return(0); 1595 continue; 1596 } 1597 if (t1->to != to) 1598 continue; 1599 if (xmlFACompareAtoms(t1->atom, atom)) 1600 return(0); 1601 } 1602 return(ret); 1603} 1604 1605/** 1606 * xmlFAComputesDeterminism: 1607 * @ctxt: a regexp parser context 1608 * 1609 * Check whether the associated regexp is determinist, 1610 * should be called after xmlFAEliminateEpsilonTransitions() 1611 * 1612 */ 1613static int 1614xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 1615 int statenr, transnr; 1616 xmlRegStatePtr state; 1617 xmlRegTransPtr t1, t2; 1618 int i; 1619 int ret = 1; 1620 1621#ifdef DEBUG_REGEXP_GRAPH 1622 printf("xmlFAComputesDeterminism\n"); 1623 xmlRegPrintCtxt(stdout, ctxt); 1624#endif 1625 if (ctxt->determinist != -1) 1626 return(ctxt->determinist); 1627 1628 /* 1629 * Check for all states that there isn't 2 transitions 1630 * with the same atom and a different target. 1631 */ 1632 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1633 state = ctxt->states[statenr]; 1634 if (state == NULL) 1635 continue; 1636 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1637 t1 = &(state->trans[transnr]); 1638 /* 1639 * Determinism checks in case of counted or all transitions 1640 * will have to be handled separately 1641 */ 1642 if (t1->atom == NULL) 1643 continue; 1644 if (t1->to == -1) /* eliminated */ 1645 continue; 1646 for (i = 0;i < transnr;i++) { 1647 t2 = &(state->trans[i]); 1648 if (t2->to == -1) /* eliminated */ 1649 continue; 1650 if (t2->atom != NULL) { 1651 if (t1->to == t2->to) { 1652 if (xmlFACompareAtoms(t1->atom, t2->atom)) 1653 t2->to = -1; /* eliminate */ 1654 } else { 1655 /* not determinist ! */ 1656 if (xmlFACompareAtoms(t1->atom, t2->atom)) 1657 ret = 0; 1658 } 1659 } else if (t1->to != -1) { 1660 /* 1661 * do the closure in case of remaining specific 1662 * epsilon transitions like choices or all 1663 */ 1664 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 1665 t2->to, t2->atom); 1666 if (ret == 0) 1667 return(0); 1668 } 1669 } 1670 if (ret == 0) 1671 break; 1672 } 1673 if (ret == 0) 1674 break; 1675 } 1676 ctxt->determinist = ret; 1677 return(ret); 1678} 1679 1680/************************************************************************ 1681 * * 1682 * Routines to check input against transition atoms * 1683 * * 1684 ************************************************************************/ 1685 1686static int 1687xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 1688 int start, int end, const xmlChar *blockName) { 1689 int ret = 0; 1690 1691 switch (type) { 1692 case XML_REGEXP_STRING: 1693 case XML_REGEXP_SUBREG: 1694 case XML_REGEXP_RANGES: 1695 case XML_REGEXP_EPSILON: 1696 return(-1); 1697 case XML_REGEXP_ANYCHAR: 1698 ret = ((codepoint != '\n') && (codepoint != '\r')); 1699 break; 1700 case XML_REGEXP_CHARVAL: 1701 ret = ((codepoint >= start) && (codepoint <= end)); 1702 break; 1703 case XML_REGEXP_NOTSPACE: 1704 neg = !neg; 1705 case XML_REGEXP_ANYSPACE: 1706 ret = ((codepoint == '\n') || (codepoint == '\r') || 1707 (codepoint == '\t') || (codepoint == ' ')); 1708 break; 1709 case XML_REGEXP_NOTINITNAME: 1710 neg = !neg; 1711 case XML_REGEXP_INITNAME: 1712 ret = (xmlIsLetter(codepoint) || 1713 (codepoint == '_') || (codepoint == ':')); 1714 break; 1715 case XML_REGEXP_NOTNAMECHAR: 1716 neg = !neg; 1717 case XML_REGEXP_NAMECHAR: 1718 ret = (xmlIsLetter(codepoint) || xmlIsDigit(codepoint) || 1719 (codepoint == '.') || (codepoint == '-') || 1720 (codepoint == '_') || (codepoint == ':') || 1721 xmlIsCombining(codepoint) || xmlIsExtender(codepoint)); 1722 break; 1723 case XML_REGEXP_NOTDECIMAL: 1724 neg = !neg; 1725 case XML_REGEXP_DECIMAL: 1726 ret = xmlUCSIsCatNd(codepoint); 1727 break; 1728 case XML_REGEXP_REALCHAR: 1729 neg = !neg; 1730 case XML_REGEXP_NOTREALCHAR: 1731 ret = xmlUCSIsCatP(codepoint); 1732 if (ret == 0) 1733 ret = xmlUCSIsCatZ(codepoint); 1734 if (ret == 0) 1735 ret = xmlUCSIsCatC(codepoint); 1736 break; 1737 case XML_REGEXP_LETTER: 1738 ret = xmlUCSIsCatL(codepoint); 1739 break; 1740 case XML_REGEXP_LETTER_UPPERCASE: 1741 ret = xmlUCSIsCatLu(codepoint); 1742 break; 1743 case XML_REGEXP_LETTER_LOWERCASE: 1744 ret = xmlUCSIsCatLl(codepoint); 1745 break; 1746 case XML_REGEXP_LETTER_TITLECASE: 1747 ret = xmlUCSIsCatLt(codepoint); 1748 break; 1749 case XML_REGEXP_LETTER_MODIFIER: 1750 ret = xmlUCSIsCatLm(codepoint); 1751 break; 1752 case XML_REGEXP_LETTER_OTHERS: 1753 ret = xmlUCSIsCatLo(codepoint); 1754 break; 1755 case XML_REGEXP_MARK: 1756 ret = xmlUCSIsCatM(codepoint); 1757 break; 1758 case XML_REGEXP_MARK_NONSPACING: 1759 ret = xmlUCSIsCatMn(codepoint); 1760 break; 1761 case XML_REGEXP_MARK_SPACECOMBINING: 1762 ret = xmlUCSIsCatMc(codepoint); 1763 break; 1764 case XML_REGEXP_MARK_ENCLOSING: 1765 ret = xmlUCSIsCatMe(codepoint); 1766 break; 1767 case XML_REGEXP_NUMBER: 1768 ret = xmlUCSIsCatN(codepoint); 1769 break; 1770 case XML_REGEXP_NUMBER_DECIMAL: 1771 ret = xmlUCSIsCatNd(codepoint); 1772 break; 1773 case XML_REGEXP_NUMBER_LETTER: 1774 ret = xmlUCSIsCatNl(codepoint); 1775 break; 1776 case XML_REGEXP_NUMBER_OTHERS: 1777 ret = xmlUCSIsCatNo(codepoint); 1778 break; 1779 case XML_REGEXP_PUNCT: 1780 ret = xmlUCSIsCatP(codepoint); 1781 break; 1782 case XML_REGEXP_PUNCT_CONNECTOR: 1783 ret = xmlUCSIsCatPc(codepoint); 1784 break; 1785 case XML_REGEXP_PUNCT_DASH: 1786 ret = xmlUCSIsCatPd(codepoint); 1787 break; 1788 case XML_REGEXP_PUNCT_OPEN: 1789 ret = xmlUCSIsCatPs(codepoint); 1790 break; 1791 case XML_REGEXP_PUNCT_CLOSE: 1792 ret = xmlUCSIsCatPe(codepoint); 1793 break; 1794 case XML_REGEXP_PUNCT_INITQUOTE: 1795 ret = xmlUCSIsCatPi(codepoint); 1796 break; 1797 case XML_REGEXP_PUNCT_FINQUOTE: 1798 ret = xmlUCSIsCatPf(codepoint); 1799 break; 1800 case XML_REGEXP_PUNCT_OTHERS: 1801 ret = xmlUCSIsCatPo(codepoint); 1802 break; 1803 case XML_REGEXP_SEPAR: 1804 ret = xmlUCSIsCatZ(codepoint); 1805 break; 1806 case XML_REGEXP_SEPAR_SPACE: 1807 ret = xmlUCSIsCatZs(codepoint); 1808 break; 1809 case XML_REGEXP_SEPAR_LINE: 1810 ret = xmlUCSIsCatZl(codepoint); 1811 break; 1812 case XML_REGEXP_SEPAR_PARA: 1813 ret = xmlUCSIsCatZp(codepoint); 1814 break; 1815 case XML_REGEXP_SYMBOL: 1816 ret = xmlUCSIsCatS(codepoint); 1817 break; 1818 case XML_REGEXP_SYMBOL_MATH: 1819 ret = xmlUCSIsCatSm(codepoint); 1820 break; 1821 case XML_REGEXP_SYMBOL_CURRENCY: 1822 ret = xmlUCSIsCatSc(codepoint); 1823 break; 1824 case XML_REGEXP_SYMBOL_MODIFIER: 1825 ret = xmlUCSIsCatSk(codepoint); 1826 break; 1827 case XML_REGEXP_SYMBOL_OTHERS: 1828 ret = xmlUCSIsCatSo(codepoint); 1829 break; 1830 case XML_REGEXP_OTHER: 1831 ret = xmlUCSIsCatC(codepoint); 1832 break; 1833 case XML_REGEXP_OTHER_CONTROL: 1834 ret = xmlUCSIsCatCc(codepoint); 1835 break; 1836 case XML_REGEXP_OTHER_FORMAT: 1837 ret = xmlUCSIsCatCf(codepoint); 1838 break; 1839 case XML_REGEXP_OTHER_PRIVATE: 1840 ret = xmlUCSIsCatCo(codepoint); 1841 break; 1842 case XML_REGEXP_OTHER_NA: 1843 /* ret = xmlUCSIsCatCn(codepoint); */ 1844 /* Seems it doesn't exist anymore in recent Unicode releases */ 1845 ret = 0; 1846 break; 1847 case XML_REGEXP_BLOCK_NAME: 1848 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 1849 break; 1850 } 1851 if (neg) 1852 return(!ret); 1853 return(ret); 1854} 1855 1856static int 1857xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 1858 int i, ret = 0; 1859 xmlRegRangePtr range; 1860 1861 if ((atom == NULL) || (!xmlIsChar(codepoint))) 1862 return(-1); 1863 1864 switch (atom->type) { 1865 case XML_REGEXP_SUBREG: 1866 case XML_REGEXP_EPSILON: 1867 return(-1); 1868 case XML_REGEXP_CHARVAL: 1869 return(codepoint == atom->codepoint); 1870 case XML_REGEXP_RANGES: { 1871 int accept = 0; 1872 for (i = 0;i < atom->nbRanges;i++) { 1873 range = atom->ranges[i]; 1874 if (range->neg) { 1875 ret = xmlRegCheckCharacterRange(range->type, codepoint, 1876 0, range->start, range->end, 1877 range->blockName); 1878 if (ret != 0) 1879 return(0); /* excluded char */ 1880 } else { 1881 ret = xmlRegCheckCharacterRange(range->type, codepoint, 1882 0, range->start, range->end, 1883 range->blockName); 1884 if (ret != 0) 1885 accept = 1; /* might still be excluded */ 1886 } 1887 } 1888 return(accept); 1889 } 1890 case XML_REGEXP_STRING: 1891 printf("TODO: XML_REGEXP_STRING\n"); 1892 return(-1); 1893 case XML_REGEXP_ANYCHAR: 1894 case XML_REGEXP_ANYSPACE: 1895 case XML_REGEXP_NOTSPACE: 1896 case XML_REGEXP_INITNAME: 1897 case XML_REGEXP_NOTINITNAME: 1898 case XML_REGEXP_NAMECHAR: 1899 case XML_REGEXP_NOTNAMECHAR: 1900 case XML_REGEXP_DECIMAL: 1901 case XML_REGEXP_NOTDECIMAL: 1902 case XML_REGEXP_REALCHAR: 1903 case XML_REGEXP_NOTREALCHAR: 1904 case XML_REGEXP_LETTER: 1905 case XML_REGEXP_LETTER_UPPERCASE: 1906 case XML_REGEXP_LETTER_LOWERCASE: 1907 case XML_REGEXP_LETTER_TITLECASE: 1908 case XML_REGEXP_LETTER_MODIFIER: 1909 case XML_REGEXP_LETTER_OTHERS: 1910 case XML_REGEXP_MARK: 1911 case XML_REGEXP_MARK_NONSPACING: 1912 case XML_REGEXP_MARK_SPACECOMBINING: 1913 case XML_REGEXP_MARK_ENCLOSING: 1914 case XML_REGEXP_NUMBER: 1915 case XML_REGEXP_NUMBER_DECIMAL: 1916 case XML_REGEXP_NUMBER_LETTER: 1917 case XML_REGEXP_NUMBER_OTHERS: 1918 case XML_REGEXP_PUNCT: 1919 case XML_REGEXP_PUNCT_CONNECTOR: 1920 case XML_REGEXP_PUNCT_DASH: 1921 case XML_REGEXP_PUNCT_OPEN: 1922 case XML_REGEXP_PUNCT_CLOSE: 1923 case XML_REGEXP_PUNCT_INITQUOTE: 1924 case XML_REGEXP_PUNCT_FINQUOTE: 1925 case XML_REGEXP_PUNCT_OTHERS: 1926 case XML_REGEXP_SEPAR: 1927 case XML_REGEXP_SEPAR_SPACE: 1928 case XML_REGEXP_SEPAR_LINE: 1929 case XML_REGEXP_SEPAR_PARA: 1930 case XML_REGEXP_SYMBOL: 1931 case XML_REGEXP_SYMBOL_MATH: 1932 case XML_REGEXP_SYMBOL_CURRENCY: 1933 case XML_REGEXP_SYMBOL_MODIFIER: 1934 case XML_REGEXP_SYMBOL_OTHERS: 1935 case XML_REGEXP_OTHER: 1936 case XML_REGEXP_OTHER_CONTROL: 1937 case XML_REGEXP_OTHER_FORMAT: 1938 case XML_REGEXP_OTHER_PRIVATE: 1939 case XML_REGEXP_OTHER_NA: 1940 case XML_REGEXP_BLOCK_NAME: 1941 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 1942 (const xmlChar *)atom->valuep); 1943 if (atom->neg) 1944 ret = !ret; 1945 break; 1946 } 1947 return(ret); 1948} 1949 1950/************************************************************************ 1951 * * 1952 * Saving an restoring state of an execution context * 1953 * * 1954 ************************************************************************/ 1955 1956#ifdef DEBUG_REGEXP_EXEC 1957static void 1958xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 1959 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 1960 if (exec->inputStack != NULL) { 1961 int i; 1962 printf(": "); 1963 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 1964 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]); 1965 } else { 1966 printf(": %s", &(exec->inputString[exec->index])); 1967 } 1968 printf("\n"); 1969} 1970#endif 1971 1972static void 1973xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 1974#ifdef DEBUG_REGEXP_EXEC 1975 printf("saving "); 1976 exec->transno++; 1977 xmlFARegDebugExec(exec); 1978 exec->transno--; 1979#endif 1980 1981 if (exec->maxRollbacks == 0) { 1982 exec->maxRollbacks = 4; 1983 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 1984 sizeof(xmlRegExecRollback)); 1985 if (exec->rollbacks == NULL) { 1986 fprintf(stderr, "exec save: allocation failed"); 1987 exec->maxRollbacks = 0; 1988 return; 1989 } 1990 memset(exec->rollbacks, 0, 1991 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 1992 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 1993 xmlRegExecRollback *tmp; 1994 int len = exec->maxRollbacks; 1995 1996 exec->maxRollbacks *= 2; 1997 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 1998 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 1999 if (tmp == NULL) { 2000 fprintf(stderr, "exec save: allocation failed"); 2001 exec->maxRollbacks /= 2; 2002 return; 2003 } 2004 exec->rollbacks = tmp; 2005 tmp = &exec->rollbacks[len]; 2006 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 2007 } 2008 exec->rollbacks[exec->nbRollbacks].state = exec->state; 2009 exec->rollbacks[exec->nbRollbacks].index = exec->index; 2010 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 2011 if (exec->comp->nbCounters > 0) { 2012 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2013 exec->rollbacks[exec->nbRollbacks].counts = (int *) 2014 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 2015 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2016 fprintf(stderr, "exec save: allocation failed"); 2017 exec->status = -5; 2018 return; 2019 } 2020 } 2021 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 2022 exec->comp->nbCounters * sizeof(int)); 2023 } 2024 exec->nbRollbacks++; 2025} 2026 2027static void 2028xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 2029 if (exec->nbRollbacks <= 0) { 2030 exec->status = -1; 2031#ifdef DEBUG_REGEXP_EXEC 2032 printf("rollback failed on empty stack\n"); 2033#endif 2034 return; 2035 } 2036 exec->nbRollbacks--; 2037 exec->state = exec->rollbacks[exec->nbRollbacks].state; 2038 exec->index = exec->rollbacks[exec->nbRollbacks].index; 2039 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 2040 if (exec->comp->nbCounters > 0) { 2041 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2042 fprintf(stderr, "exec save: allocation failed"); 2043 exec->status = -6; 2044 return; 2045 } 2046 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 2047 exec->comp->nbCounters * sizeof(int)); 2048 } 2049 2050#ifdef DEBUG_REGEXP_EXEC 2051 printf("restored "); 2052 xmlFARegDebugExec(exec); 2053#endif 2054} 2055 2056/************************************************************************ 2057 * * 2058 * Verifyer, running an input against a compiled regexp * 2059 * * 2060 ************************************************************************/ 2061 2062static int 2063xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 2064 xmlRegExecCtxt execval; 2065 xmlRegExecCtxtPtr exec = &execval; 2066 int ret, codepoint, len; 2067 2068 exec->inputString = content; 2069 exec->index = 0; 2070 exec->determinist = 1; 2071 exec->maxRollbacks = 0; 2072 exec->nbRollbacks = 0; 2073 exec->rollbacks = NULL; 2074 exec->status = 0; 2075 exec->comp = comp; 2076 exec->state = comp->states[0]; 2077 exec->transno = 0; 2078 exec->transcount = 0; 2079 if (comp->nbCounters > 0) { 2080 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 2081 if (exec->counts == NULL) 2082 return(-1); 2083 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 2084 } else 2085 exec->counts = NULL; 2086 while ((exec->status == 0) && 2087 ((exec->inputString[exec->index] != 0) || 2088 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 2089 xmlRegTransPtr trans; 2090 xmlRegAtomPtr atom; 2091 2092 /* 2093 * End of input on non-terminal state, rollback, however we may 2094 * still have epsilon like transition for counted transitions 2095 * on counters, in that case don't break too early. 2096 */ 2097 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 2098 goto rollback; 2099 2100 exec->transcount = 0; 2101 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2102 trans = &exec->state->trans[exec->transno]; 2103 if (trans->to < 0) 2104 continue; 2105 atom = trans->atom; 2106 ret = 0; 2107 if (trans->count >= 0) { 2108 int count; 2109 xmlRegCounterPtr counter; 2110 2111 /* 2112 * A counted transition. 2113 */ 2114 2115 count = exec->counts[trans->count]; 2116 counter = &exec->comp->counters[trans->count]; 2117#ifdef DEBUG_REGEXP_EXEC 2118 printf("testing count %d: val %d, min %d, max %d\n", 2119 trans->count, count, counter->min, counter->max); 2120#endif 2121 ret = ((count >= counter->min) && (count <= counter->max)); 2122 } else if (atom == NULL) { 2123 fprintf(stderr, "epsilon transition left at runtime\n"); 2124 exec->status = -2; 2125 break; 2126 } else if (exec->inputString[exec->index] != 0) { 2127 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 2128 ret = xmlRegCheckCharacter(atom, codepoint); 2129 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 2130 xmlRegStatePtr to = comp->states[trans->to]; 2131 2132 /* 2133 * this is a multiple input sequence 2134 */ 2135 if (exec->state->nbTrans > exec->transno + 1) { 2136 xmlFARegExecSave(exec); 2137 } 2138 exec->transcount = 1; 2139 do { 2140 /* 2141 * Try to progress as much as possible on the input 2142 */ 2143 if (exec->transcount == atom->max) { 2144 break; 2145 } 2146 exec->index += len; 2147 /* 2148 * End of input: stop here 2149 */ 2150 if (exec->inputString[exec->index] == 0) { 2151 exec->index -= len; 2152 break; 2153 } 2154 if (exec->transcount >= atom->min) { 2155 int transno = exec->transno; 2156 xmlRegStatePtr state = exec->state; 2157 2158 /* 2159 * The transition is acceptable save it 2160 */ 2161 exec->transno = -1; /* trick */ 2162 exec->state = to; 2163 xmlFARegExecSave(exec); 2164 exec->transno = transno; 2165 exec->state = state; 2166 } 2167 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 2168 len); 2169 ret = xmlRegCheckCharacter(atom, codepoint); 2170 exec->transcount++; 2171 } while (ret == 1); 2172 if (exec->transcount < atom->min) 2173 ret = 0; 2174 2175 /* 2176 * If the last check failed but one transition was found 2177 * possible, rollback 2178 */ 2179 if (ret < 0) 2180 ret = 0; 2181 if (ret == 0) { 2182 goto rollback; 2183 } 2184 } 2185 } 2186 if (ret == 1) { 2187 if (exec->state->nbTrans > exec->transno + 1) { 2188 xmlFARegExecSave(exec); 2189 } 2190 if (trans->counter >= 0) { 2191#ifdef DEBUG_REGEXP_EXEC 2192 printf("Increasing count %d\n", trans->counter); 2193#endif 2194 exec->counts[trans->counter]++; 2195 } 2196#ifdef DEBUG_REGEXP_EXEC 2197 printf("entering state %d\n", trans->to); 2198#endif 2199 exec->state = comp->states[trans->to]; 2200 exec->transno = 0; 2201 if (trans->atom != NULL) { 2202 exec->index += len; 2203 } 2204 goto progress; 2205 } else if (ret < 0) { 2206 exec->status = -4; 2207 break; 2208 } 2209 } 2210 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2211rollback: 2212 /* 2213 * Failed to find a way out 2214 */ 2215 exec->determinist = 0; 2216 xmlFARegExecRollBack(exec); 2217 } 2218progress: 2219 continue; 2220 } 2221 if (exec->rollbacks != NULL) { 2222 if (exec->counts != NULL) { 2223 int i; 2224 2225 for (i = 0;i < exec->maxRollbacks;i++) 2226 if (exec->rollbacks[i].counts != NULL) 2227 xmlFree(exec->rollbacks[i].counts); 2228 } 2229 xmlFree(exec->rollbacks); 2230 } 2231 if (exec->counts != NULL) 2232 xmlFree(exec->counts); 2233 if (exec->status == 0) 2234 return(1); 2235 if (exec->status == -1) 2236 return(0); 2237 return(exec->status); 2238} 2239 2240/************************************************************************ 2241 * * 2242 * Progressive interface to the verifyer one atom at a time * 2243 * * 2244 ************************************************************************/ 2245 2246/** 2247 * xmlRegExecCtxtPtr: 2248 * @comp: a precompiled regular expression 2249 * @callback: a callback function used for handling progresses in the 2250 * automata matching phase 2251 * @data: the context data associated to the callback in this context 2252 * 2253 * Build a context used for progressive evaluation of a regexp. 2254 */ 2255xmlRegExecCtxtPtr 2256xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 2257 xmlRegExecCtxtPtr exec; 2258 2259 if (comp == NULL) 2260 return(NULL); 2261 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 2262 if (exec == NULL) { 2263 return(NULL); 2264 } 2265 memset(exec, 0, sizeof(xmlRegExecCtxt)); 2266 exec->inputString = NULL; 2267 exec->index = 0; 2268 exec->determinist = 1; 2269 exec->maxRollbacks = 0; 2270 exec->nbRollbacks = 0; 2271 exec->rollbacks = NULL; 2272 exec->status = 0; 2273 exec->comp = comp; 2274 if (comp->compact == NULL) 2275 exec->state = comp->states[0]; 2276 exec->transno = 0; 2277 exec->transcount = 0; 2278 exec->callback = callback; 2279 exec->data = data; 2280 if (comp->nbCounters > 0) { 2281 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 2282 if (exec->counts == NULL) { 2283 xmlFree(exec); 2284 return(NULL); 2285 } 2286 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 2287 } else 2288 exec->counts = NULL; 2289 exec->inputStackMax = 0; 2290 exec->inputStackNr = 0; 2291 exec->inputStack = NULL; 2292 return(exec); 2293} 2294 2295/** 2296 * xmlRegFreeExecCtxt: 2297 * @exec: a regular expression evaulation context 2298 * 2299 * Free the structures associated to a regular expression evaulation context. 2300 */ 2301void 2302xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 2303 if (exec == NULL) 2304 return; 2305 2306 if (exec->rollbacks != NULL) { 2307 if (exec->counts != NULL) { 2308 int i; 2309 2310 for (i = 0;i < exec->maxRollbacks;i++) 2311 if (exec->rollbacks[i].counts != NULL) 2312 xmlFree(exec->rollbacks[i].counts); 2313 } 2314 xmlFree(exec->rollbacks); 2315 } 2316 if (exec->counts != NULL) 2317 xmlFree(exec->counts); 2318 if (exec->inputStack != NULL) { 2319 int i; 2320 2321 for (i = 0;i < exec->inputStackNr;i++) 2322 xmlFree(exec->inputStack[i].value); 2323 xmlFree(exec->inputStack); 2324 } 2325 xmlFree(exec); 2326} 2327 2328static void 2329xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 2330 void *data) { 2331#ifdef DEBUG_PUSH 2332 printf("saving value: %d:%s\n", exec->inputStackNr, value); 2333#endif 2334 if (exec->inputStackMax == 0) { 2335 exec->inputStackMax = 4; 2336 exec->inputStack = (xmlRegInputTokenPtr) 2337 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 2338 if (exec->inputStack == NULL) { 2339 fprintf(stderr, "push input: allocation failed"); 2340 exec->inputStackMax = 0; 2341 return; 2342 } 2343 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 2344 xmlRegInputTokenPtr tmp; 2345 2346 exec->inputStackMax *= 2; 2347 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 2348 exec->inputStackMax * sizeof(xmlRegInputToken)); 2349 if (tmp == NULL) { 2350 fprintf(stderr, "push input: allocation failed"); 2351 exec->inputStackMax /= 2; 2352 return; 2353 } 2354 exec->inputStack = tmp; 2355 } 2356 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 2357 exec->inputStack[exec->inputStackNr].data = data; 2358 exec->inputStackNr++; 2359 exec->inputStack[exec->inputStackNr].value = NULL; 2360 exec->inputStack[exec->inputStackNr].data = NULL; 2361} 2362 2363 2364/** 2365 * xmlRegCompactPushString: 2366 * @exec: a regexp execution context 2367 * @comp: the precompiled exec with a compact table 2368 * @value: a string token input 2369 * @data: data associated to the token to reuse in callbacks 2370 * 2371 * Push one input token in the execution context 2372 * 2373 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 2374 * a negative value in case of error. 2375 */ 2376static int 2377xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 2378 xmlRegexpPtr comp, 2379 const xmlChar *value, 2380 void *data) { 2381 int state = exec->index; 2382 int i, target; 2383 2384 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 2385 return(-1); 2386 2387 if (value == NULL) { 2388 /* 2389 * are we at a final state ? 2390 */ 2391 if (comp->compact[state * (comp->nbstrings + 1)] == 2392 XML_REGEXP_FINAL_STATE) 2393 return(1); 2394 return(0); 2395 } 2396 2397#ifdef DEBUG_PUSH 2398 printf("value pushed: %s\n", value); 2399#endif 2400 2401 /* 2402 * Examine all outside transition from current state 2403 */ 2404 for (i = 0;i < comp->nbstrings;i++) { 2405 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 2406 if ((target > 0) && (target <= comp->nbstates)) { 2407 target--; /* to avoid 0 */ 2408 if (xmlStrEqual(comp->stringMap[i], value)) { 2409 exec->index = target; 2410#ifdef DEBUG_PUSH 2411 printf("entering state %d\n", target); 2412#endif 2413 if (comp->compact[target * (comp->nbstrings + 1)] == 2414 XML_REGEXP_FINAL_STATE) 2415 return(1); 2416 return(0); 2417 } 2418 } 2419 } 2420 /* 2421 * Failed to find an exit transition out from current state for the 2422 * current token 2423 */ 2424#ifdef DEBUG_PUSH 2425 printf("failed to find a transition for %s on state %d\n", value, state); 2426#endif 2427 exec->status = -1; 2428 return(-1); 2429} 2430 2431/** 2432 * xmlRegExecPushString: 2433 * @exec: a regexp execution context 2434 * @value: a string token input 2435 * @data: data associated to the token to reuse in callbacks 2436 * 2437 * Push one input token in the execution context 2438 * 2439 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 2440 * a negative value in case of error. 2441 */ 2442int 2443xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 2444 void *data) { 2445 xmlRegTransPtr trans; 2446 xmlRegAtomPtr atom; 2447 int ret; 2448 int final = 0; 2449 2450 if (exec == NULL) 2451 return(-1); 2452 if (exec->comp == NULL) 2453 return(-1); 2454 if (exec->status != 0) 2455 return(exec->status); 2456 2457 if (exec->comp->compact != NULL) 2458 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 2459 2460 if (value == NULL) { 2461 if (exec->state->type == XML_REGEXP_FINAL_STATE) 2462 return(1); 2463 final = 1; 2464 } 2465 2466#ifdef DEBUG_PUSH 2467 printf("value pushed: %s\n", value); 2468#endif 2469 /* 2470 * If we have an active rollback stack push the new value there 2471 * and get back to where we were left 2472 */ 2473 if ((value != NULL) && (exec->inputStackNr > 0)) { 2474 xmlFARegExecSaveInputString(exec, value, data); 2475 value = exec->inputStack[exec->index].value; 2476 data = exec->inputStack[exec->index].data; 2477#ifdef DEBUG_PUSH 2478 printf("value loaded: %s\n", value); 2479#endif 2480 } 2481 2482 while ((exec->status == 0) && 2483 ((value != NULL) || 2484 ((final == 1) && 2485 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 2486 2487 /* 2488 * End of input on non-terminal state, rollback, however we may 2489 * still have epsilon like transition for counted transitions 2490 * on counters, in that case don't break too early. 2491 */ 2492 if ((value == NULL) && (exec->counts == NULL)) 2493 goto rollback; 2494 2495 exec->transcount = 0; 2496 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2497 trans = &exec->state->trans[exec->transno]; 2498 if (trans->to < 0) 2499 continue; 2500 atom = trans->atom; 2501 ret = 0; 2502 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 2503 int i; 2504 int count; 2505 xmlRegTransPtr t; 2506 xmlRegCounterPtr counter; 2507 2508 ret = 0; 2509 2510#ifdef DEBUG_PUSH 2511 printf("testing all lax %d\n", trans->count); 2512#endif 2513 /* 2514 * Check all counted transitions from the current state 2515 */ 2516 if ((value == NULL) && (final)) { 2517 ret = 1; 2518 } else if (value != NULL) { 2519 for (i = 0;i < exec->state->nbTrans;i++) { 2520 t = &exec->state->trans[i]; 2521 if ((t->counter < 0) || (t == trans)) 2522 continue; 2523 counter = &exec->comp->counters[t->counter]; 2524 count = exec->counts[t->counter]; 2525 if ((count < counter->max) && 2526 (t->atom != NULL) && 2527 (xmlStrEqual(value, t->atom->valuep))) { 2528 ret = 0; 2529 break; 2530 } 2531 if ((count >= counter->min) && 2532 (count < counter->max) && 2533 (xmlStrEqual(value, t->atom->valuep))) { 2534 ret = 1; 2535 break; 2536 } 2537 } 2538 } 2539 } else if (trans->count == REGEXP_ALL_COUNTER) { 2540 int i; 2541 int count; 2542 xmlRegTransPtr t; 2543 xmlRegCounterPtr counter; 2544 2545 ret = 1; 2546 2547#ifdef DEBUG_PUSH 2548 printf("testing all %d\n", trans->count); 2549#endif 2550 /* 2551 * Check all counted transitions from the current state 2552 */ 2553 for (i = 0;i < exec->state->nbTrans;i++) { 2554 t = &exec->state->trans[i]; 2555 if ((t->counter < 0) || (t == trans)) 2556 continue; 2557 counter = &exec->comp->counters[t->counter]; 2558 count = exec->counts[t->counter]; 2559 if ((count < counter->min) || (count > counter->max)) { 2560 ret = 0; 2561 break; 2562 } 2563 } 2564 } else if (trans->count >= 0) { 2565 int count; 2566 xmlRegCounterPtr counter; 2567 2568 /* 2569 * A counted transition. 2570 */ 2571 2572 count = exec->counts[trans->count]; 2573 counter = &exec->comp->counters[trans->count]; 2574#ifdef DEBUG_PUSH 2575 printf("testing count %d: val %d, min %d, max %d\n", 2576 trans->count, count, counter->min, counter->max); 2577#endif 2578 ret = ((count >= counter->min) && (count <= counter->max)); 2579 } else if (atom == NULL) { 2580 fprintf(stderr, "epsilon transition left at runtime\n"); 2581 exec->status = -2; 2582 break; 2583 } else if (value != NULL) { 2584 ret = xmlStrEqual(value, atom->valuep); 2585 if ((ret == 1) && (trans->counter >= 0)) { 2586 xmlRegCounterPtr counter; 2587 int count; 2588 2589 count = exec->counts[trans->counter]; 2590 counter = &exec->comp->counters[trans->counter]; 2591 if (count >= counter->max) 2592 ret = 0; 2593 } 2594 2595 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 2596 xmlRegStatePtr to = exec->comp->states[trans->to]; 2597 2598 /* 2599 * this is a multiple input sequence 2600 */ 2601 if (exec->state->nbTrans > exec->transno + 1) { 2602 if (exec->inputStackNr <= 0) { 2603 xmlFARegExecSaveInputString(exec, value, data); 2604 } 2605 xmlFARegExecSave(exec); 2606 } 2607 exec->transcount = 1; 2608 do { 2609 /* 2610 * Try to progress as much as possible on the input 2611 */ 2612 if (exec->transcount == atom->max) { 2613 break; 2614 } 2615 exec->index++; 2616 value = exec->inputStack[exec->index].value; 2617 data = exec->inputStack[exec->index].data; 2618#ifdef DEBUG_PUSH 2619 printf("value loaded: %s\n", value); 2620#endif 2621 2622 /* 2623 * End of input: stop here 2624 */ 2625 if (value == NULL) { 2626 exec->index --; 2627 break; 2628 } 2629 if (exec->transcount >= atom->min) { 2630 int transno = exec->transno; 2631 xmlRegStatePtr state = exec->state; 2632 2633 /* 2634 * The transition is acceptable save it 2635 */ 2636 exec->transno = -1; /* trick */ 2637 exec->state = to; 2638 if (exec->inputStackNr <= 0) { 2639 xmlFARegExecSaveInputString(exec, value, data); 2640 } 2641 xmlFARegExecSave(exec); 2642 exec->transno = transno; 2643 exec->state = state; 2644 } 2645 ret = xmlStrEqual(value, atom->valuep); 2646 exec->transcount++; 2647 } while (ret == 1); 2648 if (exec->transcount < atom->min) 2649 ret = 0; 2650 2651 /* 2652 * If the last check failed but one transition was found 2653 * possible, rollback 2654 */ 2655 if (ret < 0) 2656 ret = 0; 2657 if (ret == 0) { 2658 goto rollback; 2659 } 2660 } 2661 } 2662 if (ret == 1) { 2663 if ((exec->callback != NULL) && (atom != NULL)) { 2664 exec->callback(exec->data, atom->valuep, 2665 atom->data, data); 2666 } 2667 if (exec->state->nbTrans > exec->transno + 1) { 2668 if (exec->inputStackNr <= 0) { 2669 xmlFARegExecSaveInputString(exec, value, data); 2670 } 2671 xmlFARegExecSave(exec); 2672 } 2673 if (trans->counter >= 0) { 2674#ifdef DEBUG_PUSH 2675 printf("Increasing count %d\n", trans->counter); 2676#endif 2677 exec->counts[trans->counter]++; 2678 } 2679#ifdef DEBUG_PUSH 2680 printf("entering state %d\n", trans->to); 2681#endif 2682 exec->state = exec->comp->states[trans->to]; 2683 exec->transno = 0; 2684 if (trans->atom != NULL) { 2685 if (exec->inputStack != NULL) { 2686 exec->index++; 2687 if (exec->index < exec->inputStackNr) { 2688 value = exec->inputStack[exec->index].value; 2689 data = exec->inputStack[exec->index].data; 2690#ifdef DEBUG_PUSH 2691 printf("value loaded: %s\n", value); 2692#endif 2693 } else { 2694 value = NULL; 2695 data = NULL; 2696#ifdef DEBUG_PUSH 2697 printf("end of input\n"); 2698#endif 2699 } 2700 } else { 2701 value = NULL; 2702 data = NULL; 2703#ifdef DEBUG_PUSH 2704 printf("end of input\n"); 2705#endif 2706 } 2707 } 2708 goto progress; 2709 } else if (ret < 0) { 2710 exec->status = -4; 2711 break; 2712 } 2713 } 2714 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2715rollback: 2716 /* 2717 * Failed to find a way out 2718 */ 2719 exec->determinist = 0; 2720 xmlFARegExecRollBack(exec); 2721 if (exec->status == 0) { 2722 value = exec->inputStack[exec->index].value; 2723 data = exec->inputStack[exec->index].data; 2724#ifdef DEBUG_PUSH 2725 printf("value loaded: %s\n", value); 2726#endif 2727 } 2728 } 2729progress: 2730 continue; 2731 } 2732 if (exec->status == 0) { 2733 return(exec->state->type == XML_REGEXP_FINAL_STATE); 2734 } 2735 return(exec->status); 2736} 2737 2738#if 0 2739static int 2740xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 2741 xmlRegTransPtr trans; 2742 xmlRegAtomPtr atom; 2743 int ret; 2744 int codepoint, len; 2745 2746 if (exec == NULL) 2747 return(-1); 2748 if (exec->status != 0) 2749 return(exec->status); 2750 2751 while ((exec->status == 0) && 2752 ((exec->inputString[exec->index] != 0) || 2753 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 2754 2755 /* 2756 * End of input on non-terminal state, rollback, however we may 2757 * still have epsilon like transition for counted transitions 2758 * on counters, in that case don't break too early. 2759 */ 2760 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 2761 goto rollback; 2762 2763 exec->transcount = 0; 2764 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2765 trans = &exec->state->trans[exec->transno]; 2766 if (trans->to < 0) 2767 continue; 2768 atom = trans->atom; 2769 ret = 0; 2770 if (trans->count >= 0) { 2771 int count; 2772 xmlRegCounterPtr counter; 2773 2774 /* 2775 * A counted transition. 2776 */ 2777 2778 count = exec->counts[trans->count]; 2779 counter = &exec->comp->counters[trans->count]; 2780#ifdef DEBUG_REGEXP_EXEC 2781 printf("testing count %d: val %d, min %d, max %d\n", 2782 trans->count, count, counter->min, counter->max); 2783#endif 2784 ret = ((count >= counter->min) && (count <= counter->max)); 2785 } else if (atom == NULL) { 2786 fprintf(stderr, "epsilon transition left at runtime\n"); 2787 exec->status = -2; 2788 break; 2789 } else if (exec->inputString[exec->index] != 0) { 2790 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 2791 ret = xmlRegCheckCharacter(atom, codepoint); 2792 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 2793 xmlRegStatePtr to = exec->comp->states[trans->to]; 2794 2795 /* 2796 * this is a multiple input sequence 2797 */ 2798 if (exec->state->nbTrans > exec->transno + 1) { 2799 xmlFARegExecSave(exec); 2800 } 2801 exec->transcount = 1; 2802 do { 2803 /* 2804 * Try to progress as much as possible on the input 2805 */ 2806 if (exec->transcount == atom->max) { 2807 break; 2808 } 2809 exec->index += len; 2810 /* 2811 * End of input: stop here 2812 */ 2813 if (exec->inputString[exec->index] == 0) { 2814 exec->index -= len; 2815 break; 2816 } 2817 if (exec->transcount >= atom->min) { 2818 int transno = exec->transno; 2819 xmlRegStatePtr state = exec->state; 2820 2821 /* 2822 * The transition is acceptable save it 2823 */ 2824 exec->transno = -1; /* trick */ 2825 exec->state = to; 2826 xmlFARegExecSave(exec); 2827 exec->transno = transno; 2828 exec->state = state; 2829 } 2830 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 2831 len); 2832 ret = xmlRegCheckCharacter(atom, codepoint); 2833 exec->transcount++; 2834 } while (ret == 1); 2835 if (exec->transcount < atom->min) 2836 ret = 0; 2837 2838 /* 2839 * If the last check failed but one transition was found 2840 * possible, rollback 2841 */ 2842 if (ret < 0) 2843 ret = 0; 2844 if (ret == 0) { 2845 goto rollback; 2846 } 2847 } 2848 } 2849 if (ret == 1) { 2850 if (exec->state->nbTrans > exec->transno + 1) { 2851 xmlFARegExecSave(exec); 2852 } 2853 if (trans->counter >= 0) { 2854#ifdef DEBUG_REGEXP_EXEC 2855 printf("Increasing count %d\n", trans->counter); 2856#endif 2857 exec->counts[trans->counter]++; 2858 } 2859#ifdef DEBUG_REGEXP_EXEC 2860 printf("entering state %d\n", trans->to); 2861#endif 2862 exec->state = exec->comp->states[trans->to]; 2863 exec->transno = 0; 2864 if (trans->atom != NULL) { 2865 exec->index += len; 2866 } 2867 goto progress; 2868 } else if (ret < 0) { 2869 exec->status = -4; 2870 break; 2871 } 2872 } 2873 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2874rollback: 2875 /* 2876 * Failed to find a way out 2877 */ 2878 exec->determinist = 0; 2879 xmlFARegExecRollBack(exec); 2880 } 2881progress: 2882 continue; 2883 } 2884} 2885#endif 2886/************************************************************************ 2887 * * 2888 * Parser for the Shemas Datatype Regular Expressions * 2889 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 2890 * * 2891 ************************************************************************/ 2892 2893/** 2894 * xmlFAIsChar: 2895 * @ctxt: a regexp parser context 2896 * 2897 * [10] Char ::= [^.\?*+()|#x5B#x5D] 2898 */ 2899static int 2900xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 2901 int cur; 2902 int len; 2903 2904 cur = CUR_SCHAR(ctxt->cur, len); 2905 if ((cur == '.') || (cur == '\\') || (cur == '?') || 2906 (cur == '*') || (cur == '+') || (cur == '(') || 2907 (cur == ')') || (cur == '|') || (cur == 0x5B) || 2908 (cur == 0x5D) || (cur == 0)) 2909 return(-1); 2910 return(cur); 2911} 2912 2913/** 2914 * xmlFAParseCharProp: 2915 * @ctxt: a regexp parser context 2916 * 2917 * [27] charProp ::= IsCategory | IsBlock 2918 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 2919 * Separators | Symbols | Others 2920 * [29] Letters ::= 'L' [ultmo]? 2921 * [30] Marks ::= 'M' [nce]? 2922 * [31] Numbers ::= 'N' [dlo]? 2923 * [32] Punctuation ::= 'P' [cdseifo]? 2924 * [33] Separators ::= 'Z' [slp]? 2925 * [34] Symbols ::= 'S' [mcko]? 2926 * [35] Others ::= 'C' [cfon]? 2927 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 2928 */ 2929static void 2930xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 2931 int cur; 2932 xmlRegAtomType type = 0; 2933 xmlChar *blockName = NULL; 2934 2935 cur = CUR; 2936 if (cur == 'L') { 2937 NEXT; 2938 cur = CUR; 2939 if (cur == 'u') { 2940 NEXT; 2941 type = XML_REGEXP_LETTER_UPPERCASE; 2942 } else if (cur == 'l') { 2943 NEXT; 2944 type = XML_REGEXP_LETTER_LOWERCASE; 2945 } else if (cur == 't') { 2946 NEXT; 2947 type = XML_REGEXP_LETTER_TITLECASE; 2948 } else if (cur == 'm') { 2949 NEXT; 2950 type = XML_REGEXP_LETTER_MODIFIER; 2951 } else if (cur == 'o') { 2952 NEXT; 2953 type = XML_REGEXP_LETTER_OTHERS; 2954 } else { 2955 type = XML_REGEXP_LETTER; 2956 } 2957 } else if (cur == 'M') { 2958 NEXT; 2959 cur = CUR; 2960 if (cur == 'n') { 2961 NEXT; 2962 /* nonspacing */ 2963 type = XML_REGEXP_MARK_NONSPACING; 2964 } else if (cur == 'c') { 2965 NEXT; 2966 /* spacing combining */ 2967 type = XML_REGEXP_MARK_SPACECOMBINING; 2968 } else if (cur == 'e') { 2969 NEXT; 2970 /* enclosing */ 2971 type = XML_REGEXP_MARK_ENCLOSING; 2972 } else { 2973 /* all marks */ 2974 type = XML_REGEXP_MARK; 2975 } 2976 } else if (cur == 'N') { 2977 NEXT; 2978 cur = CUR; 2979 if (cur == 'd') { 2980 NEXT; 2981 /* digital */ 2982 type = XML_REGEXP_NUMBER_DECIMAL; 2983 } else if (cur == 'l') { 2984 NEXT; 2985 /* letter */ 2986 type = XML_REGEXP_NUMBER_LETTER; 2987 } else if (cur == 'o') { 2988 NEXT; 2989 /* other */ 2990 type = XML_REGEXP_NUMBER_OTHERS; 2991 } else { 2992 /* all numbers */ 2993 type = XML_REGEXP_NUMBER; 2994 } 2995 } else if (cur == 'P') { 2996 NEXT; 2997 cur = CUR; 2998 if (cur == 'c') { 2999 NEXT; 3000 /* connector */ 3001 type = XML_REGEXP_PUNCT_CONNECTOR; 3002 } else if (cur == 'd') { 3003 NEXT; 3004 /* dash */ 3005 type = XML_REGEXP_PUNCT_DASH; 3006 } else if (cur == 's') { 3007 NEXT; 3008 /* open */ 3009 type = XML_REGEXP_PUNCT_OPEN; 3010 } else if (cur == 'e') { 3011 NEXT; 3012 /* close */ 3013 type = XML_REGEXP_PUNCT_CLOSE; 3014 } else if (cur == 'i') { 3015 NEXT; 3016 /* initial quote */ 3017 type = XML_REGEXP_PUNCT_INITQUOTE; 3018 } else if (cur == 'f') { 3019 NEXT; 3020 /* final quote */ 3021 type = XML_REGEXP_PUNCT_FINQUOTE; 3022 } else if (cur == 'o') { 3023 NEXT; 3024 /* other */ 3025 type = XML_REGEXP_PUNCT_OTHERS; 3026 } else { 3027 /* all punctuation */ 3028 type = XML_REGEXP_PUNCT; 3029 } 3030 } else if (cur == 'Z') { 3031 NEXT; 3032 cur = CUR; 3033 if (cur == 's') { 3034 NEXT; 3035 /* space */ 3036 type = XML_REGEXP_SEPAR_SPACE; 3037 } else if (cur == 'l') { 3038 NEXT; 3039 /* line */ 3040 type = XML_REGEXP_SEPAR_LINE; 3041 } else if (cur == 'p') { 3042 NEXT; 3043 /* paragraph */ 3044 type = XML_REGEXP_SEPAR_PARA; 3045 } else { 3046 /* all separators */ 3047 type = XML_REGEXP_SEPAR; 3048 } 3049 } else if (cur == 'S') { 3050 NEXT; 3051 cur = CUR; 3052 if (cur == 'm') { 3053 NEXT; 3054 type = XML_REGEXP_SYMBOL_MATH; 3055 /* math */ 3056 } else if (cur == 'c') { 3057 NEXT; 3058 type = XML_REGEXP_SYMBOL_CURRENCY; 3059 /* currency */ 3060 } else if (cur == 'k') { 3061 NEXT; 3062 type = XML_REGEXP_SYMBOL_MODIFIER; 3063 /* modifiers */ 3064 } else if (cur == 'o') { 3065 NEXT; 3066 type = XML_REGEXP_SYMBOL_OTHERS; 3067 /* other */ 3068 } else { 3069 /* all symbols */ 3070 type = XML_REGEXP_SYMBOL; 3071 } 3072 } else if (cur == 'C') { 3073 NEXT; 3074 cur = CUR; 3075 if (cur == 'c') { 3076 NEXT; 3077 /* control */ 3078 type = XML_REGEXP_OTHER_CONTROL; 3079 } else if (cur == 'f') { 3080 NEXT; 3081 /* format */ 3082 type = XML_REGEXP_OTHER_FORMAT; 3083 } else if (cur == 'o') { 3084 NEXT; 3085 /* private use */ 3086 type = XML_REGEXP_OTHER_PRIVATE; 3087 } else if (cur == 'n') { 3088 NEXT; 3089 /* not assigned */ 3090 type = XML_REGEXP_OTHER_NA; 3091 } else { 3092 /* all others */ 3093 type = XML_REGEXP_OTHER; 3094 } 3095 } else if (cur == 'I') { 3096 const xmlChar *start; 3097 NEXT; 3098 cur = CUR; 3099 if (cur != 's') { 3100 ERROR("IsXXXX expected"); 3101 return; 3102 } 3103 NEXT; 3104 start = ctxt->cur; 3105 cur = CUR; 3106 if (((cur >= 'a') && (cur <= 'z')) || 3107 ((cur >= 'A') && (cur <= 'Z')) || 3108 ((cur >= '0') && (cur <= '9')) || 3109 (cur == 0x2D)) { 3110 NEXT; 3111 cur = CUR; 3112 while (((cur >= 'a') && (cur <= 'z')) || 3113 ((cur >= 'A') && (cur <= 'Z')) || 3114 ((cur >= '0') && (cur <= '9')) || 3115 (cur == 0x2D)) { 3116 NEXT; 3117 cur = CUR; 3118 } 3119 } 3120 type = XML_REGEXP_BLOCK_NAME; 3121 blockName = xmlStrndup(start, ctxt->cur - start); 3122 } else { 3123 ERROR("Unknown char property"); 3124 return; 3125 } 3126 if (ctxt->atom == NULL) { 3127 ctxt->atom = xmlRegNewAtom(ctxt, type); 3128 if (ctxt->atom != NULL) 3129 ctxt->atom->valuep = blockName; 3130 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3131 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3132 type, 0, 0, blockName); 3133 } 3134} 3135 3136/** 3137 * xmlFAParseCharClassEsc: 3138 * @ctxt: a regexp parser context 3139 * 3140 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 3141 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 3142 * [25] catEsc ::= '\p{' charProp '}' 3143 * [26] complEsc ::= '\P{' charProp '}' 3144 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 3145 */ 3146static void 3147xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 3148 int cur; 3149 3150 if (CUR == '.') { 3151 if (ctxt->atom == NULL) { 3152 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 3153 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3154 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3155 XML_REGEXP_ANYCHAR, 0, 0, NULL); 3156 } 3157 NEXT; 3158 return; 3159 } 3160 if (CUR != '\\') { 3161 ERROR("Escaped sequence: expecting \\"); 3162 return; 3163 } 3164 NEXT; 3165 cur = CUR; 3166 if (cur == 'p') { 3167 NEXT; 3168 if (CUR != '{') { 3169 ERROR("Expecting '{'"); 3170 return; 3171 } 3172 NEXT; 3173 xmlFAParseCharProp(ctxt); 3174 if (CUR != '}') { 3175 ERROR("Expecting '}'"); 3176 return; 3177 } 3178 NEXT; 3179 } else if (cur == 'P') { 3180 NEXT; 3181 if (CUR != '{') { 3182 ERROR("Expecting '{'"); 3183 return; 3184 } 3185 NEXT; 3186 xmlFAParseCharProp(ctxt); 3187 ctxt->atom->neg = 1; 3188 if (CUR != '}') { 3189 ERROR("Expecting '}'"); 3190 return; 3191 } 3192 NEXT; 3193 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 3194 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 3195 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 3196 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 3197 (cur == 0x5E)) { 3198 if (ctxt->atom == NULL) { 3199 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 3200 if (ctxt->atom != NULL) 3201 ctxt->atom->codepoint = cur; 3202 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3203 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3204 XML_REGEXP_CHARVAL, cur, cur, NULL); 3205 } 3206 NEXT; 3207 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 3208 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 3209 (cur == 'w') || (cur == 'W')) { 3210 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 3211 3212 switch (cur) { 3213 case 's': 3214 type = XML_REGEXP_ANYSPACE; 3215 break; 3216 case 'S': 3217 type = XML_REGEXP_NOTSPACE; 3218 break; 3219 case 'i': 3220 type = XML_REGEXP_INITNAME; 3221 break; 3222 case 'I': 3223 type = XML_REGEXP_NOTINITNAME; 3224 break; 3225 case 'c': 3226 type = XML_REGEXP_NAMECHAR; 3227 break; 3228 case 'C': 3229 type = XML_REGEXP_NOTNAMECHAR; 3230 break; 3231 case 'd': 3232 type = XML_REGEXP_DECIMAL; 3233 break; 3234 case 'D': 3235 type = XML_REGEXP_NOTDECIMAL; 3236 break; 3237 case 'w': 3238 type = XML_REGEXP_REALCHAR; 3239 break; 3240 case 'W': 3241 type = XML_REGEXP_NOTREALCHAR; 3242 break; 3243 } 3244 NEXT; 3245 if (ctxt->atom == NULL) { 3246 ctxt->atom = xmlRegNewAtom(ctxt, type); 3247 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 3248 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3249 type, 0, 0, NULL); 3250 } 3251 } 3252} 3253 3254/** 3255 * xmlFAParseCharRef: 3256 * @ctxt: a regexp parser context 3257 * 3258 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' ) 3259 */ 3260static int 3261xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) { 3262 int ret = 0, cur; 3263 3264 if ((CUR != '&') || (NXT(1) != '#')) 3265 return(-1); 3266 NEXT; 3267 NEXT; 3268 cur = CUR; 3269 if (cur == 'x') { 3270 NEXT; 3271 cur = CUR; 3272 if (((cur >= '0') && (cur <= '9')) || 3273 ((cur >= 'a') && (cur <= 'f')) || 3274 ((cur >= 'A') && (cur <= 'F'))) { 3275 while (((cur >= '0') && (cur <= '9')) || 3276 ((cur >= 'A') && (cur <= 'F'))) { 3277 if ((cur >= '0') && (cur <= '9')) 3278 ret = ret * 16 + cur - '0'; 3279 else if ((cur >= 'a') && (cur <= 'f')) 3280 ret = ret * 16 + 10 + (cur - 'a'); 3281 else 3282 ret = ret * 16 + 10 + (cur - 'A'); 3283 NEXT; 3284 cur = CUR; 3285 } 3286 } else { 3287 ERROR("Char ref: expecting [0-9A-F]"); 3288 return(-1); 3289 } 3290 } else { 3291 if ((cur >= '0') && (cur <= '9')) { 3292 while ((cur >= '0') && (cur <= '9')) { 3293 ret = ret * 10 + cur - '0'; 3294 NEXT; 3295 cur = CUR; 3296 } 3297 } else { 3298 ERROR("Char ref: expecting [0-9]"); 3299 return(-1); 3300 } 3301 } 3302 if (cur != ';') { 3303 ERROR("Char ref: expecting ';'"); 3304 return(-1); 3305 } else { 3306 NEXT; 3307 } 3308 return(ret); 3309} 3310 3311/** 3312 * xmlFAParseCharRange: 3313 * @ctxt: a regexp parser context 3314 * 3315 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 3316 * [18] seRange ::= charOrEsc '-' charOrEsc 3317 * [20] charOrEsc ::= XmlChar | SingleCharEsc 3318 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 3319 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 3320 */ 3321static void 3322xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 3323 int cur; 3324 int start = -1; 3325 int end = -1; 3326 3327 if ((CUR == '&') && (NXT(1) == '#')) { 3328 end = start = xmlFAParseCharRef(ctxt); 3329 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3330 XML_REGEXP_CHARVAL, start, end, NULL); 3331 return; 3332 } 3333 cur = CUR; 3334 if (cur == '\\') { 3335 NEXT; 3336 cur = CUR; 3337 switch (cur) { 3338 case 'n': start = 0xA; break; 3339 case 'r': start = 0xD; break; 3340 case 't': start = 0x9; break; 3341 case '\\': case '|': case '.': case '-': case '^': case '?': 3342 case '*': case '+': case '{': case '}': case '(': case ')': 3343 case '[': case ']': 3344 start = cur; break; 3345 default: 3346 ERROR("Invalid escape value"); 3347 return; 3348 } 3349 end = start; 3350 } else if ((cur != 0x5B) && (cur != 0x5D)) { 3351 end = start = cur; 3352 } else { 3353 ERROR("Expecting a char range"); 3354 return; 3355 } 3356 NEXT; 3357 if (start == '-') { 3358 return; 3359 } 3360 cur = CUR; 3361 if (cur != '-') { 3362 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3363 XML_REGEXP_CHARVAL, start, end, NULL); 3364 return; 3365 } 3366 NEXT; 3367 cur = CUR; 3368 if (cur == '\\') { 3369 NEXT; 3370 cur = CUR; 3371 switch (cur) { 3372 case 'n': end = 0xA; break; 3373 case 'r': end = 0xD; break; 3374 case 't': end = 0x9; break; 3375 case '\\': case '|': case '.': case '-': case '^': case '?': 3376 case '*': case '+': case '{': case '}': case '(': case ')': 3377 case '[': case ']': 3378 end = cur; break; 3379 default: 3380 ERROR("Invalid escape value"); 3381 return; 3382 } 3383 } else if ((cur != 0x5B) && (cur != 0x5D)) { 3384 end = cur; 3385 } else { 3386 ERROR("Expecting the end of a char range"); 3387 return; 3388 } 3389 NEXT; 3390 /* TODO check that the values are acceptable character ranges for XML */ 3391 if (end < start) { 3392 ERROR("End of range is before start of range"); 3393 } else { 3394 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 3395 XML_REGEXP_CHARVAL, start, end, NULL); 3396 } 3397 return; 3398} 3399 3400/** 3401 * xmlFAParsePosCharGroup: 3402 * @ctxt: a regexp parser context 3403 * 3404 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 3405 */ 3406static void 3407xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 3408 do { 3409 if ((CUR == '\\') || (CUR == '.')) { 3410 xmlFAParseCharClassEsc(ctxt); 3411 } else { 3412 xmlFAParseCharRange(ctxt); 3413 } 3414 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 3415 (ctxt->error == 0)); 3416} 3417 3418/** 3419 * xmlFAParseCharGroup: 3420 * @ctxt: a regexp parser context 3421 * 3422 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 3423 * [15] negCharGroup ::= '^' posCharGroup 3424 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 3425 * [12] charClassExpr ::= '[' charGroup ']' 3426 */ 3427static void 3428xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 3429 int n = ctxt->neg; 3430 while ((CUR != ']') && (ctxt->error == 0)) { 3431 if (CUR == '^') { 3432 int neg = ctxt->neg; 3433 3434 NEXT; 3435 ctxt->neg = !ctxt->neg; 3436 xmlFAParsePosCharGroup(ctxt); 3437 ctxt->neg = neg; 3438 } else if (CUR == '-') { 3439 NEXT; 3440 ctxt->neg = !ctxt->neg; 3441 if (CUR != '[') { 3442 ERROR("charClassExpr: '[' expected"); 3443 break; 3444 } 3445 NEXT; 3446 xmlFAParseCharGroup(ctxt); 3447 if (CUR == ']') { 3448 NEXT; 3449 } else { 3450 ERROR("charClassExpr: ']' expected"); 3451 break; 3452 } 3453 break; 3454 } else if (CUR != ']') { 3455 xmlFAParsePosCharGroup(ctxt); 3456 } 3457 } 3458 ctxt->neg = n; 3459} 3460 3461/** 3462 * xmlFAParseCharClass: 3463 * @ctxt: a regexp parser context 3464 * 3465 * [11] charClass ::= charClassEsc | charClassExpr 3466 * [12] charClassExpr ::= '[' charGroup ']' 3467 */ 3468static void 3469xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 3470 if (CUR == '[') { 3471 NEXT; 3472 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 3473 if (ctxt->atom == NULL) 3474 return; 3475 xmlFAParseCharGroup(ctxt); 3476 if (CUR == ']') { 3477 NEXT; 3478 } else { 3479 ERROR("xmlFAParseCharClass: ']' expected"); 3480 } 3481 } else { 3482 xmlFAParseCharClassEsc(ctxt); 3483 } 3484} 3485 3486/** 3487 * xmlFAParseQuantExact: 3488 * @ctxt: a regexp parser context 3489 * 3490 * [8] QuantExact ::= [0-9]+ 3491 */ 3492static int 3493xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 3494 int ret = 0; 3495 int ok = 0; 3496 3497 while ((CUR >= '0') && (CUR <= '9')) { 3498 ret = ret * 10 + (CUR - '0'); 3499 ok = 1; 3500 NEXT; 3501 } 3502 if (ok != 1) { 3503 return(-1); 3504 } 3505 return(ret); 3506} 3507 3508/** 3509 * xmlFAParseQuantifier: 3510 * @ctxt: a regexp parser context 3511 * 3512 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 3513 * [5] quantity ::= quantRange | quantMin | QuantExact 3514 * [6] quantRange ::= QuantExact ',' QuantExact 3515 * [7] quantMin ::= QuantExact ',' 3516 * [8] QuantExact ::= [0-9]+ 3517 */ 3518static int 3519xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 3520 int cur; 3521 3522 cur = CUR; 3523 if ((cur == '?') || (cur == '*') || (cur == '+')) { 3524 if (ctxt->atom != NULL) { 3525 if (cur == '?') 3526 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 3527 else if (cur == '*') 3528 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 3529 else if (cur == '+') 3530 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 3531 } 3532 NEXT; 3533 return(1); 3534 } 3535 if (cur == '{') { 3536 int min = 0, max = 0; 3537 3538 NEXT; 3539 cur = xmlFAParseQuantExact(ctxt); 3540 if (cur >= 0) 3541 min = cur; 3542 if (CUR == ',') { 3543 NEXT; 3544 cur = xmlFAParseQuantExact(ctxt); 3545 if (cur >= 0) 3546 max = cur; 3547 } 3548 if (CUR == '}') { 3549 NEXT; 3550 } else { 3551 ERROR("Unterminated quantifier"); 3552 } 3553 if (max == 0) 3554 max = min; 3555 if (ctxt->atom != NULL) { 3556 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 3557 ctxt->atom->min = min; 3558 ctxt->atom->max = max; 3559 } 3560 return(1); 3561 } 3562 return(0); 3563} 3564 3565/** 3566 * xmlFAParseAtom: 3567 * @ctxt: a regexp parser context 3568 * 3569 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 3570 */ 3571static int 3572xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 3573 int codepoint, len; 3574 3575 codepoint = xmlFAIsChar(ctxt); 3576 if (codepoint > 0) { 3577 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 3578 if (ctxt->atom == NULL) 3579 return(-1); 3580 codepoint = CUR_SCHAR(ctxt->cur, len); 3581 ctxt->atom->codepoint = codepoint; 3582 NEXTL(len); 3583 return(1); 3584 } else if (CUR == '|') { 3585 return(0); 3586 } else if (CUR == 0) { 3587 return(0); 3588 } else if (CUR == ')') { 3589 return(0); 3590 } else if (CUR == '(') { 3591 xmlRegStatePtr start, oldend; 3592 3593 NEXT; 3594 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 3595 start = ctxt->state; 3596 oldend = ctxt->end; 3597 ctxt->end = NULL; 3598 ctxt->atom = NULL; 3599 xmlFAParseRegExp(ctxt, 0); 3600 if (CUR == ')') { 3601 NEXT; 3602 } else { 3603 ERROR("xmlFAParseAtom: expecting ')'"); 3604 } 3605 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 3606 if (ctxt->atom == NULL) 3607 return(-1); 3608 ctxt->atom->start = start; 3609 ctxt->atom->stop = ctxt->state; 3610 ctxt->end = oldend; 3611 return(1); 3612 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 3613 xmlFAParseCharClass(ctxt); 3614 return(1); 3615 } 3616 return(0); 3617} 3618 3619/** 3620 * xmlFAParsePiece: 3621 * @ctxt: a regexp parser context 3622 * 3623 * [3] piece ::= atom quantifier? 3624 */ 3625static int 3626xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 3627 int ret; 3628 3629 ctxt->atom = NULL; 3630 ret = xmlFAParseAtom(ctxt); 3631 if (ret == 0) 3632 return(0); 3633 if (ctxt->atom == NULL) { 3634 ERROR("internal: no atom generated"); 3635 } 3636 xmlFAParseQuantifier(ctxt); 3637 return(1); 3638} 3639 3640/** 3641 * xmlFAParseBranch: 3642 * @ctxt: a regexp parser context 3643 * @first: is taht the first 3644 * 3645 * [2] branch ::= piece* 3646 */ 3647static void 3648xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) { 3649 xmlRegStatePtr previous; 3650 xmlRegAtomPtr prevatom = NULL; 3651 int ret; 3652 3653 previous = ctxt->state; 3654 ret = xmlFAParsePiece(ctxt); 3655 if (ret != 0) { 3656 if (first) { 3657 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom); 3658 previous = ctxt->state; 3659 } else { 3660 prevatom = ctxt->atom; 3661 } 3662 ctxt->atom = NULL; 3663 } 3664 while ((ret != 0) && (ctxt->error == 0)) { 3665 ret = xmlFAParsePiece(ctxt); 3666 if (ret != 0) { 3667 if (first) { 3668 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom); 3669 } else { 3670 xmlFAGenerateTransitions(ctxt, previous, NULL, prevatom); 3671 prevatom = ctxt->atom; 3672 } 3673 previous = ctxt->state; 3674 ctxt->atom = NULL; 3675 } 3676 } 3677 if (!first) { 3678 xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom); 3679 } 3680} 3681 3682/** 3683 * xmlFAParseRegExp: 3684 * @ctxt: a regexp parser context 3685 * @top: is that the top-level expressions ? 3686 * 3687 * [1] regExp ::= branch ( '|' branch )* 3688 */ 3689static void 3690xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 3691 xmlRegStatePtr start, end, oldend; 3692 3693 oldend = ctxt->end; 3694 3695 start = ctxt->state; 3696 xmlFAParseBranch(ctxt, (ctxt->end == NULL)); 3697 if (CUR != '|') { 3698 ctxt->end = ctxt->state; 3699 return; 3700 } 3701 end = ctxt->state; 3702 while ((CUR == '|') && (ctxt->error == 0)) { 3703 NEXT; 3704 ctxt->state = start; 3705 ctxt->end = end; 3706 xmlFAParseBranch(ctxt, 0); 3707 } 3708 if (!top) 3709 ctxt->end = oldend; 3710} 3711 3712/************************************************************************ 3713 * * 3714 * The basic API * 3715 * * 3716 ************************************************************************/ 3717 3718/** 3719 * xmlRegexpPrint: 3720 * @output: the file for the output debug 3721 * @regexp: the compiled regexp 3722 * 3723 * Print the content of the compiled regular expression 3724 */ 3725void 3726xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 3727 int i; 3728 3729 fprintf(output, " regexp: "); 3730 if (regexp == NULL) { 3731 fprintf(output, "NULL\n"); 3732 return; 3733 } 3734 fprintf(output, "'%s' ", regexp->string); 3735 fprintf(output, "\n"); 3736 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 3737 for (i = 0;i < regexp->nbAtoms; i++) { 3738 fprintf(output, " %02d ", i); 3739 xmlRegPrintAtom(output, regexp->atoms[i]); 3740 } 3741 fprintf(output, "%d states:", regexp->nbStates); 3742 fprintf(output, "\n"); 3743 for (i = 0;i < regexp->nbStates; i++) { 3744 xmlRegPrintState(output, regexp->states[i]); 3745 } 3746 fprintf(output, "%d counters:\n", regexp->nbCounters); 3747 for (i = 0;i < regexp->nbCounters; i++) { 3748 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 3749 regexp->counters[i].max); 3750 } 3751} 3752 3753/** 3754 * xmlRegexpCompile: 3755 * @regexp: a regular expression string 3756 * 3757 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 3758 * Appendix F and build an automata suitable for testing strings against 3759 * that regular expression 3760 * 3761 * Returns the compiled expression or NULL in case of error 3762 */ 3763xmlRegexpPtr 3764xmlRegexpCompile(const xmlChar *regexp) { 3765 xmlRegexpPtr ret; 3766 xmlRegParserCtxtPtr ctxt; 3767 3768 ctxt = xmlRegNewParserCtxt(regexp); 3769 if (ctxt == NULL) 3770 return(NULL); 3771 3772 /* initialize the parser */ 3773 ctxt->end = NULL; 3774 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 3775 xmlRegStatePush(ctxt, ctxt->start); 3776 3777 /* parse the expression building an automata */ 3778 xmlFAParseRegExp(ctxt, 1); 3779 if (CUR != 0) { 3780 ERROR("xmlFAParseRegExp: extra characters"); 3781 } 3782 ctxt->end = ctxt->state; 3783 ctxt->start->type = XML_REGEXP_START_STATE; 3784 ctxt->end->type = XML_REGEXP_FINAL_STATE; 3785 3786 /* remove the Epsilon except for counted transitions */ 3787 xmlFAEliminateEpsilonTransitions(ctxt); 3788 3789 3790 if (ctxt->error != 0) { 3791 xmlRegFreeParserCtxt(ctxt); 3792 return(NULL); 3793 } 3794 ret = xmlRegEpxFromParse(ctxt); 3795 xmlRegFreeParserCtxt(ctxt); 3796 return(ret); 3797} 3798 3799/** 3800 * xmlRegexpExec: 3801 * @comp: the compiled regular expression 3802 * @content: the value to check against the regular expression 3803 * 3804 * Check if the regular expression generate the value 3805 * 3806 * Returns 1 if it matches, 0 if not and a negativa value in case of error 3807 */ 3808int 3809xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 3810 if ((comp == NULL) || (content == NULL)) 3811 return(-1); 3812 return(xmlFARegExec(comp, content)); 3813} 3814 3815/** 3816 * xmlRegexpIsDeterminist: 3817 * @comp: the compiled regular expression 3818 * 3819 * Check if the regular expression is determinist 3820 * 3821 * Returns 1 if it yes, 0 if not and a negativa value in case of error 3822 */ 3823int 3824xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 3825 xmlAutomataPtr am; 3826 int ret; 3827 3828 if (comp == NULL) 3829 return(-1); 3830 if (comp->determinist != -1) 3831 return(comp->determinist); 3832 3833 am = xmlNewAutomata(); 3834 am->nbAtoms = comp->nbAtoms; 3835 am->atoms = comp->atoms; 3836 am->nbStates = comp->nbStates; 3837 am->states = comp->states; 3838 am->determinist = -1; 3839 ret = xmlFAComputesDeterminism(am); 3840 am->atoms = NULL; 3841 am->states = NULL; 3842 xmlFreeAutomata(am); 3843 return(ret); 3844} 3845 3846/** 3847 * xmlRegFreeRegexp: 3848 * @regexp: the regexp 3849 * 3850 * Free a regexp 3851 */ 3852void 3853xmlRegFreeRegexp(xmlRegexpPtr regexp) { 3854 int i; 3855 if (regexp == NULL) 3856 return; 3857 3858 if (regexp->string != NULL) 3859 xmlFree(regexp->string); 3860 if (regexp->states != NULL) { 3861 for (i = 0;i < regexp->nbStates;i++) 3862 xmlRegFreeState(regexp->states[i]); 3863 xmlFree(regexp->states); 3864 } 3865 if (regexp->atoms != NULL) { 3866 for (i = 0;i < regexp->nbAtoms;i++) 3867 xmlRegFreeAtom(regexp->atoms[i]); 3868 xmlFree(regexp->atoms); 3869 } 3870 if (regexp->counters != NULL) 3871 xmlFree(regexp->counters); 3872 if (regexp->compact != NULL) 3873 xmlFree(regexp->compact); 3874 if (regexp->stringMap != NULL) { 3875 for (i = 0; i < regexp->nbstrings;i++) 3876 xmlFree(regexp->stringMap[i]); 3877 xmlFree(regexp->stringMap); 3878 } 3879 3880 xmlFree(regexp); 3881} 3882 3883#ifdef LIBXML_AUTOMATA_ENABLED 3884/************************************************************************ 3885 * * 3886 * The Automata interface * 3887 * * 3888 ************************************************************************/ 3889 3890/** 3891 * xmlNewAutomata: 3892 * 3893 * Create a new automata 3894 * 3895 * Returns the new object or NULL in case of failure 3896 */ 3897xmlAutomataPtr 3898xmlNewAutomata(void) { 3899 xmlAutomataPtr ctxt; 3900 3901 ctxt = xmlRegNewParserCtxt(NULL); 3902 if (ctxt == NULL) 3903 return(NULL); 3904 3905 /* initialize the parser */ 3906 ctxt->end = NULL; 3907 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 3908 xmlRegStatePush(ctxt, ctxt->start); 3909 3910 return(ctxt); 3911} 3912 3913/** 3914 * xmlFreeAutomata: 3915 * @am: an automata 3916 * 3917 * Free an automata 3918 */ 3919void 3920xmlFreeAutomata(xmlAutomataPtr am) { 3921 if (am == NULL) 3922 return; 3923 xmlRegFreeParserCtxt(am); 3924} 3925 3926/** 3927 * xmlAutomataGetInitState: 3928 * @am: an automata 3929 * 3930 * Returns the initial state of the automata 3931 */ 3932xmlAutomataStatePtr 3933xmlAutomataGetInitState(xmlAutomataPtr am) { 3934 if (am == NULL) 3935 return(NULL); 3936 return(am->start); 3937} 3938 3939/** 3940 * xmlAutomataSetFinalState: 3941 * @am: an automata 3942 * @state: a state in this automata 3943 * 3944 * Makes that state a final state 3945 * 3946 * Returns 0 or -1 in case of error 3947 */ 3948int 3949xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 3950 if ((am == NULL) || (state == NULL)) 3951 return(-1); 3952 state->type = XML_REGEXP_FINAL_STATE; 3953 return(0); 3954} 3955 3956/** 3957 * xmlAutomataNewTransition: 3958 * @am: an automata 3959 * @from: the starting point of the transition 3960 * @to: the target point of the transition or NULL 3961 * @token: the input string associated to that transition 3962 * @data: data passed to the callback function if the transition is activated 3963 * 3964 * If @to is NULL, this create first a new target state in the automata 3965 * and then adds a transition from the @from state to the target state 3966 * activated by the value of @token 3967 * 3968 * Returns the target state or NULL in case of error 3969 */ 3970xmlAutomataStatePtr 3971xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 3972 xmlAutomataStatePtr to, const xmlChar *token, 3973 void *data) { 3974 xmlRegAtomPtr atom; 3975 3976 if ((am == NULL) || (from == NULL) || (token == NULL)) 3977 return(NULL); 3978 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 3979 atom->data = data; 3980 if (atom == NULL) 3981 return(NULL); 3982 atom->valuep = xmlStrdup(token); 3983 3984 xmlFAGenerateTransitions(am, from, to, atom); 3985 if (to == NULL) 3986 return(am->state); 3987 return(to); 3988} 3989 3990/** 3991 * xmlAutomataNewCountTrans: 3992 * @am: an automata 3993 * @from: the starting point of the transition 3994 * @to: the target point of the transition or NULL 3995 * @token: the input string associated to that transition 3996 * @min: the minimum successive occurences of token 3997 * @min: the maximum successive occurences of token 3998 * 3999 * If @to is NULL, this create first a new target state in the automata 4000 * and then adds a transition from the @from state to the target state 4001 * activated by a succession of input of value @token and whose number 4002 * is between @min and @max 4003 * 4004 * Returns the target state or NULL in case of error 4005 */ 4006xmlAutomataStatePtr 4007xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4008 xmlAutomataStatePtr to, const xmlChar *token, 4009 int min, int max, void *data) { 4010 xmlRegAtomPtr atom; 4011 4012 if ((am == NULL) || (from == NULL) || (token == NULL)) 4013 return(NULL); 4014 if (min < 0) 4015 return(NULL); 4016 if ((max < min) || (max < 1)) 4017 return(NULL); 4018 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4019 if (atom == NULL) 4020 return(NULL); 4021 atom->valuep = xmlStrdup(token); 4022 atom->data = data; 4023 if (min == 0) 4024 atom->min = 1; 4025 else 4026 atom->min = min; 4027 atom->max = max; 4028 4029 xmlFAGenerateTransitions(am, from, to, atom); 4030 if (to == NULL) 4031 to = am->state; 4032 if (to == NULL) 4033 return(NULL); 4034 if (min == 0) 4035 xmlFAGenerateEpsilonTransition(am, from, to); 4036 return(to); 4037} 4038 4039/** 4040 * xmlAutomataNewOnceTrans: 4041 * @am: an automata 4042 * @from: the starting point of the transition 4043 * @to: the target point of the transition or NULL 4044 * @token: the input string associated to that transition 4045 * @min: the minimum successive occurences of token 4046 * @min: the maximum successive occurences of token 4047 * 4048 * If @to is NULL, this create first a new target state in the automata 4049 * and then adds a transition from the @from state to the target state 4050 * activated by a succession of input of value @token and whose number 4051 * is between @min and @max, moreover that transistion can only be crossed 4052 * once. 4053 * 4054 * Returns the target state or NULL in case of error 4055 */ 4056xmlAutomataStatePtr 4057xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4058 xmlAutomataStatePtr to, const xmlChar *token, 4059 int min, int max, void *data) { 4060 xmlRegAtomPtr atom; 4061 int counter; 4062 4063 if ((am == NULL) || (from == NULL) || (token == NULL)) 4064 return(NULL); 4065 if (min < 1) 4066 return(NULL); 4067 if ((max < min) || (max < 1)) 4068 return(NULL); 4069 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 4070 if (atom == NULL) 4071 return(NULL); 4072 atom->valuep = xmlStrdup(token); 4073 atom->data = data; 4074 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 4075 if (min == 0) 4076 atom->min = 1; 4077 else 4078 atom->min = min; 4079 atom->max = max; 4080 /* 4081 * associate a counter to the transition. 4082 */ 4083 counter = xmlRegGetCounter(am); 4084 am->counters[counter].min = 1; 4085 am->counters[counter].max = 1; 4086 4087 /* xmlFAGenerateTransitions(am, from, to, atom); */ 4088 if (to == NULL) { 4089 to = xmlRegNewState(am); 4090 xmlRegStatePush(am, to); 4091 } 4092 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 4093 xmlRegAtomPush(am, atom); 4094 am->state = to; 4095 if (to == NULL) 4096 to = am->state; 4097 if (to == NULL) 4098 return(NULL); 4099 return(to); 4100} 4101 4102/** 4103 * xmlAutomataNewState: 4104 * @am: an automata 4105 * 4106 * Create a new disconnected state in the automata 4107 * 4108 * Returns the new state or NULL in case of error 4109 */ 4110xmlAutomataStatePtr 4111xmlAutomataNewState(xmlAutomataPtr am) { 4112 xmlAutomataStatePtr to; 4113 4114 if (am == NULL) 4115 return(NULL); 4116 to = xmlRegNewState(am); 4117 xmlRegStatePush(am, to); 4118 return(to); 4119} 4120 4121/** 4122 * xmlAutomataNewTransition: 4123 * @am: an automata 4124 * @from: the starting point of the transition 4125 * @to: the target point of the transition or NULL 4126 * 4127 * If @to is NULL, this create first a new target state in the automata 4128 * and then adds a an epsilon transition from the @from state to the 4129 * target state 4130 * 4131 * Returns the target state or NULL in case of error 4132 */ 4133xmlAutomataStatePtr 4134xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 4135 xmlAutomataStatePtr to) { 4136 if ((am == NULL) || (from == NULL)) 4137 return(NULL); 4138 xmlFAGenerateEpsilonTransition(am, from, to); 4139 if (to == NULL) 4140 return(am->state); 4141 return(to); 4142} 4143 4144/** 4145 * xmlAutomataNewAllTrans: 4146 * @am: an automata 4147 * @from: the starting point of the transition 4148 * @to: the target point of the transition or NULL 4149 * 4150 * If @to is NULL, this create first a new target state in the automata 4151 * and then adds a an ALL transition from the @from state to the 4152 * target state. That transition is an epsilon transition allowed only when 4153 * all transitions from the @from node have been activated. 4154 * 4155 * Returns the target state or NULL in case of error 4156 */ 4157xmlAutomataStatePtr 4158xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4159 xmlAutomataStatePtr to, int lax) { 4160 if ((am == NULL) || (from == NULL)) 4161 return(NULL); 4162 xmlFAGenerateAllTransition(am, from, to, lax); 4163 if (to == NULL) 4164 return(am->state); 4165 return(to); 4166} 4167 4168/** 4169 * xmlAutomataNewCounter: 4170 * @am: an automata 4171 * @min: the minimal value on the counter 4172 * @max: the maximal value on the counter 4173 * 4174 * Create a new counter 4175 * 4176 * Returns the counter number or -1 in case of error 4177 */ 4178int 4179xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 4180 int ret; 4181 4182 if (am == NULL) 4183 return(-1); 4184 4185 ret = xmlRegGetCounter(am); 4186 if (ret < 0) 4187 return(-1); 4188 am->counters[ret].min = min; 4189 am->counters[ret].max = max; 4190 return(ret); 4191} 4192 4193/** 4194 * xmlAutomataNewCountedTrans: 4195 * @am: an automata 4196 * @from: the starting point of the transition 4197 * @to: the target point of the transition or NULL 4198 * @counter: the counter associated to that transition 4199 * 4200 * If @to is NULL, this create first a new target state in the automata 4201 * and then adds an epsilon transition from the @from state to the target state 4202 * which will increment the counter provided 4203 * 4204 * Returns the target state or NULL in case of error 4205 */ 4206xmlAutomataStatePtr 4207xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4208 xmlAutomataStatePtr to, int counter) { 4209 if ((am == NULL) || (from == NULL) || (counter < 0)) 4210 return(NULL); 4211 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 4212 if (to == NULL) 4213 return(am->state); 4214 return(to); 4215} 4216 4217/** 4218 * xmlAutomataNewCounterTrans: 4219 * @am: an automata 4220 * @from: the starting point of the transition 4221 * @to: the target point of the transition or NULL 4222 * @counter: the counter associated to that transition 4223 * 4224 * If @to is NULL, this create first a new target state in the automata 4225 * and then adds an epsilon transition from the @from state to the target state 4226 * which will be allowed only if the counter is within the right range. 4227 * 4228 * Returns the target state or NULL in case of error 4229 */ 4230xmlAutomataStatePtr 4231xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 4232 xmlAutomataStatePtr to, int counter) { 4233 if ((am == NULL) || (from == NULL) || (counter < 0)) 4234 return(NULL); 4235 xmlFAGenerateCountedTransition(am, from, to, counter); 4236 if (to == NULL) 4237 return(am->state); 4238 return(to); 4239} 4240 4241/** 4242 * xmlAutomataCompile: 4243 * @am: an automata 4244 * 4245 * Compile the automata into a Reg Exp ready for being executed. 4246 * The automata should be free after this point. 4247 * 4248 * Returns the compiled regexp or NULL in case of error 4249 */ 4250xmlRegexpPtr 4251xmlAutomataCompile(xmlAutomataPtr am) { 4252 xmlRegexpPtr ret; 4253 4254 xmlFAEliminateEpsilonTransitions(am); 4255 /* xmlFAComputesDeterminism(am); */ 4256 ret = xmlRegEpxFromParse(am); 4257 4258 return(ret); 4259} 4260 4261/** 4262 * xmlAutomataIsDeterminist: 4263 * @am: an automata 4264 * 4265 * Checks if an automata is determinist. 4266 * 4267 * Returns 1 if true, 0 if not, and -1 in case of error 4268 */ 4269int 4270xmlAutomataIsDeterminist(xmlAutomataPtr am) { 4271 int ret; 4272 4273 if (am == NULL) 4274 return(-1); 4275 4276 ret = xmlFAComputesDeterminism(am); 4277 return(ret); 4278} 4279#endif /* LIBXML_AUTOMATA_ENABLED */ 4280#endif /* LIBXML_REGEXP_ENABLED */ 4281