xmlregexp.c revision 7646b18d64b6c739d04ca453493070e88c4aab13
1/* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications thise includes: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17#define IN_LIBXML 18#include "libxml.h" 19 20#ifdef LIBXML_REGEXP_ENABLED 21 22#include <stdio.h> 23#include <string.h> 24#include <libxml/tree.h> 25#include <libxml/parserInternals.h> 26#include <libxml/xmlregexp.h> 27#include <libxml/xmlautomata.h> 28#include <libxml/xmlunicode.h> 29 30/* #define DEBUG_REGEXP_GRAPH */ 31/* #define DEBUG_REGEXP_EXEC */ 32/* #define DEBUG_PUSH */ 33 34#define ERROR(str) ctxt->error = 1; \ 35 xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur) 36#define NEXT ctxt->cur++ 37#define CUR (*(ctxt->cur)) 38#define NXT(index) (ctxt->cur[index]) 39 40#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 41#define NEXTL(l) ctxt->cur += l; 42 43 44/************************************************************************ 45 * * 46 * Datatypes and structures * 47 * * 48 ************************************************************************/ 49 50typedef enum { 51 XML_REGEXP_EPSILON = 1, 52 XML_REGEXP_CHARVAL, 53 XML_REGEXP_RANGES, 54 XML_REGEXP_SUBREG, 55 XML_REGEXP_STRING, 56 XML_REGEXP_ANYCHAR, /* . */ 57 XML_REGEXP_ANYSPACE, /* \s */ 58 XML_REGEXP_NOTSPACE, /* \S */ 59 XML_REGEXP_INITNAME, /* \l */ 60 XML_REGEXP_NOTINITNAME, /* \l */ 61 XML_REGEXP_NAMECHAR, /* \c */ 62 XML_REGEXP_NOTNAMECHAR, /* \C */ 63 XML_REGEXP_DECIMAL, /* \d */ 64 XML_REGEXP_NOTDECIMAL, /* \d */ 65 XML_REGEXP_REALCHAR, /* \w */ 66 XML_REGEXP_NOTREALCHAR, /* \w */ 67 XML_REGEXP_LETTER, 68 XML_REGEXP_LETTER_UPPERCASE, 69 XML_REGEXP_LETTER_LOWERCASE, 70 XML_REGEXP_LETTER_TITLECASE, 71 XML_REGEXP_LETTER_MODIFIER, 72 XML_REGEXP_LETTER_OTHERS, 73 XML_REGEXP_MARK, 74 XML_REGEXP_MARK_NONSPACING, 75 XML_REGEXP_MARK_SPACECOMBINING, 76 XML_REGEXP_MARK_ENCLOSING, 77 XML_REGEXP_NUMBER, 78 XML_REGEXP_NUMBER_DECIMAL, 79 XML_REGEXP_NUMBER_LETTER, 80 XML_REGEXP_NUMBER_OTHERS, 81 XML_REGEXP_PUNCT, 82 XML_REGEXP_PUNCT_CONNECTOR, 83 XML_REGEXP_PUNCT_DASH, 84 XML_REGEXP_PUNCT_OPEN, 85 XML_REGEXP_PUNCT_CLOSE, 86 XML_REGEXP_PUNCT_INITQUOTE, 87 XML_REGEXP_PUNCT_FINQUOTE, 88 XML_REGEXP_PUNCT_OTHERS, 89 XML_REGEXP_SEPAR, 90 XML_REGEXP_SEPAR_SPACE, 91 XML_REGEXP_SEPAR_LINE, 92 XML_REGEXP_SEPAR_PARA, 93 XML_REGEXP_SYMBOL, 94 XML_REGEXP_SYMBOL_MATH, 95 XML_REGEXP_SYMBOL_CURRENCY, 96 XML_REGEXP_SYMBOL_MODIFIER, 97 XML_REGEXP_SYMBOL_OTHERS, 98 XML_REGEXP_OTHER, 99 XML_REGEXP_OTHER_CONTROL, 100 XML_REGEXP_OTHER_FORMAT, 101 XML_REGEXP_OTHER_PRIVATE, 102 XML_REGEXP_OTHER_NA, 103 XML_REGEXP_BLOCK_NAME 104} xmlRegAtomType; 105 106typedef enum { 107 XML_REGEXP_QUANT_EPSILON = 1, 108 XML_REGEXP_QUANT_ONCE, 109 XML_REGEXP_QUANT_OPT, 110 XML_REGEXP_QUANT_MULT, 111 XML_REGEXP_QUANT_PLUS, 112 XML_REGEXP_QUANT_ONCEONLY, 113 XML_REGEXP_QUANT_ALL, 114 XML_REGEXP_QUANT_RANGE 115} xmlRegQuantType; 116 117typedef enum { 118 XML_REGEXP_START_STATE = 1, 119 XML_REGEXP_FINAL_STATE, 120 XML_REGEXP_TRANS_STATE 121} xmlRegStateType; 122 123typedef enum { 124 XML_REGEXP_MARK_NORMAL = 0, 125 XML_REGEXP_MARK_START, 126 XML_REGEXP_MARK_VISITED 127} xmlRegMarkedType; 128 129typedef struct _xmlRegRange xmlRegRange; 130typedef xmlRegRange *xmlRegRangePtr; 131 132struct _xmlRegRange { 133 int neg; 134 xmlRegAtomType type; 135 int start; 136 int end; 137 xmlChar *blockName; 138}; 139 140typedef struct _xmlRegAtom xmlRegAtom; 141typedef xmlRegAtom *xmlRegAtomPtr; 142 143typedef struct _xmlAutomataState xmlRegState; 144typedef xmlRegState *xmlRegStatePtr; 145 146struct _xmlRegAtom { 147 int no; 148 xmlRegAtomType type; 149 xmlRegQuantType quant; 150 int min; 151 int max; 152 153 void *valuep; 154 int neg; 155 int codepoint; 156 xmlRegStatePtr start; 157 xmlRegStatePtr stop; 158 int maxRanges; 159 int nbRanges; 160 xmlRegRangePtr *ranges; 161 void *data; 162}; 163 164typedef struct _xmlRegCounter xmlRegCounter; 165typedef xmlRegCounter *xmlRegCounterPtr; 166 167struct _xmlRegCounter { 168 int min; 169 int max; 170}; 171 172typedef struct _xmlRegTrans xmlRegTrans; 173typedef xmlRegTrans *xmlRegTransPtr; 174 175struct _xmlRegTrans { 176 xmlRegAtomPtr atom; 177 int to; 178 int counter; 179 int count; 180}; 181 182struct _xmlAutomataState { 183 xmlRegStateType type; 184 xmlRegMarkedType mark; 185 int no; 186 187 int maxTrans; 188 int nbTrans; 189 xmlRegTrans *trans; 190}; 191 192typedef struct _xmlAutomata xmlRegParserCtxt; 193typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 194 195struct _xmlAutomata { 196 xmlChar *string; 197 xmlChar *cur; 198 199 int error; 200 int neg; 201 202 xmlRegStatePtr start; 203 xmlRegStatePtr end; 204 xmlRegStatePtr state; 205 206 xmlRegAtomPtr atom; 207 208 int maxAtoms; 209 int nbAtoms; 210 xmlRegAtomPtr *atoms; 211 212 int maxStates; 213 int nbStates; 214 xmlRegStatePtr *states; 215 216 int maxCounters; 217 int nbCounters; 218 xmlRegCounter *counters; 219}; 220 221struct _xmlRegexp { 222 xmlChar *string; 223 int nbStates; 224 xmlRegStatePtr *states; 225 int nbAtoms; 226 xmlRegAtomPtr *atoms; 227 int nbCounters; 228 xmlRegCounter *counters; 229}; 230 231typedef struct _xmlRegExecRollback xmlRegExecRollback; 232typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 233 234struct _xmlRegExecRollback { 235 xmlRegStatePtr state;/* the current state */ 236 int index; /* the index in the input stack */ 237 int nextbranch; /* the next transition to explore in that state */ 238 int *counts; /* save the automate state if it has some */ 239}; 240 241typedef struct _xmlRegInputToken xmlRegInputToken; 242typedef xmlRegInputToken *xmlRegInputTokenPtr; 243 244struct _xmlRegInputToken { 245 xmlChar *value; 246 void *data; 247}; 248 249struct _xmlRegExecCtxt { 250 int status; /* execution status != 0 indicate an error */ 251 int determinist; /* did we found an inderterministic behaviour */ 252 xmlRegexpPtr comp; /* the compiled regexp */ 253 xmlRegExecCallbacks callback; 254 void *data; 255 256 xmlRegStatePtr state;/* the current state */ 257 int transno; /* the current transition on that state */ 258 int transcount; /* the number of char in char counted transitions */ 259 260 /* 261 * A stack of rollback states 262 */ 263 int maxRollbacks; 264 int nbRollbacks; 265 xmlRegExecRollback *rollbacks; 266 267 /* 268 * The state of the automata if any 269 */ 270 int *counts; 271 272 /* 273 * The input stack 274 */ 275 int inputStackMax; 276 int inputStackNr; 277 int index; 278 int *charStack; 279 const xmlChar *inputString; /* when operating on characters */ 280 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 281 282}; 283 284#define REGEXP_ALL_COUNTER 0x123456 285 286static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 287 288/************************************************************************ 289 * * 290 * Allocation/Deallocation * 291 * * 292 ************************************************************************/ 293 294/** 295 * xmlRegEpxFromParse: 296 * @ctxt: the parser context used to build it 297 * 298 * Allocate a new regexp and fill it with the reult from the parser 299 * 300 * Returns the new regexp or NULL in case of error 301 */ 302static xmlRegexpPtr 303xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 304 xmlRegexpPtr ret; 305 306 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 307 if (ret == NULL) 308 return(NULL); 309 memset(ret, 0, sizeof(xmlRegexp)); 310 ret->string = ctxt->string; 311 ctxt->string = NULL; 312 ret->nbStates = ctxt->nbStates; 313 ctxt->nbStates = 0; 314 ret->states = ctxt->states; 315 ctxt->states = NULL; 316 ret->nbAtoms = ctxt->nbAtoms; 317 ctxt->nbAtoms = 0; 318 ret->atoms = ctxt->atoms; 319 ctxt->atoms = NULL; 320 ret->nbCounters = ctxt->nbCounters; 321 ctxt->nbCounters = 0; 322 ret->counters = ctxt->counters; 323 ctxt->counters = NULL; 324 return(ret); 325} 326 327/** 328 * xmlRegNewParserCtxt: 329 * @string: the string to parse 330 * 331 * Allocate a new regexp parser context 332 * 333 * Returns the new context or NULL in case of error 334 */ 335static xmlRegParserCtxtPtr 336xmlRegNewParserCtxt(const xmlChar *string) { 337 xmlRegParserCtxtPtr ret; 338 339 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 340 if (ret == NULL) 341 return(NULL); 342 memset(ret, 0, sizeof(xmlRegParserCtxt)); 343 if (string != NULL) 344 ret->string = xmlStrdup(string); 345 ret->cur = ret->string; 346 ret->neg = 0; 347 ret->error = 0; 348 return(ret); 349} 350 351/** 352 * xmlRegNewRange: 353 * @ctxt: the regexp parser context 354 * @neg: is that negative 355 * @type: the type of range 356 * @start: the start codepoint 357 * @end: the end codepoint 358 * 359 * Allocate a new regexp range 360 * 361 * Returns the new range or NULL in case of error 362 */ 363static xmlRegRangePtr 364xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 365 int neg, xmlRegAtomType type, int start, int end) { 366 xmlRegRangePtr ret; 367 368 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 369 if (ret == NULL) { 370 ERROR("failed to allocate regexp range"); 371 return(NULL); 372 } 373 ret->neg = neg; 374 ret->type = type; 375 ret->start = start; 376 ret->end = end; 377 return(ret); 378} 379 380/** 381 * xmlRegFreeRange: 382 * @range: the regexp range 383 * 384 * Free a regexp range 385 */ 386static void 387xmlRegFreeRange(xmlRegRangePtr range) { 388 if (range == NULL) 389 return; 390 391 if (range->blockName != NULL) 392 xmlFree(range->blockName); 393 xmlFree(range); 394} 395 396/** 397 * xmlRegNewAtom: 398 * @ctxt: the regexp parser context 399 * @type: the type of atom 400 * 401 * Allocate a new regexp range 402 * 403 * Returns the new atom or NULL in case of error 404 */ 405static xmlRegAtomPtr 406xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 407 xmlRegAtomPtr ret; 408 409 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 410 if (ret == NULL) { 411 ERROR("failed to allocate regexp atom"); 412 return(NULL); 413 } 414 memset(ret, 0, sizeof(xmlRegAtom)); 415 ret->type = type; 416 ret->quant = XML_REGEXP_QUANT_ONCE; 417 ret->min = 0; 418 ret->max = 0; 419 return(ret); 420} 421 422/** 423 * xmlRegFreeAtom: 424 * @atom: the regexp atom 425 * 426 * Free a regexp atom 427 */ 428static void 429xmlRegFreeAtom(xmlRegAtomPtr atom) { 430 int i; 431 432 if (atom == NULL) 433 return; 434 435 for (i = 0;i < atom->nbRanges;i++) 436 xmlRegFreeRange(atom->ranges[i]); 437 if (atom->ranges != NULL) 438 xmlFree(atom->ranges); 439 if (atom->type == XML_REGEXP_STRING) 440 xmlFree(atom->valuep); 441 xmlFree(atom); 442} 443 444static xmlRegStatePtr 445xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 446 xmlRegStatePtr ret; 447 448 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 449 if (ret == NULL) { 450 ERROR("failed to allocate regexp state"); 451 return(NULL); 452 } 453 memset(ret, 0, sizeof(xmlRegState)); 454 ret->type = XML_REGEXP_TRANS_STATE; 455 ret->mark = XML_REGEXP_MARK_NORMAL; 456 return(ret); 457} 458 459/** 460 * xmlRegFreeState: 461 * @state: the regexp state 462 * 463 * Free a regexp state 464 */ 465static void 466xmlRegFreeState(xmlRegStatePtr state) { 467 if (state == NULL) 468 return; 469 470 if (state->trans != NULL) 471 xmlFree(state->trans); 472 xmlFree(state); 473} 474 475/** 476 * xmlRegFreeParserCtxt: 477 * @ctxt: the regexp parser context 478 * 479 * Free a regexp parser context 480 */ 481static void 482xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 483 int i; 484 if (ctxt == NULL) 485 return; 486 487 if (ctxt->string != NULL) 488 xmlFree(ctxt->string); 489 if (ctxt->states != NULL) { 490 for (i = 0;i < ctxt->nbStates;i++) 491 xmlRegFreeState(ctxt->states[i]); 492 xmlFree(ctxt->states); 493 } 494 if (ctxt->atoms != NULL) { 495 for (i = 0;i < ctxt->nbAtoms;i++) 496 xmlRegFreeAtom(ctxt->atoms[i]); 497 xmlFree(ctxt->atoms); 498 } 499 if (ctxt->counters != NULL) 500 xmlFree(ctxt->counters); 501 xmlFree(ctxt); 502} 503 504/************************************************************************ 505 * * 506 * Display of Data structures * 507 * * 508 ************************************************************************/ 509 510static void 511xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 512 switch (type) { 513 case XML_REGEXP_EPSILON: 514 fprintf(output, "epsilon "); break; 515 case XML_REGEXP_CHARVAL: 516 fprintf(output, "charval "); break; 517 case XML_REGEXP_RANGES: 518 fprintf(output, "ranges "); break; 519 case XML_REGEXP_SUBREG: 520 fprintf(output, "subexpr "); break; 521 case XML_REGEXP_STRING: 522 fprintf(output, "string "); break; 523 case XML_REGEXP_ANYCHAR: 524 fprintf(output, "anychar "); break; 525 case XML_REGEXP_ANYSPACE: 526 fprintf(output, "anyspace "); break; 527 case XML_REGEXP_NOTSPACE: 528 fprintf(output, "notspace "); break; 529 case XML_REGEXP_INITNAME: 530 fprintf(output, "initname "); break; 531 case XML_REGEXP_NOTINITNAME: 532 fprintf(output, "notinitname "); break; 533 case XML_REGEXP_NAMECHAR: 534 fprintf(output, "namechar "); break; 535 case XML_REGEXP_NOTNAMECHAR: 536 fprintf(output, "notnamechar "); break; 537 case XML_REGEXP_DECIMAL: 538 fprintf(output, "decimal "); break; 539 case XML_REGEXP_NOTDECIMAL: 540 fprintf(output, "notdecimal "); break; 541 case XML_REGEXP_REALCHAR: 542 fprintf(output, "realchar "); break; 543 case XML_REGEXP_NOTREALCHAR: 544 fprintf(output, "notrealchar "); break; 545 case XML_REGEXP_LETTER: 546 fprintf(output, "LETTER "); break; 547 case XML_REGEXP_LETTER_UPPERCASE: 548 fprintf(output, "LETTER_UPPERCASE "); break; 549 case XML_REGEXP_LETTER_LOWERCASE: 550 fprintf(output, "LETTER_LOWERCASE "); break; 551 case XML_REGEXP_LETTER_TITLECASE: 552 fprintf(output, "LETTER_TITLECASE "); break; 553 case XML_REGEXP_LETTER_MODIFIER: 554 fprintf(output, "LETTER_MODIFIER "); break; 555 case XML_REGEXP_LETTER_OTHERS: 556 fprintf(output, "LETTER_OTHERS "); break; 557 case XML_REGEXP_MARK: 558 fprintf(output, "MARK "); break; 559 case XML_REGEXP_MARK_NONSPACING: 560 fprintf(output, "MARK_NONSPACING "); break; 561 case XML_REGEXP_MARK_SPACECOMBINING: 562 fprintf(output, "MARK_SPACECOMBINING "); break; 563 case XML_REGEXP_MARK_ENCLOSING: 564 fprintf(output, "MARK_ENCLOSING "); break; 565 case XML_REGEXP_NUMBER: 566 fprintf(output, "NUMBER "); break; 567 case XML_REGEXP_NUMBER_DECIMAL: 568 fprintf(output, "NUMBER_DECIMAL "); break; 569 case XML_REGEXP_NUMBER_LETTER: 570 fprintf(output, "NUMBER_LETTER "); break; 571 case XML_REGEXP_NUMBER_OTHERS: 572 fprintf(output, "NUMBER_OTHERS "); break; 573 case XML_REGEXP_PUNCT: 574 fprintf(output, "PUNCT "); break; 575 case XML_REGEXP_PUNCT_CONNECTOR: 576 fprintf(output, "PUNCT_CONNECTOR "); break; 577 case XML_REGEXP_PUNCT_DASH: 578 fprintf(output, "PUNCT_DASH "); break; 579 case XML_REGEXP_PUNCT_OPEN: 580 fprintf(output, "PUNCT_OPEN "); break; 581 case XML_REGEXP_PUNCT_CLOSE: 582 fprintf(output, "PUNCT_CLOSE "); break; 583 case XML_REGEXP_PUNCT_INITQUOTE: 584 fprintf(output, "PUNCT_INITQUOTE "); break; 585 case XML_REGEXP_PUNCT_FINQUOTE: 586 fprintf(output, "PUNCT_FINQUOTE "); break; 587 case XML_REGEXP_PUNCT_OTHERS: 588 fprintf(output, "PUNCT_OTHERS "); break; 589 case XML_REGEXP_SEPAR: 590 fprintf(output, "SEPAR "); break; 591 case XML_REGEXP_SEPAR_SPACE: 592 fprintf(output, "SEPAR_SPACE "); break; 593 case XML_REGEXP_SEPAR_LINE: 594 fprintf(output, "SEPAR_LINE "); break; 595 case XML_REGEXP_SEPAR_PARA: 596 fprintf(output, "SEPAR_PARA "); break; 597 case XML_REGEXP_SYMBOL: 598 fprintf(output, "SYMBOL "); break; 599 case XML_REGEXP_SYMBOL_MATH: 600 fprintf(output, "SYMBOL_MATH "); break; 601 case XML_REGEXP_SYMBOL_CURRENCY: 602 fprintf(output, "SYMBOL_CURRENCY "); break; 603 case XML_REGEXP_SYMBOL_MODIFIER: 604 fprintf(output, "SYMBOL_MODIFIER "); break; 605 case XML_REGEXP_SYMBOL_OTHERS: 606 fprintf(output, "SYMBOL_OTHERS "); break; 607 case XML_REGEXP_OTHER: 608 fprintf(output, "OTHER "); break; 609 case XML_REGEXP_OTHER_CONTROL: 610 fprintf(output, "OTHER_CONTROL "); break; 611 case XML_REGEXP_OTHER_FORMAT: 612 fprintf(output, "OTHER_FORMAT "); break; 613 case XML_REGEXP_OTHER_PRIVATE: 614 fprintf(output, "OTHER_PRIVATE "); break; 615 case XML_REGEXP_OTHER_NA: 616 fprintf(output, "OTHER_NA "); break; 617 case XML_REGEXP_BLOCK_NAME: 618 fprintf(output, "BLOCK "); break; 619 } 620} 621 622static void 623xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 624 switch (type) { 625 case XML_REGEXP_QUANT_EPSILON: 626 fprintf(output, "epsilon "); break; 627 case XML_REGEXP_QUANT_ONCE: 628 fprintf(output, "once "); break; 629 case XML_REGEXP_QUANT_OPT: 630 fprintf(output, "? "); break; 631 case XML_REGEXP_QUANT_MULT: 632 fprintf(output, "* "); break; 633 case XML_REGEXP_QUANT_PLUS: 634 fprintf(output, "+ "); break; 635 case XML_REGEXP_QUANT_RANGE: 636 fprintf(output, "range "); break; 637 case XML_REGEXP_QUANT_ONCEONLY: 638 fprintf(output, "onceonly "); break; 639 case XML_REGEXP_QUANT_ALL: 640 fprintf(output, "all "); break; 641 } 642} 643static void 644xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 645 fprintf(output, " range: "); 646 if (range->neg) 647 fprintf(output, "negative "); 648 xmlRegPrintAtomType(output, range->type); 649 fprintf(output, "%c - %c\n", range->start, range->end); 650} 651 652static void 653xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 654 fprintf(output, " atom: "); 655 if (atom == NULL) { 656 fprintf(output, "NULL\n"); 657 return; 658 } 659 xmlRegPrintAtomType(output, atom->type); 660 xmlRegPrintQuantType(output, atom->quant); 661 if (atom->quant == XML_REGEXP_QUANT_RANGE) 662 fprintf(output, "%d-%d ", atom->min, atom->max); 663 if (atom->type == XML_REGEXP_STRING) 664 fprintf(output, "'%s' ", (char *) atom->valuep); 665 if (atom->type == XML_REGEXP_CHARVAL) 666 fprintf(output, "char %c\n", atom->codepoint); 667 else if (atom->type == XML_REGEXP_RANGES) { 668 int i; 669 fprintf(output, "%d entries\n", atom->nbRanges); 670 for (i = 0; i < atom->nbRanges;i++) 671 xmlRegPrintRange(output, atom->ranges[i]); 672 } else if (atom->type == XML_REGEXP_SUBREG) { 673 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 674 } else { 675 fprintf(output, "\n"); 676 } 677} 678 679static void 680xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 681 fprintf(output, " trans: "); 682 if (trans == NULL) { 683 fprintf(output, "NULL\n"); 684 return; 685 } 686 if (trans->to < 0) { 687 fprintf(output, "removed\n"); 688 return; 689 } 690 if (trans->counter >= 0) { 691 fprintf(output, "counted %d, ", trans->counter); 692 } 693 if (trans->count >= 0) { 694 fprintf(output, "count based %d, ", trans->count); 695 } 696 if (trans->atom == NULL) { 697 fprintf(output, "epsilon to %d\n", trans->to); 698 return; 699 } 700 if (trans->atom->type == XML_REGEXP_CHARVAL) 701 fprintf(output, "char %c ", trans->atom->codepoint); 702 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 703} 704 705static void 706xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 707 int i; 708 709 fprintf(output, " state: "); 710 if (state == NULL) { 711 fprintf(output, "NULL\n"); 712 return; 713 } 714 if (state->type == XML_REGEXP_START_STATE) 715 fprintf(output, "START "); 716 if (state->type == XML_REGEXP_FINAL_STATE) 717 fprintf(output, "FINAL "); 718 719 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 720 for (i = 0;i < state->nbTrans; i++) { 721 xmlRegPrintTrans(output, &(state->trans[i])); 722 } 723} 724 725#if 0 726static void 727xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 728 int i; 729 730 fprintf(output, " ctxt: "); 731 if (ctxt == NULL) { 732 fprintf(output, "NULL\n"); 733 return; 734 } 735 fprintf(output, "'%s' ", ctxt->string); 736 if (ctxt->error) 737 fprintf(output, "error "); 738 if (ctxt->neg) 739 fprintf(output, "neg "); 740 fprintf(output, "\n"); 741 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 742 for (i = 0;i < ctxt->nbAtoms; i++) { 743 fprintf(output, " %02d ", i); 744 xmlRegPrintAtom(output, ctxt->atoms[i]); 745 } 746 if (ctxt->atom != NULL) { 747 fprintf(output, "current atom:\n"); 748 xmlRegPrintAtom(output, ctxt->atom); 749 } 750 fprintf(output, "%d states:", ctxt->nbStates); 751 if (ctxt->start != NULL) 752 fprintf(output, " start: %d", ctxt->start->no); 753 if (ctxt->end != NULL) 754 fprintf(output, " end: %d", ctxt->end->no); 755 fprintf(output, "\n"); 756 for (i = 0;i < ctxt->nbStates; i++) { 757 xmlRegPrintState(output, ctxt->states[i]); 758 } 759 fprintf(output, "%d counters:\n", ctxt->nbCounters); 760 for (i = 0;i < ctxt->nbCounters; i++) { 761 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 762 ctxt->counters[i].max); 763 } 764} 765#endif 766 767/************************************************************************ 768 * * 769 * Finite Automata structures manipulations * 770 * * 771 ************************************************************************/ 772 773static void 774xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 775 int neg, xmlRegAtomType type, int start, int end, 776 xmlChar *blockName) { 777 xmlRegRangePtr range; 778 779 if (atom == NULL) { 780 ERROR("add range: atom is NULL"); 781 return; 782 } 783 if (atom->type != XML_REGEXP_RANGES) { 784 ERROR("add range: atom is not ranges"); 785 return; 786 } 787 if (atom->maxRanges == 0) { 788 atom->maxRanges = 4; 789 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 790 sizeof(xmlRegRangePtr)); 791 if (atom->ranges == NULL) { 792 ERROR("add range: allocation failed"); 793 atom->maxRanges = 0; 794 return; 795 } 796 } else if (atom->nbRanges >= atom->maxRanges) { 797 xmlRegRangePtr *tmp; 798 atom->maxRanges *= 2; 799 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 800 sizeof(xmlRegRangePtr)); 801 if (tmp == NULL) { 802 ERROR("add range: allocation failed"); 803 atom->maxRanges /= 2; 804 return; 805 } 806 atom->ranges = tmp; 807 } 808 range = xmlRegNewRange(ctxt, neg, type, start, end); 809 if (range == NULL) 810 return; 811 range->blockName = blockName; 812 atom->ranges[atom->nbRanges++] = range; 813 814} 815 816static int 817xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 818 if (ctxt->maxCounters == 0) { 819 ctxt->maxCounters = 4; 820 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 821 sizeof(xmlRegCounter)); 822 if (ctxt->counters == NULL) { 823 ERROR("reg counter: allocation failed"); 824 ctxt->maxCounters = 0; 825 return(-1); 826 } 827 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 828 xmlRegCounter *tmp; 829 ctxt->maxCounters *= 2; 830 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 831 sizeof(xmlRegCounter)); 832 if (tmp == NULL) { 833 ERROR("reg counter: allocation failed"); 834 ctxt->maxCounters /= 2; 835 return(-1); 836 } 837 ctxt->counters = tmp; 838 } 839 ctxt->counters[ctxt->nbCounters].min = -1; 840 ctxt->counters[ctxt->nbCounters].max = -1; 841 return(ctxt->nbCounters++); 842} 843 844static void 845xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 846 if (atom == NULL) { 847 ERROR("atom push: atom is NULL"); 848 return; 849 } 850 if (ctxt->maxAtoms == 0) { 851 ctxt->maxAtoms = 4; 852 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 853 sizeof(xmlRegAtomPtr)); 854 if (ctxt->atoms == NULL) { 855 ERROR("atom push: allocation failed"); 856 ctxt->maxAtoms = 0; 857 return; 858 } 859 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 860 xmlRegAtomPtr *tmp; 861 ctxt->maxAtoms *= 2; 862 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 863 sizeof(xmlRegAtomPtr)); 864 if (tmp == NULL) { 865 ERROR("atom push: allocation failed"); 866 ctxt->maxAtoms /= 2; 867 return; 868 } 869 ctxt->atoms = tmp; 870 } 871 atom->no = ctxt->nbAtoms; 872 ctxt->atoms[ctxt->nbAtoms++] = atom; 873} 874 875static void 876xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 877 xmlRegAtomPtr atom, xmlRegStatePtr target, 878 int counter, int count) { 879 if (state == NULL) { 880 ERROR("add state: state is NULL"); 881 return; 882 } 883 if (target == NULL) { 884 ERROR("add state: target is NULL"); 885 return; 886 } 887 if (state->maxTrans == 0) { 888 state->maxTrans = 4; 889 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 890 sizeof(xmlRegTrans)); 891 if (state->trans == NULL) { 892 ERROR("add range: allocation failed"); 893 state->maxTrans = 0; 894 return; 895 } 896 } else if (state->nbTrans >= state->maxTrans) { 897 xmlRegTrans *tmp; 898 state->maxTrans *= 2; 899 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 900 sizeof(xmlRegTrans)); 901 if (tmp == NULL) { 902 ERROR("add range: allocation failed"); 903 state->maxTrans /= 2; 904 return; 905 } 906 state->trans = tmp; 907 } 908#ifdef DEBUG_REGEXP_GRAPH 909 printf("Add trans from %d to %d ", state->no, target->no); 910 if (count >= 0) 911 printf("count based %d", count); 912 else if (counter >= 0) 913 printf("counted %d", counter); 914 else if (atom == NULL) 915 printf("epsilon transition"); 916 printf("\n"); 917#endif 918 919 state->trans[state->nbTrans].atom = atom; 920 state->trans[state->nbTrans].to = target->no; 921 state->trans[state->nbTrans].counter = counter; 922 state->trans[state->nbTrans].count = count; 923 state->nbTrans++; 924} 925 926static void 927xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 928 if (ctxt->maxStates == 0) { 929 ctxt->maxStates = 4; 930 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 931 sizeof(xmlRegStatePtr)); 932 if (ctxt->states == NULL) { 933 ERROR("add range: allocation failed"); 934 ctxt->maxStates = 0; 935 return; 936 } 937 } else if (ctxt->nbStates >= ctxt->maxStates) { 938 xmlRegStatePtr *tmp; 939 ctxt->maxStates *= 2; 940 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 941 sizeof(xmlRegStatePtr)); 942 if (tmp == NULL) { 943 ERROR("add range: allocation failed"); 944 ctxt->maxStates /= 2; 945 return; 946 } 947 ctxt->states = tmp; 948 } 949 state->no = ctxt->nbStates; 950 ctxt->states[ctxt->nbStates++] = state; 951} 952 953/** 954 * xmlFAGenerateAllTransition: 955 * ctxt: a regexp parser context 956 * from: the from state 957 * to: the target state or NULL for building a new one 958 * 959 */ 960static void 961xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 962 xmlRegStatePtr from, xmlRegStatePtr to) { 963 if (to == NULL) { 964 to = xmlRegNewState(ctxt); 965 xmlRegStatePush(ctxt, to); 966 ctxt->state = to; 967 } 968 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 969} 970 971/** 972 * xmlFAGenerateEpsilonTransition: 973 * ctxt: a regexp parser context 974 * from: the from state 975 * to: the target state or NULL for building a new one 976 * 977 */ 978static void 979xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 980 xmlRegStatePtr from, xmlRegStatePtr to) { 981 if (to == NULL) { 982 to = xmlRegNewState(ctxt); 983 xmlRegStatePush(ctxt, to); 984 ctxt->state = to; 985 } 986 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 987} 988 989/** 990 * xmlFAGenerateCountedEpsilonTransition: 991 * ctxt: a regexp parser context 992 * from: the from state 993 * to: the target state or NULL for building a new one 994 * counter: the counter for that transition 995 * 996 */ 997static void 998xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 999 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1000 if (to == NULL) { 1001 to = xmlRegNewState(ctxt); 1002 xmlRegStatePush(ctxt, to); 1003 ctxt->state = to; 1004 } 1005 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1006} 1007 1008/** 1009 * xmlFAGenerateCountedTransition: 1010 * ctxt: a regexp parser context 1011 * from: the from state 1012 * to: the target state or NULL for building a new one 1013 * counter: the counter for that transition 1014 * 1015 */ 1016static void 1017xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1018 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1019 if (to == NULL) { 1020 to = xmlRegNewState(ctxt); 1021 xmlRegStatePush(ctxt, to); 1022 ctxt->state = to; 1023 } 1024 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1025} 1026 1027/** 1028 * xmlFAGenerateTransitions: 1029 * ctxt: a regexp parser context 1030 * from: the from state 1031 * to: the target state or NULL for building a new one 1032 * atom: the atom generating the transition 1033 * 1034 */ 1035static void 1036xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1037 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1038 if (atom == NULL) { 1039 ERROR("genrate transition: atom == NULL"); 1040 return; 1041 } 1042 if (atom->type == XML_REGEXP_SUBREG) { 1043 /* 1044 * this is a subexpression handling one should not need to 1045 * create a new node excep for XML_REGEXP_QUANT_RANGE. 1046 */ 1047 xmlRegAtomPush(ctxt, atom); 1048 if ((to != NULL) && (atom->stop != to) && 1049 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1050 /* 1051 * Generate an epsilon transition to link to the target 1052 */ 1053 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1054 } 1055 switch (atom->quant) { 1056 case XML_REGEXP_QUANT_OPT: 1057 atom->quant = XML_REGEXP_QUANT_ONCE; 1058 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1059 break; 1060 case XML_REGEXP_QUANT_MULT: 1061 atom->quant = XML_REGEXP_QUANT_ONCE; 1062 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1063 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1064 break; 1065 case XML_REGEXP_QUANT_PLUS: 1066 atom->quant = XML_REGEXP_QUANT_ONCE; 1067 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1068 break; 1069 case XML_REGEXP_QUANT_RANGE: { 1070 int counter; 1071 xmlRegStatePtr newstate; 1072 1073 /* 1074 * This one is nasty: 1075 * 1/ register a new counter 1076 * 2/ register an epsilon transition associated to 1077 * this counter going from atom->stop to atom->start 1078 * 3/ create a new state 1079 * 4/ generate a counted transition from atom->stop to 1080 * that state 1081 */ 1082 counter = xmlRegGetCounter(ctxt); 1083 ctxt->counters[counter].min = atom->min - 1; 1084 ctxt->counters[counter].max = atom->max - 1; 1085 atom->min = 0; 1086 atom->max = 0; 1087 atom->quant = XML_REGEXP_QUANT_ONCE; 1088 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1089 atom->start, counter); 1090 if (to != NULL) { 1091 newstate = to; 1092 } else { 1093 newstate = xmlRegNewState(ctxt); 1094 xmlRegStatePush(ctxt, newstate); 1095 ctxt->state = newstate; 1096 } 1097 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1098 newstate, counter); 1099 } 1100 default: 1101 break; 1102 } 1103 return; 1104 } else { 1105 if (to == NULL) { 1106 to = xmlRegNewState(ctxt); 1107 xmlRegStatePush(ctxt, to); 1108 } 1109 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1110 xmlRegAtomPush(ctxt, atom); 1111 ctxt->state = to; 1112 } 1113 switch (atom->quant) { 1114 case XML_REGEXP_QUANT_OPT: 1115 atom->quant = XML_REGEXP_QUANT_ONCE; 1116 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1117 break; 1118 case XML_REGEXP_QUANT_MULT: 1119 atom->quant = XML_REGEXP_QUANT_ONCE; 1120 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1121 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1122 break; 1123 case XML_REGEXP_QUANT_PLUS: 1124 atom->quant = XML_REGEXP_QUANT_ONCE; 1125 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1126 break; 1127 default: 1128 break; 1129 } 1130} 1131 1132/** 1133 * xmlFAReduceEpsilonTransitions: 1134 * ctxt: a regexp parser context 1135 * @fromnr: the from state 1136 * @tonr: the to state 1137 * @cpunter: should that transition be associted to a counted 1138 * 1139 */ 1140static void 1141xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1142 int tonr, int counter) { 1143 int transnr; 1144 xmlRegStatePtr from; 1145 xmlRegStatePtr to; 1146 1147#ifdef DEBUG_REGEXP_GRAPH 1148 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1149#endif 1150 from = ctxt->states[fromnr]; 1151 if (from == NULL) 1152 return; 1153 to = ctxt->states[tonr]; 1154 if (to == NULL) 1155 return; 1156 if ((to->mark == XML_REGEXP_MARK_START) || 1157 (to->mark == XML_REGEXP_MARK_VISITED)) 1158 return; 1159 1160 to->mark = XML_REGEXP_MARK_VISITED; 1161 if (to->type == XML_REGEXP_FINAL_STATE) { 1162#ifdef DEBUG_REGEXP_GRAPH 1163 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1164#endif 1165 from->type = XML_REGEXP_FINAL_STATE; 1166 } 1167 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1168 if (to->trans[transnr].atom == NULL) { 1169 /* 1170 * Don't remove counted transitions 1171 * Don't loop either 1172 */ 1173 if (to->trans[transnr].to != fromnr) { 1174 if (to->trans[transnr].count >= 0) { 1175 int newto = to->trans[transnr].to; 1176 1177 xmlRegStateAddTrans(ctxt, from, NULL, 1178 ctxt->states[newto], 1179 -1, to->trans[transnr].count); 1180 } else { 1181#ifdef DEBUG_REGEXP_GRAPH 1182 printf("Found epsilon trans %d from %d to %d\n", 1183 transnr, tonr, to->trans[transnr].to); 1184#endif 1185 if (to->trans[transnr].counter >= 0) { 1186 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1187 to->trans[transnr].to, 1188 to->trans[transnr].counter); 1189 } else { 1190 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1191 to->trans[transnr].to, 1192 counter); 1193 } 1194 } 1195 } 1196 } else { 1197 int newto = to->trans[transnr].to; 1198 1199 if (to->trans[transnr].counter >= 0) { 1200 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1201 ctxt->states[newto], 1202 to->trans[transnr].counter, -1); 1203 } else { 1204 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1205 ctxt->states[newto], counter, -1); 1206 } 1207 1208 } 1209 } 1210 to->mark = XML_REGEXP_MARK_NORMAL; 1211} 1212 1213/** 1214 * xmlFAEliminateEpsilonTransitions: 1215 * ctxt: a regexp parser context 1216 * 1217 */ 1218static void 1219xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1220 int statenr, transnr; 1221 xmlRegStatePtr state; 1222 1223 /* 1224 * build the completed transitions bypassing the epsilons 1225 * Use a marking algorithm to avoid loops 1226 */ 1227 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1228 state = ctxt->states[statenr]; 1229 if (state == NULL) 1230 continue; 1231 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1232 if ((state->trans[transnr].atom == NULL) && 1233 (state->trans[transnr].to >= 0)) { 1234 if (state->trans[transnr].to == statenr) { 1235 state->trans[transnr].to = -1; 1236#ifdef DEBUG_REGEXP_GRAPH 1237 printf("Removed loopback epsilon trans %d on %d\n", 1238 transnr, statenr); 1239#endif 1240 } else if (state->trans[transnr].count < 0) { 1241 int newto = state->trans[transnr].to; 1242 1243#ifdef DEBUG_REGEXP_GRAPH 1244 printf("Found epsilon trans %d from %d to %d\n", 1245 transnr, statenr, newto); 1246#endif 1247 state->mark = XML_REGEXP_MARK_START; 1248 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1249 newto, state->trans[transnr].counter); 1250 state->mark = XML_REGEXP_MARK_NORMAL; 1251#ifdef DEBUG_REGEXP_GRAPH 1252 } else { 1253 printf("Found counted transition %d on %d\n", 1254 transnr, statenr); 1255#endif 1256 } 1257 } 1258 } 1259 } 1260 /* 1261 * Eliminate the epsilon transitions 1262 */ 1263 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1264 state = ctxt->states[statenr]; 1265 if (state == NULL) 1266 continue; 1267 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1268 if ((state->trans[transnr].atom == NULL) && 1269 (state->trans[transnr].count < 0) && 1270 (state->trans[transnr].to >= 0)) { 1271 state->trans[transnr].to = -1; 1272 } 1273 } 1274 } 1275} 1276 1277/************************************************************************ 1278 * * 1279 * Routines to check input against transition atoms * 1280 * * 1281 ************************************************************************/ 1282 1283static int 1284xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 1285 int start, int end, const xmlChar *blockName) { 1286 int ret = 0; 1287 1288 switch (type) { 1289 case XML_REGEXP_STRING: 1290 case XML_REGEXP_SUBREG: 1291 case XML_REGEXP_RANGES: 1292 case XML_REGEXP_EPSILON: 1293 return(-1); 1294 case XML_REGEXP_ANYCHAR: 1295 ret = ((codepoint != '\n') && (codepoint != '\r')); 1296 break; 1297 case XML_REGEXP_CHARVAL: 1298 ret = ((codepoint >= start) && (codepoint <= end)); 1299 break; 1300 case XML_REGEXP_NOTSPACE: 1301 neg = !neg; 1302 case XML_REGEXP_ANYSPACE: 1303 ret = ((codepoint == '\n') || (codepoint == '\r') || 1304 (codepoint == '\t') || (codepoint == ' ')); 1305 break; 1306 case XML_REGEXP_NOTINITNAME: 1307 neg = !neg; 1308 case XML_REGEXP_INITNAME: 1309 ret = (xmlIsLetter(codepoint) || 1310 (codepoint == '_') || (codepoint == ':')); 1311 break; 1312 case XML_REGEXP_NOTNAMECHAR: 1313 neg = !neg; 1314 case XML_REGEXP_NAMECHAR: 1315 ret = (xmlIsLetter(codepoint) || xmlIsDigit(codepoint) || 1316 (codepoint == '.') || (codepoint == '-') || 1317 (codepoint == '_') || (codepoint == ':') || 1318 xmlIsCombining(codepoint) || xmlIsExtender(codepoint)); 1319 break; 1320 case XML_REGEXP_NOTDECIMAL: 1321 neg = !neg; 1322 case XML_REGEXP_DECIMAL: 1323 ret = xmlUCSIsCatNd(codepoint); 1324 break; 1325 case XML_REGEXP_REALCHAR: 1326 neg = !neg; 1327 case XML_REGEXP_NOTREALCHAR: 1328 ret = xmlUCSIsCatP(codepoint); 1329 if (ret == 0) 1330 ret = xmlUCSIsCatZ(codepoint); 1331 if (ret == 0) 1332 ret = xmlUCSIsCatC(codepoint); 1333 break; 1334 case XML_REGEXP_LETTER: 1335 ret = xmlUCSIsCatL(codepoint); 1336 break; 1337 case XML_REGEXP_LETTER_UPPERCASE: 1338 ret = xmlUCSIsCatLu(codepoint); 1339 break; 1340 case XML_REGEXP_LETTER_LOWERCASE: 1341 ret = xmlUCSIsCatLl(codepoint); 1342 break; 1343 case XML_REGEXP_LETTER_TITLECASE: 1344 ret = xmlUCSIsCatLt(codepoint); 1345 break; 1346 case XML_REGEXP_LETTER_MODIFIER: 1347 ret = xmlUCSIsCatLm(codepoint); 1348 break; 1349 case XML_REGEXP_LETTER_OTHERS: 1350 ret = xmlUCSIsCatLo(codepoint); 1351 break; 1352 case XML_REGEXP_MARK: 1353 ret = xmlUCSIsCatM(codepoint); 1354 break; 1355 case XML_REGEXP_MARK_NONSPACING: 1356 ret = xmlUCSIsCatMn(codepoint); 1357 break; 1358 case XML_REGEXP_MARK_SPACECOMBINING: 1359 ret = xmlUCSIsCatMc(codepoint); 1360 break; 1361 case XML_REGEXP_MARK_ENCLOSING: 1362 ret = xmlUCSIsCatMe(codepoint); 1363 break; 1364 case XML_REGEXP_NUMBER: 1365 ret = xmlUCSIsCatN(codepoint); 1366 break; 1367 case XML_REGEXP_NUMBER_DECIMAL: 1368 ret = xmlUCSIsCatNd(codepoint); 1369 break; 1370 case XML_REGEXP_NUMBER_LETTER: 1371 ret = xmlUCSIsCatNl(codepoint); 1372 break; 1373 case XML_REGEXP_NUMBER_OTHERS: 1374 ret = xmlUCSIsCatNo(codepoint); 1375 break; 1376 case XML_REGEXP_PUNCT: 1377 ret = xmlUCSIsCatP(codepoint); 1378 break; 1379 case XML_REGEXP_PUNCT_CONNECTOR: 1380 ret = xmlUCSIsCatPc(codepoint); 1381 break; 1382 case XML_REGEXP_PUNCT_DASH: 1383 ret = xmlUCSIsCatPd(codepoint); 1384 break; 1385 case XML_REGEXP_PUNCT_OPEN: 1386 ret = xmlUCSIsCatPs(codepoint); 1387 break; 1388 case XML_REGEXP_PUNCT_CLOSE: 1389 ret = xmlUCSIsCatPe(codepoint); 1390 break; 1391 case XML_REGEXP_PUNCT_INITQUOTE: 1392 ret = xmlUCSIsCatPi(codepoint); 1393 break; 1394 case XML_REGEXP_PUNCT_FINQUOTE: 1395 ret = xmlUCSIsCatPf(codepoint); 1396 break; 1397 case XML_REGEXP_PUNCT_OTHERS: 1398 ret = xmlUCSIsCatPo(codepoint); 1399 break; 1400 case XML_REGEXP_SEPAR: 1401 ret = xmlUCSIsCatZ(codepoint); 1402 break; 1403 case XML_REGEXP_SEPAR_SPACE: 1404 ret = xmlUCSIsCatZs(codepoint); 1405 break; 1406 case XML_REGEXP_SEPAR_LINE: 1407 ret = xmlUCSIsCatZl(codepoint); 1408 break; 1409 case XML_REGEXP_SEPAR_PARA: 1410 ret = xmlUCSIsCatZp(codepoint); 1411 break; 1412 case XML_REGEXP_SYMBOL: 1413 ret = xmlUCSIsCatS(codepoint); 1414 break; 1415 case XML_REGEXP_SYMBOL_MATH: 1416 ret = xmlUCSIsCatSm(codepoint); 1417 break; 1418 case XML_REGEXP_SYMBOL_CURRENCY: 1419 ret = xmlUCSIsCatSc(codepoint); 1420 break; 1421 case XML_REGEXP_SYMBOL_MODIFIER: 1422 ret = xmlUCSIsCatSk(codepoint); 1423 break; 1424 case XML_REGEXP_SYMBOL_OTHERS: 1425 ret = xmlUCSIsCatSo(codepoint); 1426 break; 1427 case XML_REGEXP_OTHER: 1428 ret = xmlUCSIsCatC(codepoint); 1429 break; 1430 case XML_REGEXP_OTHER_CONTROL: 1431 ret = xmlUCSIsCatCc(codepoint); 1432 break; 1433 case XML_REGEXP_OTHER_FORMAT: 1434 ret = xmlUCSIsCatCf(codepoint); 1435 break; 1436 case XML_REGEXP_OTHER_PRIVATE: 1437 ret = xmlUCSIsCatCo(codepoint); 1438 break; 1439 case XML_REGEXP_OTHER_NA: 1440 /* ret = xmlUCSIsCatCn(codepoint); */ 1441 /* Seems it doesn't exist anymore in recent Unicode releases */ 1442 ret = 0; 1443 break; 1444 case XML_REGEXP_BLOCK_NAME: 1445 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 1446 break; 1447 } 1448 if (neg) 1449 return(!ret); 1450 return(ret); 1451} 1452 1453static int 1454xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 1455 int i, ret = 0; 1456 xmlRegRangePtr range; 1457 1458 if ((atom == NULL) || (!xmlIsChar(codepoint))) 1459 return(-1); 1460 1461 switch (atom->type) { 1462 case XML_REGEXP_SUBREG: 1463 case XML_REGEXP_EPSILON: 1464 return(-1); 1465 case XML_REGEXP_CHARVAL: 1466 return(codepoint == atom->codepoint); 1467 case XML_REGEXP_RANGES: { 1468 int accept = 0; 1469 for (i = 0;i < atom->nbRanges;i++) { 1470 range = atom->ranges[i]; 1471 if (range->neg) { 1472 ret = xmlRegCheckCharacterRange(range->type, codepoint, 1473 0, range->start, range->end, 1474 range->blockName); 1475 if (ret != 0) 1476 return(0); /* excluded char */ 1477 } else { 1478 ret = xmlRegCheckCharacterRange(range->type, codepoint, 1479 0, range->start, range->end, 1480 range->blockName); 1481 if (ret != 0) 1482 accept = 1; /* might still be excluded */ 1483 } 1484 } 1485 return(accept); 1486 } 1487 case XML_REGEXP_STRING: 1488 printf("TODO: XML_REGEXP_STRING\n"); 1489 return(-1); 1490 case XML_REGEXP_ANYCHAR: 1491 case XML_REGEXP_ANYSPACE: 1492 case XML_REGEXP_NOTSPACE: 1493 case XML_REGEXP_INITNAME: 1494 case XML_REGEXP_NOTINITNAME: 1495 case XML_REGEXP_NAMECHAR: 1496 case XML_REGEXP_NOTNAMECHAR: 1497 case XML_REGEXP_DECIMAL: 1498 case XML_REGEXP_NOTDECIMAL: 1499 case XML_REGEXP_REALCHAR: 1500 case XML_REGEXP_NOTREALCHAR: 1501 case XML_REGEXP_LETTER: 1502 case XML_REGEXP_LETTER_UPPERCASE: 1503 case XML_REGEXP_LETTER_LOWERCASE: 1504 case XML_REGEXP_LETTER_TITLECASE: 1505 case XML_REGEXP_LETTER_MODIFIER: 1506 case XML_REGEXP_LETTER_OTHERS: 1507 case XML_REGEXP_MARK: 1508 case XML_REGEXP_MARK_NONSPACING: 1509 case XML_REGEXP_MARK_SPACECOMBINING: 1510 case XML_REGEXP_MARK_ENCLOSING: 1511 case XML_REGEXP_NUMBER: 1512 case XML_REGEXP_NUMBER_DECIMAL: 1513 case XML_REGEXP_NUMBER_LETTER: 1514 case XML_REGEXP_NUMBER_OTHERS: 1515 case XML_REGEXP_PUNCT: 1516 case XML_REGEXP_PUNCT_CONNECTOR: 1517 case XML_REGEXP_PUNCT_DASH: 1518 case XML_REGEXP_PUNCT_OPEN: 1519 case XML_REGEXP_PUNCT_CLOSE: 1520 case XML_REGEXP_PUNCT_INITQUOTE: 1521 case XML_REGEXP_PUNCT_FINQUOTE: 1522 case XML_REGEXP_PUNCT_OTHERS: 1523 case XML_REGEXP_SEPAR: 1524 case XML_REGEXP_SEPAR_SPACE: 1525 case XML_REGEXP_SEPAR_LINE: 1526 case XML_REGEXP_SEPAR_PARA: 1527 case XML_REGEXP_SYMBOL: 1528 case XML_REGEXP_SYMBOL_MATH: 1529 case XML_REGEXP_SYMBOL_CURRENCY: 1530 case XML_REGEXP_SYMBOL_MODIFIER: 1531 case XML_REGEXP_SYMBOL_OTHERS: 1532 case XML_REGEXP_OTHER: 1533 case XML_REGEXP_OTHER_CONTROL: 1534 case XML_REGEXP_OTHER_FORMAT: 1535 case XML_REGEXP_OTHER_PRIVATE: 1536 case XML_REGEXP_OTHER_NA: 1537 case XML_REGEXP_BLOCK_NAME: 1538 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 1539 (const xmlChar *)atom->valuep); 1540 if (atom->neg) 1541 ret = !ret; 1542 break; 1543 } 1544 return(ret); 1545} 1546 1547/************************************************************************ 1548 * * 1549 * Saving an restoring state of an execution context * 1550 * * 1551 ************************************************************************/ 1552 1553#ifdef DEBUG_REGEXP_EXEC 1554static void 1555xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 1556 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 1557 if (exec->inputStack != NULL) { 1558 int i; 1559 printf(": "); 1560 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 1561 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]); 1562 } else { 1563 printf(": %s", &(exec->inputString[exec->index])); 1564 } 1565 printf("\n"); 1566} 1567#endif 1568 1569static void 1570xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 1571#ifdef DEBUG_REGEXP_EXEC 1572 printf("saving "); 1573 exec->transno++; 1574 xmlFARegDebugExec(exec); 1575 exec->transno--; 1576#endif 1577 1578 if (exec->maxRollbacks == 0) { 1579 exec->maxRollbacks = 4; 1580 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 1581 sizeof(xmlRegExecRollback)); 1582 if (exec->rollbacks == NULL) { 1583 fprintf(stderr, "exec save: allocation failed"); 1584 exec->maxRollbacks = 0; 1585 return; 1586 } 1587 memset(exec->rollbacks, 0, 1588 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 1589 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 1590 xmlRegExecRollback *tmp; 1591 int len = exec->maxRollbacks; 1592 1593 exec->maxRollbacks *= 2; 1594 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 1595 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 1596 if (tmp == NULL) { 1597 fprintf(stderr, "exec save: allocation failed"); 1598 exec->maxRollbacks /= 2; 1599 return; 1600 } 1601 exec->rollbacks = tmp; 1602 tmp = &exec->rollbacks[len]; 1603 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 1604 } 1605 exec->rollbacks[exec->nbRollbacks].state = exec->state; 1606 exec->rollbacks[exec->nbRollbacks].index = exec->index; 1607 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 1608 if (exec->comp->nbCounters > 0) { 1609 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 1610 exec->rollbacks[exec->nbRollbacks].counts = (int *) 1611 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 1612 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 1613 fprintf(stderr, "exec save: allocation failed"); 1614 exec->status = -5; 1615 return; 1616 } 1617 } 1618 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 1619 exec->comp->nbCounters * sizeof(int)); 1620 } 1621 exec->nbRollbacks++; 1622} 1623 1624static void 1625xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 1626 if (exec->nbRollbacks <= 0) { 1627 exec->status = -1; 1628#ifdef DEBUG_REGEXP_EXEC 1629 printf("rollback failed on empty stack\n"); 1630#endif 1631 return; 1632 } 1633 exec->nbRollbacks--; 1634 exec->state = exec->rollbacks[exec->nbRollbacks].state; 1635 exec->index = exec->rollbacks[exec->nbRollbacks].index; 1636 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 1637 if (exec->comp->nbCounters > 0) { 1638 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 1639 fprintf(stderr, "exec save: allocation failed"); 1640 exec->status = -6; 1641 return; 1642 } 1643 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 1644 exec->comp->nbCounters * sizeof(int)); 1645 } 1646 1647#ifdef DEBUG_REGEXP_EXEC 1648 printf("restored "); 1649 xmlFARegDebugExec(exec); 1650#endif 1651} 1652 1653/************************************************************************ 1654 * * 1655 * Verifyer, running an input against a compiled regexp * 1656 * * 1657 ************************************************************************/ 1658 1659static int 1660xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 1661 xmlRegExecCtxt execval; 1662 xmlRegExecCtxtPtr exec = &execval; 1663 int ret, codepoint, len; 1664 1665 exec->inputString = content; 1666 exec->index = 0; 1667 exec->determinist = 1; 1668 exec->maxRollbacks = 0; 1669 exec->nbRollbacks = 0; 1670 exec->rollbacks = NULL; 1671 exec->status = 0; 1672 exec->comp = comp; 1673 exec->state = comp->states[0]; 1674 exec->transno = 0; 1675 exec->transcount = 0; 1676 if (comp->nbCounters > 0) { 1677 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 1678 if (exec->counts == NULL) 1679 return(-1); 1680 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 1681 } else 1682 exec->counts = NULL; 1683 while ((exec->status == 0) && 1684 ((exec->inputString[exec->index] != 0) || 1685 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 1686 xmlRegTransPtr trans; 1687 xmlRegAtomPtr atom; 1688 1689 /* 1690 * End of input on non-terminal state, rollback, however we may 1691 * still have epsilon like transition for counted transitions 1692 * on counters, in that case don't break too early. 1693 */ 1694 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 1695 goto rollback; 1696 1697 exec->transcount = 0; 1698 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 1699 trans = &exec->state->trans[exec->transno]; 1700 if (trans->to < 0) 1701 continue; 1702 atom = trans->atom; 1703 ret = 0; 1704 if (trans->count >= 0) { 1705 int count; 1706 xmlRegCounterPtr counter; 1707 1708 /* 1709 * A counted transition. 1710 */ 1711 1712 count = exec->counts[trans->count]; 1713 counter = &exec->comp->counters[trans->count]; 1714#ifdef DEBUG_REGEXP_EXEC 1715 printf("testing count %d: val %d, min %d, max %d\n", 1716 trans->count, count, counter->min, counter->max); 1717#endif 1718 ret = ((count >= counter->min) && (count <= counter->max)); 1719 } else if (atom == NULL) { 1720 fprintf(stderr, "epsilon transition left at runtime\n"); 1721 exec->status = -2; 1722 break; 1723 } else if (exec->inputString[exec->index] != 0) { 1724 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 1725 ret = xmlRegCheckCharacter(atom, codepoint); 1726 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 1727 xmlRegStatePtr to = comp->states[trans->to]; 1728 1729 /* 1730 * this is a multiple input sequence 1731 */ 1732 if (exec->state->nbTrans > exec->transno + 1) { 1733 xmlFARegExecSave(exec); 1734 } 1735 exec->transcount = 1; 1736 do { 1737 /* 1738 * Try to progress as much as possible on the input 1739 */ 1740 if (exec->transcount == atom->max) { 1741 break; 1742 } 1743 exec->index += len; 1744 /* 1745 * End of input: stop here 1746 */ 1747 if (exec->inputString[exec->index] == 0) { 1748 exec->index -= len; 1749 break; 1750 } 1751 if (exec->transcount >= atom->min) { 1752 int transno = exec->transno; 1753 xmlRegStatePtr state = exec->state; 1754 1755 /* 1756 * The transition is acceptable save it 1757 */ 1758 exec->transno = -1; /* trick */ 1759 exec->state = to; 1760 xmlFARegExecSave(exec); 1761 exec->transno = transno; 1762 exec->state = state; 1763 } 1764 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 1765 len); 1766 ret = xmlRegCheckCharacter(atom, codepoint); 1767 exec->transcount++; 1768 } while (ret == 1); 1769 if (exec->transcount < atom->min) 1770 ret = 0; 1771 1772 /* 1773 * If the last check failed but one transition was found 1774 * possible, rollback 1775 */ 1776 if (ret < 0) 1777 ret = 0; 1778 if (ret == 0) { 1779 goto rollback; 1780 } 1781 } 1782 } 1783 if (ret == 1) { 1784 if (exec->state->nbTrans > exec->transno + 1) { 1785 xmlFARegExecSave(exec); 1786 } 1787 if (trans->counter >= 0) { 1788#ifdef DEBUG_REGEXP_EXEC 1789 printf("Increasing count %d\n", trans->counter); 1790#endif 1791 exec->counts[trans->counter]++; 1792 } 1793#ifdef DEBUG_REGEXP_EXEC 1794 printf("entering state %d\n", trans->to); 1795#endif 1796 exec->state = comp->states[trans->to]; 1797 exec->transno = 0; 1798 if (trans->atom != NULL) { 1799 exec->index += len; 1800 } 1801 goto progress; 1802 } else if (ret < 0) { 1803 exec->status = -4; 1804 break; 1805 } 1806 } 1807 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 1808rollback: 1809 /* 1810 * Failed to find a way out 1811 */ 1812 exec->determinist = 0; 1813 xmlFARegExecRollBack(exec); 1814 } 1815progress: 1816 continue; 1817 } 1818 if (exec->rollbacks != NULL) { 1819 if (exec->counts != NULL) { 1820 int i; 1821 1822 for (i = 0;i < exec->maxRollbacks;i++) 1823 if (exec->rollbacks[i].counts != NULL) 1824 xmlFree(exec->rollbacks[i].counts); 1825 } 1826 xmlFree(exec->rollbacks); 1827 } 1828 if (exec->counts != NULL) 1829 xmlFree(exec->counts); 1830 if (exec->status == 0) 1831 return(1); 1832 if (exec->status == -1) 1833 return(0); 1834 return(exec->status); 1835} 1836 1837/************************************************************************ 1838 * * 1839 * Progressive interface to the verifyer one atom at a time * 1840 * * 1841 ************************************************************************/ 1842 1843/** 1844 * xmlRegExecCtxtPtr: 1845 * @comp: a precompiled regular expression 1846 * @callback: a callback function used for handling progresses in the 1847 * automata matching phase 1848 * @data: the context data associated to the callback in this context 1849 * 1850 * Build a context used for progressive evaluation of a regexp. 1851 */ 1852xmlRegExecCtxtPtr 1853xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 1854 xmlRegExecCtxtPtr exec; 1855 1856 if (comp == NULL) 1857 return(NULL); 1858 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 1859 if (exec == NULL) { 1860 return(NULL); 1861 } 1862 memset(exec, 0, sizeof(xmlRegExecCtxt)); 1863 exec->inputString = NULL; 1864 exec->index = 0; 1865 exec->determinist = 1; 1866 exec->maxRollbacks = 0; 1867 exec->nbRollbacks = 0; 1868 exec->rollbacks = NULL; 1869 exec->status = 0; 1870 exec->comp = comp; 1871 exec->state = comp->states[0]; 1872 exec->transno = 0; 1873 exec->transcount = 0; 1874 exec->callback = callback; 1875 exec->data = data; 1876 if (comp->nbCounters > 0) { 1877 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 1878 if (exec->counts == NULL) { 1879 xmlFree(exec); 1880 return(NULL); 1881 } 1882 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 1883 } else 1884 exec->counts = NULL; 1885 exec->inputStackMax = 0; 1886 exec->inputStackNr = 0; 1887 exec->inputStack = NULL; 1888 return(exec); 1889} 1890 1891/** 1892 * xmlRegFreeExecCtxt: 1893 * @exec: a regular expression evaulation context 1894 * 1895 * Free the structures associated to a regular expression evaulation context. 1896 */ 1897void 1898xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 1899 if (exec == NULL) 1900 return; 1901 1902 if (exec->rollbacks != NULL) { 1903 if (exec->counts != NULL) { 1904 int i; 1905 1906 for (i = 0;i < exec->maxRollbacks;i++) 1907 if (exec->rollbacks[i].counts != NULL) 1908 xmlFree(exec->rollbacks[i].counts); 1909 } 1910 xmlFree(exec->rollbacks); 1911 } 1912 if (exec->counts != NULL) 1913 xmlFree(exec->counts); 1914 if (exec->inputStack != NULL) { 1915 int i; 1916 1917 for (i = 0;i < exec->inputStackNr;i++) 1918 xmlFree(exec->inputStack[i].value); 1919 xmlFree(exec->inputStack); 1920 } 1921 xmlFree(exec); 1922} 1923 1924static void 1925xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 1926 void *data) { 1927#ifdef DEBUG_PUSH 1928 printf("saving value: %d:%s\n", exec->inputStackNr, value); 1929#endif 1930 if (exec->inputStackMax == 0) { 1931 exec->inputStackMax = 4; 1932 exec->inputStack = (xmlRegInputTokenPtr) 1933 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 1934 if (exec->inputStack == NULL) { 1935 fprintf(stderr, "push input: allocation failed"); 1936 exec->inputStackMax = 0; 1937 return; 1938 } 1939 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 1940 xmlRegInputTokenPtr tmp; 1941 1942 exec->inputStackMax *= 2; 1943 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 1944 exec->inputStackMax * sizeof(xmlRegInputToken)); 1945 if (tmp == NULL) { 1946 fprintf(stderr, "push input: allocation failed"); 1947 exec->inputStackMax /= 2; 1948 return; 1949 } 1950 exec->inputStack = tmp; 1951 } 1952 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 1953 exec->inputStack[exec->inputStackNr].data = data; 1954 exec->inputStackNr++; 1955 exec->inputStack[exec->inputStackNr].value = NULL; 1956 exec->inputStack[exec->inputStackNr].data = NULL; 1957} 1958 1959 1960/** 1961 * xmlRegExecPushString: 1962 * @exec: a regexp execution context 1963 * @value: a string token input 1964 * @data: data associated to the token to reuse in callbacks 1965 * 1966 * Push one input token in the execution context 1967 * 1968 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 1969 * a negative value in case of error. 1970 */ 1971int 1972xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 1973 void *data) { 1974 xmlRegTransPtr trans; 1975 xmlRegAtomPtr atom; 1976 int ret; 1977 int final = 0; 1978 1979 if (exec == NULL) 1980 return(-1); 1981 if (exec->status != 0) 1982 return(exec->status); 1983 1984 if (value == NULL) { 1985 if (exec->state->type == XML_REGEXP_FINAL_STATE) 1986 return(1); 1987 final = 1; 1988 } 1989 1990#ifdef DEBUG_PUSH 1991 printf("value pushed: %s\n", value); 1992#endif 1993 /* 1994 * If we have an active rollback stack push the new value there 1995 * and get back to where we were left 1996 */ 1997 if ((value != NULL) && (exec->inputStackNr > 0)) { 1998 xmlFARegExecSaveInputString(exec, value, data); 1999 value = exec->inputStack[exec->index].value; 2000 data = exec->inputStack[exec->index].data; 2001#ifdef DEBUG_PUSH 2002 printf("value loaded: %s\n", value); 2003#endif 2004 } 2005 2006 while ((exec->status == 0) && 2007 ((value != NULL) || 2008 ((final == 1) && 2009 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 2010 2011 /* 2012 * End of input on non-terminal state, rollback, however we may 2013 * still have epsilon like transition for counted transitions 2014 * on counters, in that case don't break too early. 2015 */ 2016 if ((value == NULL) && (exec->counts == NULL)) 2017 goto rollback; 2018 2019 exec->transcount = 0; 2020 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2021 trans = &exec->state->trans[exec->transno]; 2022 if (trans->to < 0) 2023 continue; 2024 atom = trans->atom; 2025 ret = 0; 2026 if (trans->count >= 0) { 2027 int count; 2028 xmlRegCounterPtr counter; 2029 2030 /* 2031 * A counted transition. 2032 */ 2033 2034 count = exec->counts[trans->count]; 2035 counter = &exec->comp->counters[trans->count]; 2036#ifdef DEBUG_PUSH 2037 printf("testing count %d: val %d, min %d, max %d\n", 2038 trans->count, count, counter->min, counter->max); 2039#endif 2040 ret = ((count >= counter->min) && (count <= counter->max)); 2041 } else if (atom == NULL) { 2042 fprintf(stderr, "epsilon transition left at runtime\n"); 2043 exec->status = -2; 2044 break; 2045 } else if (value != NULL) { 2046 ret = xmlStrEqual(value, atom->valuep); 2047 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 2048 xmlRegStatePtr to = exec->comp->states[trans->to]; 2049 2050 /* 2051 * this is a multiple input sequence 2052 */ 2053 if (exec->state->nbTrans > exec->transno + 1) { 2054 if (exec->inputStackNr <= 0) { 2055 xmlFARegExecSaveInputString(exec, value, data); 2056 } 2057 xmlFARegExecSave(exec); 2058 } 2059 exec->transcount = 1; 2060 do { 2061 /* 2062 * Try to progress as much as possible on the input 2063 */ 2064 if (exec->transcount == atom->max) { 2065 break; 2066 } 2067 exec->index++; 2068 value = exec->inputStack[exec->index].value; 2069 data = exec->inputStack[exec->index].data; 2070#ifdef DEBUG_PUSH 2071 printf("value loaded: %s\n", value); 2072#endif 2073 2074 /* 2075 * End of input: stop here 2076 */ 2077 if (value == NULL) { 2078 exec->index --; 2079 break; 2080 } 2081 if (exec->transcount >= atom->min) { 2082 int transno = exec->transno; 2083 xmlRegStatePtr state = exec->state; 2084 2085 /* 2086 * The transition is acceptable save it 2087 */ 2088 exec->transno = -1; /* trick */ 2089 exec->state = to; 2090 if (exec->inputStackNr <= 0) { 2091 xmlFARegExecSaveInputString(exec, value, data); 2092 } 2093 xmlFARegExecSave(exec); 2094 exec->transno = transno; 2095 exec->state = state; 2096 } 2097 ret = xmlStrEqual(value, atom->valuep); 2098 exec->transcount++; 2099 } while (ret == 1); 2100 if (exec->transcount < atom->min) 2101 ret = 0; 2102 2103 /* 2104 * If the last check failed but one transition was found 2105 * possible, rollback 2106 */ 2107 if (ret < 0) 2108 ret = 0; 2109 if (ret == 0) { 2110 goto rollback; 2111 } 2112 } 2113 } 2114 if (ret == 1) { 2115 if ((exec->callback != NULL) && (atom != NULL)) { 2116 exec->callback(exec->data, atom->valuep, 2117 atom->data, data); 2118 } 2119 if (exec->state->nbTrans > exec->transno + 1) { 2120 if (exec->inputStackNr <= 0) { 2121 xmlFARegExecSaveInputString(exec, value, data); 2122 } 2123 xmlFARegExecSave(exec); 2124 } 2125 if (trans->counter >= 0) { 2126#ifdef DEBUG_PUSH 2127 printf("Increasing count %d\n", trans->counter); 2128#endif 2129 exec->counts[trans->counter]++; 2130 } 2131#ifdef DEBUG_PUSH 2132 printf("entering state %d\n", trans->to); 2133#endif 2134 exec->state = exec->comp->states[trans->to]; 2135 exec->transno = 0; 2136 if (trans->atom != NULL) { 2137 if (exec->inputStack != NULL) { 2138 exec->index++; 2139 if (exec->index < exec->inputStackNr) { 2140 value = exec->inputStack[exec->index].value; 2141 data = exec->inputStack[exec->index].data; 2142#ifdef DEBUG_PUSH 2143 printf("value loaded: %s\n", value); 2144#endif 2145 } else { 2146 value = NULL; 2147 data = NULL; 2148#ifdef DEBUG_PUSH 2149 printf("end of input\n"); 2150#endif 2151 } 2152 } else { 2153 value = NULL; 2154 data = NULL; 2155#ifdef DEBUG_PUSH 2156 printf("end of input\n"); 2157#endif 2158 } 2159 } 2160 goto progress; 2161 } else if (ret < 0) { 2162 exec->status = -4; 2163 break; 2164 } 2165 } 2166 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2167rollback: 2168 /* 2169 * Failed to find a way out 2170 */ 2171 exec->determinist = 0; 2172 xmlFARegExecRollBack(exec); 2173 if (exec->status == 0) { 2174 value = exec->inputStack[exec->index].value; 2175 data = exec->inputStack[exec->index].data; 2176#ifdef DEBUG_PUSH 2177 printf("value loaded: %s\n", value); 2178#endif 2179 } 2180 } 2181progress: 2182 continue; 2183 } 2184 if (exec->status == 0) { 2185 return(exec->state->type == XML_REGEXP_FINAL_STATE); 2186 } 2187 return(exec->status); 2188} 2189 2190#if 0 2191static int 2192xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 2193 xmlRegTransPtr trans; 2194 xmlRegAtomPtr atom; 2195 int ret; 2196 int codepoint, len; 2197 2198 if (exec == NULL) 2199 return(-1); 2200 if (exec->status != 0) 2201 return(exec->status); 2202 2203 while ((exec->status == 0) && 2204 ((exec->inputString[exec->index] != 0) || 2205 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 2206 2207 /* 2208 * End of input on non-terminal state, rollback, however we may 2209 * still have epsilon like transition for counted transitions 2210 * on counters, in that case don't break too early. 2211 */ 2212 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 2213 goto rollback; 2214 2215 exec->transcount = 0; 2216 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 2217 trans = &exec->state->trans[exec->transno]; 2218 if (trans->to < 0) 2219 continue; 2220 atom = trans->atom; 2221 ret = 0; 2222 if (trans->count >= 0) { 2223 int count; 2224 xmlRegCounterPtr counter; 2225 2226 /* 2227 * A counted transition. 2228 */ 2229 2230 count = exec->counts[trans->count]; 2231 counter = &exec->comp->counters[trans->count]; 2232#ifdef DEBUG_REGEXP_EXEC 2233 printf("testing count %d: val %d, min %d, max %d\n", 2234 trans->count, count, counter->min, counter->max); 2235#endif 2236 ret = ((count >= counter->min) && (count <= counter->max)); 2237 } else if (atom == NULL) { 2238 fprintf(stderr, "epsilon transition left at runtime\n"); 2239 exec->status = -2; 2240 break; 2241 } else if (exec->inputString[exec->index] != 0) { 2242 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 2243 ret = xmlRegCheckCharacter(atom, codepoint); 2244 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 2245 xmlRegStatePtr to = exec->comp->states[trans->to]; 2246 2247 /* 2248 * this is a multiple input sequence 2249 */ 2250 if (exec->state->nbTrans > exec->transno + 1) { 2251 xmlFARegExecSave(exec); 2252 } 2253 exec->transcount = 1; 2254 do { 2255 /* 2256 * Try to progress as much as possible on the input 2257 */ 2258 if (exec->transcount == atom->max) { 2259 break; 2260 } 2261 exec->index += len; 2262 /* 2263 * End of input: stop here 2264 */ 2265 if (exec->inputString[exec->index] == 0) { 2266 exec->index -= len; 2267 break; 2268 } 2269 if (exec->transcount >= atom->min) { 2270 int transno = exec->transno; 2271 xmlRegStatePtr state = exec->state; 2272 2273 /* 2274 * The transition is acceptable save it 2275 */ 2276 exec->transno = -1; /* trick */ 2277 exec->state = to; 2278 xmlFARegExecSave(exec); 2279 exec->transno = transno; 2280 exec->state = state; 2281 } 2282 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 2283 len); 2284 ret = xmlRegCheckCharacter(atom, codepoint); 2285 exec->transcount++; 2286 } while (ret == 1); 2287 if (exec->transcount < atom->min) 2288 ret = 0; 2289 2290 /* 2291 * If the last check failed but one transition was found 2292 * possible, rollback 2293 */ 2294 if (ret < 0) 2295 ret = 0; 2296 if (ret == 0) { 2297 goto rollback; 2298 } 2299 } 2300 } 2301 if (ret == 1) { 2302 if (exec->state->nbTrans > exec->transno + 1) { 2303 xmlFARegExecSave(exec); 2304 } 2305 if (trans->counter >= 0) { 2306#ifdef DEBUG_REGEXP_EXEC 2307 printf("Increasing count %d\n", trans->counter); 2308#endif 2309 exec->counts[trans->counter]++; 2310 } 2311#ifdef DEBUG_REGEXP_EXEC 2312 printf("entering state %d\n", trans->to); 2313#endif 2314 exec->state = exec->comp->states[trans->to]; 2315 exec->transno = 0; 2316 if (trans->atom != NULL) { 2317 exec->index += len; 2318 } 2319 goto progress; 2320 } else if (ret < 0) { 2321 exec->status = -4; 2322 break; 2323 } 2324 } 2325 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 2326rollback: 2327 /* 2328 * Failed to find a way out 2329 */ 2330 exec->determinist = 0; 2331 xmlFARegExecRollBack(exec); 2332 } 2333progress: 2334 continue; 2335 } 2336} 2337#endif 2338/************************************************************************ 2339 * * 2340 * Parser for the Shemas Datatype Regular Expressions * 2341 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 2342 * * 2343 ************************************************************************/ 2344 2345/** 2346 * xmlFAIsChar: 2347 * ctxt: a regexp parser context 2348 * 2349 * [10] Char ::= [^.\?*+()|#x5B#x5D] 2350 */ 2351static int 2352xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 2353 int cur; 2354 int len; 2355 2356 cur = CUR_SCHAR(ctxt->cur, len); 2357 if ((cur == '.') || (cur == '\\') || (cur == '?') || 2358 (cur == '*') || (cur == '+') || (cur == '(') || 2359 (cur == ')') || (cur == '|') || (cur == 0x5B) || 2360 (cur == 0x5D) || (cur == 0)) 2361 return(-1); 2362 return(cur); 2363} 2364 2365/** 2366 * xmlFAParseCharProp: 2367 * ctxt: a regexp parser context 2368 * 2369 * [27] charProp ::= IsCategory | IsBlock 2370 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 2371 * Separators | Symbols | Others 2372 * [29] Letters ::= 'L' [ultmo]? 2373 * [30] Marks ::= 'M' [nce]? 2374 * [31] Numbers ::= 'N' [dlo]? 2375 * [32] Punctuation ::= 'P' [cdseifo]? 2376 * [33] Separators ::= 'Z' [slp]? 2377 * [34] Symbols ::= 'S' [mcko]? 2378 * [35] Others ::= 'C' [cfon]? 2379 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 2380 */ 2381static void 2382xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 2383 int cur; 2384 xmlRegAtomType type = 0; 2385 xmlChar *blockName = NULL; 2386 2387 cur = CUR; 2388 if (cur == 'L') { 2389 NEXT; 2390 cur = CUR; 2391 if (cur == 'u') { 2392 NEXT; 2393 type = XML_REGEXP_LETTER_UPPERCASE; 2394 } else if (cur == 'l') { 2395 NEXT; 2396 type = XML_REGEXP_LETTER_LOWERCASE; 2397 } else if (cur == 't') { 2398 NEXT; 2399 type = XML_REGEXP_LETTER_TITLECASE; 2400 } else if (cur == 'm') { 2401 NEXT; 2402 type = XML_REGEXP_LETTER_MODIFIER; 2403 } else if (cur == 'o') { 2404 NEXT; 2405 type = XML_REGEXP_LETTER_OTHERS; 2406 } else { 2407 type = XML_REGEXP_LETTER; 2408 } 2409 } else if (cur == 'M') { 2410 NEXT; 2411 cur = CUR; 2412 if (cur == 'n') { 2413 NEXT; 2414 /* nonspacing */ 2415 type = XML_REGEXP_MARK_NONSPACING; 2416 } else if (cur == 'c') { 2417 NEXT; 2418 /* spacing combining */ 2419 type = XML_REGEXP_MARK_SPACECOMBINING; 2420 } else if (cur == 'e') { 2421 NEXT; 2422 /* enclosing */ 2423 type = XML_REGEXP_MARK_ENCLOSING; 2424 } else { 2425 /* all marks */ 2426 type = XML_REGEXP_MARK; 2427 } 2428 } else if (cur == 'N') { 2429 NEXT; 2430 cur = CUR; 2431 if (cur == 'd') { 2432 NEXT; 2433 /* digital */ 2434 type = XML_REGEXP_NUMBER_DECIMAL; 2435 } else if (cur == 'l') { 2436 NEXT; 2437 /* letter */ 2438 type = XML_REGEXP_NUMBER_LETTER; 2439 } else if (cur == 'o') { 2440 NEXT; 2441 /* other */ 2442 type = XML_REGEXP_NUMBER_OTHERS; 2443 } else { 2444 /* all numbers */ 2445 type = XML_REGEXP_NUMBER; 2446 } 2447 } else if (cur == 'P') { 2448 NEXT; 2449 cur = CUR; 2450 if (cur == 'c') { 2451 NEXT; 2452 /* connector */ 2453 type = XML_REGEXP_PUNCT_CONNECTOR; 2454 } else if (cur == 'd') { 2455 NEXT; 2456 /* dash */ 2457 type = XML_REGEXP_PUNCT_DASH; 2458 } else if (cur == 's') { 2459 NEXT; 2460 /* open */ 2461 type = XML_REGEXP_PUNCT_OPEN; 2462 } else if (cur == 'e') { 2463 NEXT; 2464 /* close */ 2465 type = XML_REGEXP_PUNCT_CLOSE; 2466 } else if (cur == 'i') { 2467 NEXT; 2468 /* initial quote */ 2469 type = XML_REGEXP_PUNCT_INITQUOTE; 2470 } else if (cur == 'f') { 2471 NEXT; 2472 /* final quote */ 2473 type = XML_REGEXP_PUNCT_FINQUOTE; 2474 } else if (cur == 'o') { 2475 NEXT; 2476 /* other */ 2477 type = XML_REGEXP_PUNCT_OTHERS; 2478 } else { 2479 /* all punctuation */ 2480 type = XML_REGEXP_PUNCT; 2481 } 2482 } else if (cur == 'Z') { 2483 NEXT; 2484 cur = CUR; 2485 if (cur == 's') { 2486 NEXT; 2487 /* space */ 2488 type = XML_REGEXP_SEPAR_SPACE; 2489 } else if (cur == 'l') { 2490 NEXT; 2491 /* line */ 2492 type = XML_REGEXP_SEPAR_LINE; 2493 } else if (cur == 'p') { 2494 NEXT; 2495 /* paragraph */ 2496 type = XML_REGEXP_SEPAR_PARA; 2497 } else { 2498 /* all separators */ 2499 type = XML_REGEXP_SEPAR; 2500 } 2501 } else if (cur == 'S') { 2502 NEXT; 2503 cur = CUR; 2504 if (cur == 'm') { 2505 NEXT; 2506 type = XML_REGEXP_SYMBOL_MATH; 2507 /* math */ 2508 } else if (cur == 'c') { 2509 NEXT; 2510 type = XML_REGEXP_SYMBOL_CURRENCY; 2511 /* currency */ 2512 } else if (cur == 'k') { 2513 NEXT; 2514 type = XML_REGEXP_SYMBOL_MODIFIER; 2515 /* modifiers */ 2516 } else if (cur == 'o') { 2517 NEXT; 2518 type = XML_REGEXP_SYMBOL_OTHERS; 2519 /* other */ 2520 } else { 2521 /* all symbols */ 2522 type = XML_REGEXP_SYMBOL; 2523 } 2524 } else if (cur == 'C') { 2525 NEXT; 2526 cur = CUR; 2527 if (cur == 'c') { 2528 NEXT; 2529 /* control */ 2530 type = XML_REGEXP_OTHER_CONTROL; 2531 } else if (cur == 'f') { 2532 NEXT; 2533 /* format */ 2534 type = XML_REGEXP_OTHER_FORMAT; 2535 } else if (cur == 'o') { 2536 NEXT; 2537 /* private use */ 2538 type = XML_REGEXP_OTHER_PRIVATE; 2539 } else if (cur == 'n') { 2540 NEXT; 2541 /* not assigned */ 2542 type = XML_REGEXP_OTHER_NA; 2543 } else { 2544 /* all others */ 2545 type = XML_REGEXP_OTHER; 2546 } 2547 } else if (cur == 'I') { 2548 const xmlChar *start; 2549 NEXT; 2550 cur = CUR; 2551 if (cur != 's') { 2552 ERROR("IsXXXX expected"); 2553 return; 2554 } 2555 NEXT; 2556 start = ctxt->cur; 2557 cur = CUR; 2558 if (((cur >= 'a') && (cur <= 'z')) || 2559 ((cur >= 'A') && (cur <= 'Z')) || 2560 ((cur >= '0') && (cur <= '9')) || 2561 (cur == 0x2D)) { 2562 NEXT; 2563 cur = CUR; 2564 while (((cur >= 'a') && (cur <= 'z')) || 2565 ((cur >= 'A') && (cur <= 'Z')) || 2566 ((cur >= '0') && (cur <= '9')) || 2567 (cur == 0x2D)) { 2568 NEXT; 2569 cur = CUR; 2570 } 2571 } 2572 type = XML_REGEXP_BLOCK_NAME; 2573 blockName = xmlStrndup(start, ctxt->cur - start); 2574 } else { 2575 ERROR("Unknown char property"); 2576 return; 2577 } 2578 if (ctxt->atom == NULL) { 2579 ctxt->atom = xmlRegNewAtom(ctxt, type); 2580 if (ctxt->atom != NULL) 2581 ctxt->atom->valuep = blockName; 2582 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 2583 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 2584 type, 0, 0, blockName); 2585 } 2586} 2587 2588/** 2589 * xmlFAParseCharClassEsc: 2590 * ctxt: a regexp parser context 2591 * 2592 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 2593 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 2594 * [25] catEsc ::= '\p{' charProp '}' 2595 * [26] complEsc ::= '\P{' charProp '}' 2596 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 2597 */ 2598static void 2599xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 2600 int cur; 2601 2602 if (CUR == '.') { 2603 if (ctxt->atom == NULL) { 2604 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 2605 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 2606 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 2607 XML_REGEXP_ANYCHAR, 0, 0, NULL); 2608 } 2609 NEXT; 2610 return; 2611 } 2612 if (CUR != '\\') { 2613 ERROR("Escaped sequence: expecting \\"); 2614 return; 2615 } 2616 NEXT; 2617 cur = CUR; 2618 if (cur == 'p') { 2619 NEXT; 2620 if (CUR != '{') { 2621 ERROR("Expecting '{'"); 2622 return; 2623 } 2624 NEXT; 2625 xmlFAParseCharProp(ctxt); 2626 if (CUR != '}') { 2627 ERROR("Expecting '}'"); 2628 return; 2629 } 2630 NEXT; 2631 } else if (cur == 'P') { 2632 NEXT; 2633 if (CUR != '{') { 2634 ERROR("Expecting '{'"); 2635 return; 2636 } 2637 NEXT; 2638 xmlFAParseCharProp(ctxt); 2639 ctxt->atom->neg = 1; 2640 if (CUR != '}') { 2641 ERROR("Expecting '}'"); 2642 return; 2643 } 2644 NEXT; 2645 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 2646 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 2647 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 2648 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 2649 (cur == 0x5E)) { 2650 if (ctxt->atom == NULL) { 2651 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 2652 if (ctxt->atom != NULL) 2653 ctxt->atom->codepoint = cur; 2654 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 2655 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 2656 XML_REGEXP_CHARVAL, cur, cur, NULL); 2657 } 2658 NEXT; 2659 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 2660 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 2661 (cur == 'w') || (cur == 'W')) { 2662 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 2663 2664 switch (cur) { 2665 case 's': 2666 type = XML_REGEXP_ANYSPACE; 2667 break; 2668 case 'S': 2669 type = XML_REGEXP_NOTSPACE; 2670 break; 2671 case 'i': 2672 type = XML_REGEXP_INITNAME; 2673 break; 2674 case 'I': 2675 type = XML_REGEXP_NOTINITNAME; 2676 break; 2677 case 'c': 2678 type = XML_REGEXP_NAMECHAR; 2679 break; 2680 case 'C': 2681 type = XML_REGEXP_NOTNAMECHAR; 2682 break; 2683 case 'd': 2684 type = XML_REGEXP_DECIMAL; 2685 break; 2686 case 'D': 2687 type = XML_REGEXP_NOTDECIMAL; 2688 break; 2689 case 'w': 2690 type = XML_REGEXP_REALCHAR; 2691 break; 2692 case 'W': 2693 type = XML_REGEXP_NOTREALCHAR; 2694 break; 2695 } 2696 NEXT; 2697 if (ctxt->atom == NULL) { 2698 ctxt->atom = xmlRegNewAtom(ctxt, type); 2699 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 2700 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 2701 type, 0, 0, NULL); 2702 } 2703 } 2704} 2705 2706/** 2707 * xmlFAParseCharRef: 2708 * ctxt: a regexp parser context 2709 * 2710 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' ) 2711 */ 2712static int 2713xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) { 2714 int ret = 0, cur; 2715 2716 if ((CUR != '&') || (NXT(1) != '#')) 2717 return(-1); 2718 NEXT; 2719 NEXT; 2720 cur = CUR; 2721 if (cur == 'x') { 2722 NEXT; 2723 cur = CUR; 2724 if (((cur >= '0') && (cur <= '9')) || 2725 ((cur >= 'a') && (cur <= 'f')) || 2726 ((cur >= 'A') && (cur <= 'F'))) { 2727 while (((cur >= '0') && (cur <= '9')) || 2728 ((cur >= 'A') && (cur <= 'F'))) { 2729 if ((cur >= '0') && (cur <= '9')) 2730 ret = ret * 16 + cur - '0'; 2731 else if ((cur >= 'a') && (cur <= 'f')) 2732 ret = ret * 16 + 10 + (cur - 'a'); 2733 else 2734 ret = ret * 16 + 10 + (cur - 'A'); 2735 NEXT; 2736 cur = CUR; 2737 } 2738 } else { 2739 ERROR("Char ref: expecting [0-9A-F]"); 2740 return(-1); 2741 } 2742 } else { 2743 if ((cur >= '0') && (cur <= '9')) { 2744 while ((cur >= '0') && (cur <= '9')) { 2745 ret = ret * 10 + cur - '0'; 2746 NEXT; 2747 cur = CUR; 2748 } 2749 } else { 2750 ERROR("Char ref: expecting [0-9]"); 2751 return(-1); 2752 } 2753 } 2754 if (cur != ';') { 2755 ERROR("Char ref: expecting ';'"); 2756 return(-1); 2757 } else { 2758 NEXT; 2759 } 2760 return(ret); 2761} 2762 2763/** 2764 * xmlFAParseCharRange: 2765 * ctxt: a regexp parser context 2766 * 2767 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 2768 * [18] seRange ::= charOrEsc '-' charOrEsc 2769 * [20] charOrEsc ::= XmlChar | SingleCharEsc 2770 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 2771 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 2772 */ 2773static void 2774xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 2775 int cur; 2776 int start = -1; 2777 int end = -1; 2778 2779 if ((CUR == '&') && (NXT(1) == '#')) { 2780 end = start = xmlFAParseCharRef(ctxt); 2781 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 2782 XML_REGEXP_CHARVAL, start, end, NULL); 2783 return; 2784 } 2785 cur = CUR; 2786 if (cur == '\\') { 2787 NEXT; 2788 cur = CUR; 2789 switch (cur) { 2790 case 'n': start = 0xA; break; 2791 case 'r': start = 0xD; break; 2792 case 't': start = 0x9; break; 2793 case '\\': case '|': case '.': case '-': case '^': case '?': 2794 case '*': case '+': case '{': case '}': case '(': case ')': 2795 case '[': case ']': 2796 start = cur; break; 2797 default: 2798 ERROR("Invalid escape value"); 2799 return; 2800 } 2801 end = start; 2802 } else if ((cur != 0x5B) && (cur != 0x5D)) { 2803 end = start = cur; 2804 } else { 2805 ERROR("Expecting a char range"); 2806 return; 2807 } 2808 NEXT; 2809 if (start == '-') { 2810 return; 2811 } 2812 cur = CUR; 2813 if (cur != '-') { 2814 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 2815 XML_REGEXP_CHARVAL, start, end, NULL); 2816 return; 2817 } 2818 NEXT; 2819 cur = CUR; 2820 if (cur == '\\') { 2821 NEXT; 2822 cur = CUR; 2823 switch (cur) { 2824 case 'n': end = 0xA; break; 2825 case 'r': end = 0xD; break; 2826 case 't': end = 0x9; break; 2827 case '\\': case '|': case '.': case '-': case '^': case '?': 2828 case '*': case '+': case '{': case '}': case '(': case ')': 2829 case '[': case ']': 2830 end = cur; break; 2831 default: 2832 ERROR("Invalid escape value"); 2833 return; 2834 } 2835 } else if ((cur != 0x5B) && (cur != 0x5D)) { 2836 end = cur; 2837 } else { 2838 ERROR("Expecting the end of a char range"); 2839 return; 2840 } 2841 NEXT; 2842 /* TODO check that the values are acceptable character ranges for XML */ 2843 if (end < start) { 2844 ERROR("End of range is before start of range"); 2845 } else { 2846 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 2847 XML_REGEXP_CHARVAL, start, end, NULL); 2848 } 2849 return; 2850} 2851 2852/** 2853 * xmlFAParsePosCharGroup: 2854 * ctxt: a regexp parser context 2855 * 2856 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 2857 */ 2858static void 2859xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 2860 do { 2861 if ((CUR == '\\') || (CUR == '.')) { 2862 xmlFAParseCharClassEsc(ctxt); 2863 } else { 2864 xmlFAParseCharRange(ctxt); 2865 } 2866 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 2867 (ctxt->error == 0)); 2868} 2869 2870/** 2871 * xmlFAParseCharGroup: 2872 * ctxt: a regexp parser context 2873 * 2874 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 2875 * [15] negCharGroup ::= '^' posCharGroup 2876 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 2877 * [12] charClassExpr ::= '[' charGroup ']' 2878 */ 2879static void 2880xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 2881 int n = ctxt->neg; 2882 while ((CUR != ']') && (ctxt->error == 0)) { 2883 if (CUR == '^') { 2884 int neg = ctxt->neg; 2885 2886 NEXT; 2887 ctxt->neg = !ctxt->neg; 2888 xmlFAParsePosCharGroup(ctxt); 2889 ctxt->neg = neg; 2890 } else if (CUR == '-') { 2891 NEXT; 2892 ctxt->neg = !ctxt->neg; 2893 if (CUR != '[') { 2894 ERROR("charClassExpr: '[' expected"); 2895 break; 2896 } 2897 NEXT; 2898 xmlFAParseCharGroup(ctxt); 2899 if (CUR == ']') { 2900 NEXT; 2901 } else { 2902 ERROR("charClassExpr: ']' expected"); 2903 break; 2904 } 2905 break; 2906 } else if (CUR != ']') { 2907 xmlFAParsePosCharGroup(ctxt); 2908 } 2909 } 2910 ctxt->neg = n; 2911} 2912 2913/** 2914 * xmlFAParseCharClass: 2915 * ctxt: a regexp parser context 2916 * 2917 * [11] charClass ::= charClassEsc | charClassExpr 2918 * [12] charClassExpr ::= '[' charGroup ']' 2919 */ 2920static void 2921xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 2922 if (CUR == '[') { 2923 NEXT; 2924 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 2925 if (ctxt->atom == NULL) 2926 return; 2927 xmlFAParseCharGroup(ctxt); 2928 if (CUR == ']') { 2929 NEXT; 2930 } else { 2931 ERROR("xmlFAParseCharClass: ']' expected"); 2932 } 2933 } else { 2934 xmlFAParseCharClassEsc(ctxt); 2935 } 2936} 2937 2938/** 2939 * xmlFAParseQuantExact: 2940 * ctxt: a regexp parser context 2941 * 2942 * [8] QuantExact ::= [0-9]+ 2943 */ 2944static int 2945xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 2946 int ret = 0; 2947 int ok = 0; 2948 2949 while ((CUR >= '0') && (CUR <= '9')) { 2950 ret = ret * 10 + (CUR - '0'); 2951 ok = 1; 2952 NEXT; 2953 } 2954 if (ok != 1) { 2955 return(-1); 2956 } 2957 return(ret); 2958} 2959 2960/** 2961 * xmlFAParseQuantifier: 2962 * ctxt: a regexp parser context 2963 * 2964 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 2965 * [5] quantity ::= quantRange | quantMin | QuantExact 2966 * [6] quantRange ::= QuantExact ',' QuantExact 2967 * [7] quantMin ::= QuantExact ',' 2968 * [8] QuantExact ::= [0-9]+ 2969 */ 2970static int 2971xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 2972 int cur; 2973 2974 cur = CUR; 2975 if ((cur == '?') || (cur == '*') || (cur == '+')) { 2976 if (ctxt->atom != NULL) { 2977 if (cur == '?') 2978 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 2979 else if (cur == '*') 2980 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 2981 else if (cur == '+') 2982 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 2983 } 2984 NEXT; 2985 return(1); 2986 } 2987 if (cur == '{') { 2988 int min = 0, max = 0; 2989 2990 NEXT; 2991 cur = xmlFAParseQuantExact(ctxt); 2992 if (cur >= 0) 2993 min = cur; 2994 if (CUR == ',') { 2995 NEXT; 2996 cur = xmlFAParseQuantExact(ctxt); 2997 if (cur >= 0) 2998 max = cur; 2999 } 3000 if (CUR == '}') { 3001 NEXT; 3002 } else { 3003 ERROR("Unterminated quantifier"); 3004 } 3005 if (max == 0) 3006 max = min; 3007 if (ctxt->atom != NULL) { 3008 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 3009 ctxt->atom->min = min; 3010 ctxt->atom->max = max; 3011 } 3012 return(1); 3013 } 3014 return(0); 3015} 3016 3017/** 3018 * xmlFAParseAtom: 3019 * ctxt: a regexp parser context 3020 * 3021 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 3022 */ 3023static int 3024xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 3025 int codepoint, len; 3026 3027 codepoint = xmlFAIsChar(ctxt); 3028 if (codepoint > 0) { 3029 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 3030 if (ctxt->atom == NULL) 3031 return(-1); 3032 codepoint = CUR_SCHAR(ctxt->cur, len); 3033 ctxt->atom->codepoint = codepoint; 3034 NEXTL(len); 3035 return(1); 3036 } else if (CUR == '|') { 3037 return(0); 3038 } else if (CUR == 0) { 3039 return(0); 3040 } else if (CUR == ')') { 3041 return(0); 3042 } else if (CUR == '(') { 3043 xmlRegStatePtr start, oldend; 3044 3045 NEXT; 3046 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 3047 start = ctxt->state; 3048 oldend = ctxt->end; 3049 ctxt->end = NULL; 3050 ctxt->atom = NULL; 3051 xmlFAParseRegExp(ctxt, 0); 3052 if (CUR == ')') { 3053 NEXT; 3054 } else { 3055 ERROR("xmlFAParseAtom: expecting ')'"); 3056 } 3057 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 3058 if (ctxt->atom == NULL) 3059 return(-1); 3060 ctxt->atom->start = start; 3061 ctxt->atom->stop = ctxt->state; 3062 ctxt->end = oldend; 3063 return(1); 3064 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 3065 xmlFAParseCharClass(ctxt); 3066 return(1); 3067 } 3068 return(0); 3069} 3070 3071/** 3072 * xmlFAParsePiece: 3073 * ctxt: a regexp parser context 3074 * 3075 * [3] piece ::= atom quantifier? 3076 */ 3077static int 3078xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 3079 int ret; 3080 3081 ctxt->atom = NULL; 3082 ret = xmlFAParseAtom(ctxt); 3083 if (ret == 0) 3084 return(0); 3085 if (ctxt->atom == NULL) { 3086 ERROR("internal: no atom generated"); 3087 } 3088 xmlFAParseQuantifier(ctxt); 3089 return(1); 3090} 3091 3092/** 3093 * xmlFAParseBranch: 3094 * ctxt: a regexp parser context 3095 * first: is taht the first 3096 * 3097 * [2] branch ::= piece* 3098 */ 3099static void 3100xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) { 3101 xmlRegStatePtr previous; 3102 xmlRegAtomPtr prevatom = NULL; 3103 int ret; 3104 3105 previous = ctxt->state; 3106 ret = xmlFAParsePiece(ctxt); 3107 if (ret != 0) { 3108 if (first) { 3109 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom); 3110 previous = ctxt->state; 3111 } else { 3112 prevatom = ctxt->atom; 3113 } 3114 ctxt->atom = NULL; 3115 } 3116 while ((ret != 0) && (ctxt->error == 0)) { 3117 ret = xmlFAParsePiece(ctxt); 3118 if (ret != 0) { 3119 if (first) { 3120 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom); 3121 } else { 3122 xmlFAGenerateTransitions(ctxt, previous, NULL, prevatom); 3123 prevatom = ctxt->atom; 3124 } 3125 previous = ctxt->state; 3126 ctxt->atom = NULL; 3127 } 3128 } 3129 if (!first) { 3130 xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom); 3131 } 3132} 3133 3134/** 3135 * xmlFAParseRegExp: 3136 * ctxt: a regexp parser context 3137 * top: is that the top-level expressions ? 3138 * 3139 * [1] regExp ::= branch ( '|' branch )* 3140 */ 3141static void 3142xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 3143 xmlRegStatePtr start, end, oldend; 3144 3145 oldend = ctxt->end; 3146 3147 start = ctxt->state; 3148 xmlFAParseBranch(ctxt, (ctxt->end == NULL)); 3149 if (CUR != '|') { 3150 ctxt->end = ctxt->state; 3151 return; 3152 } 3153 end = ctxt->state; 3154 while ((CUR == '|') && (ctxt->error == 0)) { 3155 NEXT; 3156 ctxt->state = start; 3157 ctxt->end = end; 3158 xmlFAParseBranch(ctxt, 0); 3159 } 3160 if (!top) 3161 ctxt->end = oldend; 3162} 3163 3164/************************************************************************ 3165 * * 3166 * The basic API * 3167 * * 3168 ************************************************************************/ 3169 3170/** 3171 * xmlRegexpPrint: 3172 * @output: the file for the output debug 3173 * @regexp: the compiled regexp 3174 * 3175 * Print the content of the compiled regular expression 3176 */ 3177void 3178xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 3179 int i; 3180 3181 fprintf(output, " regexp: "); 3182 if (regexp == NULL) { 3183 fprintf(output, "NULL\n"); 3184 return; 3185 } 3186 fprintf(output, "'%s' ", regexp->string); 3187 fprintf(output, "\n"); 3188 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 3189 for (i = 0;i < regexp->nbAtoms; i++) { 3190 fprintf(output, " %02d ", i); 3191 xmlRegPrintAtom(output, regexp->atoms[i]); 3192 } 3193 fprintf(output, "%d states:", regexp->nbStates); 3194 fprintf(output, "\n"); 3195 for (i = 0;i < regexp->nbStates; i++) { 3196 xmlRegPrintState(output, regexp->states[i]); 3197 } 3198 fprintf(output, "%d counters:\n", regexp->nbCounters); 3199 for (i = 0;i < regexp->nbCounters; i++) { 3200 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 3201 regexp->counters[i].max); 3202 } 3203} 3204 3205/** 3206 * xmlRegexpCompile: 3207 * @regexp: a regular expression string 3208 * 3209 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 3210 * Appendix F and build an automata suitable for testing strings against 3211 * that regular expression 3212 * 3213 * Returns the compiled expression or NULL in case of error 3214 */ 3215xmlRegexpPtr 3216xmlRegexpCompile(const xmlChar *regexp) { 3217 xmlRegexpPtr ret; 3218 xmlRegParserCtxtPtr ctxt; 3219 3220 ctxt = xmlRegNewParserCtxt(regexp); 3221 if (ctxt == NULL) 3222 return(NULL); 3223 3224 /* initialize the parser */ 3225 ctxt->end = NULL; 3226 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 3227 xmlRegStatePush(ctxt, ctxt->start); 3228 3229 /* parse the expression building an automata */ 3230 xmlFAParseRegExp(ctxt, 1); 3231 if (CUR != 0) { 3232 ERROR("xmlFAParseRegExp: extra characters"); 3233 } 3234 ctxt->end = ctxt->state; 3235 ctxt->start->type = XML_REGEXP_START_STATE; 3236 ctxt->end->type = XML_REGEXP_FINAL_STATE; 3237 3238 /* remove the Epsilon except for counted transitions */ 3239 xmlFAEliminateEpsilonTransitions(ctxt); 3240 3241 3242 if (ctxt->error != 0) { 3243 xmlRegFreeParserCtxt(ctxt); 3244 return(NULL); 3245 } 3246 ret = xmlRegEpxFromParse(ctxt); 3247 xmlRegFreeParserCtxt(ctxt); 3248 return(ret); 3249} 3250 3251/** 3252 * xmlRegexpExec: 3253 * @comp: the compiled regular expression 3254 * @content: the value to check against the regular expression 3255 * 3256 * Check if the regular expression generate the value 3257 * 3258 * Returns 1 if it matches, 0 if not and a negativa value in case of error 3259 */ 3260int 3261xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 3262 if ((comp == NULL) || (content == NULL)) 3263 return(-1); 3264 return(xmlFARegExec(comp, content)); 3265} 3266 3267/** 3268 * xmlRegFreeRegexp: 3269 * @regexp: the regexp 3270 * 3271 * Free a regexp 3272 */ 3273void 3274xmlRegFreeRegexp(xmlRegexpPtr regexp) { 3275 int i; 3276 if (regexp == NULL) 3277 return; 3278 3279 if (regexp->string != NULL) 3280 xmlFree(regexp->string); 3281 if (regexp->states != NULL) { 3282 for (i = 0;i < regexp->nbStates;i++) 3283 xmlRegFreeState(regexp->states[i]); 3284 xmlFree(regexp->states); 3285 } 3286 if (regexp->atoms != NULL) { 3287 for (i = 0;i < regexp->nbAtoms;i++) 3288 xmlRegFreeAtom(regexp->atoms[i]); 3289 xmlFree(regexp->atoms); 3290 } 3291 if (regexp->counters != NULL) 3292 xmlFree(regexp->counters); 3293 xmlFree(regexp); 3294} 3295 3296#ifdef LIBXML_AUTOMATA_ENABLED 3297/************************************************************************ 3298 * * 3299 * The Automata interface * 3300 * * 3301 ************************************************************************/ 3302 3303/** 3304 * xmlNewAutomata: 3305 * 3306 * Create a new automata 3307 * 3308 * Returns the new object or NULL in case of failure 3309 */ 3310xmlAutomataPtr 3311xmlNewAutomata(void) { 3312 xmlAutomataPtr ctxt; 3313 3314 ctxt = xmlRegNewParserCtxt(NULL); 3315 if (ctxt == NULL) 3316 return(NULL); 3317 3318 /* initialize the parser */ 3319 ctxt->end = NULL; 3320 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 3321 xmlRegStatePush(ctxt, ctxt->start); 3322 3323 return(ctxt); 3324} 3325 3326/** 3327 * xmlFreeAutomata: 3328 * @am: an automata 3329 * 3330 * Free an automata 3331 */ 3332void 3333xmlFreeAutomata(xmlAutomataPtr am) { 3334 if (am == NULL) 3335 return; 3336 xmlRegFreeParserCtxt(am); 3337} 3338 3339/** 3340 * xmlAutomataGetInitState: 3341 * @am: an automata 3342 * 3343 * Returns the initial state of the automata 3344 */ 3345xmlAutomataStatePtr 3346xmlAutomataGetInitState(xmlAutomataPtr am) { 3347 if (am == NULL) 3348 return(NULL); 3349 return(am->start); 3350} 3351 3352/** 3353 * xmlAutomataSetFinalState: 3354 * @am: an automata 3355 * @state: a state in this automata 3356 * 3357 * Makes that state a final state 3358 * 3359 * Returns 0 or -1 in case of error 3360 */ 3361int 3362xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 3363 if ((am == NULL) || (state == NULL)) 3364 return(-1); 3365 state->type = XML_REGEXP_FINAL_STATE; 3366 return(0); 3367} 3368 3369/** 3370 * xmlAutomataNewTransition: 3371 * @am: an automata 3372 * @from: the starting point of the transition 3373 * @to: the target point of the transition or NULL 3374 * @token: the input string associated to that transition 3375 * @data: data passed to the callback function if the transition is activated 3376 * 3377 * If @to is NULL, this create first a new target state in the automata 3378 * and then adds a transition from the @from state to the target state 3379 * activated by the value of @token 3380 * 3381 * Returns the target state or NULL in case of error 3382 */ 3383xmlAutomataStatePtr 3384xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 3385 xmlAutomataStatePtr to, const xmlChar *token, 3386 void *data) { 3387 xmlRegAtomPtr atom; 3388 3389 if ((am == NULL) || (from == NULL) || (token == NULL)) 3390 return(NULL); 3391 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 3392 atom->data = data; 3393 if (atom == NULL) 3394 return(NULL); 3395 atom->valuep = xmlStrdup(token); 3396 3397 xmlFAGenerateTransitions(am, from, to, atom); 3398 if (to == NULL) 3399 return(am->state); 3400 return(to); 3401} 3402 3403/** 3404 * xmlAutomataNewCountTrans: 3405 * @am: an automata 3406 * @from: the starting point of the transition 3407 * @to: the target point of the transition or NULL 3408 * @token: the input string associated to that transition 3409 * @min: the minimum successive occurences of token 3410 * @min: the maximum successive occurences of token 3411 * 3412 * If @to is NULL, this create first a new target state in the automata 3413 * and then adds a transition from the @from state to the target state 3414 * activated by a succession of input of value @token and whose number 3415 * is between @min and @max 3416 * 3417 * Returns the target state or NULL in case of error 3418 */ 3419xmlAutomataStatePtr 3420xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 3421 xmlAutomataStatePtr to, const xmlChar *token, 3422 int min, int max, void *data) { 3423 xmlRegAtomPtr atom; 3424 3425 if ((am == NULL) || (from == NULL) || (token == NULL)) 3426 return(NULL); 3427 if (min < 0) 3428 return(NULL); 3429 if ((max < min) || (max < 1)) 3430 return(NULL); 3431 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 3432 if (atom == NULL) 3433 return(NULL); 3434 atom->valuep = xmlStrdup(token); 3435 atom->data = data; 3436 if (min == 0) 3437 atom->min = 1; 3438 else 3439 atom->min = min; 3440 atom->max = max; 3441 3442 xmlFAGenerateTransitions(am, from, to, atom); 3443 if (to == NULL) 3444 to = am->state; 3445 if (to == NULL) 3446 return(NULL); 3447 if (min == 0) 3448 xmlFAGenerateEpsilonTransition(am, from, to); 3449 return(to); 3450} 3451 3452/** 3453 * xmlAutomataNewOnceTrans: 3454 * @am: an automata 3455 * @from: the starting point of the transition 3456 * @to: the target point of the transition or NULL 3457 * @token: the input string associated to that transition 3458 * @min: the minimum successive occurences of token 3459 * @min: the maximum successive occurences of token 3460 * 3461 * If @to is NULL, this create first a new target state in the automata 3462 * and then adds a transition from the @from state to the target state 3463 * activated by a succession of input of value @token and whose number 3464 * is between @min and @max, moreover that transistion can only be crossed 3465 * once. 3466 * 3467 * Returns the target state or NULL in case of error 3468 */ 3469xmlAutomataStatePtr 3470xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 3471 xmlAutomataStatePtr to, const xmlChar *token, 3472 int min, int max, void *data) { 3473 xmlRegAtomPtr atom; 3474 int counter; 3475 3476 if ((am == NULL) || (from == NULL) || (token == NULL)) 3477 return(NULL); 3478 if (min < 1) 3479 return(NULL); 3480 if ((max < min) || (max < 1)) 3481 return(NULL); 3482 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 3483 if (atom == NULL) 3484 return(NULL); 3485 atom->valuep = xmlStrdup(token); 3486 atom->data = data; 3487 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 3488 if (min == 0) 3489 atom->min = 1; 3490 else 3491 atom->min = min; 3492 atom->max = max; 3493 /* 3494 * associate a counter to the transition. 3495 */ 3496 counter = xmlRegGetCounter(am); 3497 am->counters[counter].min = 1; 3498 am->counters[counter].max = 1; 3499 3500 /* xmlFAGenerateTransitions(am, from, to, atom); */ 3501 if (to == NULL) { 3502 to = xmlRegNewState(am); 3503 xmlRegStatePush(am, to); 3504 } 3505 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 3506 xmlRegAtomPush(am, atom); 3507 am->state = to; 3508 if (to == NULL) 3509 to = am->state; 3510 if (to == NULL) 3511 return(NULL); 3512 return(to); 3513} 3514 3515/** 3516 * xmlAutomataNewState: 3517 * @am: an automata 3518 * 3519 * Create a new disconnected state in the automata 3520 * 3521 * Returns the new state or NULL in case of error 3522 */ 3523xmlAutomataStatePtr 3524xmlAutomataNewState(xmlAutomataPtr am) { 3525 xmlAutomataStatePtr to; 3526 3527 if (am == NULL) 3528 return(NULL); 3529 to = xmlRegNewState(am); 3530 xmlRegStatePush(am, to); 3531 return(to); 3532} 3533 3534/** 3535 * xmlAutomataNewTransition: 3536 * @am: an automata 3537 * @from: the starting point of the transition 3538 * @to: the target point of the transition or NULL 3539 * 3540 * If @to is NULL, this create first a new target state in the automata 3541 * and then adds a an epsilon transition from the @from state to the 3542 * target state 3543 * 3544 * Returns the target state or NULL in case of error 3545 */ 3546xmlAutomataStatePtr 3547xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 3548 xmlAutomataStatePtr to) { 3549 if ((am == NULL) || (from == NULL)) 3550 return(NULL); 3551 xmlFAGenerateEpsilonTransition(am, from, to); 3552 if (to == NULL) 3553 return(am->state); 3554 return(to); 3555} 3556 3557/** 3558 * xmlAutomataNewAllTrans: 3559 * @am: an automata 3560 * @from: the starting point of the transition 3561 * @to: the target point of the transition or NULL 3562 * 3563 * If @to is NULL, this create first a new target state in the automata 3564 * and then adds a an ALL transition from the @from state to the 3565 * target state. That transition is an epsilon transition allowed only when 3566 * all transitions from the @from node have been activated. 3567 * 3568 * Returns the target state or NULL in case of error 3569 */ 3570xmlAutomataStatePtr 3571xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 3572 xmlAutomataStatePtr to) { 3573 if ((am == NULL) || (from == NULL)) 3574 return(NULL); 3575 xmlFAGenerateAllTransition(am, from, to); 3576 if (to == NULL) 3577 return(am->state); 3578 return(to); 3579} 3580 3581/** 3582 * xmlAutomataNewCounter: 3583 * @am: an automata 3584 * @min: the minimal value on the counter 3585 * @max: the maximal value on the counter 3586 * 3587 * Create a new counter 3588 * 3589 * Returns the counter number or -1 in case of error 3590 */ 3591int 3592xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 3593 int ret; 3594 3595 if (am == NULL) 3596 return(-1); 3597 3598 ret = xmlRegGetCounter(am); 3599 if (ret < 0) 3600 return(-1); 3601 am->counters[ret].min = min; 3602 am->counters[ret].max = max; 3603 return(ret); 3604} 3605 3606/** 3607 * xmlAutomataNewCountedTrans: 3608 * @am: an automata 3609 * @from: the starting point of the transition 3610 * @to: the target point of the transition or NULL 3611 * @counter: the counter associated to that transition 3612 * 3613 * If @to is NULL, this create first a new target state in the automata 3614 * and then adds an epsilon transition from the @from state to the target state 3615 * which will increment the counter provided 3616 * 3617 * Returns the target state or NULL in case of error 3618 */ 3619xmlAutomataStatePtr 3620xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 3621 xmlAutomataStatePtr to, int counter) { 3622 if ((am == NULL) || (from == NULL) || (counter < 0)) 3623 return(NULL); 3624 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 3625 if (to == NULL) 3626 return(am->state); 3627 return(to); 3628} 3629 3630/** 3631 * xmlAutomataNewCounterTrans: 3632 * @am: an automata 3633 * @from: the starting point of the transition 3634 * @to: the target point of the transition or NULL 3635 * @counter: the counter associated to that transition 3636 * 3637 * If @to is NULL, this create first a new target state in the automata 3638 * and then adds an epsilon transition from the @from state to the target state 3639 * which will be allowed only if the counter is within the right range. 3640 * 3641 * Returns the target state or NULL in case of error 3642 */ 3643xmlAutomataStatePtr 3644xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 3645 xmlAutomataStatePtr to, int counter) { 3646 if ((am == NULL) || (from == NULL) || (counter < 0)) 3647 return(NULL); 3648 xmlFAGenerateCountedTransition(am, from, to, counter); 3649 if (to == NULL) 3650 return(am->state); 3651 return(to); 3652} 3653 3654/** 3655 * xmlAutomataCompile: 3656 * @am: an automata 3657 * 3658 * Compile the automata into a Reg Exp ready for being executed. 3659 * The automata should be free after this point. 3660 * 3661 * Returns the compiled regexp or NULL in case of error 3662 */ 3663xmlRegexpPtr 3664xmlAutomataCompile(xmlAutomataPtr am) { 3665 xmlRegexpPtr ret; 3666 3667 xmlFAEliminateEpsilonTransitions(am); 3668 ret = xmlRegEpxFromParse(am); 3669 3670 return(ret); 3671} 3672#endif /* LIBXML_AUTOMATA_ENABLED */ 3673#endif /* LIBXML_REGEXP_ENABLED */ 3674