1/* 2******************************************************************************* 3* Copyright (C) 2004-2015, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* file name: uregex.cpp 7*/ 8 9#include "unicode/utypes.h" 10 11#if !UCONFIG_NO_REGULAR_EXPRESSIONS 12 13#include "unicode/regex.h" 14#include "unicode/uregex.h" 15#include "unicode/unistr.h" 16#include "unicode/ustring.h" 17#include "unicode/uchar.h" 18#include "unicode/uobject.h" 19#include "unicode/utf16.h" 20#include "cmemory.h" 21#include "uassert.h" 22#include "uhash.h" 23#include "umutex.h" 24#include "uvectr32.h" 25 26#include "regextxt.h" 27 28U_NAMESPACE_BEGIN 29 30#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) 31 32struct RegularExpression: public UMemory { 33public: 34 RegularExpression(); 35 ~RegularExpression(); 36 int32_t fMagic; 37 RegexPattern *fPat; 38 u_atomic_int32_t *fPatRefCount; 39 UChar *fPatString; 40 int32_t fPatStringLen; 41 RegexMatcher *fMatcher; 42 const UChar *fText; // Text from setText() 43 int32_t fTextLength; // Length provided by user with setText(), which 44 // may be -1. 45 UBool fOwnsText; 46}; 47 48static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII 49 50RegularExpression::RegularExpression() { 51 fMagic = REXP_MAGIC; 52 fPat = NULL; 53 fPatRefCount = NULL; 54 fPatString = NULL; 55 fPatStringLen = 0; 56 fMatcher = NULL; 57 fText = NULL; 58 fTextLength = 0; 59 fOwnsText = FALSE; 60} 61 62RegularExpression::~RegularExpression() { 63 delete fMatcher; 64 fMatcher = NULL; 65 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { 66 delete fPat; 67 uprv_free(fPatString); 68 uprv_free((void *)fPatRefCount); 69 } 70 if (fOwnsText && fText!=NULL) { 71 uprv_free((void *)fText); 72 } 73 fMagic = 0; 74} 75 76U_NAMESPACE_END 77 78U_NAMESPACE_USE 79 80//---------------------------------------------------------------------------------------- 81// 82// validateRE Do boilerplate style checks on API function parameters. 83// Return TRUE if they look OK. 84//---------------------------------------------------------------------------------------- 85static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) { 86 if (U_FAILURE(*status)) { 87 return FALSE; 88 } 89 if (re == NULL || re->fMagic != REXP_MAGIC) { 90 *status = U_ILLEGAL_ARGUMENT_ERROR; 91 return FALSE; 92 } 93 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway 94 if (requiresText && re->fText == NULL && !re->fOwnsText) { 95 *status = U_REGEX_INVALID_STATE; 96 return FALSE; 97 } 98 return TRUE; 99} 100 101//---------------------------------------------------------------------------------------- 102// 103// uregex_open 104// 105//---------------------------------------------------------------------------------------- 106U_CAPI URegularExpression * U_EXPORT2 107uregex_open( const UChar *pattern, 108 int32_t patternLength, 109 uint32_t flags, 110 UParseError *pe, 111 UErrorCode *status) { 112 113 if (U_FAILURE(*status)) { 114 return NULL; 115 } 116 if (pattern == NULL || patternLength < -1 || patternLength == 0) { 117 *status = U_ILLEGAL_ARGUMENT_ERROR; 118 return NULL; 119 } 120 int32_t actualPatLen = patternLength; 121 if (actualPatLen == -1) { 122 actualPatLen = u_strlen(pattern); 123 } 124 125 RegularExpression *re = new RegularExpression; 126 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); 127 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); 128 if (re == NULL || refC == NULL || patBuf == NULL) { 129 *status = U_MEMORY_ALLOCATION_ERROR; 130 delete re; 131 uprv_free((void *)refC); 132 uprv_free(patBuf); 133 return NULL; 134 } 135 re->fPatRefCount = refC; 136 *re->fPatRefCount = 1; 137 138 // 139 // Make a copy of the pattern string, so we can return it later if asked. 140 // For compiling the pattern, we will use a UText wrapper around 141 // this local copy, to avoid making even more copies. 142 // 143 re->fPatString = patBuf; 144 re->fPatStringLen = patternLength; 145 u_memcpy(patBuf, pattern, actualPatLen); 146 patBuf[actualPatLen] = 0; 147 148 UText patText = UTEXT_INITIALIZER; 149 utext_openUChars(&patText, patBuf, patternLength, status); 150 151 // 152 // Compile the pattern 153 // 154 if (pe != NULL) { 155 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 156 } else { 157 re->fPat = RegexPattern::compile(&patText, flags, *status); 158 } 159 utext_close(&patText); 160 161 if (U_FAILURE(*status)) { 162 goto ErrorExit; 163 } 164 165 // 166 // Create the matcher object 167 // 168 re->fMatcher = re->fPat->matcher(*status); 169 if (U_SUCCESS(*status)) { 170 return (URegularExpression*)re; 171 } 172 173ErrorExit: 174 delete re; 175 return NULL; 176 177} 178 179//---------------------------------------------------------------------------------------- 180// 181// uregex_openUText 182// 183//---------------------------------------------------------------------------------------- 184U_CAPI URegularExpression * U_EXPORT2 185uregex_openUText(UText *pattern, 186 uint32_t flags, 187 UParseError *pe, 188 UErrorCode *status) { 189 190 if (U_FAILURE(*status)) { 191 return NULL; 192 } 193 if (pattern == NULL) { 194 *status = U_ILLEGAL_ARGUMENT_ERROR; 195 return NULL; 196 } 197 198 int64_t patternNativeLength = utext_nativeLength(pattern); 199 200 if (patternNativeLength == 0) { 201 *status = U_ILLEGAL_ARGUMENT_ERROR; 202 return NULL; 203 } 204 205 RegularExpression *re = new RegularExpression; 206 207 UErrorCode lengthStatus = U_ZERO_ERROR; 208 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); 209 210 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); 211 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); 212 if (re == NULL || refC == NULL || patBuf == NULL) { 213 *status = U_MEMORY_ALLOCATION_ERROR; 214 delete re; 215 uprv_free((void *)refC); 216 uprv_free(patBuf); 217 return NULL; 218 } 219 re->fPatRefCount = refC; 220 *re->fPatRefCount = 1; 221 222 // 223 // Make a copy of the pattern string, so we can return it later if asked. 224 // For compiling the pattern, we will use a read-only UText wrapper 225 // around this local copy, to avoid making even more copies. 226 // 227 re->fPatString = patBuf; 228 re->fPatStringLen = pattern16Length; 229 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); 230 231 UText patText = UTEXT_INITIALIZER; 232 utext_openUChars(&patText, patBuf, pattern16Length, status); 233 234 // 235 // Compile the pattern 236 // 237 if (pe != NULL) { 238 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 239 } else { 240 re->fPat = RegexPattern::compile(&patText, flags, *status); 241 } 242 utext_close(&patText); 243 244 if (U_FAILURE(*status)) { 245 goto ErrorExit; 246 } 247 248 // 249 // Create the matcher object 250 // 251 re->fMatcher = re->fPat->matcher(*status); 252 if (U_SUCCESS(*status)) { 253 return (URegularExpression*)re; 254 } 255 256ErrorExit: 257 delete re; 258 return NULL; 259 260} 261 262//---------------------------------------------------------------------------------------- 263// 264// uregex_close 265// 266//---------------------------------------------------------------------------------------- 267U_CAPI void U_EXPORT2 268uregex_close(URegularExpression *re2) { 269 RegularExpression *re = (RegularExpression*)re2; 270 UErrorCode status = U_ZERO_ERROR; 271 if (validateRE(re, FALSE, &status) == FALSE) { 272 return; 273 } 274 delete re; 275} 276 277 278//---------------------------------------------------------------------------------------- 279// 280// uregex_clone 281// 282//---------------------------------------------------------------------------------------- 283U_CAPI URegularExpression * U_EXPORT2 284uregex_clone(const URegularExpression *source2, UErrorCode *status) { 285 RegularExpression *source = (RegularExpression*)source2; 286 if (validateRE(source, FALSE, status) == FALSE) { 287 return NULL; 288 } 289 290 RegularExpression *clone = new RegularExpression; 291 if (clone == NULL) { 292 *status = U_MEMORY_ALLOCATION_ERROR; 293 return NULL; 294 } 295 296 clone->fMatcher = source->fPat->matcher(*status); 297 if (U_FAILURE(*status)) { 298 delete clone; 299 return NULL; 300 } 301 302 clone->fPat = source->fPat; 303 clone->fPatRefCount = source->fPatRefCount; 304 clone->fPatString = source->fPatString; 305 clone->fPatStringLen = source->fPatStringLen; 306 umtx_atomic_inc(source->fPatRefCount); 307 // Note: fText is not cloned. 308 309 return (URegularExpression*)clone; 310} 311 312 313 314 315//------------------------------------------------------------------------------ 316// 317// uregex_pattern 318// 319//------------------------------------------------------------------------------ 320U_CAPI const UChar * U_EXPORT2 321uregex_pattern(const URegularExpression *regexp2, 322 int32_t *patLength, 323 UErrorCode *status) { 324 RegularExpression *regexp = (RegularExpression*)regexp2; 325 326 if (validateRE(regexp, FALSE, status) == FALSE) { 327 return NULL; 328 } 329 if (patLength != NULL) { 330 *patLength = regexp->fPatStringLen; 331 } 332 return regexp->fPatString; 333} 334 335 336//------------------------------------------------------------------------------ 337// 338// uregex_patternUText 339// 340//------------------------------------------------------------------------------ 341U_CAPI UText * U_EXPORT2 342uregex_patternUText(const URegularExpression *regexp2, 343 UErrorCode *status) { 344 RegularExpression *regexp = (RegularExpression*)regexp2; 345 return regexp->fPat->patternText(*status); 346} 347 348 349//------------------------------------------------------------------------------ 350// 351// uregex_flags 352// 353//------------------------------------------------------------------------------ 354U_CAPI int32_t U_EXPORT2 355uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { 356 RegularExpression *regexp = (RegularExpression*)regexp2; 357 if (validateRE(regexp, FALSE, status) == FALSE) { 358 return 0; 359 } 360 int32_t flags = regexp->fPat->flags(); 361 return flags; 362} 363 364 365//------------------------------------------------------------------------------ 366// 367// uregex_setText 368// 369//------------------------------------------------------------------------------ 370U_CAPI void U_EXPORT2 371uregex_setText(URegularExpression *regexp2, 372 const UChar *text, 373 int32_t textLength, 374 UErrorCode *status) { 375 RegularExpression *regexp = (RegularExpression*)regexp2; 376 if (validateRE(regexp, FALSE, status) == FALSE) { 377 return; 378 } 379 if (text == NULL || textLength < -1) { 380 *status = U_ILLEGAL_ARGUMENT_ERROR; 381 return; 382 } 383 384 if (regexp->fOwnsText && regexp->fText != NULL) { 385 uprv_free((void *)regexp->fText); 386 } 387 388 regexp->fText = text; 389 regexp->fTextLength = textLength; 390 regexp->fOwnsText = FALSE; 391 392 UText input = UTEXT_INITIALIZER; 393 utext_openUChars(&input, text, textLength, status); 394 regexp->fMatcher->reset(&input); 395 utext_close(&input); // reset() made a shallow clone, so we don't need this copy 396} 397 398 399//------------------------------------------------------------------------------ 400// 401// uregex_setUText 402// 403//------------------------------------------------------------------------------ 404U_CAPI void U_EXPORT2 405uregex_setUText(URegularExpression *regexp2, 406 UText *text, 407 UErrorCode *status) { 408 RegularExpression *regexp = (RegularExpression*)regexp2; 409 if (validateRE(regexp, FALSE, status) == FALSE) { 410 return; 411 } 412 if (text == NULL) { 413 *status = U_ILLEGAL_ARGUMENT_ERROR; 414 return; 415 } 416 417 if (regexp->fOwnsText && regexp->fText != NULL) { 418 uprv_free((void *)regexp->fText); 419 } 420 421 regexp->fText = NULL; // only fill it in on request 422 regexp->fTextLength = -1; 423 regexp->fOwnsText = TRUE; 424 regexp->fMatcher->reset(text); 425} 426 427 428 429//------------------------------------------------------------------------------ 430// 431// uregex_getText 432// 433//------------------------------------------------------------------------------ 434U_CAPI const UChar * U_EXPORT2 435uregex_getText(URegularExpression *regexp2, 436 int32_t *textLength, 437 UErrorCode *status) { 438 RegularExpression *regexp = (RegularExpression*)regexp2; 439 if (validateRE(regexp, FALSE, status) == FALSE) { 440 return NULL; 441 } 442 443 if (regexp->fText == NULL) { 444 // need to fill in the text 445 UText *inputText = regexp->fMatcher->inputText(); 446 int64_t inputNativeLength = utext_nativeLength(inputText); 447 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { 448 regexp->fText = inputText->chunkContents; 449 regexp->fTextLength = (int32_t)inputNativeLength; 450 regexp->fOwnsText = FALSE; // because the UText owns it 451 } else { 452 UErrorCode lengthStatus = U_ZERO_ERROR; 453 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error 454 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); 455 456 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); 457 regexp->fText = inputChars; 458 regexp->fOwnsText = TRUE; // should already be set but just in case 459 } 460 } 461 462 if (textLength != NULL) { 463 *textLength = regexp->fTextLength; 464 } 465 return regexp->fText; 466} 467 468 469//------------------------------------------------------------------------------ 470// 471// uregex_getUText 472// 473//------------------------------------------------------------------------------ 474U_CAPI UText * U_EXPORT2 475uregex_getUText(URegularExpression *regexp2, 476 UText *dest, 477 UErrorCode *status) { 478 RegularExpression *regexp = (RegularExpression*)regexp2; 479 if (validateRE(regexp, FALSE, status) == FALSE) { 480 return dest; 481 } 482 return regexp->fMatcher->getInput(dest, *status); 483} 484 485 486//------------------------------------------------------------------------------ 487// 488// uregex_refreshUText 489// 490//------------------------------------------------------------------------------ 491U_CAPI void U_EXPORT2 492uregex_refreshUText(URegularExpression *regexp2, 493 UText *text, 494 UErrorCode *status) { 495 RegularExpression *regexp = (RegularExpression*)regexp2; 496 if (validateRE(regexp, FALSE, status) == FALSE) { 497 return; 498 } 499 regexp->fMatcher->refreshInputText(text, *status); 500} 501 502 503//------------------------------------------------------------------------------ 504// 505// uregex_matches 506// 507//------------------------------------------------------------------------------ 508U_CAPI UBool U_EXPORT2 509uregex_matches(URegularExpression *regexp2, 510 int32_t startIndex, 511 UErrorCode *status) { 512 return uregex_matches64( regexp2, (int64_t)startIndex, status); 513} 514 515U_CAPI UBool U_EXPORT2 516uregex_matches64(URegularExpression *regexp2, 517 int64_t startIndex, 518 UErrorCode *status) { 519 RegularExpression *regexp = (RegularExpression*)regexp2; 520 UBool result = FALSE; 521 if (validateRE(regexp, TRUE, status) == FALSE) { 522 return result; 523 } 524 if (startIndex == -1) { 525 result = regexp->fMatcher->matches(*status); 526 } else { 527 result = regexp->fMatcher->matches(startIndex, *status); 528 } 529 return result; 530} 531 532 533//------------------------------------------------------------------------------ 534// 535// uregex_lookingAt 536// 537//------------------------------------------------------------------------------ 538U_CAPI UBool U_EXPORT2 539uregex_lookingAt(URegularExpression *regexp2, 540 int32_t startIndex, 541 UErrorCode *status) { 542 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); 543} 544 545U_CAPI UBool U_EXPORT2 546uregex_lookingAt64(URegularExpression *regexp2, 547 int64_t startIndex, 548 UErrorCode *status) { 549 RegularExpression *regexp = (RegularExpression*)regexp2; 550 UBool result = FALSE; 551 if (validateRE(regexp, TRUE, status) == FALSE) { 552 return result; 553 } 554 if (startIndex == -1) { 555 result = regexp->fMatcher->lookingAt(*status); 556 } else { 557 result = regexp->fMatcher->lookingAt(startIndex, *status); 558 } 559 return result; 560} 561 562 563 564//------------------------------------------------------------------------------ 565// 566// uregex_find 567// 568//------------------------------------------------------------------------------ 569U_CAPI UBool U_EXPORT2 570uregex_find(URegularExpression *regexp2, 571 int32_t startIndex, 572 UErrorCode *status) { 573 return uregex_find64( regexp2, (int64_t)startIndex, status); 574} 575 576U_CAPI UBool U_EXPORT2 577uregex_find64(URegularExpression *regexp2, 578 int64_t startIndex, 579 UErrorCode *status) { 580 RegularExpression *regexp = (RegularExpression*)regexp2; 581 UBool result = FALSE; 582 if (validateRE(regexp, TRUE, status) == FALSE) { 583 return result; 584 } 585 if (startIndex == -1) { 586 regexp->fMatcher->resetPreserveRegion(); 587 result = regexp->fMatcher->find(*status); 588 } else { 589 result = regexp->fMatcher->find(startIndex, *status); 590 } 591 return result; 592} 593 594 595//------------------------------------------------------------------------------ 596// 597// uregex_findNext 598// 599//------------------------------------------------------------------------------ 600U_CAPI UBool U_EXPORT2 601uregex_findNext(URegularExpression *regexp2, 602 UErrorCode *status) { 603 RegularExpression *regexp = (RegularExpression*)regexp2; 604 if (validateRE(regexp, TRUE, status) == FALSE) { 605 return FALSE; 606 } 607 UBool result = regexp->fMatcher->find(*status); 608 return result; 609} 610 611//------------------------------------------------------------------------------ 612// 613// uregex_groupCount 614// 615//------------------------------------------------------------------------------ 616U_CAPI int32_t U_EXPORT2 617uregex_groupCount(URegularExpression *regexp2, 618 UErrorCode *status) { 619 RegularExpression *regexp = (RegularExpression*)regexp2; 620 if (validateRE(regexp, FALSE, status) == FALSE) { 621 return 0; 622 } 623 int32_t result = regexp->fMatcher->groupCount(); 624 return result; 625} 626 627 628//------------------------------------------------------------------------------ 629// 630// uregex_groupNumberFromName 631// 632//------------------------------------------------------------------------------ 633int32_t 634uregex_groupNumberFromName(URegularExpression *regexp2, 635 const UChar *groupName, 636 int32_t nameLength, 637 UErrorCode *status) { 638 RegularExpression *regexp = (RegularExpression*)regexp2; 639 if (validateRE(regexp, FALSE, status) == FALSE) { 640 return 0; 641 } 642 int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status); 643 return result; 644} 645 646int32_t 647uregex_groupNumberFromCName(URegularExpression *regexp2, 648 const char *groupName, 649 int32_t nameLength, 650 UErrorCode *status) { 651 RegularExpression *regexp = (RegularExpression*)regexp2; 652 if (validateRE(regexp, FALSE, status) == FALSE) { 653 return 0; 654 } 655 return regexp->fPat->groupNumberFromName(groupName, nameLength, *status); 656} 657 658//------------------------------------------------------------------------------ 659// 660// uregex_group 661// 662//------------------------------------------------------------------------------ 663U_CAPI int32_t U_EXPORT2 664uregex_group(URegularExpression *regexp2, 665 int32_t groupNum, 666 UChar *dest, 667 int32_t destCapacity, 668 UErrorCode *status) { 669 RegularExpression *regexp = (RegularExpression*)regexp2; 670 if (validateRE(regexp, TRUE, status) == FALSE) { 671 return 0; 672 } 673 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 674 *status = U_ILLEGAL_ARGUMENT_ERROR; 675 return 0; 676 } 677 678 if (destCapacity == 0 || regexp->fText != NULL) { 679 // If preflighting or if we already have the text as UChars, 680 // this is a little cheaper than extracting from the UText 681 682 // 683 // Pick up the range of characters from the matcher 684 // 685 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 686 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 687 if (U_FAILURE(*status)) { 688 return 0; 689 } 690 691 // 692 // Trim length based on buffer capacity 693 // 694 int32_t fullLength = endIx - startIx; 695 int32_t copyLength = fullLength; 696 if (copyLength < destCapacity) { 697 dest[copyLength] = 0; 698 } else if (copyLength == destCapacity) { 699 *status = U_STRING_NOT_TERMINATED_WARNING; 700 } else { 701 copyLength = destCapacity; 702 *status = U_BUFFER_OVERFLOW_ERROR; 703 } 704 705 // 706 // Copy capture group to user's buffer 707 // 708 if (copyLength > 0) { 709 u_memcpy(dest, ®exp->fText[startIx], copyLength); 710 } 711 return fullLength; 712 } else { 713 int64_t start = regexp->fMatcher->start64(groupNum, *status); 714 int64_t limit = regexp->fMatcher->end64(groupNum, *status); 715 if (U_FAILURE(*status)) { 716 return 0; 717 } 718 // Note edge cases: 719 // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result. 720 // Zero Length Match: start == end. 721 int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status); 722 return length; 723 } 724 725} 726 727 728//------------------------------------------------------------------------------ 729// 730// uregex_groupUText 731// 732//------------------------------------------------------------------------------ 733U_CAPI UText * U_EXPORT2 734uregex_groupUText(URegularExpression *regexp2, 735 int32_t groupNum, 736 UText *dest, 737 int64_t *groupLength, 738 UErrorCode *status) { 739 RegularExpression *regexp = (RegularExpression*)regexp2; 740 if (validateRE(regexp, TRUE, status) == FALSE) { 741 UErrorCode emptyTextStatus = U_ZERO_ERROR; 742 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 743 } 744 745 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); 746} 747 748//------------------------------------------------------------------------------ 749// 750// uregex_start 751// 752//------------------------------------------------------------------------------ 753U_CAPI int32_t U_EXPORT2 754uregex_start(URegularExpression *regexp2, 755 int32_t groupNum, 756 UErrorCode *status) { 757 return (int32_t)uregex_start64( regexp2, groupNum, status); 758} 759 760U_CAPI int64_t U_EXPORT2 761uregex_start64(URegularExpression *regexp2, 762 int32_t groupNum, 763 UErrorCode *status) { 764 RegularExpression *regexp = (RegularExpression*)regexp2; 765 if (validateRE(regexp, TRUE, status) == FALSE) { 766 return 0; 767 } 768 int32_t result = regexp->fMatcher->start(groupNum, *status); 769 return result; 770} 771 772//------------------------------------------------------------------------------ 773// 774// uregex_end 775// 776//------------------------------------------------------------------------------ 777U_CAPI int32_t U_EXPORT2 778uregex_end(URegularExpression *regexp2, 779 int32_t groupNum, 780 UErrorCode *status) { 781 return (int32_t)uregex_end64( regexp2, groupNum, status); 782} 783 784U_CAPI int64_t U_EXPORT2 785uregex_end64(URegularExpression *regexp2, 786 int32_t groupNum, 787 UErrorCode *status) { 788 RegularExpression *regexp = (RegularExpression*)regexp2; 789 if (validateRE(regexp, TRUE, status) == FALSE) { 790 return 0; 791 } 792 int32_t result = regexp->fMatcher->end(groupNum, *status); 793 return result; 794} 795 796//------------------------------------------------------------------------------ 797// 798// uregex_reset 799// 800//------------------------------------------------------------------------------ 801U_CAPI void U_EXPORT2 802uregex_reset(URegularExpression *regexp2, 803 int32_t index, 804 UErrorCode *status) { 805 uregex_reset64( regexp2, (int64_t)index, status); 806} 807 808U_CAPI void U_EXPORT2 809uregex_reset64(URegularExpression *regexp2, 810 int64_t index, 811 UErrorCode *status) { 812 RegularExpression *regexp = (RegularExpression*)regexp2; 813 if (validateRE(regexp, TRUE, status) == FALSE) { 814 return; 815 } 816 regexp->fMatcher->reset(index, *status); 817} 818 819 820//------------------------------------------------------------------------------ 821// 822// uregex_setRegion 823// 824//------------------------------------------------------------------------------ 825U_CAPI void U_EXPORT2 826uregex_setRegion(URegularExpression *regexp2, 827 int32_t regionStart, 828 int32_t regionLimit, 829 UErrorCode *status) { 830 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status); 831} 832 833U_CAPI void U_EXPORT2 834uregex_setRegion64(URegularExpression *regexp2, 835 int64_t regionStart, 836 int64_t regionLimit, 837 UErrorCode *status) { 838 RegularExpression *regexp = (RegularExpression*)regexp2; 839 if (validateRE(regexp, TRUE, status) == FALSE) { 840 return; 841 } 842 regexp->fMatcher->region(regionStart, regionLimit, *status); 843} 844 845 846//------------------------------------------------------------------------------ 847// 848// uregex_setRegionAndStart 849// 850//------------------------------------------------------------------------------ 851U_CAPI void U_EXPORT2 852uregex_setRegionAndStart(URegularExpression *regexp2, 853 int64_t regionStart, 854 int64_t regionLimit, 855 int64_t startIndex, 856 UErrorCode *status) { 857 RegularExpression *regexp = (RegularExpression*)regexp2; 858 if (validateRE(regexp, TRUE, status) == FALSE) { 859 return; 860 } 861 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); 862} 863 864//------------------------------------------------------------------------------ 865// 866// uregex_regionStart 867// 868//------------------------------------------------------------------------------ 869U_CAPI int32_t U_EXPORT2 870uregex_regionStart(const URegularExpression *regexp2, 871 UErrorCode *status) { 872 return (int32_t)uregex_regionStart64(regexp2, status); 873} 874 875U_CAPI int64_t U_EXPORT2 876uregex_regionStart64(const URegularExpression *regexp2, 877 UErrorCode *status) { 878 RegularExpression *regexp = (RegularExpression*)regexp2; 879 if (validateRE(regexp, TRUE, status) == FALSE) { 880 return 0; 881 } 882 return regexp->fMatcher->regionStart(); 883} 884 885 886//------------------------------------------------------------------------------ 887// 888// uregex_regionEnd 889// 890//------------------------------------------------------------------------------ 891U_CAPI int32_t U_EXPORT2 892uregex_regionEnd(const URegularExpression *regexp2, 893 UErrorCode *status) { 894 return (int32_t)uregex_regionEnd64(regexp2, status); 895} 896 897U_CAPI int64_t U_EXPORT2 898uregex_regionEnd64(const URegularExpression *regexp2, 899 UErrorCode *status) { 900 RegularExpression *regexp = (RegularExpression*)regexp2; 901 if (validateRE(regexp, TRUE, status) == FALSE) { 902 return 0; 903 } 904 return regexp->fMatcher->regionEnd(); 905} 906 907 908//------------------------------------------------------------------------------ 909// 910// uregex_hasTransparentBounds 911// 912//------------------------------------------------------------------------------ 913U_CAPI UBool U_EXPORT2 914uregex_hasTransparentBounds(const URegularExpression *regexp2, 915 UErrorCode *status) { 916 RegularExpression *regexp = (RegularExpression*)regexp2; 917 if (validateRE(regexp, FALSE, status) == FALSE) { 918 return FALSE; 919 } 920 return regexp->fMatcher->hasTransparentBounds(); 921} 922 923 924//------------------------------------------------------------------------------ 925// 926// uregex_useTransparentBounds 927// 928//------------------------------------------------------------------------------ 929U_CAPI void U_EXPORT2 930uregex_useTransparentBounds(URegularExpression *regexp2, 931 UBool b, 932 UErrorCode *status) { 933 RegularExpression *regexp = (RegularExpression*)regexp2; 934 if (validateRE(regexp, FALSE, status) == FALSE) { 935 return; 936 } 937 regexp->fMatcher->useTransparentBounds(b); 938} 939 940 941//------------------------------------------------------------------------------ 942// 943// uregex_hasAnchoringBounds 944// 945//------------------------------------------------------------------------------ 946U_CAPI UBool U_EXPORT2 947uregex_hasAnchoringBounds(const URegularExpression *regexp2, 948 UErrorCode *status) { 949 RegularExpression *regexp = (RegularExpression*)regexp2; 950 if (validateRE(regexp, FALSE, status) == FALSE) { 951 return FALSE; 952 } 953 return regexp->fMatcher->hasAnchoringBounds(); 954} 955 956 957//------------------------------------------------------------------------------ 958// 959// uregex_useAnchoringBounds 960// 961//------------------------------------------------------------------------------ 962U_CAPI void U_EXPORT2 963uregex_useAnchoringBounds(URegularExpression *regexp2, 964 UBool b, 965 UErrorCode *status) { 966 RegularExpression *regexp = (RegularExpression*)regexp2; 967 if (validateRE(regexp, FALSE, status) == FALSE) { 968 return; 969 } 970 regexp->fMatcher->useAnchoringBounds(b); 971} 972 973 974//------------------------------------------------------------------------------ 975// 976// uregex_hitEnd 977// 978//------------------------------------------------------------------------------ 979U_CAPI UBool U_EXPORT2 980uregex_hitEnd(const URegularExpression *regexp2, 981 UErrorCode *status) { 982 RegularExpression *regexp = (RegularExpression*)regexp2; 983 if (validateRE(regexp, TRUE, status) == FALSE) { 984 return FALSE; 985 } 986 return regexp->fMatcher->hitEnd(); 987} 988 989 990//------------------------------------------------------------------------------ 991// 992// uregex_requireEnd 993// 994//------------------------------------------------------------------------------ 995U_CAPI UBool U_EXPORT2 996uregex_requireEnd(const URegularExpression *regexp2, 997 UErrorCode *status) { 998 RegularExpression *regexp = (RegularExpression*)regexp2; 999 if (validateRE(regexp, TRUE, status) == FALSE) { 1000 return FALSE; 1001 } 1002 return regexp->fMatcher->requireEnd(); 1003} 1004 1005 1006//------------------------------------------------------------------------------ 1007// 1008// uregex_setTimeLimit 1009// 1010//------------------------------------------------------------------------------ 1011U_CAPI void U_EXPORT2 1012uregex_setTimeLimit(URegularExpression *regexp2, 1013 int32_t limit, 1014 UErrorCode *status) { 1015 RegularExpression *regexp = (RegularExpression*)regexp2; 1016 if (validateRE(regexp, FALSE, status)) { 1017 regexp->fMatcher->setTimeLimit(limit, *status); 1018 } 1019} 1020 1021 1022 1023//------------------------------------------------------------------------------ 1024// 1025// uregex_getTimeLimit 1026// 1027//------------------------------------------------------------------------------ 1028U_CAPI int32_t U_EXPORT2 1029uregex_getTimeLimit(const URegularExpression *regexp2, 1030 UErrorCode *status) { 1031 int32_t retVal = 0; 1032 RegularExpression *regexp = (RegularExpression*)regexp2; 1033 if (validateRE(regexp, FALSE, status)) { 1034 retVal = regexp->fMatcher->getTimeLimit(); 1035 } 1036 return retVal; 1037} 1038 1039 1040 1041//------------------------------------------------------------------------------ 1042// 1043// uregex_setStackLimit 1044// 1045//------------------------------------------------------------------------------ 1046U_CAPI void U_EXPORT2 1047uregex_setStackLimit(URegularExpression *regexp2, 1048 int32_t limit, 1049 UErrorCode *status) { 1050 RegularExpression *regexp = (RegularExpression*)regexp2; 1051 if (validateRE(regexp, FALSE, status)) { 1052 regexp->fMatcher->setStackLimit(limit, *status); 1053 } 1054} 1055 1056 1057 1058//------------------------------------------------------------------------------ 1059// 1060// uregex_getStackLimit 1061// 1062//------------------------------------------------------------------------------ 1063U_CAPI int32_t U_EXPORT2 1064uregex_getStackLimit(const URegularExpression *regexp2, 1065 UErrorCode *status) { 1066 int32_t retVal = 0; 1067 RegularExpression *regexp = (RegularExpression*)regexp2; 1068 if (validateRE(regexp, FALSE, status)) { 1069 retVal = regexp->fMatcher->getStackLimit(); 1070 } 1071 return retVal; 1072} 1073 1074 1075//------------------------------------------------------------------------------ 1076// 1077// uregex_setMatchCallback 1078// 1079//------------------------------------------------------------------------------ 1080U_CAPI void U_EXPORT2 1081uregex_setMatchCallback(URegularExpression *regexp2, 1082 URegexMatchCallback *callback, 1083 const void *context, 1084 UErrorCode *status) { 1085 RegularExpression *regexp = (RegularExpression*)regexp2; 1086 if (validateRE(regexp, FALSE, status)) { 1087 regexp->fMatcher->setMatchCallback(callback, context, *status); 1088 } 1089} 1090 1091 1092//------------------------------------------------------------------------------ 1093// 1094// uregex_getMatchCallback 1095// 1096//------------------------------------------------------------------------------ 1097U_CAPI void U_EXPORT2 1098uregex_getMatchCallback(const URegularExpression *regexp2, 1099 URegexMatchCallback **callback, 1100 const void **context, 1101 UErrorCode *status) { 1102 RegularExpression *regexp = (RegularExpression*)regexp2; 1103 if (validateRE(regexp, FALSE, status)) { 1104 regexp->fMatcher->getMatchCallback(*callback, *context, *status); 1105 } 1106} 1107 1108 1109//------------------------------------------------------------------------------ 1110// 1111// uregex_setMatchProgressCallback 1112// 1113//------------------------------------------------------------------------------ 1114U_CAPI void U_EXPORT2 1115uregex_setFindProgressCallback(URegularExpression *regexp2, 1116 URegexFindProgressCallback *callback, 1117 const void *context, 1118 UErrorCode *status) { 1119 RegularExpression *regexp = (RegularExpression*)regexp2; 1120 if (validateRE(regexp, FALSE, status)) { 1121 regexp->fMatcher->setFindProgressCallback(callback, context, *status); 1122 } 1123} 1124 1125 1126//------------------------------------------------------------------------------ 1127// 1128// uregex_getMatchCallback 1129// 1130//------------------------------------------------------------------------------ 1131U_CAPI void U_EXPORT2 1132uregex_getFindProgressCallback(const URegularExpression *regexp2, 1133 URegexFindProgressCallback **callback, 1134 const void **context, 1135 UErrorCode *status) { 1136 RegularExpression *regexp = (RegularExpression*)regexp2; 1137 if (validateRE(regexp, FALSE, status)) { 1138 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status); 1139 } 1140} 1141 1142 1143//------------------------------------------------------------------------------ 1144// 1145// uregex_replaceAll 1146// 1147//------------------------------------------------------------------------------ 1148U_CAPI int32_t U_EXPORT2 1149uregex_replaceAll(URegularExpression *regexp2, 1150 const UChar *replacementText, 1151 int32_t replacementLength, 1152 UChar *destBuf, 1153 int32_t destCapacity, 1154 UErrorCode *status) { 1155 RegularExpression *regexp = (RegularExpression*)regexp2; 1156 if (validateRE(regexp, TRUE, status) == FALSE) { 1157 return 0; 1158 } 1159 if (replacementText == NULL || replacementLength < -1 || 1160 (destBuf == NULL && destCapacity > 0) || 1161 destCapacity < 0) { 1162 *status = U_ILLEGAL_ARGUMENT_ERROR; 1163 return 0; 1164 } 1165 1166 int32_t len = 0; 1167 1168 uregex_reset(regexp2, 0, status); 1169 1170 // Note: Seperate error code variables for findNext() and appendReplacement() 1171 // are used so that destination buffer overflow errors 1172 // in appendReplacement won't stop findNext() from working. 1173 // appendReplacement() and appendTail() special case incoming buffer 1174 // overflow errors, continuing to return the correct length. 1175 UErrorCode findStatus = *status; 1176 while (uregex_findNext(regexp2, &findStatus)) { 1177 len += uregex_appendReplacement(regexp2, replacementText, replacementLength, 1178 &destBuf, &destCapacity, status); 1179 } 1180 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1181 1182 if (U_FAILURE(findStatus)) { 1183 // If anything went wrong with the findNext(), make that error trump 1184 // whatever may have happened with the append() operations. 1185 // Errors in findNext() are not expected. 1186 *status = findStatus; 1187 } 1188 1189 return len; 1190} 1191 1192 1193//------------------------------------------------------------------------------ 1194// 1195// uregex_replaceAllUText 1196// 1197//------------------------------------------------------------------------------ 1198U_CAPI UText * U_EXPORT2 1199uregex_replaceAllUText(URegularExpression *regexp2, 1200 UText *replacementText, 1201 UText *dest, 1202 UErrorCode *status) { 1203 RegularExpression *regexp = (RegularExpression*)regexp2; 1204 if (validateRE(regexp, TRUE, status) == FALSE) { 1205 return 0; 1206 } 1207 if (replacementText == NULL) { 1208 *status = U_ILLEGAL_ARGUMENT_ERROR; 1209 return 0; 1210 } 1211 1212 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); 1213 return dest; 1214} 1215 1216 1217//------------------------------------------------------------------------------ 1218// 1219// uregex_replaceFirst 1220// 1221//------------------------------------------------------------------------------ 1222U_CAPI int32_t U_EXPORT2 1223uregex_replaceFirst(URegularExpression *regexp2, 1224 const UChar *replacementText, 1225 int32_t replacementLength, 1226 UChar *destBuf, 1227 int32_t destCapacity, 1228 UErrorCode *status) { 1229 RegularExpression *regexp = (RegularExpression*)regexp2; 1230 if (validateRE(regexp, TRUE, status) == FALSE) { 1231 return 0; 1232 } 1233 if (replacementText == NULL || replacementLength < -1 || 1234 (destBuf == NULL && destCapacity > 0) || 1235 destCapacity < 0) { 1236 *status = U_ILLEGAL_ARGUMENT_ERROR; 1237 return 0; 1238 } 1239 1240 int32_t len = 0; 1241 UBool findSucceeded; 1242 uregex_reset(regexp2, 0, status); 1243 findSucceeded = uregex_find(regexp2, 0, status); 1244 if (findSucceeded) { 1245 len = uregex_appendReplacement(regexp2, replacementText, replacementLength, 1246 &destBuf, &destCapacity, status); 1247 } 1248 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1249 1250 return len; 1251} 1252 1253 1254//------------------------------------------------------------------------------ 1255// 1256// uregex_replaceFirstUText 1257// 1258//------------------------------------------------------------------------------ 1259U_CAPI UText * U_EXPORT2 1260uregex_replaceFirstUText(URegularExpression *regexp2, 1261 UText *replacementText, 1262 UText *dest, 1263 UErrorCode *status) { 1264 RegularExpression *regexp = (RegularExpression*)regexp2; 1265 if (validateRE(regexp, TRUE, status) == FALSE) { 1266 return 0; 1267 } 1268 if (replacementText == NULL) { 1269 *status = U_ILLEGAL_ARGUMENT_ERROR; 1270 return 0; 1271 } 1272 1273 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); 1274 return dest; 1275} 1276 1277 1278//------------------------------------------------------------------------------ 1279// 1280// uregex_appendReplacement 1281// 1282//------------------------------------------------------------------------------ 1283 1284U_NAMESPACE_BEGIN 1285// 1286// Dummy class, because these functions need to be friends of class RegexMatcher, 1287// and stand-alone C functions don't work as friends 1288// 1289class RegexCImpl { 1290 public: 1291 inline static int32_t appendReplacement(RegularExpression *regexp, 1292 const UChar *replacementText, 1293 int32_t replacementLength, 1294 UChar **destBuf, 1295 int32_t *destCapacity, 1296 UErrorCode *status); 1297 1298 inline static int32_t appendTail(RegularExpression *regexp, 1299 UChar **destBuf, 1300 int32_t *destCapacity, 1301 UErrorCode *status); 1302 1303 inline static int32_t split(RegularExpression *regexp, 1304 UChar *destBuf, 1305 int32_t destCapacity, 1306 int32_t *requiredCapacity, 1307 UChar *destFields[], 1308 int32_t destFieldsCapacity, 1309 UErrorCode *status); 1310}; 1311 1312U_NAMESPACE_END 1313 1314 1315 1316static const UChar BACKSLASH = 0x5c; 1317static const UChar DOLLARSIGN = 0x24; 1318static const UChar LEFTBRACKET = 0x7b; 1319static const UChar RIGHTBRACKET = 0x7d; 1320 1321// 1322// Move a character to an output buffer, with bounds checking on the index. 1323// Index advances even if capacity is exceeded, for preflight size computations. 1324// This little sequence is used a LOT. 1325// 1326static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { 1327 if (*idx < bufCapacity) { 1328 buf[*idx] = c; 1329 } 1330 (*idx)++; 1331} 1332 1333 1334// 1335// appendReplacement, the actual implementation. 1336// 1337int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, 1338 const UChar *replacementText, 1339 int32_t replacementLength, 1340 UChar **destBuf, 1341 int32_t *destCapacity, 1342 UErrorCode *status) { 1343 1344 // If we come in with a buffer overflow error, don't suppress the operation. 1345 // A series of appendReplacements, appendTail need to correctly preflight 1346 // the buffer size when an overflow happens somewhere in the middle. 1347 UBool pendingBufferOverflow = FALSE; 1348 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1349 pendingBufferOverflow = TRUE; 1350 *status = U_ZERO_ERROR; 1351 } 1352 1353 // 1354 // Validate all paramters 1355 // 1356 if (validateRE(regexp, TRUE, status) == FALSE) { 1357 return 0; 1358 } 1359 if (replacementText == NULL || replacementLength < -1 || 1360 destCapacity == NULL || destBuf == NULL || 1361 (*destBuf == NULL && *destCapacity > 0) || 1362 *destCapacity < 0) { 1363 *status = U_ILLEGAL_ARGUMENT_ERROR; 1364 return 0; 1365 } 1366 1367 RegexMatcher *m = regexp->fMatcher; 1368 if (m->fMatch == FALSE) { 1369 *status = U_REGEX_INVALID_STATE; 1370 return 0; 1371 } 1372 1373 UChar *dest = *destBuf; 1374 int32_t capacity = *destCapacity; 1375 int32_t destIdx = 0; 1376 int32_t i; 1377 1378 // If it wasn't supplied by the caller, get the length of the replacement text. 1379 // TODO: slightly smarter logic in the copy loop could watch for the NUL on 1380 // the fly and avoid this step. 1381 if (replacementLength == -1) { 1382 replacementLength = u_strlen(replacementText); 1383 } 1384 1385 // Copy input string from the end of previous match to start of current match 1386 if (regexp->fText != NULL) { 1387 int32_t matchStart; 1388 int32_t lastMatchEnd; 1389 if (UTEXT_USES_U16(m->fInputText)) { 1390 lastMatchEnd = (int32_t)m->fLastMatchEnd; 1391 matchStart = (int32_t)m->fMatchStart; 1392 } else { 1393 // !!!: Would like a better way to do this! 1394 UErrorCode tempStatus = U_ZERO_ERROR; 1395 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus); 1396 tempStatus = U_ZERO_ERROR; 1397 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus); 1398 } 1399 for (i=lastMatchEnd; i<matchStart; i++) { 1400 appendToBuf(regexp->fText[i], &destIdx, dest, capacity); 1401 } 1402 } else { 1403 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore 1404 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, 1405 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), 1406 &possibleOverflowError); 1407 } 1408 U_ASSERT(destIdx >= 0); 1409 1410 // scan the replacement text, looking for substitutions ($n) and \escapes. 1411 int32_t replIdx = 0; 1412 while (replIdx < replacementLength && U_SUCCESS(*status)) { 1413 UChar c = replacementText[replIdx]; 1414 replIdx++; 1415 if (c != DOLLARSIGN && c != BACKSLASH) { 1416 // Common case, no substitution, no escaping, 1417 // just copy the char to the dest buf. 1418 appendToBuf(c, &destIdx, dest, capacity); 1419 continue; 1420 } 1421 1422 if (c == BACKSLASH) { 1423 // Backslash Escape. Copy the following char out without further checks. 1424 // Note: Surrogate pairs don't need any special handling 1425 // The second half wont be a '$' or a '\', and 1426 // will move to the dest normally on the next 1427 // loop iteration. 1428 if (replIdx >= replacementLength) { 1429 break; 1430 } 1431 c = replacementText[replIdx]; 1432 1433 if (c==0x55/*U*/ || c==0x75/*u*/) { 1434 // We have a \udddd or \Udddddddd escape sequence. 1435 UChar32 escapedChar = 1436 u_unescapeAt(uregex_ucstr_unescape_charAt, 1437 &replIdx, // Index is updated by unescapeAt 1438 replacementLength, // Length of replacement text 1439 (void *)replacementText); 1440 1441 if (escapedChar != (UChar32)0xFFFFFFFF) { 1442 if (escapedChar <= 0xffff) { 1443 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); 1444 } else { 1445 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); 1446 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); 1447 } 1448 continue; 1449 } 1450 // Note: if the \u escape was invalid, just fall through and 1451 // treat it as a plain \<anything> escape. 1452 } 1453 1454 // Plain backslash escape. Just put out the escaped character. 1455 appendToBuf(c, &destIdx, dest, capacity); 1456 1457 replIdx++; 1458 continue; 1459 } 1460 1461 // We've got a $. Pick up the following capture group name or number. 1462 // For numbers, consume only digits that produce a valid capture group for the pattern. 1463 1464 int32_t groupNum = 0; 1465 U_ASSERT(c == DOLLARSIGN); 1466 UChar32 c32; 1467 U16_GET(replacementText, 0, replIdx, replacementLength, c32); 1468 if (u_isdigit(c32)) { 1469 int32_t numDigits = 0; 1470 int32_t numCaptureGroups = m->fPattern->fGroupMap->size(); 1471 for (;;) { 1472 if (replIdx >= replacementLength) { 1473 break; 1474 } 1475 U16_GET(replacementText, 0, replIdx, replacementLength, c32); 1476 if (u_isdigit(c32) == FALSE) { 1477 break; 1478 } 1479 1480 int32_t digitVal = u_charDigitValue(c32); 1481 if (groupNum * 10 + digitVal <= numCaptureGroups) { 1482 groupNum = groupNum * 10 + digitVal; 1483 U16_FWD_1(replacementText, replIdx, replacementLength); 1484 numDigits++; 1485 } else { 1486 if (numDigits == 0) { 1487 *status = U_INDEX_OUTOFBOUNDS_ERROR; 1488 } 1489 break; 1490 } 1491 } 1492 } else if (c32 == LEFTBRACKET) { 1493 // Scan for Named Capture Group, ${name}. 1494 UnicodeString groupName; 1495 U16_FWD_1(replacementText, replIdx, replacementLength); 1496 while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) { 1497 if (replIdx >= replacementLength) { 1498 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 1499 break; 1500 } 1501 U16_NEXT(replacementText, replIdx, replacementLength, c32); 1502 if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z 1503 (c32 >= 0x61 && c32 <= 0x7a) || // a..z 1504 (c32 >= 0x31 && c32 <= 0x39)) { // 0..9 1505 groupName.append(c32); 1506 } else if (c32 == RIGHTBRACKET) { 1507 groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName); 1508 if (groupNum == 0) { 1509 // Name not defined by pattern. 1510 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 1511 } 1512 } else { 1513 // Character was something other than a name char or a closing '}' 1514 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 1515 } 1516 } 1517 } else { 1518 // $ not followed by {name} or digits. 1519 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 1520 } 1521 1522 1523 // Finally, append the capture group data to the destination. 1524 if (U_SUCCESS(*status)) { 1525 destIdx += uregex_group((URegularExpression*)regexp, groupNum, 1526 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); 1527 if (*status == U_BUFFER_OVERFLOW_ERROR) { 1528 // Ignore buffer overflow when extracting the group. We need to 1529 // continue on to get full size of the untruncated result. We will 1530 // raise our own buffer overflow error at the end. 1531 *status = U_ZERO_ERROR; 1532 } 1533 } 1534 1535 if (U_FAILURE(*status)) { 1536 // bad group number or name. 1537 break; 1538 } 1539 } 1540 1541 // 1542 // Nul Terminate the dest buffer if possible. 1543 // Set the appropriate buffer overflow or not terminated error, if needed. 1544 // 1545 if (destIdx < capacity) { 1546 dest[destIdx] = 0; 1547 } else if (U_SUCCESS(*status)) { 1548 if (destIdx == *destCapacity) { 1549 *status = U_STRING_NOT_TERMINATED_WARNING; 1550 } else { 1551 *status = U_BUFFER_OVERFLOW_ERROR; 1552 } 1553 } 1554 1555 // 1556 // Return an updated dest buffer and capacity to the caller. 1557 // 1558 if (destIdx > 0 && *destCapacity > 0) { 1559 if (destIdx < capacity) { 1560 *destBuf += destIdx; 1561 *destCapacity -= destIdx; 1562 } else { 1563 *destBuf += capacity; 1564 *destCapacity = 0; 1565 } 1566 } 1567 1568 // If we came in with a buffer overflow, make sure we go out with one also. 1569 // (A zero length match right at the end of the previous match could 1570 // make this function succeed even though a previous call had overflowed the buf) 1571 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1572 *status = U_BUFFER_OVERFLOW_ERROR; 1573 } 1574 1575 return destIdx; 1576} 1577 1578// 1579// appendReplacement the actual API function, 1580// 1581U_CAPI int32_t U_EXPORT2 1582uregex_appendReplacement(URegularExpression *regexp2, 1583 const UChar *replacementText, 1584 int32_t replacementLength, 1585 UChar **destBuf, 1586 int32_t *destCapacity, 1587 UErrorCode *status) { 1588 1589 RegularExpression *regexp = (RegularExpression*)regexp2; 1590 return RegexCImpl::appendReplacement( 1591 regexp, replacementText, replacementLength,destBuf, destCapacity, status); 1592} 1593 1594// 1595// uregex_appendReplacementUText...can just use the normal C++ method 1596// 1597U_CAPI void U_EXPORT2 1598uregex_appendReplacementUText(URegularExpression *regexp2, 1599 UText *replText, 1600 UText *dest, 1601 UErrorCode *status) { 1602 RegularExpression *regexp = (RegularExpression*)regexp2; 1603 regexp->fMatcher->appendReplacement(dest, replText, *status); 1604} 1605 1606 1607//------------------------------------------------------------------------------ 1608// 1609// uregex_appendTail 1610// 1611//------------------------------------------------------------------------------ 1612int32_t RegexCImpl::appendTail(RegularExpression *regexp, 1613 UChar **destBuf, 1614 int32_t *destCapacity, 1615 UErrorCode *status) 1616{ 1617 1618 // If we come in with a buffer overflow error, don't suppress the operation. 1619 // A series of appendReplacements, appendTail need to correctly preflight 1620 // the buffer size when an overflow happens somewhere in the middle. 1621 UBool pendingBufferOverflow = FALSE; 1622 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1623 pendingBufferOverflow = TRUE; 1624 *status = U_ZERO_ERROR; 1625 } 1626 1627 if (validateRE(regexp, TRUE, status) == FALSE) { 1628 return 0; 1629 } 1630 1631 if (destCapacity == NULL || destBuf == NULL || 1632 (*destBuf == NULL && *destCapacity > 0) || 1633 *destCapacity < 0) 1634 { 1635 *status = U_ILLEGAL_ARGUMENT_ERROR; 1636 return 0; 1637 } 1638 1639 RegexMatcher *m = regexp->fMatcher; 1640 1641 int32_t destIdx = 0; 1642 int32_t destCap = *destCapacity; 1643 UChar *dest = *destBuf; 1644 1645 if (regexp->fText != NULL) { 1646 int32_t srcIdx; 1647 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); 1648 if (nativeIdx == -1) { 1649 srcIdx = 0; 1650 } else if (UTEXT_USES_U16(m->fInputText)) { 1651 srcIdx = (int32_t)nativeIdx; 1652 } else { 1653 UErrorCode status = U_ZERO_ERROR; 1654 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); 1655 } 1656 1657 for (;;) { 1658 U_ASSERT(destIdx >= 0); 1659 1660 if (srcIdx == regexp->fTextLength) { 1661 break; 1662 } 1663 UChar c = regexp->fText[srcIdx]; 1664 if (c == 0 && regexp->fTextLength == -1) { 1665 regexp->fTextLength = srcIdx; 1666 break; 1667 } 1668 1669 if (destIdx < destCap) { 1670 dest[destIdx] = c; 1671 } else { 1672 // We've overflowed the dest buffer. 1673 // If the total input string length is known, we can 1674 // compute the total buffer size needed without scanning through the string. 1675 if (regexp->fTextLength > 0) { 1676 destIdx += (regexp->fTextLength - srcIdx); 1677 break; 1678 } 1679 } 1680 srcIdx++; 1681 destIdx++; 1682 } 1683 } else { 1684 int64_t srcIdx; 1685 if (m->fMatch) { 1686 // The most recent call to find() succeeded. 1687 srcIdx = m->fMatchEnd; 1688 } else { 1689 // The last call to find() on this matcher failed(). 1690 // Look back to the end of the last find() that succeeded for src index. 1691 srcIdx = m->fLastMatchEnd; 1692 if (srcIdx == -1) { 1693 // There has been no successful match with this matcher. 1694 // We want to copy the whole string. 1695 srcIdx = 0; 1696 } 1697 } 1698 1699 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); 1700 } 1701 1702 // 1703 // NUL terminate the output string, if possible, otherwise issue the 1704 // appropriate error or warning. 1705 // 1706 if (destIdx < destCap) { 1707 dest[destIdx] = 0; 1708 } else if (destIdx == destCap) { 1709 *status = U_STRING_NOT_TERMINATED_WARNING; 1710 } else { 1711 *status = U_BUFFER_OVERFLOW_ERROR; 1712 } 1713 1714 // 1715 // Update the user's buffer ptr and capacity vars to reflect the 1716 // amount used. 1717 // 1718 if (destIdx < destCap) { 1719 *destBuf += destIdx; 1720 *destCapacity -= destIdx; 1721 } else if (*destBuf != NULL) { 1722 *destBuf += destCap; 1723 *destCapacity = 0; 1724 } 1725 1726 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1727 *status = U_BUFFER_OVERFLOW_ERROR; 1728 } 1729 1730 return destIdx; 1731} 1732 1733 1734// 1735// appendTail the actual API function 1736// 1737U_CAPI int32_t U_EXPORT2 1738uregex_appendTail(URegularExpression *regexp2, 1739 UChar **destBuf, 1740 int32_t *destCapacity, 1741 UErrorCode *status) { 1742 RegularExpression *regexp = (RegularExpression*)regexp2; 1743 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); 1744} 1745 1746 1747// 1748// uregex_appendTailUText...can just use the normal C++ method 1749// 1750U_CAPI UText * U_EXPORT2 1751uregex_appendTailUText(URegularExpression *regexp2, 1752 UText *dest, 1753 UErrorCode *status) { 1754 RegularExpression *regexp = (RegularExpression*)regexp2; 1755 return regexp->fMatcher->appendTail(dest, *status); 1756} 1757 1758 1759//------------------------------------------------------------------------------ 1760// 1761// copyString Internal utility to copy a string to an output buffer, 1762// while managing buffer overflow and preflight size 1763// computation. NUL termination is added to destination, 1764// and the NUL is counted in the output size. 1765// 1766//------------------------------------------------------------------------------ 1767#if 0 1768static void copyString(UChar *destBuffer, // Destination buffer. 1769 int32_t destCapacity, // Total capacity of dest buffer 1770 int32_t *destIndex, // Index into dest buffer. Updated on return. 1771 // Update not clipped to destCapacity. 1772 const UChar *srcPtr, // Pointer to source string 1773 int32_t srcLen) // Source string len. 1774{ 1775 int32_t si; 1776 int32_t di = *destIndex; 1777 UChar c; 1778 1779 for (si=0; si<srcLen; si++) { 1780 c = srcPtr[si]; 1781 if (di < destCapacity) { 1782 destBuffer[di] = c; 1783 di++; 1784 } else { 1785 di += srcLen - si; 1786 break; 1787 } 1788 } 1789 if (di<destCapacity) { 1790 destBuffer[di] = 0; 1791 } 1792 di++; 1793 *destIndex = di; 1794} 1795#endif 1796 1797//------------------------------------------------------------------------------ 1798// 1799// uregex_split 1800// 1801//------------------------------------------------------------------------------ 1802int32_t RegexCImpl::split(RegularExpression *regexp, 1803 UChar *destBuf, 1804 int32_t destCapacity, 1805 int32_t *requiredCapacity, 1806 UChar *destFields[], 1807 int32_t destFieldsCapacity, 1808 UErrorCode *status) { 1809 // 1810 // Reset for the input text 1811 // 1812 regexp->fMatcher->reset(); 1813 UText *inputText = regexp->fMatcher->fInputText; 1814 int64_t nextOutputStringStart = 0; 1815 int64_t inputLen = regexp->fMatcher->fInputLength; 1816 if (inputLen == 0) { 1817 return 0; 1818 } 1819 1820 // 1821 // Loop through the input text, searching for the delimiter pattern 1822 // 1823 int32_t i; // Index of the field being processed. 1824 int32_t destIdx = 0; // Next available position in destBuf; 1825 int32_t numCaptureGroups = regexp->fMatcher->groupCount(); 1826 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted 1827 for (i=0; ; i++) { 1828 if (i>=destFieldsCapacity-1) { 1829 // There are one or zero output strings left. 1830 // Fill the last output string with whatever is left from the input, then exit the loop. 1831 // ( i will be == destFieldsCapacity if we filled the output array while processing 1832 // capture groups of the delimiter expression, in which case we will discard the 1833 // last capture group saved in favor of the unprocessed remainder of the 1834 // input string.) 1835 if (inputLen > nextOutputStringStart) { 1836 if (i != destFieldsCapacity-1) { 1837 // No fields are left. Recycle the last one for holding the trailing part of 1838 // the input string. 1839 i = destFieldsCapacity-1; 1840 destIdx = (int32_t)(destFields[i] - destFields[0]); 1841 } 1842 1843 destFields[i] = &destBuf[destIdx]; 1844 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1845 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1846 } 1847 break; 1848 } 1849 1850 if (regexp->fMatcher->find()) { 1851 // We found another delimiter. Move everything from where we started looking 1852 // up until the start of the delimiter into the next output string. 1853 destFields[i] = &destBuf[destIdx]; 1854 1855 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, 1856 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); 1857 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1858 tStatus = U_ZERO_ERROR; 1859 } else { 1860 *status = tStatus; 1861 } 1862 nextOutputStringStart = regexp->fMatcher->fMatchEnd; 1863 1864 // If the delimiter pattern has capturing parentheses, the captured 1865 // text goes out into the next n destination strings. 1866 int32_t groupNum; 1867 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 1868 // If we've run out of output string slots, bail out. 1869 if (i==destFieldsCapacity-1) { 1870 break; 1871 } 1872 i++; 1873 1874 // Set up to extract the capture group contents into the dest buffer. 1875 destFields[i] = &destBuf[destIdx]; 1876 tStatus = U_ZERO_ERROR; 1877 int32_t t = uregex_group((URegularExpression*)regexp, 1878 groupNum, 1879 destFields[i], 1880 REMAINING_CAPACITY(destIdx, destCapacity), 1881 &tStatus); 1882 destIdx += t + 1; // Record the space used in the output string buffer. 1883 // +1 for the NUL that terminates the string. 1884 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1885 tStatus = U_ZERO_ERROR; 1886 } else { 1887 *status = tStatus; 1888 } 1889 } 1890 1891 if (nextOutputStringStart == inputLen) { 1892 // The delimiter was at the end of the string. 1893 // Output an empty string, and then we are done. 1894 if (destIdx < destCapacity) { 1895 destBuf[destIdx] = 0; 1896 } 1897 if (i < destFieldsCapacity-1) { 1898 ++i; 1899 } 1900 if (destIdx < destCapacity) { 1901 destFields[i] = destBuf + destIdx; 1902 } 1903 ++destIdx; 1904 break; 1905 } 1906 1907 } 1908 else 1909 { 1910 // We ran off the end of the input while looking for the next delimiter. 1911 // All the remaining text goes into the current output string. 1912 destFields[i] = &destBuf[destIdx]; 1913 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1914 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1915 break; 1916 } 1917 } 1918 1919 // Zero out any unused portion of the destFields array 1920 int j; 1921 for (j=i+1; j<destFieldsCapacity; j++) { 1922 destFields[j] = NULL; 1923 } 1924 1925 if (requiredCapacity != NULL) { 1926 *requiredCapacity = destIdx; 1927 } 1928 if (destIdx > destCapacity) { 1929 *status = U_BUFFER_OVERFLOW_ERROR; 1930 } 1931 return i+1; 1932} 1933 1934// 1935// uregex_split The actual API function 1936// 1937U_CAPI int32_t U_EXPORT2 1938uregex_split(URegularExpression *regexp2, 1939 UChar *destBuf, 1940 int32_t destCapacity, 1941 int32_t *requiredCapacity, 1942 UChar *destFields[], 1943 int32_t destFieldsCapacity, 1944 UErrorCode *status) { 1945 RegularExpression *regexp = (RegularExpression*)regexp2; 1946 if (validateRE(regexp, TRUE, status) == FALSE) { 1947 return 0; 1948 } 1949 if ((destBuf == NULL && destCapacity > 0) || 1950 destCapacity < 0 || 1951 destFields == NULL || 1952 destFieldsCapacity < 1 ) { 1953 *status = U_ILLEGAL_ARGUMENT_ERROR; 1954 return 0; 1955 } 1956 1957 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); 1958} 1959 1960 1961// 1962// uregex_splitUText...can just use the normal C++ method 1963// 1964U_CAPI int32_t U_EXPORT2 1965uregex_splitUText(URegularExpression *regexp2, 1966 UText *destFields[], 1967 int32_t destFieldsCapacity, 1968 UErrorCode *status) { 1969 RegularExpression *regexp = (RegularExpression*)regexp2; 1970 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); 1971} 1972 1973 1974#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1975 1976