1/* 2******************************************************************************* 3* Copyright (C) 2004-2013, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* file name: uregex.cpp 7*/ 8 9#include "unicode/utypes.h" 10 11#if !UCONFIG_NO_REGULAR_EXPRESSIONS 12 13#include "unicode/regex.h" 14#include "unicode/uregex.h" 15#include "unicode/unistr.h" 16#include "unicode/ustring.h" 17#include "unicode/uchar.h" 18#include "unicode/uobject.h" 19#include "unicode/utf16.h" 20#include "umutex.h" 21#include "uassert.h" 22#include "cmemory.h" 23 24#include "regextxt.h" 25 26#include <stdio.h> 27 28U_NAMESPACE_BEGIN 29 30#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) 31 32struct RegularExpression: public UMemory { 33public: 34 RegularExpression(); 35 ~RegularExpression(); 36 int32_t fMagic; 37 RegexPattern *fPat; 38 u_atomic_int32_t *fPatRefCount; 39 UChar *fPatString; 40 int32_t fPatStringLen; 41 RegexMatcher *fMatcher; 42 const UChar *fText; // Text from setText() 43 int32_t fTextLength; // Length provided by user with setText(), which 44 // may be -1. 45 UBool fOwnsText; 46}; 47 48static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII 49 50RegularExpression::RegularExpression() { 51 fMagic = REXP_MAGIC; 52 fPat = NULL; 53 fPatRefCount = NULL; 54 fPatString = NULL; 55 fPatStringLen = 0; 56 fMatcher = NULL; 57 fText = NULL; 58 fTextLength = 0; 59 fOwnsText = FALSE; 60} 61 62RegularExpression::~RegularExpression() { 63 delete fMatcher; 64 fMatcher = NULL; 65 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { 66 delete fPat; 67 uprv_free(fPatString); 68 uprv_free((void *)fPatRefCount); 69 } 70 if (fOwnsText && fText!=NULL) { 71 uprv_free((void *)fText); 72 } 73 fMagic = 0; 74} 75 76U_NAMESPACE_END 77 78U_NAMESPACE_USE 79 80//---------------------------------------------------------------------------------------- 81// 82// validateRE Do boilerplate style checks on API function parameters. 83// Return TRUE if they look OK. 84//---------------------------------------------------------------------------------------- 85static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) { 86 if (U_FAILURE(*status)) { 87 return FALSE; 88 } 89 if (re == NULL || re->fMagic != REXP_MAGIC) { 90 *status = U_ILLEGAL_ARGUMENT_ERROR; 91 return FALSE; 92 } 93 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway 94 if (requiresText && re->fText == NULL && !re->fOwnsText) { 95 *status = U_REGEX_INVALID_STATE; 96 return FALSE; 97 } 98 return TRUE; 99} 100 101//---------------------------------------------------------------------------------------- 102// 103// uregex_open 104// 105//---------------------------------------------------------------------------------------- 106U_CAPI URegularExpression * U_EXPORT2 107uregex_open( const UChar *pattern, 108 int32_t patternLength, 109 uint32_t flags, 110 UParseError *pe, 111 UErrorCode *status) { 112 113 if (U_FAILURE(*status)) { 114 return NULL; 115 } 116 if (pattern == NULL || patternLength < -1 || patternLength == 0) { 117 *status = U_ILLEGAL_ARGUMENT_ERROR; 118 return NULL; 119 } 120 int32_t actualPatLen = patternLength; 121 if (actualPatLen == -1) { 122 actualPatLen = u_strlen(pattern); 123 } 124 125 RegularExpression *re = new RegularExpression; 126 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); 127 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); 128 if (re == NULL || refC == NULL || patBuf == NULL) { 129 *status = U_MEMORY_ALLOCATION_ERROR; 130 delete re; 131 uprv_free((void *)refC); 132 uprv_free(patBuf); 133 return NULL; 134 } 135 re->fPatRefCount = refC; 136 *re->fPatRefCount = 1; 137 138 // 139 // Make a copy of the pattern string, so we can return it later if asked. 140 // For compiling the pattern, we will use a UText wrapper around 141 // this local copy, to avoid making even more copies. 142 // 143 re->fPatString = patBuf; 144 re->fPatStringLen = patternLength; 145 u_memcpy(patBuf, pattern, actualPatLen); 146 patBuf[actualPatLen] = 0; 147 148 UText patText = UTEXT_INITIALIZER; 149 utext_openUChars(&patText, patBuf, patternLength, status); 150 151 // 152 // Compile the pattern 153 // 154 if (pe != NULL) { 155 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 156 } else { 157 re->fPat = RegexPattern::compile(&patText, flags, *status); 158 } 159 utext_close(&patText); 160 161 if (U_FAILURE(*status)) { 162 goto ErrorExit; 163 } 164 165 // 166 // Create the matcher object 167 // 168 re->fMatcher = re->fPat->matcher(*status); 169 if (U_SUCCESS(*status)) { 170 return (URegularExpression*)re; 171 } 172 173ErrorExit: 174 delete re; 175 return NULL; 176 177} 178 179//---------------------------------------------------------------------------------------- 180// 181// uregex_openUText 182// 183//---------------------------------------------------------------------------------------- 184U_CAPI URegularExpression * U_EXPORT2 185uregex_openUText(UText *pattern, 186 uint32_t flags, 187 UParseError *pe, 188 UErrorCode *status) { 189 190 if (U_FAILURE(*status)) { 191 return NULL; 192 } 193 if (pattern == NULL) { 194 *status = U_ILLEGAL_ARGUMENT_ERROR; 195 return NULL; 196 } 197 198 int64_t patternNativeLength = utext_nativeLength(pattern); 199 200 if (patternNativeLength == 0) { 201 *status = U_ILLEGAL_ARGUMENT_ERROR; 202 return NULL; 203 } 204 205 RegularExpression *re = new RegularExpression; 206 207 UErrorCode lengthStatus = U_ZERO_ERROR; 208 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); 209 210 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); 211 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); 212 if (re == NULL || refC == NULL || patBuf == NULL) { 213 *status = U_MEMORY_ALLOCATION_ERROR; 214 delete re; 215 uprv_free((void *)refC); 216 uprv_free(patBuf); 217 return NULL; 218 } 219 re->fPatRefCount = refC; 220 *re->fPatRefCount = 1; 221 222 // 223 // Make a copy of the pattern string, so we can return it later if asked. 224 // For compiling the pattern, we will use a read-only UText wrapper 225 // around this local copy, to avoid making even more copies. 226 // 227 re->fPatString = patBuf; 228 re->fPatStringLen = pattern16Length; 229 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); 230 231 UText patText = UTEXT_INITIALIZER; 232 utext_openUChars(&patText, patBuf, pattern16Length, status); 233 234 // 235 // Compile the pattern 236 // 237 if (pe != NULL) { 238 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 239 } else { 240 re->fPat = RegexPattern::compile(&patText, flags, *status); 241 } 242 utext_close(&patText); 243 244 if (U_FAILURE(*status)) { 245 goto ErrorExit; 246 } 247 248 // 249 // Create the matcher object 250 // 251 re->fMatcher = re->fPat->matcher(*status); 252 if (U_SUCCESS(*status)) { 253 return (URegularExpression*)re; 254 } 255 256ErrorExit: 257 delete re; 258 return NULL; 259 260} 261 262//---------------------------------------------------------------------------------------- 263// 264// uregex_close 265// 266//---------------------------------------------------------------------------------------- 267U_CAPI void U_EXPORT2 268uregex_close(URegularExpression *re2) { 269 RegularExpression *re = (RegularExpression*)re2; 270 UErrorCode status = U_ZERO_ERROR; 271 if (validateRE(re, FALSE, &status) == FALSE) { 272 return; 273 } 274 delete re; 275} 276 277 278//---------------------------------------------------------------------------------------- 279// 280// uregex_clone 281// 282//---------------------------------------------------------------------------------------- 283U_CAPI URegularExpression * U_EXPORT2 284uregex_clone(const URegularExpression *source2, UErrorCode *status) { 285 RegularExpression *source = (RegularExpression*)source2; 286 if (validateRE(source, FALSE, status) == FALSE) { 287 return NULL; 288 } 289 290 RegularExpression *clone = new RegularExpression; 291 if (clone == NULL) { 292 *status = U_MEMORY_ALLOCATION_ERROR; 293 return NULL; 294 } 295 296 clone->fMatcher = source->fPat->matcher(*status); 297 if (U_FAILURE(*status)) { 298 delete clone; 299 return NULL; 300 } 301 302 clone->fPat = source->fPat; 303 clone->fPatRefCount = source->fPatRefCount; 304 clone->fPatString = source->fPatString; 305 clone->fPatStringLen = source->fPatStringLen; 306 umtx_atomic_inc(source->fPatRefCount); 307 // Note: fText is not cloned. 308 309 return (URegularExpression*)clone; 310} 311 312 313 314 315//------------------------------------------------------------------------------ 316// 317// uregex_pattern 318// 319//------------------------------------------------------------------------------ 320U_CAPI const UChar * U_EXPORT2 321uregex_pattern(const URegularExpression *regexp2, 322 int32_t *patLength, 323 UErrorCode *status) { 324 RegularExpression *regexp = (RegularExpression*)regexp2; 325 326 if (validateRE(regexp, FALSE, status) == FALSE) { 327 return NULL; 328 } 329 if (patLength != NULL) { 330 *patLength = regexp->fPatStringLen; 331 } 332 return regexp->fPatString; 333} 334 335 336//------------------------------------------------------------------------------ 337// 338// uregex_patternUText 339// 340//------------------------------------------------------------------------------ 341U_CAPI UText * U_EXPORT2 342uregex_patternUText(const URegularExpression *regexp2, 343 UErrorCode *status) { 344 RegularExpression *regexp = (RegularExpression*)regexp2; 345 return regexp->fPat->patternText(*status); 346} 347 348 349//------------------------------------------------------------------------------ 350// 351// uregex_flags 352// 353//------------------------------------------------------------------------------ 354U_CAPI int32_t U_EXPORT2 355uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { 356 RegularExpression *regexp = (RegularExpression*)regexp2; 357 if (validateRE(regexp, FALSE, status) == FALSE) { 358 return 0; 359 } 360 int32_t flags = regexp->fPat->flags(); 361 return flags; 362} 363 364 365//------------------------------------------------------------------------------ 366// 367// uregex_setText 368// 369//------------------------------------------------------------------------------ 370U_CAPI void U_EXPORT2 371uregex_setText(URegularExpression *regexp2, 372 const UChar *text, 373 int32_t textLength, 374 UErrorCode *status) { 375 RegularExpression *regexp = (RegularExpression*)regexp2; 376 if (validateRE(regexp, FALSE, status) == FALSE) { 377 return; 378 } 379 if (text == NULL || textLength < -1) { 380 *status = U_ILLEGAL_ARGUMENT_ERROR; 381 return; 382 } 383 384 if (regexp->fOwnsText && regexp->fText != NULL) { 385 uprv_free((void *)regexp->fText); 386 } 387 388 regexp->fText = text; 389 regexp->fTextLength = textLength; 390 regexp->fOwnsText = FALSE; 391 392 UText input = UTEXT_INITIALIZER; 393 utext_openUChars(&input, text, textLength, status); 394 regexp->fMatcher->reset(&input); 395 utext_close(&input); // reset() made a shallow clone, so we don't need this copy 396} 397 398 399//------------------------------------------------------------------------------ 400// 401// uregex_setUText 402// 403//------------------------------------------------------------------------------ 404U_CAPI void U_EXPORT2 405uregex_setUText(URegularExpression *regexp2, 406 UText *text, 407 UErrorCode *status) { 408 RegularExpression *regexp = (RegularExpression*)regexp2; 409 if (validateRE(regexp, FALSE, status) == FALSE) { 410 return; 411 } 412 if (text == NULL) { 413 *status = U_ILLEGAL_ARGUMENT_ERROR; 414 return; 415 } 416 417 if (regexp->fOwnsText && regexp->fText != NULL) { 418 uprv_free((void *)regexp->fText); 419 } 420 421 regexp->fText = NULL; // only fill it in on request 422 regexp->fTextLength = -1; 423 regexp->fOwnsText = TRUE; 424 regexp->fMatcher->reset(text); 425} 426 427 428 429//------------------------------------------------------------------------------ 430// 431// uregex_getText 432// 433//------------------------------------------------------------------------------ 434U_CAPI const UChar * U_EXPORT2 435uregex_getText(URegularExpression *regexp2, 436 int32_t *textLength, 437 UErrorCode *status) { 438 RegularExpression *regexp = (RegularExpression*)regexp2; 439 if (validateRE(regexp, FALSE, status) == FALSE) { 440 return NULL; 441 } 442 443 if (regexp->fText == NULL) { 444 // need to fill in the text 445 UText *inputText = regexp->fMatcher->inputText(); 446 int64_t inputNativeLength = utext_nativeLength(inputText); 447 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { 448 regexp->fText = inputText->chunkContents; 449 regexp->fTextLength = (int32_t)inputNativeLength; 450 regexp->fOwnsText = FALSE; // because the UText owns it 451 } else { 452 UErrorCode lengthStatus = U_ZERO_ERROR; 453 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error 454 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); 455 456 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); 457 regexp->fText = inputChars; 458 regexp->fOwnsText = TRUE; // should already be set but just in case 459 } 460 } 461 462 if (textLength != NULL) { 463 *textLength = regexp->fTextLength; 464 } 465 return regexp->fText; 466} 467 468 469//------------------------------------------------------------------------------ 470// 471// uregex_getUText 472// 473//------------------------------------------------------------------------------ 474U_CAPI UText * U_EXPORT2 475uregex_getUText(URegularExpression *regexp2, 476 UText *dest, 477 UErrorCode *status) { 478 RegularExpression *regexp = (RegularExpression*)regexp2; 479 if (validateRE(regexp, FALSE, status) == FALSE) { 480 return dest; 481 } 482 return regexp->fMatcher->getInput(dest, *status); 483} 484 485 486//------------------------------------------------------------------------------ 487// 488// uregex_refreshUText 489// 490//------------------------------------------------------------------------------ 491U_CAPI void U_EXPORT2 492uregex_refreshUText(URegularExpression *regexp2, 493 UText *text, 494 UErrorCode *status) { 495 RegularExpression *regexp = (RegularExpression*)regexp2; 496 if (validateRE(regexp, FALSE, status) == FALSE) { 497 return; 498 } 499 regexp->fMatcher->refreshInputText(text, *status); 500} 501 502 503//------------------------------------------------------------------------------ 504// 505// uregex_matches 506// 507//------------------------------------------------------------------------------ 508U_CAPI UBool U_EXPORT2 509uregex_matches(URegularExpression *regexp2, 510 int32_t startIndex, 511 UErrorCode *status) { 512 return uregex_matches64( regexp2, (int64_t)startIndex, status); 513} 514 515U_CAPI UBool U_EXPORT2 516uregex_matches64(URegularExpression *regexp2, 517 int64_t startIndex, 518 UErrorCode *status) { 519 RegularExpression *regexp = (RegularExpression*)regexp2; 520 UBool result = FALSE; 521 if (validateRE(regexp, TRUE, status) == FALSE) { 522 return result; 523 } 524 if (startIndex == -1) { 525 result = regexp->fMatcher->matches(*status); 526 } else { 527 result = regexp->fMatcher->matches(startIndex, *status); 528 } 529 return result; 530} 531 532 533//------------------------------------------------------------------------------ 534// 535// uregex_lookingAt 536// 537//------------------------------------------------------------------------------ 538U_CAPI UBool U_EXPORT2 539uregex_lookingAt(URegularExpression *regexp2, 540 int32_t startIndex, 541 UErrorCode *status) { 542 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); 543} 544 545U_CAPI UBool U_EXPORT2 546uregex_lookingAt64(URegularExpression *regexp2, 547 int64_t startIndex, 548 UErrorCode *status) { 549 RegularExpression *regexp = (RegularExpression*)regexp2; 550 UBool result = FALSE; 551 if (validateRE(regexp, TRUE, status) == FALSE) { 552 return result; 553 } 554 if (startIndex == -1) { 555 result = regexp->fMatcher->lookingAt(*status); 556 } else { 557 result = regexp->fMatcher->lookingAt(startIndex, *status); 558 } 559 return result; 560} 561 562 563 564//------------------------------------------------------------------------------ 565// 566// uregex_find 567// 568//------------------------------------------------------------------------------ 569U_CAPI UBool U_EXPORT2 570uregex_find(URegularExpression *regexp2, 571 int32_t startIndex, 572 UErrorCode *status) { 573 return uregex_find64( regexp2, (int64_t)startIndex, status); 574} 575 576U_CAPI UBool U_EXPORT2 577uregex_find64(URegularExpression *regexp2, 578 int64_t startIndex, 579 UErrorCode *status) { 580 RegularExpression *regexp = (RegularExpression*)regexp2; 581 UBool result = FALSE; 582 if (validateRE(regexp, TRUE, status) == FALSE) { 583 return result; 584 } 585 if (startIndex == -1) { 586 regexp->fMatcher->resetPreserveRegion(); 587 result = regexp->fMatcher->find(); 588 } else { 589 result = regexp->fMatcher->find(startIndex, *status); 590 } 591 return result; 592} 593 594 595//------------------------------------------------------------------------------ 596// 597// uregex_findNext 598// 599//------------------------------------------------------------------------------ 600U_CAPI UBool U_EXPORT2 601uregex_findNext(URegularExpression *regexp2, 602 UErrorCode *status) { 603 RegularExpression *regexp = (RegularExpression*)regexp2; 604 if (validateRE(regexp, TRUE, status) == FALSE) { 605 return FALSE; 606 } 607 UBool result = regexp->fMatcher->find(); 608 return result; 609} 610 611//------------------------------------------------------------------------------ 612// 613// uregex_groupCount 614// 615//------------------------------------------------------------------------------ 616U_CAPI int32_t U_EXPORT2 617uregex_groupCount(URegularExpression *regexp2, 618 UErrorCode *status) { 619 RegularExpression *regexp = (RegularExpression*)regexp2; 620 if (validateRE(regexp, FALSE, status) == FALSE) { 621 return 0; 622 } 623 int32_t result = regexp->fMatcher->groupCount(); 624 return result; 625} 626 627 628//------------------------------------------------------------------------------ 629// 630// uregex_group 631// 632//------------------------------------------------------------------------------ 633U_CAPI int32_t U_EXPORT2 634uregex_group(URegularExpression *regexp2, 635 int32_t groupNum, 636 UChar *dest, 637 int32_t destCapacity, 638 UErrorCode *status) { 639 RegularExpression *regexp = (RegularExpression*)regexp2; 640 if (validateRE(regexp, TRUE, status) == FALSE) { 641 return 0; 642 } 643 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 644 *status = U_ILLEGAL_ARGUMENT_ERROR; 645 return 0; 646 } 647 648 if (destCapacity == 0 || regexp->fText != NULL) { 649 // If preflighting or if we already have the text as UChars, 650 // this is a little cheaper than going through uregex_groupUTextDeep() 651 652 // 653 // Pick up the range of characters from the matcher 654 // 655 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 656 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 657 if (U_FAILURE(*status)) { 658 return 0; 659 } 660 661 // 662 // Trim length based on buffer capacity 663 // 664 int32_t fullLength = endIx - startIx; 665 int32_t copyLength = fullLength; 666 if (copyLength < destCapacity) { 667 dest[copyLength] = 0; 668 } else if (copyLength == destCapacity) { 669 *status = U_STRING_NOT_TERMINATED_WARNING; 670 } else { 671 copyLength = destCapacity; 672 *status = U_BUFFER_OVERFLOW_ERROR; 673 } 674 675 // 676 // Copy capture group to user's buffer 677 // 678 if (copyLength > 0) { 679 u_memcpy(dest, ®exp->fText[startIx], copyLength); 680 } 681 return fullLength; 682 } else { 683 int32_t result = 0; 684 UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status); 685 if (U_SUCCESS(*status)) { 686 result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); 687 } 688 utext_close(groupText); 689 return result; 690 } 691} 692 693 694//------------------------------------------------------------------------------ 695// 696// uregex_groupUText 697// 698//------------------------------------------------------------------------------ 699U_CAPI UText * U_EXPORT2 700uregex_groupUText(URegularExpression *regexp2, 701 int32_t groupNum, 702 UText *dest, 703 int64_t *groupLength, 704 UErrorCode *status) { 705 RegularExpression *regexp = (RegularExpression*)regexp2; 706 if (validateRE(regexp, TRUE, status) == FALSE) { 707 UErrorCode emptyTextStatus = U_ZERO_ERROR; 708 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 709 } 710 711 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); 712} 713 714//------------------------------------------------------------------------------ 715// 716// uregex_groupUTextDeep 717// 718//------------------------------------------------------------------------------ 719U_CAPI UText * U_EXPORT2 720uregex_groupUTextDeep(URegularExpression *regexp2, 721 int32_t groupNum, 722 UText *dest, 723 UErrorCode *status) { 724 RegularExpression *regexp = (RegularExpression*)regexp2; 725 if (validateRE(regexp, TRUE, status) == FALSE) { 726 UErrorCode emptyTextStatus = U_ZERO_ERROR; 727 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 728 } 729 730 if (regexp->fText != NULL) { 731 // 732 // Pick up the range of characters from the matcher 733 // and use our already-extracted characters 734 // 735 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 736 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 737 if (U_FAILURE(*status)) { 738 UErrorCode emptyTextStatus = U_ZERO_ERROR; 739 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 740 } 741 742 if (dest) { 743 utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); 744 } else { 745 UText groupText = UTEXT_INITIALIZER; 746 utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); 747 dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); 748 utext_close(&groupText); 749 } 750 751 return dest; 752 } else { 753 return regexp->fMatcher->group(groupNum, dest, *status); 754 } 755} 756 757//------------------------------------------------------------------------------ 758// 759// uregex_start 760// 761//------------------------------------------------------------------------------ 762U_CAPI int32_t U_EXPORT2 763uregex_start(URegularExpression *regexp2, 764 int32_t groupNum, 765 UErrorCode *status) { 766 return (int32_t)uregex_start64( regexp2, groupNum, status); 767} 768 769U_CAPI int64_t U_EXPORT2 770uregex_start64(URegularExpression *regexp2, 771 int32_t groupNum, 772 UErrorCode *status) { 773 RegularExpression *regexp = (RegularExpression*)regexp2; 774 if (validateRE(regexp, TRUE, status) == FALSE) { 775 return 0; 776 } 777 int32_t result = regexp->fMatcher->start(groupNum, *status); 778 return result; 779} 780 781//------------------------------------------------------------------------------ 782// 783// uregex_end 784// 785//------------------------------------------------------------------------------ 786U_CAPI int32_t U_EXPORT2 787uregex_end(URegularExpression *regexp2, 788 int32_t groupNum, 789 UErrorCode *status) { 790 return (int32_t)uregex_end64( regexp2, groupNum, status); 791} 792 793U_CAPI int64_t U_EXPORT2 794uregex_end64(URegularExpression *regexp2, 795 int32_t groupNum, 796 UErrorCode *status) { 797 RegularExpression *regexp = (RegularExpression*)regexp2; 798 if (validateRE(regexp, TRUE, status) == FALSE) { 799 return 0; 800 } 801 int32_t result = regexp->fMatcher->end(groupNum, *status); 802 return result; 803} 804 805//------------------------------------------------------------------------------ 806// 807// uregex_reset 808// 809//------------------------------------------------------------------------------ 810U_CAPI void U_EXPORT2 811uregex_reset(URegularExpression *regexp2, 812 int32_t index, 813 UErrorCode *status) { 814 uregex_reset64( regexp2, (int64_t)index, status); 815} 816 817U_CAPI void U_EXPORT2 818uregex_reset64(URegularExpression *regexp2, 819 int64_t index, 820 UErrorCode *status) { 821 RegularExpression *regexp = (RegularExpression*)regexp2; 822 if (validateRE(regexp, TRUE, status) == FALSE) { 823 return; 824 } 825 regexp->fMatcher->reset(index, *status); 826} 827 828 829//------------------------------------------------------------------------------ 830// 831// uregex_setRegion 832// 833//------------------------------------------------------------------------------ 834U_CAPI void U_EXPORT2 835uregex_setRegion(URegularExpression *regexp2, 836 int32_t regionStart, 837 int32_t regionLimit, 838 UErrorCode *status) { 839 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status); 840} 841 842U_CAPI void U_EXPORT2 843uregex_setRegion64(URegularExpression *regexp2, 844 int64_t regionStart, 845 int64_t regionLimit, 846 UErrorCode *status) { 847 RegularExpression *regexp = (RegularExpression*)regexp2; 848 if (validateRE(regexp, TRUE, status) == FALSE) { 849 return; 850 } 851 regexp->fMatcher->region(regionStart, regionLimit, *status); 852} 853 854 855//------------------------------------------------------------------------------ 856// 857// uregex_setRegionAndStart 858// 859//------------------------------------------------------------------------------ 860U_CAPI void U_EXPORT2 861uregex_setRegionAndStart(URegularExpression *regexp2, 862 int64_t regionStart, 863 int64_t regionLimit, 864 int64_t startIndex, 865 UErrorCode *status) { 866 RegularExpression *regexp = (RegularExpression*)regexp2; 867 if (validateRE(regexp, TRUE, status) == FALSE) { 868 return; 869 } 870 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); 871} 872 873//------------------------------------------------------------------------------ 874// 875// uregex_regionStart 876// 877//------------------------------------------------------------------------------ 878U_CAPI int32_t U_EXPORT2 879uregex_regionStart(const URegularExpression *regexp2, 880 UErrorCode *status) { 881 return (int32_t)uregex_regionStart64(regexp2, status); 882} 883 884U_CAPI int64_t U_EXPORT2 885uregex_regionStart64(const URegularExpression *regexp2, 886 UErrorCode *status) { 887 RegularExpression *regexp = (RegularExpression*)regexp2; 888 if (validateRE(regexp, TRUE, status) == FALSE) { 889 return 0; 890 } 891 return regexp->fMatcher->regionStart(); 892} 893 894 895//------------------------------------------------------------------------------ 896// 897// uregex_regionEnd 898// 899//------------------------------------------------------------------------------ 900U_CAPI int32_t U_EXPORT2 901uregex_regionEnd(const URegularExpression *regexp2, 902 UErrorCode *status) { 903 return (int32_t)uregex_regionEnd64(regexp2, status); 904} 905 906U_CAPI int64_t U_EXPORT2 907uregex_regionEnd64(const URegularExpression *regexp2, 908 UErrorCode *status) { 909 RegularExpression *regexp = (RegularExpression*)regexp2; 910 if (validateRE(regexp, TRUE, status) == FALSE) { 911 return 0; 912 } 913 return regexp->fMatcher->regionEnd(); 914} 915 916 917//------------------------------------------------------------------------------ 918// 919// uregex_hasTransparentBounds 920// 921//------------------------------------------------------------------------------ 922U_CAPI UBool U_EXPORT2 923uregex_hasTransparentBounds(const URegularExpression *regexp2, 924 UErrorCode *status) { 925 RegularExpression *regexp = (RegularExpression*)regexp2; 926 if (validateRE(regexp, FALSE, status) == FALSE) { 927 return FALSE; 928 } 929 return regexp->fMatcher->hasTransparentBounds(); 930} 931 932 933//------------------------------------------------------------------------------ 934// 935// uregex_useTransparentBounds 936// 937//------------------------------------------------------------------------------ 938U_CAPI void U_EXPORT2 939uregex_useTransparentBounds(URegularExpression *regexp2, 940 UBool b, 941 UErrorCode *status) { 942 RegularExpression *regexp = (RegularExpression*)regexp2; 943 if (validateRE(regexp, FALSE, status) == FALSE) { 944 return; 945 } 946 regexp->fMatcher->useTransparentBounds(b); 947} 948 949 950//------------------------------------------------------------------------------ 951// 952// uregex_hasAnchoringBounds 953// 954//------------------------------------------------------------------------------ 955U_CAPI UBool U_EXPORT2 956uregex_hasAnchoringBounds(const URegularExpression *regexp2, 957 UErrorCode *status) { 958 RegularExpression *regexp = (RegularExpression*)regexp2; 959 if (validateRE(regexp, FALSE, status) == FALSE) { 960 return FALSE; 961 } 962 return regexp->fMatcher->hasAnchoringBounds(); 963} 964 965 966//------------------------------------------------------------------------------ 967// 968// uregex_useAnchoringBounds 969// 970//------------------------------------------------------------------------------ 971U_CAPI void U_EXPORT2 972uregex_useAnchoringBounds(URegularExpression *regexp2, 973 UBool b, 974 UErrorCode *status) { 975 RegularExpression *regexp = (RegularExpression*)regexp2; 976 if (validateRE(regexp, FALSE, status) == FALSE) { 977 return; 978 } 979 regexp->fMatcher->useAnchoringBounds(b); 980} 981 982 983//------------------------------------------------------------------------------ 984// 985// uregex_hitEnd 986// 987//------------------------------------------------------------------------------ 988U_CAPI UBool U_EXPORT2 989uregex_hitEnd(const URegularExpression *regexp2, 990 UErrorCode *status) { 991 RegularExpression *regexp = (RegularExpression*)regexp2; 992 if (validateRE(regexp, TRUE, status) == FALSE) { 993 return FALSE; 994 } 995 return regexp->fMatcher->hitEnd(); 996} 997 998 999//------------------------------------------------------------------------------ 1000// 1001// uregex_requireEnd 1002// 1003//------------------------------------------------------------------------------ 1004U_CAPI UBool U_EXPORT2 1005uregex_requireEnd(const URegularExpression *regexp2, 1006 UErrorCode *status) { 1007 RegularExpression *regexp = (RegularExpression*)regexp2; 1008 if (validateRE(regexp, TRUE, status) == FALSE) { 1009 return FALSE; 1010 } 1011 return regexp->fMatcher->requireEnd(); 1012} 1013 1014 1015//------------------------------------------------------------------------------ 1016// 1017// uregex_setTimeLimit 1018// 1019//------------------------------------------------------------------------------ 1020U_CAPI void U_EXPORT2 1021uregex_setTimeLimit(URegularExpression *regexp2, 1022 int32_t limit, 1023 UErrorCode *status) { 1024 RegularExpression *regexp = (RegularExpression*)regexp2; 1025 if (validateRE(regexp, FALSE, status)) { 1026 regexp->fMatcher->setTimeLimit(limit, *status); 1027 } 1028} 1029 1030 1031 1032//------------------------------------------------------------------------------ 1033// 1034// uregex_getTimeLimit 1035// 1036//------------------------------------------------------------------------------ 1037U_CAPI int32_t U_EXPORT2 1038uregex_getTimeLimit(const URegularExpression *regexp2, 1039 UErrorCode *status) { 1040 int32_t retVal = 0; 1041 RegularExpression *regexp = (RegularExpression*)regexp2; 1042 if (validateRE(regexp, FALSE, status)) { 1043 retVal = regexp->fMatcher->getTimeLimit(); 1044 } 1045 return retVal; 1046} 1047 1048 1049 1050//------------------------------------------------------------------------------ 1051// 1052// uregex_setStackLimit 1053// 1054//------------------------------------------------------------------------------ 1055U_CAPI void U_EXPORT2 1056uregex_setStackLimit(URegularExpression *regexp2, 1057 int32_t limit, 1058 UErrorCode *status) { 1059 RegularExpression *regexp = (RegularExpression*)regexp2; 1060 if (validateRE(regexp, FALSE, status)) { 1061 regexp->fMatcher->setStackLimit(limit, *status); 1062 } 1063} 1064 1065 1066 1067//------------------------------------------------------------------------------ 1068// 1069// uregex_getStackLimit 1070// 1071//------------------------------------------------------------------------------ 1072U_CAPI int32_t U_EXPORT2 1073uregex_getStackLimit(const URegularExpression *regexp2, 1074 UErrorCode *status) { 1075 int32_t retVal = 0; 1076 RegularExpression *regexp = (RegularExpression*)regexp2; 1077 if (validateRE(regexp, FALSE, status)) { 1078 retVal = regexp->fMatcher->getStackLimit(); 1079 } 1080 return retVal; 1081} 1082 1083 1084//------------------------------------------------------------------------------ 1085// 1086// uregex_setMatchCallback 1087// 1088//------------------------------------------------------------------------------ 1089U_CAPI void U_EXPORT2 1090uregex_setMatchCallback(URegularExpression *regexp2, 1091 URegexMatchCallback *callback, 1092 const void *context, 1093 UErrorCode *status) { 1094 RegularExpression *regexp = (RegularExpression*)regexp2; 1095 if (validateRE(regexp, FALSE, status)) { 1096 regexp->fMatcher->setMatchCallback(callback, context, *status); 1097 } 1098} 1099 1100 1101//------------------------------------------------------------------------------ 1102// 1103// uregex_getMatchCallback 1104// 1105//------------------------------------------------------------------------------ 1106U_CAPI void U_EXPORT2 1107uregex_getMatchCallback(const URegularExpression *regexp2, 1108 URegexMatchCallback **callback, 1109 const void **context, 1110 UErrorCode *status) { 1111 RegularExpression *regexp = (RegularExpression*)regexp2; 1112 if (validateRE(regexp, FALSE, status)) { 1113 regexp->fMatcher->getMatchCallback(*callback, *context, *status); 1114 } 1115} 1116 1117 1118//------------------------------------------------------------------------------ 1119// 1120// uregex_setMatchProgressCallback 1121// 1122//------------------------------------------------------------------------------ 1123U_CAPI void U_EXPORT2 1124uregex_setFindProgressCallback(URegularExpression *regexp2, 1125 URegexFindProgressCallback *callback, 1126 const void *context, 1127 UErrorCode *status) { 1128 RegularExpression *regexp = (RegularExpression*)regexp2; 1129 if (validateRE(regexp, FALSE, status)) { 1130 regexp->fMatcher->setFindProgressCallback(callback, context, *status); 1131 } 1132} 1133 1134 1135//------------------------------------------------------------------------------ 1136// 1137// uregex_getMatchCallback 1138// 1139//------------------------------------------------------------------------------ 1140U_CAPI void U_EXPORT2 1141uregex_getFindProgressCallback(const URegularExpression *regexp2, 1142 URegexFindProgressCallback **callback, 1143 const void **context, 1144 UErrorCode *status) { 1145 RegularExpression *regexp = (RegularExpression*)regexp2; 1146 if (validateRE(regexp, FALSE, status)) { 1147 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status); 1148 } 1149} 1150 1151 1152//------------------------------------------------------------------------------ 1153// 1154// uregex_replaceAll 1155// 1156//------------------------------------------------------------------------------ 1157U_CAPI int32_t U_EXPORT2 1158uregex_replaceAll(URegularExpression *regexp2, 1159 const UChar *replacementText, 1160 int32_t replacementLength, 1161 UChar *destBuf, 1162 int32_t destCapacity, 1163 UErrorCode *status) { 1164 RegularExpression *regexp = (RegularExpression*)regexp2; 1165 if (validateRE(regexp, TRUE, status) == FALSE) { 1166 return 0; 1167 } 1168 if (replacementText == NULL || replacementLength < -1 || 1169 (destBuf == NULL && destCapacity > 0) || 1170 destCapacity < 0) { 1171 *status = U_ILLEGAL_ARGUMENT_ERROR; 1172 return 0; 1173 } 1174 1175 int32_t len = 0; 1176 1177 uregex_reset(regexp2, 0, status); 1178 1179 // Note: Seperate error code variables for findNext() and appendReplacement() 1180 // are used so that destination buffer overflow errors 1181 // in appendReplacement won't stop findNext() from working. 1182 // appendReplacement() and appendTail() special case incoming buffer 1183 // overflow errors, continuing to return the correct length. 1184 UErrorCode findStatus = *status; 1185 while (uregex_findNext(regexp2, &findStatus)) { 1186 len += uregex_appendReplacement(regexp2, replacementText, replacementLength, 1187 &destBuf, &destCapacity, status); 1188 } 1189 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1190 1191 if (U_FAILURE(findStatus)) { 1192 // If anything went wrong with the findNext(), make that error trump 1193 // whatever may have happened with the append() operations. 1194 // Errors in findNext() are not expected. 1195 *status = findStatus; 1196 } 1197 1198 return len; 1199} 1200 1201 1202//------------------------------------------------------------------------------ 1203// 1204// uregex_replaceAllUText 1205// 1206//------------------------------------------------------------------------------ 1207U_CAPI UText * U_EXPORT2 1208uregex_replaceAllUText(URegularExpression *regexp2, 1209 UText *replacementText, 1210 UText *dest, 1211 UErrorCode *status) { 1212 RegularExpression *regexp = (RegularExpression*)regexp2; 1213 if (validateRE(regexp, TRUE, status) == FALSE) { 1214 return 0; 1215 } 1216 if (replacementText == NULL) { 1217 *status = U_ILLEGAL_ARGUMENT_ERROR; 1218 return 0; 1219 } 1220 1221 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); 1222 return dest; 1223} 1224 1225 1226//------------------------------------------------------------------------------ 1227// 1228// uregex_replaceFirst 1229// 1230//------------------------------------------------------------------------------ 1231U_CAPI int32_t U_EXPORT2 1232uregex_replaceFirst(URegularExpression *regexp2, 1233 const UChar *replacementText, 1234 int32_t replacementLength, 1235 UChar *destBuf, 1236 int32_t destCapacity, 1237 UErrorCode *status) { 1238 RegularExpression *regexp = (RegularExpression*)regexp2; 1239 if (validateRE(regexp, TRUE, status) == FALSE) { 1240 return 0; 1241 } 1242 if (replacementText == NULL || replacementLength < -1 || 1243 (destBuf == NULL && destCapacity > 0) || 1244 destCapacity < 0) { 1245 *status = U_ILLEGAL_ARGUMENT_ERROR; 1246 return 0; 1247 } 1248 1249 int32_t len = 0; 1250 UBool findSucceeded; 1251 uregex_reset(regexp2, 0, status); 1252 findSucceeded = uregex_find(regexp2, 0, status); 1253 if (findSucceeded) { 1254 len = uregex_appendReplacement(regexp2, replacementText, replacementLength, 1255 &destBuf, &destCapacity, status); 1256 } 1257 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1258 1259 return len; 1260} 1261 1262 1263//------------------------------------------------------------------------------ 1264// 1265// uregex_replaceFirstUText 1266// 1267//------------------------------------------------------------------------------ 1268U_CAPI UText * U_EXPORT2 1269uregex_replaceFirstUText(URegularExpression *regexp2, 1270 UText *replacementText, 1271 UText *dest, 1272 UErrorCode *status) { 1273 RegularExpression *regexp = (RegularExpression*)regexp2; 1274 if (validateRE(regexp, TRUE, status) == FALSE) { 1275 return 0; 1276 } 1277 if (replacementText == NULL) { 1278 *status = U_ILLEGAL_ARGUMENT_ERROR; 1279 return 0; 1280 } 1281 1282 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); 1283 return dest; 1284} 1285 1286 1287//------------------------------------------------------------------------------ 1288// 1289// uregex_appendReplacement 1290// 1291//------------------------------------------------------------------------------ 1292 1293U_NAMESPACE_BEGIN 1294// 1295// Dummy class, because these functions need to be friends of class RegexMatcher, 1296// and stand-alone C functions don't work as friends 1297// 1298class RegexCImpl { 1299 public: 1300 inline static int32_t appendReplacement(RegularExpression *regexp, 1301 const UChar *replacementText, 1302 int32_t replacementLength, 1303 UChar **destBuf, 1304 int32_t *destCapacity, 1305 UErrorCode *status); 1306 1307 inline static int32_t appendTail(RegularExpression *regexp, 1308 UChar **destBuf, 1309 int32_t *destCapacity, 1310 UErrorCode *status); 1311 1312 inline static int32_t split(RegularExpression *regexp, 1313 UChar *destBuf, 1314 int32_t destCapacity, 1315 int32_t *requiredCapacity, 1316 UChar *destFields[], 1317 int32_t destFieldsCapacity, 1318 UErrorCode *status); 1319}; 1320 1321U_NAMESPACE_END 1322 1323 1324 1325static const UChar BACKSLASH = 0x5c; 1326static const UChar DOLLARSIGN = 0x24; 1327 1328// 1329// Move a character to an output buffer, with bounds checking on the index. 1330// Index advances even if capacity is exceeded, for preflight size computations. 1331// This little sequence is used a LOT. 1332// 1333static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { 1334 if (*idx < bufCapacity) { 1335 buf[*idx] = c; 1336 } 1337 (*idx)++; 1338} 1339 1340 1341// 1342// appendReplacement, the actual implementation. 1343// 1344int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, 1345 const UChar *replacementText, 1346 int32_t replacementLength, 1347 UChar **destBuf, 1348 int32_t *destCapacity, 1349 UErrorCode *status) { 1350 1351 // If we come in with a buffer overflow error, don't suppress the operation. 1352 // A series of appendReplacements, appendTail need to correctly preflight 1353 // the buffer size when an overflow happens somewhere in the middle. 1354 UBool pendingBufferOverflow = FALSE; 1355 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1356 pendingBufferOverflow = TRUE; 1357 *status = U_ZERO_ERROR; 1358 } 1359 1360 // 1361 // Validate all paramters 1362 // 1363 if (validateRE(regexp, TRUE, status) == FALSE) { 1364 return 0; 1365 } 1366 if (replacementText == NULL || replacementLength < -1 || 1367 destCapacity == NULL || destBuf == NULL || 1368 (*destBuf == NULL && *destCapacity > 0) || 1369 *destCapacity < 0) { 1370 *status = U_ILLEGAL_ARGUMENT_ERROR; 1371 return 0; 1372 } 1373 1374 RegexMatcher *m = regexp->fMatcher; 1375 if (m->fMatch == FALSE) { 1376 *status = U_REGEX_INVALID_STATE; 1377 return 0; 1378 } 1379 1380 UChar *dest = *destBuf; 1381 int32_t capacity = *destCapacity; 1382 int32_t destIdx = 0; 1383 int32_t i; 1384 1385 // If it wasn't supplied by the caller, get the length of the replacement text. 1386 // TODO: slightly smarter logic in the copy loop could watch for the NUL on 1387 // the fly and avoid this step. 1388 if (replacementLength == -1) { 1389 replacementLength = u_strlen(replacementText); 1390 } 1391 1392 // Copy input string from the end of previous match to start of current match 1393 if (regexp->fText != NULL) { 1394 int32_t matchStart; 1395 int32_t lastMatchEnd; 1396 if (UTEXT_USES_U16(m->fInputText)) { 1397 lastMatchEnd = (int32_t)m->fLastMatchEnd; 1398 matchStart = (int32_t)m->fMatchStart; 1399 } else { 1400 // !!!: Would like a better way to do this! 1401 UErrorCode status = U_ZERO_ERROR; 1402 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status); 1403 status = U_ZERO_ERROR; 1404 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status); 1405 } 1406 for (i=lastMatchEnd; i<matchStart; i++) { 1407 appendToBuf(regexp->fText[i], &destIdx, dest, capacity); 1408 } 1409 } else { 1410 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore 1411 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, 1412 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), 1413 &possibleOverflowError); 1414 } 1415 U_ASSERT(destIdx >= 0); 1416 1417 // scan the replacement text, looking for substitutions ($n) and \escapes. 1418 int32_t replIdx = 0; 1419 while (replIdx < replacementLength) { 1420 UChar c = replacementText[replIdx]; 1421 replIdx++; 1422 if (c != DOLLARSIGN && c != BACKSLASH) { 1423 // Common case, no substitution, no escaping, 1424 // just copy the char to the dest buf. 1425 appendToBuf(c, &destIdx, dest, capacity); 1426 continue; 1427 } 1428 1429 if (c == BACKSLASH) { 1430 // Backslash Escape. Copy the following char out without further checks. 1431 // Note: Surrogate pairs don't need any special handling 1432 // The second half wont be a '$' or a '\', and 1433 // will move to the dest normally on the next 1434 // loop iteration. 1435 if (replIdx >= replacementLength) { 1436 break; 1437 } 1438 c = replacementText[replIdx]; 1439 1440 if (c==0x55/*U*/ || c==0x75/*u*/) { 1441 // We have a \udddd or \Udddddddd escape sequence. 1442 UChar32 escapedChar = 1443 u_unescapeAt(uregex_ucstr_unescape_charAt, 1444 &replIdx, // Index is updated by unescapeAt 1445 replacementLength, // Length of replacement text 1446 (void *)replacementText); 1447 1448 if (escapedChar != (UChar32)0xFFFFFFFF) { 1449 if (escapedChar <= 0xffff) { 1450 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); 1451 } else { 1452 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); 1453 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); 1454 } 1455 continue; 1456 } 1457 // Note: if the \u escape was invalid, just fall through and 1458 // treat it as a plain \<anything> escape. 1459 } 1460 1461 // Plain backslash escape. Just put out the escaped character. 1462 appendToBuf(c, &destIdx, dest, capacity); 1463 1464 replIdx++; 1465 continue; 1466 } 1467 1468 1469 1470 // We've got a $. Pick up a capture group number if one follows. 1471 // Consume at most the number of digits necessary for the largest capture 1472 // number that is valid for this pattern. 1473 1474 int32_t numDigits = 0; 1475 int32_t groupNum = 0; 1476 UChar32 digitC; 1477 for (;;) { 1478 if (replIdx >= replacementLength) { 1479 break; 1480 } 1481 U16_GET(replacementText, 0, replIdx, replacementLength, digitC); 1482 if (u_isdigit(digitC) == FALSE) { 1483 break; 1484 } 1485 1486 U16_FWD_1(replacementText, replIdx, replacementLength); 1487 groupNum=groupNum*10 + u_charDigitValue(digitC); 1488 numDigits++; 1489 if (numDigits >= m->fPattern->fMaxCaptureDigits) { 1490 break; 1491 } 1492 } 1493 1494 1495 if (numDigits == 0) { 1496 // The $ didn't introduce a group number at all. 1497 // Treat it as just part of the substitution text. 1498 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); 1499 continue; 1500 } 1501 1502 // Finally, append the capture group data to the destination. 1503 destIdx += uregex_group((URegularExpression*)regexp, groupNum, 1504 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); 1505 if (*status == U_BUFFER_OVERFLOW_ERROR) { 1506 // Ignore buffer overflow when extracting the group. We need to 1507 // continue on to get full size of the untruncated result. We will 1508 // raise our own buffer overflow error at the end. 1509 *status = U_ZERO_ERROR; 1510 } 1511 1512 if (U_FAILURE(*status)) { 1513 // Can fail if group number is out of range. 1514 break; 1515 } 1516 1517 } 1518 1519 // 1520 // Nul Terminate the dest buffer if possible. 1521 // Set the appropriate buffer overflow or not terminated error, if needed. 1522 // 1523 if (destIdx < capacity) { 1524 dest[destIdx] = 0; 1525 } else if (destIdx == *destCapacity) { 1526 *status = U_STRING_NOT_TERMINATED_WARNING; 1527 } else { 1528 *status = U_BUFFER_OVERFLOW_ERROR; 1529 } 1530 1531 // 1532 // Return an updated dest buffer and capacity to the caller. 1533 // 1534 if (destIdx > 0 && *destCapacity > 0) { 1535 if (destIdx < capacity) { 1536 *destBuf += destIdx; 1537 *destCapacity -= destIdx; 1538 } else { 1539 *destBuf += capacity; 1540 *destCapacity = 0; 1541 } 1542 } 1543 1544 // If we came in with a buffer overflow, make sure we go out with one also. 1545 // (A zero length match right at the end of the previous match could 1546 // make this function succeed even though a previous call had overflowed the buf) 1547 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1548 *status = U_BUFFER_OVERFLOW_ERROR; 1549 } 1550 1551 return destIdx; 1552} 1553 1554// 1555// appendReplacement the actual API function, 1556// 1557U_CAPI int32_t U_EXPORT2 1558uregex_appendReplacement(URegularExpression *regexp2, 1559 const UChar *replacementText, 1560 int32_t replacementLength, 1561 UChar **destBuf, 1562 int32_t *destCapacity, 1563 UErrorCode *status) { 1564 1565 RegularExpression *regexp = (RegularExpression*)regexp2; 1566 return RegexCImpl::appendReplacement( 1567 regexp, replacementText, replacementLength,destBuf, destCapacity, status); 1568} 1569 1570// 1571// uregex_appendReplacementUText...can just use the normal C++ method 1572// 1573U_CAPI void U_EXPORT2 1574uregex_appendReplacementUText(URegularExpression *regexp2, 1575 UText *replText, 1576 UText *dest, 1577 UErrorCode *status) { 1578 RegularExpression *regexp = (RegularExpression*)regexp2; 1579 regexp->fMatcher->appendReplacement(dest, replText, *status); 1580} 1581 1582 1583//------------------------------------------------------------------------------ 1584// 1585// uregex_appendTail 1586// 1587//------------------------------------------------------------------------------ 1588int32_t RegexCImpl::appendTail(RegularExpression *regexp, 1589 UChar **destBuf, 1590 int32_t *destCapacity, 1591 UErrorCode *status) 1592{ 1593 1594 // If we come in with a buffer overflow error, don't suppress the operation. 1595 // A series of appendReplacements, appendTail need to correctly preflight 1596 // the buffer size when an overflow happens somewhere in the middle. 1597 UBool pendingBufferOverflow = FALSE; 1598 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1599 pendingBufferOverflow = TRUE; 1600 *status = U_ZERO_ERROR; 1601 } 1602 1603 if (validateRE(regexp, TRUE, status) == FALSE) { 1604 return 0; 1605 } 1606 1607 if (destCapacity == NULL || destBuf == NULL || 1608 (*destBuf == NULL && *destCapacity > 0) || 1609 *destCapacity < 0) 1610 { 1611 *status = U_ILLEGAL_ARGUMENT_ERROR; 1612 return 0; 1613 } 1614 1615 RegexMatcher *m = regexp->fMatcher; 1616 1617 int32_t destIdx = 0; 1618 int32_t destCap = *destCapacity; 1619 UChar *dest = *destBuf; 1620 1621 if (regexp->fText != NULL) { 1622 int32_t srcIdx; 1623 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); 1624 if (nativeIdx == -1) { 1625 srcIdx = 0; 1626 } else if (UTEXT_USES_U16(m->fInputText)) { 1627 srcIdx = (int32_t)nativeIdx; 1628 } else { 1629 UErrorCode status = U_ZERO_ERROR; 1630 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); 1631 } 1632 1633 for (;;) { 1634 U_ASSERT(destIdx >= 0); 1635 1636 if (srcIdx == regexp->fTextLength) { 1637 break; 1638 } 1639 UChar c = regexp->fText[srcIdx]; 1640 if (c == 0 && regexp->fTextLength == -1) { 1641 regexp->fTextLength = srcIdx; 1642 break; 1643 } 1644 1645 if (destIdx < destCap) { 1646 dest[destIdx] = c; 1647 } else { 1648 // We've overflowed the dest buffer. 1649 // If the total input string length is known, we can 1650 // compute the total buffer size needed without scanning through the string. 1651 if (regexp->fTextLength > 0) { 1652 destIdx += (regexp->fTextLength - srcIdx); 1653 break; 1654 } 1655 } 1656 srcIdx++; 1657 destIdx++; 1658 } 1659 } else { 1660 int64_t srcIdx; 1661 if (m->fMatch) { 1662 // The most recent call to find() succeeded. 1663 srcIdx = m->fMatchEnd; 1664 } else { 1665 // The last call to find() on this matcher failed(). 1666 // Look back to the end of the last find() that succeeded for src index. 1667 srcIdx = m->fLastMatchEnd; 1668 if (srcIdx == -1) { 1669 // There has been no successful match with this matcher. 1670 // We want to copy the whole string. 1671 srcIdx = 0; 1672 } 1673 } 1674 1675 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); 1676 } 1677 1678 // 1679 // NUL terminate the output string, if possible, otherwise issue the 1680 // appropriate error or warning. 1681 // 1682 if (destIdx < destCap) { 1683 dest[destIdx] = 0; 1684 } else if (destIdx == destCap) { 1685 *status = U_STRING_NOT_TERMINATED_WARNING; 1686 } else { 1687 *status = U_BUFFER_OVERFLOW_ERROR; 1688 } 1689 1690 // 1691 // Update the user's buffer ptr and capacity vars to reflect the 1692 // amount used. 1693 // 1694 if (destIdx < destCap) { 1695 *destBuf += destIdx; 1696 *destCapacity -= destIdx; 1697 } else if (*destBuf != NULL) { 1698 *destBuf += destCap; 1699 *destCapacity = 0; 1700 } 1701 1702 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1703 *status = U_BUFFER_OVERFLOW_ERROR; 1704 } 1705 1706 return destIdx; 1707} 1708 1709 1710// 1711// appendTail the actual API function 1712// 1713U_CAPI int32_t U_EXPORT2 1714uregex_appendTail(URegularExpression *regexp2, 1715 UChar **destBuf, 1716 int32_t *destCapacity, 1717 UErrorCode *status) { 1718 RegularExpression *regexp = (RegularExpression*)regexp2; 1719 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); 1720} 1721 1722 1723// 1724// uregex_appendTailUText...can just use the normal C++ method 1725// 1726U_CAPI UText * U_EXPORT2 1727uregex_appendTailUText(URegularExpression *regexp2, 1728 UText *dest, 1729 UErrorCode *status) { 1730 RegularExpression *regexp = (RegularExpression*)regexp2; 1731 return regexp->fMatcher->appendTail(dest, *status); 1732} 1733 1734 1735//------------------------------------------------------------------------------ 1736// 1737// copyString Internal utility to copy a string to an output buffer, 1738// while managing buffer overflow and preflight size 1739// computation. NUL termination is added to destination, 1740// and the NUL is counted in the output size. 1741// 1742//------------------------------------------------------------------------------ 1743#if 0 1744static void copyString(UChar *destBuffer, // Destination buffer. 1745 int32_t destCapacity, // Total capacity of dest buffer 1746 int32_t *destIndex, // Index into dest buffer. Updated on return. 1747 // Update not clipped to destCapacity. 1748 const UChar *srcPtr, // Pointer to source string 1749 int32_t srcLen) // Source string len. 1750{ 1751 int32_t si; 1752 int32_t di = *destIndex; 1753 UChar c; 1754 1755 for (si=0; si<srcLen; si++) { 1756 c = srcPtr[si]; 1757 if (di < destCapacity) { 1758 destBuffer[di] = c; 1759 di++; 1760 } else { 1761 di += srcLen - si; 1762 break; 1763 } 1764 } 1765 if (di<destCapacity) { 1766 destBuffer[di] = 0; 1767 } 1768 di++; 1769 *destIndex = di; 1770} 1771#endif 1772 1773//------------------------------------------------------------------------------ 1774// 1775// uregex_split 1776// 1777//------------------------------------------------------------------------------ 1778int32_t RegexCImpl::split(RegularExpression *regexp, 1779 UChar *destBuf, 1780 int32_t destCapacity, 1781 int32_t *requiredCapacity, 1782 UChar *destFields[], 1783 int32_t destFieldsCapacity, 1784 UErrorCode *status) { 1785 // 1786 // Reset for the input text 1787 // 1788 regexp->fMatcher->reset(); 1789 UText *inputText = regexp->fMatcher->fInputText; 1790 int64_t nextOutputStringStart = 0; 1791 int64_t inputLen = regexp->fMatcher->fInputLength; 1792 if (inputLen == 0) { 1793 return 0; 1794 } 1795 1796 // 1797 // Loop through the input text, searching for the delimiter pattern 1798 // 1799 int32_t i; // Index of the field being processed. 1800 int32_t destIdx = 0; // Next available position in destBuf; 1801 int32_t numCaptureGroups = regexp->fMatcher->groupCount(); 1802 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted 1803 for (i=0; ; i++) { 1804 if (i>=destFieldsCapacity-1) { 1805 // There are one or zero output strings left. 1806 // Fill the last output string with whatever is left from the input, then exit the loop. 1807 // ( i will be == destFieldsCapacity if we filled the output array while processing 1808 // capture groups of the delimiter expression, in which case we will discard the 1809 // last capture group saved in favor of the unprocessed remainder of the 1810 // input string.) 1811 if (inputLen > nextOutputStringStart) { 1812 if (i != destFieldsCapacity-1) { 1813 // No fields are left. Recycle the last one for holding the trailing part of 1814 // the input string. 1815 i = destFieldsCapacity-1; 1816 destIdx = (int32_t)(destFields[i] - destFields[0]); 1817 } 1818 1819 destFields[i] = &destBuf[destIdx]; 1820 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1821 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1822 } 1823 break; 1824 } 1825 1826 if (regexp->fMatcher->find()) { 1827 // We found another delimiter. Move everything from where we started looking 1828 // up until the start of the delimiter into the next output string. 1829 destFields[i] = &destBuf[destIdx]; 1830 1831 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, 1832 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); 1833 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1834 tStatus = U_ZERO_ERROR; 1835 } else { 1836 *status = tStatus; 1837 } 1838 nextOutputStringStart = regexp->fMatcher->fMatchEnd; 1839 1840 // If the delimiter pattern has capturing parentheses, the captured 1841 // text goes out into the next n destination strings. 1842 int32_t groupNum; 1843 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 1844 // If we've run out of output string slots, bail out. 1845 if (i==destFieldsCapacity-1) { 1846 break; 1847 } 1848 i++; 1849 1850 // Set up to extract the capture group contents into the dest buffer. 1851 destFields[i] = &destBuf[destIdx]; 1852 tStatus = U_ZERO_ERROR; 1853 int32_t t = uregex_group((URegularExpression*)regexp, 1854 groupNum, 1855 destFields[i], 1856 REMAINING_CAPACITY(destIdx, destCapacity), 1857 &tStatus); 1858 destIdx += t + 1; // Record the space used in the output string buffer. 1859 // +1 for the NUL that terminates the string. 1860 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1861 tStatus = U_ZERO_ERROR; 1862 } else { 1863 *status = tStatus; 1864 } 1865 } 1866 1867 if (nextOutputStringStart == inputLen) { 1868 // The delimiter was at the end of the string. 1869 // Output an empty string, and then we are done. 1870 if (destIdx < destCapacity) { 1871 destBuf[destIdx] = 0; 1872 } 1873 if (i < destFieldsCapacity-1) { 1874 ++i; 1875 } 1876 if (destIdx < destCapacity) { 1877 destFields[i] = destBuf + destIdx; 1878 } 1879 ++destIdx; 1880 break; 1881 } 1882 1883 } 1884 else 1885 { 1886 // We ran off the end of the input while looking for the next delimiter. 1887 // All the remaining text goes into the current output string. 1888 destFields[i] = &destBuf[destIdx]; 1889 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1890 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1891 break; 1892 } 1893 } 1894 1895 // Zero out any unused portion of the destFields array 1896 int j; 1897 for (j=i+1; j<destFieldsCapacity; j++) { 1898 destFields[j] = NULL; 1899 } 1900 1901 if (requiredCapacity != NULL) { 1902 *requiredCapacity = destIdx; 1903 } 1904 if (destIdx > destCapacity) { 1905 *status = U_BUFFER_OVERFLOW_ERROR; 1906 } 1907 return i+1; 1908} 1909 1910// 1911// uregex_split The actual API function 1912// 1913U_CAPI int32_t U_EXPORT2 1914uregex_split(URegularExpression *regexp2, 1915 UChar *destBuf, 1916 int32_t destCapacity, 1917 int32_t *requiredCapacity, 1918 UChar *destFields[], 1919 int32_t destFieldsCapacity, 1920 UErrorCode *status) { 1921 RegularExpression *regexp = (RegularExpression*)regexp2; 1922 if (validateRE(regexp, TRUE, status) == FALSE) { 1923 return 0; 1924 } 1925 if ((destBuf == NULL && destCapacity > 0) || 1926 destCapacity < 0 || 1927 destFields == NULL || 1928 destFieldsCapacity < 1 ) { 1929 *status = U_ILLEGAL_ARGUMENT_ERROR; 1930 return 0; 1931 } 1932 1933 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); 1934} 1935 1936 1937// 1938// uregex_splitUText...can just use the normal C++ method 1939// 1940U_CAPI int32_t U_EXPORT2 1941uregex_splitUText(URegularExpression *regexp2, 1942 UText *destFields[], 1943 int32_t destFieldsCapacity, 1944 UErrorCode *status) { 1945 RegularExpression *regexp = (RegularExpression*)regexp2; 1946 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); 1947} 1948 1949 1950#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1951 1952