1/* 2******************************************************************************* 3* Copyright (C) 2004-2011, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* file name: regex.cpp 7*/ 8 9#include "unicode/utypes.h" 10 11#if !UCONFIG_NO_REGULAR_EXPRESSIONS 12 13#include "unicode/regex.h" 14#include "unicode/uregex.h" 15#include "unicode/unistr.h" 16#include "unicode/ustring.h" 17#include "unicode/uchar.h" 18#include "unicode/uobject.h" 19#include "umutex.h" 20#include "uassert.h" 21#include "cmemory.h" 22 23#include "regextxt.h" 24 25#include <stdio.h> 26 27U_NAMESPACE_BEGIN 28 29#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) 30 31struct RegularExpression: public UMemory { 32public: 33 RegularExpression(); 34 ~RegularExpression(); 35 int32_t fMagic; 36 RegexPattern *fPat; 37 int32_t *fPatRefCount; 38 UChar *fPatString; 39 int32_t fPatStringLen; 40 RegexMatcher *fMatcher; 41 const UChar *fText; // Text from setText() 42 int32_t fTextLength; // Length provided by user with setText(), which 43 // may be -1. 44 UBool fOwnsText; 45}; 46 47static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII 48 49RegularExpression::RegularExpression() { 50 fMagic = REXP_MAGIC; 51 fPat = NULL; 52 fPatRefCount = NULL; 53 fPatString = NULL; 54 fPatStringLen = 0; 55 fMatcher = NULL; 56 fText = NULL; 57 fTextLength = 0; 58 fOwnsText = FALSE; 59} 60 61RegularExpression::~RegularExpression() { 62 delete fMatcher; 63 fMatcher = NULL; 64 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { 65 delete fPat; 66 uprv_free(fPatString); 67 uprv_free(fPatRefCount); 68 } 69 if (fOwnsText && fText!=NULL) { 70 uprv_free((void *)fText); 71 } 72 fMagic = 0; 73} 74 75U_NAMESPACE_END 76 77U_NAMESPACE_USE 78 79//---------------------------------------------------------------------------------------- 80// 81// validateRE Do boilerplate style checks on API function parameters. 82// Return TRUE if they look OK. 83//---------------------------------------------------------------------------------------- 84static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) { 85 if (U_FAILURE(*status)) { 86 return FALSE; 87 } 88 if (re == NULL || re->fMagic != REXP_MAGIC) { 89 *status = U_ILLEGAL_ARGUMENT_ERROR; 90 return FALSE; 91 } 92 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway 93 if (requiresText && re->fText == NULL && !re->fOwnsText) { 94 *status = U_REGEX_INVALID_STATE; 95 return FALSE; 96 } 97 return TRUE; 98} 99 100//---------------------------------------------------------------------------------------- 101// 102// uregex_open 103// 104//---------------------------------------------------------------------------------------- 105U_CAPI URegularExpression * U_EXPORT2 106uregex_open( const UChar *pattern, 107 int32_t patternLength, 108 uint32_t flags, 109 UParseError *pe, 110 UErrorCode *status) { 111 112 if (U_FAILURE(*status)) { 113 return NULL; 114 } 115 if (pattern == NULL || patternLength < -1 || patternLength == 0) { 116 *status = U_ILLEGAL_ARGUMENT_ERROR; 117 return NULL; 118 } 119 int32_t actualPatLen = patternLength; 120 if (actualPatLen == -1) { 121 actualPatLen = u_strlen(pattern); 122 } 123 124 RegularExpression *re = new RegularExpression; 125 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); 126 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); 127 if (re == NULL || refC == NULL || patBuf == NULL) { 128 *status = U_MEMORY_ALLOCATION_ERROR; 129 delete re; 130 uprv_free(refC); 131 uprv_free(patBuf); 132 return NULL; 133 } 134 re->fPatRefCount = refC; 135 *re->fPatRefCount = 1; 136 137 // 138 // Make a copy of the pattern string, so we can return it later if asked. 139 // For compiling the pattern, we will use a UText wrapper around 140 // this local copy, to avoid making even more copies. 141 // 142 re->fPatString = patBuf; 143 re->fPatStringLen = patternLength; 144 u_memcpy(patBuf, pattern, actualPatLen); 145 patBuf[actualPatLen] = 0; 146 147 UText patText = UTEXT_INITIALIZER; 148 utext_openUChars(&patText, patBuf, patternLength, status); 149 150 // 151 // Compile the pattern 152 // 153 if (pe != NULL) { 154 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 155 } else { 156 re->fPat = RegexPattern::compile(&patText, flags, *status); 157 } 158 utext_close(&patText); 159 160 if (U_FAILURE(*status)) { 161 goto ErrorExit; 162 } 163 164 // 165 // Create the matcher object 166 // 167 re->fMatcher = re->fPat->matcher(*status); 168 if (U_SUCCESS(*status)) { 169 return (URegularExpression*)re; 170 } 171 172ErrorExit: 173 delete re; 174 return NULL; 175 176} 177 178//---------------------------------------------------------------------------------------- 179// 180// uregex_openUText 181// 182//---------------------------------------------------------------------------------------- 183U_CAPI URegularExpression * U_EXPORT2 184uregex_openUText(UText *pattern, 185 uint32_t flags, 186 UParseError *pe, 187 UErrorCode *status) { 188 189 if (U_FAILURE(*status)) { 190 return NULL; 191 } 192 if (pattern == NULL) { 193 *status = U_ILLEGAL_ARGUMENT_ERROR; 194 return NULL; 195 } 196 197 int64_t patternNativeLength = utext_nativeLength(pattern); 198 199 if (patternNativeLength == 0) { 200 *status = U_ILLEGAL_ARGUMENT_ERROR; 201 return NULL; 202 } 203 204 RegularExpression *re = new RegularExpression; 205 206 UErrorCode lengthStatus = U_ZERO_ERROR; 207 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); 208 209 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); 210 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); 211 if (re == NULL || refC == NULL || patBuf == NULL) { 212 *status = U_MEMORY_ALLOCATION_ERROR; 213 delete re; 214 uprv_free(refC); 215 uprv_free(patBuf); 216 return NULL; 217 } 218 re->fPatRefCount = refC; 219 *re->fPatRefCount = 1; 220 221 // 222 // Make a copy of the pattern string, so we can return it later if asked. 223 // For compiling the pattern, we will use a read-only UText wrapper 224 // around this local copy, to avoid making even more copies. 225 // 226 re->fPatString = patBuf; 227 re->fPatStringLen = pattern16Length; 228 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); 229 230 UText patText = UTEXT_INITIALIZER; 231 utext_openUChars(&patText, patBuf, pattern16Length, status); 232 233 // 234 // Compile the pattern 235 // 236 if (pe != NULL) { 237 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 238 } else { 239 re->fPat = RegexPattern::compile(&patText, flags, *status); 240 } 241 utext_close(&patText); 242 243 if (U_FAILURE(*status)) { 244 goto ErrorExit; 245 } 246 247 // 248 // Create the matcher object 249 // 250 re->fMatcher = re->fPat->matcher(*status); 251 if (U_SUCCESS(*status)) { 252 return (URegularExpression*)re; 253 } 254 255ErrorExit: 256 delete re; 257 return NULL; 258 259} 260 261//---------------------------------------------------------------------------------------- 262// 263// uregex_close 264// 265//---------------------------------------------------------------------------------------- 266U_CAPI void U_EXPORT2 267uregex_close(URegularExpression *re2) { 268 RegularExpression *re = (RegularExpression*)re2; 269 UErrorCode status = U_ZERO_ERROR; 270 if (validateRE(re, FALSE, &status) == FALSE) { 271 return; 272 } 273 delete re; 274} 275 276 277//---------------------------------------------------------------------------------------- 278// 279// uregex_clone 280// 281//---------------------------------------------------------------------------------------- 282U_CAPI URegularExpression * U_EXPORT2 283uregex_clone(const URegularExpression *source2, UErrorCode *status) { 284 RegularExpression *source = (RegularExpression*)source2; 285 if (validateRE(source, FALSE, status) == FALSE) { 286 return NULL; 287 } 288 289 RegularExpression *clone = new RegularExpression; 290 if (clone == NULL) { 291 *status = U_MEMORY_ALLOCATION_ERROR; 292 return NULL; 293 } 294 295 clone->fMatcher = source->fPat->matcher(*status); 296 if (U_FAILURE(*status)) { 297 delete clone; 298 return NULL; 299 } 300 301 clone->fPat = source->fPat; 302 clone->fPatRefCount = source->fPatRefCount; 303 clone->fPatString = source->fPatString; 304 clone->fPatStringLen = source->fPatStringLen; 305 umtx_atomic_inc(source->fPatRefCount); 306 // Note: fText is not cloned. 307 308 return (URegularExpression*)clone; 309} 310 311 312 313 314//------------------------------------------------------------------------------ 315// 316// uregex_pattern 317// 318//------------------------------------------------------------------------------ 319U_CAPI const UChar * U_EXPORT2 320uregex_pattern(const URegularExpression *regexp2, 321 int32_t *patLength, 322 UErrorCode *status) { 323 RegularExpression *regexp = (RegularExpression*)regexp2; 324 325 if (validateRE(regexp, FALSE, status) == FALSE) { 326 return NULL; 327 } 328 if (patLength != NULL) { 329 *patLength = regexp->fPatStringLen; 330 } 331 return regexp->fPatString; 332} 333 334 335//------------------------------------------------------------------------------ 336// 337// uregex_patternUText 338// 339//------------------------------------------------------------------------------ 340U_CAPI UText * U_EXPORT2 341uregex_patternUText(const URegularExpression *regexp2, 342 UErrorCode *status) { 343 RegularExpression *regexp = (RegularExpression*)regexp2; 344 return regexp->fPat->patternText(*status); 345} 346 347 348//------------------------------------------------------------------------------ 349// 350// uregex_flags 351// 352//------------------------------------------------------------------------------ 353U_CAPI int32_t U_EXPORT2 354uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { 355 RegularExpression *regexp = (RegularExpression*)regexp2; 356 if (validateRE(regexp, FALSE, status) == FALSE) { 357 return 0; 358 } 359 int32_t flags = regexp->fPat->flags(); 360 return flags; 361} 362 363 364//------------------------------------------------------------------------------ 365// 366// uregex_setText 367// 368//------------------------------------------------------------------------------ 369U_CAPI void U_EXPORT2 370uregex_setText(URegularExpression *regexp2, 371 const UChar *text, 372 int32_t textLength, 373 UErrorCode *status) { 374 RegularExpression *regexp = (RegularExpression*)regexp2; 375 if (validateRE(regexp, FALSE, status) == FALSE) { 376 return; 377 } 378 if (text == NULL || textLength < -1) { 379 *status = U_ILLEGAL_ARGUMENT_ERROR; 380 return; 381 } 382 383 if (regexp->fOwnsText && regexp->fText != NULL) { 384 uprv_free((void *)regexp->fText); 385 } 386 387 regexp->fText = text; 388 regexp->fTextLength = textLength; 389 regexp->fOwnsText = FALSE; 390 391 UText input = UTEXT_INITIALIZER; 392 utext_openUChars(&input, text, textLength, status); 393 regexp->fMatcher->reset(&input); 394 utext_close(&input); // reset() made a shallow clone, so we don't need this copy 395} 396 397 398//------------------------------------------------------------------------------ 399// 400// uregex_setUText 401// 402//------------------------------------------------------------------------------ 403U_CAPI void U_EXPORT2 404uregex_setUText(URegularExpression *regexp2, 405 UText *text, 406 UErrorCode *status) { 407 RegularExpression *regexp = (RegularExpression*)regexp2; 408 if (validateRE(regexp, FALSE, status) == FALSE) { 409 return; 410 } 411 if (text == NULL) { 412 *status = U_ILLEGAL_ARGUMENT_ERROR; 413 return; 414 } 415 416 if (regexp->fOwnsText && regexp->fText != NULL) { 417 uprv_free((void *)regexp->fText); 418 } 419 420 regexp->fText = NULL; // only fill it in on request 421 regexp->fTextLength = -1; 422 regexp->fOwnsText = TRUE; 423 regexp->fMatcher->reset(text); 424} 425 426 427 428//------------------------------------------------------------------------------ 429// 430// uregex_getText 431// 432//------------------------------------------------------------------------------ 433U_CAPI const UChar * U_EXPORT2 434uregex_getText(URegularExpression *regexp2, 435 int32_t *textLength, 436 UErrorCode *status) { 437 RegularExpression *regexp = (RegularExpression*)regexp2; 438 if (validateRE(regexp, FALSE, status) == FALSE) { 439 return NULL; 440 } 441 442 if (regexp->fText == NULL) { 443 // need to fill in the text 444 UText *inputText = regexp->fMatcher->inputText(); 445 int64_t inputNativeLength = utext_nativeLength(inputText); 446 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { 447 regexp->fText = inputText->chunkContents; 448 regexp->fTextLength = (int32_t)inputNativeLength; 449 regexp->fOwnsText = FALSE; // because the UText owns it 450 } else { 451 UErrorCode lengthStatus = U_ZERO_ERROR; 452 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error 453 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); 454 455 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); 456 regexp->fText = inputChars; 457 regexp->fOwnsText = TRUE; // should already be set but just in case 458 } 459 } 460 461 if (textLength != NULL) { 462 *textLength = regexp->fTextLength; 463 } 464 return regexp->fText; 465} 466 467 468//------------------------------------------------------------------------------ 469// 470// uregex_getUText 471// 472//------------------------------------------------------------------------------ 473U_CAPI UText * U_EXPORT2 474uregex_getUText(URegularExpression *regexp2, 475 UText *dest, 476 UErrorCode *status) { 477 RegularExpression *regexp = (RegularExpression*)regexp2; 478 if (validateRE(regexp, FALSE, status) == FALSE) { 479 return dest; 480 } 481 return regexp->fMatcher->getInput(dest, *status); 482} 483 484 485//------------------------------------------------------------------------------ 486// 487// uregex_refreshUText 488// 489//------------------------------------------------------------------------------ 490U_CAPI void U_EXPORT2 491uregex_refreshUText(URegularExpression *regexp2, 492 UText *text, 493 UErrorCode *status) { 494 RegularExpression *regexp = (RegularExpression*)regexp2; 495 if (validateRE(regexp, FALSE, status) == FALSE) { 496 return; 497 } 498 regexp->fMatcher->refreshInputText(text, *status); 499} 500 501 502//------------------------------------------------------------------------------ 503// 504// uregex_matches 505// 506//------------------------------------------------------------------------------ 507U_CAPI UBool U_EXPORT2 508uregex_matches(URegularExpression *regexp2, 509 int32_t startIndex, 510 UErrorCode *status) { 511 return uregex_matches64( regexp2, (int64_t)startIndex, status); 512} 513 514U_CAPI UBool U_EXPORT2 515uregex_matches64(URegularExpression *regexp2, 516 int64_t startIndex, 517 UErrorCode *status) { 518 RegularExpression *regexp = (RegularExpression*)regexp2; 519 UBool result = FALSE; 520 if (validateRE(regexp, TRUE, status) == FALSE) { 521 return result; 522 } 523 if (startIndex == -1) { 524 result = regexp->fMatcher->matches(*status); 525 } else { 526 result = regexp->fMatcher->matches(startIndex, *status); 527 } 528 return result; 529} 530 531 532//------------------------------------------------------------------------------ 533// 534// uregex_lookingAt 535// 536//------------------------------------------------------------------------------ 537U_CAPI UBool U_EXPORT2 538uregex_lookingAt(URegularExpression *regexp2, 539 int32_t startIndex, 540 UErrorCode *status) { 541 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); 542} 543 544U_CAPI UBool U_EXPORT2 545uregex_lookingAt64(URegularExpression *regexp2, 546 int64_t startIndex, 547 UErrorCode *status) { 548 RegularExpression *regexp = (RegularExpression*)regexp2; 549 UBool result = FALSE; 550 if (validateRE(regexp, TRUE, status) == FALSE) { 551 return result; 552 } 553 if (startIndex == -1) { 554 result = regexp->fMatcher->lookingAt(*status); 555 } else { 556 result = regexp->fMatcher->lookingAt(startIndex, *status); 557 } 558 return result; 559} 560 561 562 563//------------------------------------------------------------------------------ 564// 565// uregex_find 566// 567//------------------------------------------------------------------------------ 568U_CAPI UBool U_EXPORT2 569uregex_find(URegularExpression *regexp2, 570 int32_t startIndex, 571 UErrorCode *status) { 572 return uregex_find64( regexp2, (int64_t)startIndex, status); 573} 574 575U_CAPI UBool U_EXPORT2 576uregex_find64(URegularExpression *regexp2, 577 int64_t startIndex, 578 UErrorCode *status) { 579 RegularExpression *regexp = (RegularExpression*)regexp2; 580 UBool result = FALSE; 581 if (validateRE(regexp, TRUE, status) == FALSE) { 582 return result; 583 } 584 if (startIndex == -1) { 585 regexp->fMatcher->resetPreserveRegion(); 586 result = regexp->fMatcher->find(); 587 } else { 588 result = regexp->fMatcher->find(startIndex, *status); 589 } 590 return result; 591} 592 593 594//------------------------------------------------------------------------------ 595// 596// uregex_findNext 597// 598//------------------------------------------------------------------------------ 599U_CAPI UBool U_EXPORT2 600uregex_findNext(URegularExpression *regexp2, 601 UErrorCode *status) { 602 RegularExpression *regexp = (RegularExpression*)regexp2; 603 if (validateRE(regexp, TRUE, status) == FALSE) { 604 return FALSE; 605 } 606 UBool result = regexp->fMatcher->find(); 607 return result; 608} 609 610//------------------------------------------------------------------------------ 611// 612// uregex_groupCount 613// 614//------------------------------------------------------------------------------ 615U_CAPI int32_t U_EXPORT2 616uregex_groupCount(URegularExpression *regexp2, 617 UErrorCode *status) { 618 RegularExpression *regexp = (RegularExpression*)regexp2; 619 if (validateRE(regexp, FALSE, status) == FALSE) { 620 return 0; 621 } 622 int32_t result = regexp->fMatcher->groupCount(); 623 return result; 624} 625 626 627//------------------------------------------------------------------------------ 628// 629// uregex_group 630// 631//------------------------------------------------------------------------------ 632U_CAPI int32_t U_EXPORT2 633uregex_group(URegularExpression *regexp2, 634 int32_t groupNum, 635 UChar *dest, 636 int32_t destCapacity, 637 UErrorCode *status) { 638 RegularExpression *regexp = (RegularExpression*)regexp2; 639 if (validateRE(regexp, TRUE, status) == FALSE) { 640 return 0; 641 } 642 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 643 *status = U_ILLEGAL_ARGUMENT_ERROR; 644 return 0; 645 } 646 647 if (destCapacity == 0 || regexp->fText != NULL) { 648 // If preflighting or if we already have the text as UChars, 649 // this is a little cheaper than going through uregex_groupUTextDeep() 650 651 // 652 // Pick up the range of characters from the matcher 653 // 654 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 655 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 656 if (U_FAILURE(*status)) { 657 return 0; 658 } 659 660 // 661 // Trim length based on buffer capacity 662 // 663 int32_t fullLength = endIx - startIx; 664 int32_t copyLength = fullLength; 665 if (copyLength < destCapacity) { 666 dest[copyLength] = 0; 667 } else if (copyLength == destCapacity) { 668 *status = U_STRING_NOT_TERMINATED_WARNING; 669 } else { 670 copyLength = destCapacity; 671 *status = U_BUFFER_OVERFLOW_ERROR; 672 } 673 674 // 675 // Copy capture group to user's buffer 676 // 677 if (copyLength > 0) { 678 u_memcpy(dest, ®exp->fText[startIx], copyLength); 679 } 680 return fullLength; 681 } else { 682 UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status); 683 int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); 684 utext_close(groupText); 685 return result; 686 } 687} 688 689 690//------------------------------------------------------------------------------ 691// 692// uregex_groupUText 693// 694//------------------------------------------------------------------------------ 695U_CAPI UText * U_EXPORT2 696uregex_groupUText(URegularExpression *regexp2, 697 int32_t groupNum, 698 UText *dest, 699 int64_t *groupLength, 700 UErrorCode *status) { 701 RegularExpression *regexp = (RegularExpression*)regexp2; 702 if (validateRE(regexp, TRUE, status) == FALSE) { 703 UErrorCode emptyTextStatus = U_ZERO_ERROR; 704 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 705 } 706 707 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); 708} 709 710//------------------------------------------------------------------------------ 711// 712// uregex_groupUTextDeep 713// 714//------------------------------------------------------------------------------ 715U_CAPI UText * U_EXPORT2 716uregex_groupUTextDeep(URegularExpression *regexp2, 717 int32_t groupNum, 718 UText *dest, 719 UErrorCode *status) { 720 RegularExpression *regexp = (RegularExpression*)regexp2; 721 if (validateRE(regexp, TRUE, status) == FALSE) { 722 UErrorCode emptyTextStatus = U_ZERO_ERROR; 723 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 724 } 725 726 if (regexp->fText != NULL) { 727 // 728 // Pick up the range of characters from the matcher 729 // and use our already-extracted characters 730 // 731 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 732 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 733 if (U_FAILURE(*status)) { 734 UErrorCode emptyTextStatus = U_ZERO_ERROR; 735 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 736 } 737 738 if (dest) { 739 utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); 740 } else { 741 UText groupText = UTEXT_INITIALIZER; 742 utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); 743 dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); 744 utext_close(&groupText); 745 } 746 747 return dest; 748 } else { 749 return regexp->fMatcher->group(groupNum, dest, *status); 750 } 751} 752 753//------------------------------------------------------------------------------ 754// 755// uregex_start 756// 757//------------------------------------------------------------------------------ 758U_CAPI int32_t U_EXPORT2 759uregex_start(URegularExpression *regexp2, 760 int32_t groupNum, 761 UErrorCode *status) { 762 return (int32_t)uregex_start64( regexp2, groupNum, status); 763} 764 765U_CAPI int64_t U_EXPORT2 766uregex_start64(URegularExpression *regexp2, 767 int32_t groupNum, 768 UErrorCode *status) { 769 RegularExpression *regexp = (RegularExpression*)regexp2; 770 if (validateRE(regexp, TRUE, status) == FALSE) { 771 return 0; 772 } 773 int32_t result = regexp->fMatcher->start(groupNum, *status); 774 return result; 775} 776 777//------------------------------------------------------------------------------ 778// 779// uregex_end 780// 781//------------------------------------------------------------------------------ 782U_CAPI int32_t U_EXPORT2 783uregex_end(URegularExpression *regexp2, 784 int32_t groupNum, 785 UErrorCode *status) { 786 return (int32_t)uregex_end64( regexp2, groupNum, status); 787} 788 789U_CAPI int64_t U_EXPORT2 790uregex_end64(URegularExpression *regexp2, 791 int32_t groupNum, 792 UErrorCode *status) { 793 RegularExpression *regexp = (RegularExpression*)regexp2; 794 if (validateRE(regexp, TRUE, status) == FALSE) { 795 return 0; 796 } 797 int32_t result = regexp->fMatcher->end(groupNum, *status); 798 return result; 799} 800 801//------------------------------------------------------------------------------ 802// 803// uregex_reset 804// 805//------------------------------------------------------------------------------ 806U_CAPI void U_EXPORT2 807uregex_reset(URegularExpression *regexp2, 808 int32_t index, 809 UErrorCode *status) { 810 uregex_reset64( regexp2, (int64_t)index, status); 811} 812 813U_CAPI void U_EXPORT2 814uregex_reset64(URegularExpression *regexp2, 815 int64_t index, 816 UErrorCode *status) { 817 RegularExpression *regexp = (RegularExpression*)regexp2; 818 if (validateRE(regexp, TRUE, status) == FALSE) { 819 return; 820 } 821 regexp->fMatcher->reset(index, *status); 822} 823 824 825//------------------------------------------------------------------------------ 826// 827// uregex_setRegion 828// 829//------------------------------------------------------------------------------ 830U_CAPI void U_EXPORT2 831uregex_setRegion(URegularExpression *regexp2, 832 int32_t regionStart, 833 int32_t regionLimit, 834 UErrorCode *status) { 835 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status); 836} 837 838U_CAPI void U_EXPORT2 839uregex_setRegion64(URegularExpression *regexp2, 840 int64_t regionStart, 841 int64_t regionLimit, 842 UErrorCode *status) { 843 RegularExpression *regexp = (RegularExpression*)regexp2; 844 if (validateRE(regexp, TRUE, status) == FALSE) { 845 return; 846 } 847 regexp->fMatcher->region(regionStart, regionLimit, *status); 848} 849 850 851//------------------------------------------------------------------------------ 852// 853// uregex_setRegionAndStart 854// 855//------------------------------------------------------------------------------ 856U_DRAFT void U_EXPORT2 857uregex_setRegionAndStart(URegularExpression *regexp2, 858 int64_t regionStart, 859 int64_t regionLimit, 860 int64_t startIndex, 861 UErrorCode *status) { 862 RegularExpression *regexp = (RegularExpression*)regexp2; 863 if (validateRE(regexp, TRUE, status) == FALSE) { 864 return; 865 } 866 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); 867} 868 869//------------------------------------------------------------------------------ 870// 871// uregex_regionStart 872// 873//------------------------------------------------------------------------------ 874U_CAPI int32_t U_EXPORT2 875uregex_regionStart(const URegularExpression *regexp2, 876 UErrorCode *status) { 877 return (int32_t)uregex_regionStart64(regexp2, status); 878} 879 880U_CAPI int64_t U_EXPORT2 881uregex_regionStart64(const URegularExpression *regexp2, 882 UErrorCode *status) { 883 RegularExpression *regexp = (RegularExpression*)regexp2; 884 if (validateRE(regexp, TRUE, status) == FALSE) { 885 return 0; 886 } 887 return regexp->fMatcher->regionStart(); 888} 889 890 891//------------------------------------------------------------------------------ 892// 893// uregex_regionEnd 894// 895//------------------------------------------------------------------------------ 896U_CAPI int32_t U_EXPORT2 897uregex_regionEnd(const URegularExpression *regexp2, 898 UErrorCode *status) { 899 return (int32_t)uregex_regionEnd64(regexp2, status); 900} 901 902U_CAPI int64_t U_EXPORT2 903uregex_regionEnd64(const URegularExpression *regexp2, 904 UErrorCode *status) { 905 RegularExpression *regexp = (RegularExpression*)regexp2; 906 if (validateRE(regexp, TRUE, status) == FALSE) { 907 return 0; 908 } 909 return regexp->fMatcher->regionEnd(); 910} 911 912 913//------------------------------------------------------------------------------ 914// 915// uregex_hasTransparentBounds 916// 917//------------------------------------------------------------------------------ 918U_CAPI UBool U_EXPORT2 919uregex_hasTransparentBounds(const URegularExpression *regexp2, 920 UErrorCode *status) { 921 RegularExpression *regexp = (RegularExpression*)regexp2; 922 if (validateRE(regexp, FALSE, status) == FALSE) { 923 return FALSE; 924 } 925 return regexp->fMatcher->hasTransparentBounds(); 926} 927 928 929//------------------------------------------------------------------------------ 930// 931// uregex_useTransparentBounds 932// 933//------------------------------------------------------------------------------ 934U_CAPI void U_EXPORT2 935uregex_useTransparentBounds(URegularExpression *regexp2, 936 UBool b, 937 UErrorCode *status) { 938 RegularExpression *regexp = (RegularExpression*)regexp2; 939 if (validateRE(regexp, FALSE, status) == FALSE) { 940 return; 941 } 942 regexp->fMatcher->useTransparentBounds(b); 943} 944 945 946//------------------------------------------------------------------------------ 947// 948// uregex_hasAnchoringBounds 949// 950//------------------------------------------------------------------------------ 951U_CAPI UBool U_EXPORT2 952uregex_hasAnchoringBounds(const URegularExpression *regexp2, 953 UErrorCode *status) { 954 RegularExpression *regexp = (RegularExpression*)regexp2; 955 if (validateRE(regexp, FALSE, status) == FALSE) { 956 return FALSE; 957 } 958 return regexp->fMatcher->hasAnchoringBounds(); 959} 960 961 962//------------------------------------------------------------------------------ 963// 964// uregex_useAnchoringBounds 965// 966//------------------------------------------------------------------------------ 967U_CAPI void U_EXPORT2 968uregex_useAnchoringBounds(URegularExpression *regexp2, 969 UBool b, 970 UErrorCode *status) { 971 RegularExpression *regexp = (RegularExpression*)regexp2; 972 if (validateRE(regexp, FALSE, status) == FALSE) { 973 return; 974 } 975 regexp->fMatcher->useAnchoringBounds(b); 976} 977 978 979//------------------------------------------------------------------------------ 980// 981// uregex_hitEnd 982// 983//------------------------------------------------------------------------------ 984U_CAPI UBool U_EXPORT2 985uregex_hitEnd(const URegularExpression *regexp2, 986 UErrorCode *status) { 987 RegularExpression *regexp = (RegularExpression*)regexp2; 988 if (validateRE(regexp, TRUE, status) == FALSE) { 989 return FALSE; 990 } 991 return regexp->fMatcher->hitEnd(); 992} 993 994 995//------------------------------------------------------------------------------ 996// 997// uregex_requireEnd 998// 999//------------------------------------------------------------------------------ 1000U_CAPI UBool U_EXPORT2 1001uregex_requireEnd(const URegularExpression *regexp2, 1002 UErrorCode *status) { 1003 RegularExpression *regexp = (RegularExpression*)regexp2; 1004 if (validateRE(regexp, TRUE, status) == FALSE) { 1005 return FALSE; 1006 } 1007 return regexp->fMatcher->requireEnd(); 1008} 1009 1010 1011//------------------------------------------------------------------------------ 1012// 1013// uregex_setTimeLimit 1014// 1015//------------------------------------------------------------------------------ 1016U_CAPI void U_EXPORT2 1017uregex_setTimeLimit(URegularExpression *regexp2, 1018 int32_t limit, 1019 UErrorCode *status) { 1020 RegularExpression *regexp = (RegularExpression*)regexp2; 1021 if (validateRE(regexp, FALSE, status)) { 1022 regexp->fMatcher->setTimeLimit(limit, *status); 1023 } 1024} 1025 1026 1027 1028//------------------------------------------------------------------------------ 1029// 1030// uregex_getTimeLimit 1031// 1032//------------------------------------------------------------------------------ 1033U_CAPI int32_t U_EXPORT2 1034uregex_getTimeLimit(const URegularExpression *regexp2, 1035 UErrorCode *status) { 1036 int32_t retVal = 0; 1037 RegularExpression *regexp = (RegularExpression*)regexp2; 1038 if (validateRE(regexp, FALSE, status)) { 1039 retVal = regexp->fMatcher->getTimeLimit(); 1040 } 1041 return retVal; 1042} 1043 1044 1045 1046//------------------------------------------------------------------------------ 1047// 1048// uregex_setStackLimit 1049// 1050//------------------------------------------------------------------------------ 1051U_CAPI void U_EXPORT2 1052uregex_setStackLimit(URegularExpression *regexp2, 1053 int32_t limit, 1054 UErrorCode *status) { 1055 RegularExpression *regexp = (RegularExpression*)regexp2; 1056 if (validateRE(regexp, FALSE, status)) { 1057 regexp->fMatcher->setStackLimit(limit, *status); 1058 } 1059} 1060 1061 1062 1063//------------------------------------------------------------------------------ 1064// 1065// uregex_getStackLimit 1066// 1067//------------------------------------------------------------------------------ 1068U_CAPI int32_t U_EXPORT2 1069uregex_getStackLimit(const URegularExpression *regexp2, 1070 UErrorCode *status) { 1071 int32_t retVal = 0; 1072 RegularExpression *regexp = (RegularExpression*)regexp2; 1073 if (validateRE(regexp, FALSE, status)) { 1074 retVal = regexp->fMatcher->getStackLimit(); 1075 } 1076 return retVal; 1077} 1078 1079 1080//------------------------------------------------------------------------------ 1081// 1082// uregex_setMatchCallback 1083// 1084//------------------------------------------------------------------------------ 1085U_CAPI void U_EXPORT2 1086uregex_setMatchCallback(URegularExpression *regexp2, 1087 URegexMatchCallback *callback, 1088 const void *context, 1089 UErrorCode *status) { 1090 RegularExpression *regexp = (RegularExpression*)regexp2; 1091 if (validateRE(regexp, FALSE, status)) { 1092 regexp->fMatcher->setMatchCallback(callback, context, *status); 1093 } 1094} 1095 1096 1097//------------------------------------------------------------------------------ 1098// 1099// uregex_getMatchCallback 1100// 1101//------------------------------------------------------------------------------ 1102U_CAPI void U_EXPORT2 1103uregex_getMatchCallback(const URegularExpression *regexp2, 1104 URegexMatchCallback **callback, 1105 const void **context, 1106 UErrorCode *status) { 1107 RegularExpression *regexp = (RegularExpression*)regexp2; 1108 if (validateRE(regexp, FALSE, status)) { 1109 regexp->fMatcher->getMatchCallback(*callback, *context, *status); 1110 } 1111} 1112 1113 1114//------------------------------------------------------------------------------ 1115// 1116// uregex_setMatchProgressCallback 1117// 1118//------------------------------------------------------------------------------ 1119U_CAPI void U_EXPORT2 1120uregex_setFindProgressCallback(URegularExpression *regexp2, 1121 URegexFindProgressCallback *callback, 1122 const void *context, 1123 UErrorCode *status) { 1124 RegularExpression *regexp = (RegularExpression*)regexp2; 1125 if (validateRE(regexp, FALSE, status)) { 1126 regexp->fMatcher->setFindProgressCallback(callback, context, *status); 1127 } 1128} 1129 1130 1131//------------------------------------------------------------------------------ 1132// 1133// uregex_getMatchCallback 1134// 1135//------------------------------------------------------------------------------ 1136U_CAPI void U_EXPORT2 1137uregex_getFindProgressCallback(const URegularExpression *regexp2, 1138 URegexFindProgressCallback **callback, 1139 const void **context, 1140 UErrorCode *status) { 1141 RegularExpression *regexp = (RegularExpression*)regexp2; 1142 if (validateRE(regexp, FALSE, status)) { 1143 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status); 1144 } 1145} 1146 1147 1148//------------------------------------------------------------------------------ 1149// 1150// uregex_replaceAll 1151// 1152//------------------------------------------------------------------------------ 1153U_CAPI int32_t U_EXPORT2 1154uregex_replaceAll(URegularExpression *regexp2, 1155 const UChar *replacementText, 1156 int32_t replacementLength, 1157 UChar *destBuf, 1158 int32_t destCapacity, 1159 UErrorCode *status) { 1160 RegularExpression *regexp = (RegularExpression*)regexp2; 1161 if (validateRE(regexp, TRUE, status) == FALSE) { 1162 return 0; 1163 } 1164 if (replacementText == NULL || replacementLength < -1 || 1165 (destBuf == NULL && destCapacity > 0) || 1166 destCapacity < 0) { 1167 *status = U_ILLEGAL_ARGUMENT_ERROR; 1168 return 0; 1169 } 1170 1171 int32_t len = 0; 1172 1173 uregex_reset(regexp2, 0, status); 1174 1175 // Note: Seperate error code variables for findNext() and appendReplacement() 1176 // are used so that destination buffer overflow errors 1177 // in appendReplacement won't stop findNext() from working. 1178 // appendReplacement() and appendTail() special case incoming buffer 1179 // overflow errors, continuing to return the correct length. 1180 UErrorCode findStatus = *status; 1181 while (uregex_findNext(regexp2, &findStatus)) { 1182 len += uregex_appendReplacement(regexp2, replacementText, replacementLength, 1183 &destBuf, &destCapacity, status); 1184 } 1185 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1186 1187 if (U_FAILURE(findStatus)) { 1188 // If anything went wrong with the findNext(), make that error trump 1189 // whatever may have happened with the append() operations. 1190 // Errors in findNext() are not expected. 1191 *status = findStatus; 1192 } 1193 1194 return len; 1195} 1196 1197 1198//------------------------------------------------------------------------------ 1199// 1200// uregex_replaceAllUText 1201// 1202//------------------------------------------------------------------------------ 1203U_CAPI UText * U_EXPORT2 1204uregex_replaceAllUText(URegularExpression *regexp2, 1205 UText *replacementText, 1206 UText *dest, 1207 UErrorCode *status) { 1208 RegularExpression *regexp = (RegularExpression*)regexp2; 1209 if (validateRE(regexp, TRUE, status) == FALSE) { 1210 return 0; 1211 } 1212 if (replacementText == NULL) { 1213 *status = U_ILLEGAL_ARGUMENT_ERROR; 1214 return 0; 1215 } 1216 1217 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); 1218 return dest; 1219} 1220 1221 1222//------------------------------------------------------------------------------ 1223// 1224// uregex_replaceFirst 1225// 1226//------------------------------------------------------------------------------ 1227U_CAPI int32_t U_EXPORT2 1228uregex_replaceFirst(URegularExpression *regexp2, 1229 const UChar *replacementText, 1230 int32_t replacementLength, 1231 UChar *destBuf, 1232 int32_t destCapacity, 1233 UErrorCode *status) { 1234 RegularExpression *regexp = (RegularExpression*)regexp2; 1235 if (validateRE(regexp, TRUE, status) == FALSE) { 1236 return 0; 1237 } 1238 if (replacementText == NULL || replacementLength < -1 || 1239 (destBuf == NULL && destCapacity > 0) || 1240 destCapacity < 0) { 1241 *status = U_ILLEGAL_ARGUMENT_ERROR; 1242 return 0; 1243 } 1244 1245 int32_t len = 0; 1246 UBool findSucceeded; 1247 uregex_reset(regexp2, 0, status); 1248 findSucceeded = uregex_find(regexp2, 0, status); 1249 if (findSucceeded) { 1250 len = uregex_appendReplacement(regexp2, replacementText, replacementLength, 1251 &destBuf, &destCapacity, status); 1252 } 1253 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1254 1255 return len; 1256} 1257 1258 1259//------------------------------------------------------------------------------ 1260// 1261// uregex_replaceFirstUText 1262// 1263//------------------------------------------------------------------------------ 1264U_CAPI UText * U_EXPORT2 1265uregex_replaceFirstUText(URegularExpression *regexp2, 1266 UText *replacementText, 1267 UText *dest, 1268 UErrorCode *status) { 1269 RegularExpression *regexp = (RegularExpression*)regexp2; 1270 if (validateRE(regexp, TRUE, status) == FALSE) { 1271 return 0; 1272 } 1273 if (replacementText == NULL) { 1274 *status = U_ILLEGAL_ARGUMENT_ERROR; 1275 return 0; 1276 } 1277 1278 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); 1279 return dest; 1280} 1281 1282 1283//------------------------------------------------------------------------------ 1284// 1285// uregex_appendReplacement 1286// 1287//------------------------------------------------------------------------------ 1288 1289U_NAMESPACE_BEGIN 1290// 1291// Dummy class, because these functions need to be friends of class RegexMatcher, 1292// and stand-alone C functions don't work as friends 1293// 1294class RegexCImpl { 1295 public: 1296 inline static int32_t appendReplacement(RegularExpression *regexp, 1297 const UChar *replacementText, 1298 int32_t replacementLength, 1299 UChar **destBuf, 1300 int32_t *destCapacity, 1301 UErrorCode *status); 1302 1303 inline static int32_t appendTail(RegularExpression *regexp, 1304 UChar **destBuf, 1305 int32_t *destCapacity, 1306 UErrorCode *status); 1307 1308 inline static int32_t split(RegularExpression *regexp, 1309 UChar *destBuf, 1310 int32_t destCapacity, 1311 int32_t *requiredCapacity, 1312 UChar *destFields[], 1313 int32_t destFieldsCapacity, 1314 UErrorCode *status); 1315}; 1316 1317U_NAMESPACE_END 1318 1319 1320 1321static const UChar BACKSLASH = 0x5c; 1322static const UChar DOLLARSIGN = 0x24; 1323 1324// 1325// Move a character to an output buffer, with bounds checking on the index. 1326// Index advances even if capacity is exceeded, for preflight size computations. 1327// This little sequence is used a LOT. 1328// 1329static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { 1330 if (*idx < bufCapacity) { 1331 buf[*idx] = c; 1332 } 1333 (*idx)++; 1334} 1335 1336 1337// 1338// appendReplacement, the actual implementation. 1339// 1340int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, 1341 const UChar *replacementText, 1342 int32_t replacementLength, 1343 UChar **destBuf, 1344 int32_t *destCapacity, 1345 UErrorCode *status) { 1346 1347 // If we come in with a buffer overflow error, don't suppress the operation. 1348 // A series of appendReplacements, appendTail need to correctly preflight 1349 // the buffer size when an overflow happens somewhere in the middle. 1350 UBool pendingBufferOverflow = FALSE; 1351 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1352 pendingBufferOverflow = TRUE; 1353 *status = U_ZERO_ERROR; 1354 } 1355 1356 // 1357 // Validate all paramters 1358 // 1359 if (validateRE(regexp, TRUE, status) == FALSE) { 1360 return 0; 1361 } 1362 if (replacementText == NULL || replacementLength < -1 || 1363 destCapacity == NULL || destBuf == NULL || 1364 (*destBuf == NULL && *destCapacity > 0) || 1365 *destCapacity < 0) { 1366 *status = U_ILLEGAL_ARGUMENT_ERROR; 1367 return 0; 1368 } 1369 1370 RegexMatcher *m = regexp->fMatcher; 1371 if (m->fMatch == FALSE) { 1372 *status = U_REGEX_INVALID_STATE; 1373 return 0; 1374 } 1375 1376 UChar *dest = *destBuf; 1377 int32_t capacity = *destCapacity; 1378 int32_t destIdx = 0; 1379 int32_t i; 1380 1381 // If it wasn't supplied by the caller, get the length of the replacement text. 1382 // TODO: slightly smarter logic in the copy loop could watch for the NUL on 1383 // the fly and avoid this step. 1384 if (replacementLength == -1) { 1385 replacementLength = u_strlen(replacementText); 1386 } 1387 1388 // Copy input string from the end of previous match to start of current match 1389 if (regexp->fText != NULL) { 1390 int32_t matchStart; 1391 int32_t lastMatchEnd; 1392 if (UTEXT_USES_U16(m->fInputText)) { 1393 lastMatchEnd = (int32_t)m->fLastMatchEnd; 1394 matchStart = (int32_t)m->fMatchStart; 1395 } else { 1396 // !!!: Would like a better way to do this! 1397 UErrorCode status = U_ZERO_ERROR; 1398 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status); 1399 status = U_ZERO_ERROR; 1400 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status); 1401 } 1402 for (i=lastMatchEnd; i<matchStart; i++) { 1403 appendToBuf(regexp->fText[i], &destIdx, dest, capacity); 1404 } 1405 } else { 1406 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore 1407 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, 1408 &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError); 1409 } 1410 1411 1412 // scan the replacement text, looking for substitutions ($n) and \escapes. 1413 int32_t replIdx = 0; 1414 while (replIdx < replacementLength) { 1415 UChar c = replacementText[replIdx]; 1416 replIdx++; 1417 if (c != DOLLARSIGN && c != BACKSLASH) { 1418 // Common case, no substitution, no escaping, 1419 // just copy the char to the dest buf. 1420 appendToBuf(c, &destIdx, dest, capacity); 1421 continue; 1422 } 1423 1424 if (c == BACKSLASH) { 1425 // Backslash Escape. Copy the following char out without further checks. 1426 // Note: Surrogate pairs don't need any special handling 1427 // The second half wont be a '$' or a '\', and 1428 // will move to the dest normally on the next 1429 // loop iteration. 1430 if (replIdx >= replacementLength) { 1431 break; 1432 } 1433 c = replacementText[replIdx]; 1434 1435 if (c==0x55/*U*/ || c==0x75/*u*/) { 1436 // We have a \udddd or \Udddddddd escape sequence. 1437 UChar32 escapedChar = 1438 u_unescapeAt(uregex_ucstr_unescape_charAt, 1439 &replIdx, // Index is updated by unescapeAt 1440 replacementLength, // Length of replacement text 1441 (void *)replacementText); 1442 1443 if (escapedChar != (UChar32)0xFFFFFFFF) { 1444 if (escapedChar <= 0xffff) { 1445 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); 1446 } else { 1447 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); 1448 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); 1449 } 1450 continue; 1451 } 1452 // Note: if the \u escape was invalid, just fall through and 1453 // treat it as a plain \<anything> escape. 1454 } 1455 1456 // Plain backslash escape. Just put out the escaped character. 1457 appendToBuf(c, &destIdx, dest, capacity); 1458 1459 replIdx++; 1460 continue; 1461 } 1462 1463 1464 1465 // We've got a $. Pick up a capture group number if one follows. 1466 // Consume at most the number of digits necessary for the largest capture 1467 // number that is valid for this pattern. 1468 1469 int32_t numDigits = 0; 1470 int32_t groupNum = 0; 1471 UChar32 digitC; 1472 for (;;) { 1473 if (replIdx >= replacementLength) { 1474 break; 1475 } 1476 U16_GET(replacementText, 0, replIdx, replacementLength, digitC); 1477 if (u_isdigit(digitC) == FALSE) { 1478 break; 1479 } 1480 1481 U16_FWD_1(replacementText, replIdx, replacementLength); 1482 groupNum=groupNum*10 + u_charDigitValue(digitC); 1483 numDigits++; 1484 if (numDigits >= m->fPattern->fMaxCaptureDigits) { 1485 break; 1486 } 1487 } 1488 1489 1490 if (numDigits == 0) { 1491 // The $ didn't introduce a group number at all. 1492 // Treat it as just part of the substitution text. 1493 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); 1494 continue; 1495 } 1496 1497 // Finally, append the capture group data to the destination. 1498 destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); 1499 if (*status == U_BUFFER_OVERFLOW_ERROR) { 1500 // Ignore buffer overflow when extracting the group. We need to 1501 // continue on to get full size of the untruncated result. We will 1502 // raise our own buffer overflow error at the end. 1503 *status = U_ZERO_ERROR; 1504 } 1505 1506 if (U_FAILURE(*status)) { 1507 // Can fail if group number is out of range. 1508 break; 1509 } 1510 1511 } 1512 1513 // 1514 // Nul Terminate the dest buffer if possible. 1515 // Set the appropriate buffer overflow or not terminated error, if needed. 1516 // 1517 if (destIdx < capacity) { 1518 dest[destIdx] = 0; 1519 } else if (destIdx == *destCapacity) { 1520 *status = U_STRING_NOT_TERMINATED_WARNING; 1521 } else { 1522 *status = U_BUFFER_OVERFLOW_ERROR; 1523 } 1524 1525 // 1526 // Return an updated dest buffer and capacity to the caller. 1527 // 1528 if (destIdx > 0 && *destCapacity > 0) { 1529 if (destIdx < capacity) { 1530 *destBuf += destIdx; 1531 *destCapacity -= destIdx; 1532 } else { 1533 *destBuf += capacity; 1534 *destCapacity = 0; 1535 } 1536 } 1537 1538 // If we came in with a buffer overflow, make sure we go out with one also. 1539 // (A zero length match right at the end of the previous match could 1540 // make this function succeed even though a previous call had overflowed the buf) 1541 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1542 *status = U_BUFFER_OVERFLOW_ERROR; 1543 } 1544 1545 return destIdx; 1546} 1547 1548// 1549// appendReplacement the actual API function, 1550// 1551U_CAPI int32_t U_EXPORT2 1552uregex_appendReplacement(URegularExpression *regexp2, 1553 const UChar *replacementText, 1554 int32_t replacementLength, 1555 UChar **destBuf, 1556 int32_t *destCapacity, 1557 UErrorCode *status) { 1558 1559 RegularExpression *regexp = (RegularExpression*)regexp2; 1560 return RegexCImpl::appendReplacement( 1561 regexp, replacementText, replacementLength,destBuf, destCapacity, status); 1562} 1563 1564// 1565// uregex_appendReplacementUText...can just use the normal C++ method 1566// 1567U_CAPI void U_EXPORT2 1568uregex_appendReplacementUText(URegularExpression *regexp2, 1569 UText *replText, 1570 UText *dest, 1571 UErrorCode *status) { 1572 RegularExpression *regexp = (RegularExpression*)regexp2; 1573 regexp->fMatcher->appendReplacement(dest, replText, *status); 1574} 1575 1576 1577//------------------------------------------------------------------------------ 1578// 1579// uregex_appendTail 1580// 1581//------------------------------------------------------------------------------ 1582int32_t RegexCImpl::appendTail(RegularExpression *regexp, 1583 UChar **destBuf, 1584 int32_t *destCapacity, 1585 UErrorCode *status) 1586{ 1587 1588 // If we come in with a buffer overflow error, don't suppress the operation. 1589 // A series of appendReplacements, appendTail need to correctly preflight 1590 // the buffer size when an overflow happens somewhere in the middle. 1591 UBool pendingBufferOverflow = FALSE; 1592 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1593 pendingBufferOverflow = TRUE; 1594 *status = U_ZERO_ERROR; 1595 } 1596 1597 if (validateRE(regexp, TRUE, status) == FALSE) { 1598 return 0; 1599 } 1600 1601 if (destCapacity == NULL || destBuf == NULL || 1602 (*destBuf == NULL && *destCapacity > 0) || 1603 *destCapacity < 0) 1604 { 1605 *status = U_ILLEGAL_ARGUMENT_ERROR; 1606 return 0; 1607 } 1608 1609 RegexMatcher *m = regexp->fMatcher; 1610 1611 int32_t destIdx = 0; 1612 int32_t destCap = *destCapacity; 1613 UChar *dest = *destBuf; 1614 1615 if (regexp->fText != NULL) { 1616 int32_t srcIdx; 1617 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); 1618 if (nativeIdx == -1) { 1619 srcIdx = 0; 1620 } else if (UTEXT_USES_U16(m->fInputText)) { 1621 srcIdx = (int32_t)nativeIdx; 1622 } else { 1623 UErrorCode status = U_ZERO_ERROR; 1624 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); 1625 } 1626 1627 for (;;) { 1628 if (srcIdx == regexp->fTextLength) { 1629 break; 1630 } 1631 UChar c = regexp->fText[srcIdx]; 1632 if (c == 0 && regexp->fTextLength == -1) { 1633 regexp->fTextLength = srcIdx; 1634 break; 1635 } 1636 if (destIdx < destCap) { 1637 dest[destIdx] = c; 1638 } else { 1639 // We've overflowed the dest buffer. 1640 // If the total input string length is known, we can 1641 // compute the total buffer size needed without scanning through the string. 1642 if (regexp->fTextLength > 0) { 1643 destIdx += (regexp->fTextLength - srcIdx); 1644 break; 1645 } 1646 } 1647 srcIdx++; 1648 destIdx++; 1649 } 1650 } else { 1651 int64_t srcIdx; 1652 if (m->fMatch) { 1653 // The most recent call to find() succeeded. 1654 srcIdx = m->fMatchEnd; 1655 } else { 1656 // The last call to find() on this matcher failed(). 1657 // Look back to the end of the last find() that succeeded for src index. 1658 srcIdx = m->fLastMatchEnd; 1659 if (srcIdx == -1) { 1660 // There has been no successful match with this matcher. 1661 // We want to copy the whole string. 1662 srcIdx = 0; 1663 } 1664 } 1665 1666 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); 1667 } 1668 1669 // 1670 // NUL terminate the output string, if possible, otherwise issue the 1671 // appropriate error or warning. 1672 // 1673 if (destIdx < destCap) { 1674 dest[destIdx] = 0; 1675 } else if (destIdx == destCap) { 1676 *status = U_STRING_NOT_TERMINATED_WARNING; 1677 } else { 1678 *status = U_BUFFER_OVERFLOW_ERROR; 1679 } 1680 1681 // 1682 // Update the user's buffer ptr and capacity vars to reflect the 1683 // amount used. 1684 // 1685 if (destIdx < destCap) { 1686 *destBuf += destIdx; 1687 *destCapacity -= destIdx; 1688 } else { 1689 *destBuf += destCap; 1690 *destCapacity = 0; 1691 } 1692 1693 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1694 *status = U_BUFFER_OVERFLOW_ERROR; 1695 } 1696 1697 return destIdx; 1698} 1699 1700 1701// 1702// appendTail the actual API function 1703// 1704U_CAPI int32_t U_EXPORT2 1705uregex_appendTail(URegularExpression *regexp2, 1706 UChar **destBuf, 1707 int32_t *destCapacity, 1708 UErrorCode *status) { 1709 RegularExpression *regexp = (RegularExpression*)regexp2; 1710 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); 1711} 1712 1713 1714// 1715// uregex_appendTailUText...can just use the normal C++ method 1716// 1717U_CAPI UText * U_EXPORT2 1718uregex_appendTailUText(URegularExpression *regexp2, 1719 UText *dest, 1720 UErrorCode *status) { 1721 RegularExpression *regexp = (RegularExpression*)regexp2; 1722 return regexp->fMatcher->appendTail(dest, *status); 1723} 1724 1725 1726//------------------------------------------------------------------------------ 1727// 1728// copyString Internal utility to copy a string to an output buffer, 1729// while managing buffer overflow and preflight size 1730// computation. NUL termination is added to destination, 1731// and the NUL is counted in the output size. 1732// 1733//------------------------------------------------------------------------------ 1734#if 0 1735static void copyString(UChar *destBuffer, // Destination buffer. 1736 int32_t destCapacity, // Total capacity of dest buffer 1737 int32_t *destIndex, // Index into dest buffer. Updated on return. 1738 // Update not clipped to destCapacity. 1739 const UChar *srcPtr, // Pointer to source string 1740 int32_t srcLen) // Source string len. 1741{ 1742 int32_t si; 1743 int32_t di = *destIndex; 1744 UChar c; 1745 1746 for (si=0; si<srcLen; si++) { 1747 c = srcPtr[si]; 1748 if (di < destCapacity) { 1749 destBuffer[di] = c; 1750 di++; 1751 } else { 1752 di += srcLen - si; 1753 break; 1754 } 1755 } 1756 if (di<destCapacity) { 1757 destBuffer[di] = 0; 1758 } 1759 di++; 1760 *destIndex = di; 1761} 1762#endif 1763 1764//------------------------------------------------------------------------------ 1765// 1766// uregex_split 1767// 1768//------------------------------------------------------------------------------ 1769int32_t RegexCImpl::split(RegularExpression *regexp, 1770 UChar *destBuf, 1771 int32_t destCapacity, 1772 int32_t *requiredCapacity, 1773 UChar *destFields[], 1774 int32_t destFieldsCapacity, 1775 UErrorCode *status) { 1776 // 1777 // Reset for the input text 1778 // 1779 regexp->fMatcher->reset(); 1780 UText *inputText = regexp->fMatcher->fInputText; 1781 int64_t nextOutputStringStart = 0; 1782 int64_t inputLen = regexp->fMatcher->fInputLength; 1783 if (inputLen == 0) { 1784 return 0; 1785 } 1786 1787 // 1788 // Loop through the input text, searching for the delimiter pattern 1789 // 1790 int32_t i; // Index of the field being processed. 1791 int32_t destIdx = 0; // Next available position in destBuf; 1792 int32_t numCaptureGroups = regexp->fMatcher->groupCount(); 1793 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted 1794 for (i=0; ; i++) { 1795 if (i>=destFieldsCapacity-1) { 1796 // There are one or zero output strings left. 1797 // Fill the last output string with whatever is left from the input, then exit the loop. 1798 // ( i will be == destFieldsCapacity if we filled the output array while processing 1799 // capture groups of the delimiter expression, in which case we will discard the 1800 // last capture group saved in favor of the unprocessed remainder of the 1801 // input string.) 1802 if (inputLen > nextOutputStringStart) { 1803 if (i != destFieldsCapacity-1) { 1804 // No fields are left. Recycle the last one for holding the trailing part of 1805 // the input string. 1806 i = destFieldsCapacity-1; 1807 destIdx = (int32_t)(destFields[i] - destFields[0]); 1808 } 1809 1810 destFields[i] = &destBuf[destIdx]; 1811 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1812 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1813 } 1814 break; 1815 } 1816 1817 if (regexp->fMatcher->find()) { 1818 // We found another delimiter. Move everything from where we started looking 1819 // up until the start of the delimiter into the next output string. 1820 destFields[i] = &destBuf[destIdx]; 1821 1822 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, 1823 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); 1824 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1825 tStatus = U_ZERO_ERROR; 1826 } else { 1827 *status = tStatus; 1828 } 1829 nextOutputStringStart = regexp->fMatcher->fMatchEnd; 1830 1831 // If the delimiter pattern has capturing parentheses, the captured 1832 // text goes out into the next n destination strings. 1833 int32_t groupNum; 1834 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 1835 // If we've run out of output string slots, bail out. 1836 if (i==destFieldsCapacity-1) { 1837 break; 1838 } 1839 i++; 1840 1841 // Set up to extract the capture group contents into the dest buffer. 1842 destFields[i] = &destBuf[destIdx]; 1843 tStatus = U_ZERO_ERROR; 1844 int32_t t = uregex_group((URegularExpression*)regexp, 1845 groupNum, 1846 destFields[i], 1847 REMAINING_CAPACITY(destIdx, destCapacity), 1848 &tStatus); 1849 destIdx += t + 1; // Record the space used in the output string buffer. 1850 // +1 for the NUL that terminates the string. 1851 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1852 tStatus = U_ZERO_ERROR; 1853 } else { 1854 *status = tStatus; 1855 } 1856 } 1857 1858 if (nextOutputStringStart == inputLen) { 1859 // The delimiter was at the end of the string. 1860 // Output an empty string, and then we are done. 1861 if (destIdx < destCapacity) { 1862 destBuf[destIdx] = 0; 1863 } 1864 if (i < destFieldsCapacity-1) { 1865 ++i; 1866 } 1867 if (destIdx < destCapacity) { 1868 destFields[i] = destBuf + destIdx; 1869 } 1870 ++destIdx; 1871 break; 1872 } 1873 1874 } 1875 else 1876 { 1877 // We ran off the end of the input while looking for the next delimiter. 1878 // All the remaining text goes into the current output string. 1879 destFields[i] = &destBuf[destIdx]; 1880 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1881 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1882 break; 1883 } 1884 } 1885 1886 // Zero out any unused portion of the destFields array 1887 int j; 1888 for (j=i+1; j<destFieldsCapacity; j++) { 1889 destFields[j] = NULL; 1890 } 1891 1892 if (requiredCapacity != NULL) { 1893 *requiredCapacity = destIdx; 1894 } 1895 if (destIdx > destCapacity) { 1896 *status = U_BUFFER_OVERFLOW_ERROR; 1897 } 1898 return i+1; 1899} 1900 1901// 1902// uregex_split The actual API function 1903// 1904U_CAPI int32_t U_EXPORT2 1905uregex_split(URegularExpression *regexp2, 1906 UChar *destBuf, 1907 int32_t destCapacity, 1908 int32_t *requiredCapacity, 1909 UChar *destFields[], 1910 int32_t destFieldsCapacity, 1911 UErrorCode *status) { 1912 RegularExpression *regexp = (RegularExpression*)regexp2; 1913 if (validateRE(regexp, TRUE, status) == FALSE) { 1914 return 0; 1915 } 1916 if ((destBuf == NULL && destCapacity > 0) || 1917 destCapacity < 0 || 1918 destFields == NULL || 1919 destFieldsCapacity < 1 ) { 1920 *status = U_ILLEGAL_ARGUMENT_ERROR; 1921 return 0; 1922 } 1923 1924 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); 1925} 1926 1927 1928// 1929// uregex_splitUText...can just use the normal C++ method 1930// 1931U_CAPI int32_t U_EXPORT2 1932uregex_splitUText(URegularExpression *regexp2, 1933 UText *destFields[], 1934 int32_t destFieldsCapacity, 1935 UErrorCode *status) { 1936 RegularExpression *regexp = (RegularExpression*)regexp2; 1937 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); 1938} 1939 1940 1941#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1942 1943