1/* 2******************************************************************************* 3* Copyright (C) 2004-2009, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* file name: regex.cpp 7*/ 8 9#include "unicode/utypes.h" 10 11#if !UCONFIG_NO_REGULAR_EXPRESSIONS 12 13#include "unicode/regex.h" 14#include "unicode/uregex.h" 15#include "unicode/unistr.h" 16#include "unicode/ustring.h" 17#include "unicode/uchar.h" 18#include "unicode/uobject.h" 19#include "umutex.h" 20#include "uassert.h" 21#include "cmemory.h" 22 23U_NAMESPACE_USE 24 25struct URegularExpression: public UMemory { 26public: 27 URegularExpression(); 28 ~URegularExpression(); 29 int32_t fMagic; 30 RegexPattern *fPat; 31 int32_t *fPatRefCount; 32 UChar *fPatString; 33 int32_t fPatStringLen; 34 RegexMatcher *fMatcher; 35 const UChar *fText; // Text from setText() 36 int32_t fTextLength; // Length provided by user with setText(), which 37 // may be -1. 38 39 UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString. 40 // TODO: regexp engine should not depend on UnicodeString. 41}; 42 43static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII 44 45URegularExpression::URegularExpression() { 46 fMagic = REXP_MAGIC; 47 fPat = NULL; 48 fPatRefCount = NULL; 49 fPatString = NULL; 50 fPatStringLen = 0; 51 fMatcher = NULL; 52 fText = NULL; 53 fTextLength = 0; 54} 55 56URegularExpression::~URegularExpression() { 57 delete fMatcher; 58 fMatcher = NULL; 59 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { 60 delete fPat; 61 uprv_free(fPatString); 62 uprv_free(fPatRefCount); 63 } 64 fMagic = 0; 65} 66 67//---------------------------------------------------------------------------------------- 68// 69// validateRE Do boilerplate style checks on API function parameters. 70// Return TRUE if they look OK. 71//---------------------------------------------------------------------------------------- 72static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) { 73 if (U_FAILURE(*status)) { 74 return FALSE; 75 } 76 if (re == NULL || re->fMagic != REXP_MAGIC) { 77 *status = U_ILLEGAL_ARGUMENT_ERROR; 78 return FALSE; 79 } 80 if (requiresText && re->fText == NULL) { 81 *status = U_REGEX_INVALID_STATE; 82 return FALSE; 83 } 84 return TRUE; 85} 86 87//---------------------------------------------------------------------------------------- 88// 89// uregex_open 90// 91//---------------------------------------------------------------------------------------- 92U_CAPI URegularExpression * U_EXPORT2 93uregex_open( const UChar *pattern, 94 int32_t patternLength, 95 uint32_t flags, 96 UParseError *pe, 97 UErrorCode *status) { 98 99 if (U_FAILURE(*status)) { 100 return NULL; 101 } 102 if (pattern == NULL || patternLength < -1 || patternLength == 0) { 103 *status = U_ILLEGAL_ARGUMENT_ERROR; 104 return NULL; 105 } 106 int32_t actualPatLen = patternLength; 107 if (actualPatLen == -1) { 108 actualPatLen = u_strlen(pattern); 109 } 110 111 URegularExpression *re = new URegularExpression; 112 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); 113 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); 114 if (re == NULL || refC == NULL || patBuf == NULL) { 115 *status = U_MEMORY_ALLOCATION_ERROR; 116 delete re; 117 uprv_free(refC); 118 uprv_free(patBuf); 119 return NULL; 120 } 121 re->fPatRefCount = refC; 122 *re->fPatRefCount = 1; 123 124 // 125 // Make a copy of the pattern string, so we can return it later if asked. 126 // For compiling the pattern, we will use a read-only-aliased UnicodeString 127 // of this local copy, to avoid making even more copies. 128 // 129 re->fPatString = patBuf; 130 re->fPatStringLen = patternLength; 131 u_memcpy(patBuf, pattern, actualPatLen); 132 patBuf[actualPatLen] = 0; 133 UnicodeString patString(patternLength==-1, patBuf, patternLength); 134 135 // 136 // Compile the pattern 137 // 138 if (pe != NULL) { 139 re->fPat = RegexPattern::compile(patString, flags, *pe, *status); 140 } else { 141 re->fPat = RegexPattern::compile(patString, flags, *status); 142 } 143 if (U_FAILURE(*status)) { 144 goto ErrorExit; 145 } 146 147 // 148 // Create the matcher object 149 // 150 re->fMatcher = re->fPat->matcher(*status); 151 if (U_SUCCESS(*status)) { 152 return re; 153 } 154 155ErrorExit: 156 delete re; 157 return NULL; 158 159} 160 161//---------------------------------------------------------------------------------------- 162// 163// uregex_close 164// 165//---------------------------------------------------------------------------------------- 166U_CAPI void U_EXPORT2 167uregex_close(URegularExpression *re) { 168 UErrorCode status = U_ZERO_ERROR; 169 if (validateRE(re, &status, FALSE) == FALSE) { 170 return; 171 } 172 delete re; 173} 174 175 176//---------------------------------------------------------------------------------------- 177// 178// uregex_clone 179// 180//---------------------------------------------------------------------------------------- 181U_CAPI URegularExpression * U_EXPORT2 182uregex_clone(const URegularExpression *source, UErrorCode *status) { 183 if (validateRE(source, status, FALSE) == FALSE) { 184 return NULL; 185 } 186 187 URegularExpression *clone = new URegularExpression; 188 if (clone == NULL) { 189 *status = U_MEMORY_ALLOCATION_ERROR; 190 return NULL; 191 } 192 193 clone->fMatcher = source->fPat->matcher(*status); 194 if (U_FAILURE(*status)) { 195 delete clone; 196 return NULL; 197 } 198 199 clone->fPat = source->fPat; 200 clone->fPatRefCount = source->fPatRefCount; 201 clone->fPatString = source->fPatString; 202 clone->fPatStringLen = source->fPatStringLen; 203 umtx_atomic_inc(source->fPatRefCount); 204 // Note: fText is not cloned. 205 206 return clone; 207} 208 209 210 211 212//------------------------------------------------------------------------------ 213// 214// uregex_pattern 215// 216//------------------------------------------------------------------------------ 217U_CAPI const UChar * U_EXPORT2 218uregex_pattern(const URegularExpression *regexp, 219 int32_t *patLength, 220 UErrorCode *status) { 221 222 if (validateRE(regexp, status, FALSE) == FALSE) { 223 return NULL; 224 } 225 if (patLength != NULL) { 226 *patLength = regexp->fPatStringLen; 227 } 228 return regexp->fPatString; 229} 230 231 232//------------------------------------------------------------------------------ 233// 234// uregex_flags 235// 236//------------------------------------------------------------------------------ 237U_CAPI int32_t U_EXPORT2 238uregex_flags(const URegularExpression *regexp, UErrorCode *status) { 239 if (validateRE(regexp, status, FALSE) == FALSE) { 240 return 0; 241 } 242 int32_t flags = regexp->fPat->flags(); 243 return flags; 244} 245 246 247//------------------------------------------------------------------------------ 248// 249// uregex_setText 250// 251//------------------------------------------------------------------------------ 252U_CAPI void U_EXPORT2 253uregex_setText(URegularExpression *regexp, 254 const UChar *text, 255 int32_t textLength, 256 UErrorCode *status) { 257 if (validateRE(regexp, status, FALSE) == FALSE) { 258 return; 259 } 260 if (text == NULL || textLength < -1) { 261 *status = U_ILLEGAL_ARGUMENT_ERROR; 262 return; 263 } 264 regexp->fText = text; 265 regexp->fTextLength = textLength; 266 UBool isTerminated = (textLength == -1); 267 268 regexp->fTextString.setTo(isTerminated, text, textLength); 269 regexp->fMatcher->reset(regexp->fTextString); 270} 271 272 273 274//------------------------------------------------------------------------------ 275// 276// uregex_getText 277// 278//------------------------------------------------------------------------------ 279U_CAPI const UChar * U_EXPORT2 280uregex_getText(URegularExpression *regexp, 281 int32_t *textLength, 282 UErrorCode *status) { 283 if (validateRE(regexp, status, FALSE) == FALSE) { 284 return NULL; 285 } 286 if (textLength != NULL) { 287 *textLength = regexp->fTextLength; 288 } 289 return regexp->fText; 290} 291 292 293//------------------------------------------------------------------------------ 294// 295// uregex_matches 296// 297//------------------------------------------------------------------------------ 298U_CAPI UBool U_EXPORT2 299uregex_matches(URegularExpression *regexp, 300 int32_t startIndex, 301 UErrorCode *status) { 302 UBool result = FALSE; 303 if (validateRE(regexp, status) == FALSE) { 304 return result; 305 } 306 if (startIndex == -1) { 307 result = regexp->fMatcher->matches(*status); 308 } else { 309 result = regexp->fMatcher->matches(startIndex, *status); 310 } 311 return result; 312} 313 314 315 316//------------------------------------------------------------------------------ 317// 318// uregex_lookingAt 319// 320//------------------------------------------------------------------------------ 321U_CAPI UBool U_EXPORT2 322uregex_lookingAt(URegularExpression *regexp, 323 int32_t startIndex, 324 UErrorCode *status) { 325 UBool result = FALSE; 326 if (validateRE(regexp, status) == FALSE) { 327 return result; 328 } 329 if (startIndex == -1) { 330 result = regexp->fMatcher->lookingAt(*status); 331 } else { 332 result = regexp->fMatcher->lookingAt(startIndex, *status); 333 } 334 return result; 335} 336 337 338 339//------------------------------------------------------------------------------ 340// 341// uregex_find 342// 343//------------------------------------------------------------------------------ 344U_CAPI UBool U_EXPORT2 345uregex_find(URegularExpression *regexp, 346 int32_t startIndex, 347 UErrorCode *status) { 348 UBool result = FALSE; 349 if (validateRE(regexp, status) == FALSE) { 350 return result; 351 } 352 if (startIndex == -1) { 353 regexp->fMatcher->resetPreserveRegion(); 354 result = regexp->fMatcher->find(); 355 } else { 356 result = regexp->fMatcher->find(startIndex, *status); 357 } 358 return result; 359} 360 361//------------------------------------------------------------------------------ 362// 363// uregex_findNext 364// 365//------------------------------------------------------------------------------ 366U_CAPI UBool U_EXPORT2 367uregex_findNext(URegularExpression *regexp, 368 UErrorCode *status) { 369 if (validateRE(regexp, status) == FALSE) { 370 return FALSE; 371 } 372 UBool result = regexp->fMatcher->find(); 373 return result; 374} 375 376//------------------------------------------------------------------------------ 377// 378// uregex_groupCount 379// 380//------------------------------------------------------------------------------ 381U_CAPI int32_t U_EXPORT2 382uregex_groupCount(URegularExpression *regexp, 383 UErrorCode *status) { 384 if (validateRE(regexp, status, FALSE) == FALSE) { 385 return 0; 386 } 387 int32_t result = regexp->fMatcher->groupCount(); 388 return result; 389} 390 391 392//------------------------------------------------------------------------------ 393// 394// uregex_group 395// 396//------------------------------------------------------------------------------ 397U_CAPI int32_t U_EXPORT2 398uregex_group(URegularExpression *regexp, 399 int32_t groupNum, 400 UChar *dest, 401 int32_t destCapacity, 402 UErrorCode *status) { 403 if (validateRE(regexp, status) == FALSE) { 404 return 0; 405 } 406 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 407 *status = U_ILLEGAL_ARGUMENT_ERROR; 408 return 0; 409 } 410 411 // 412 // Pick up the range of characters from the matcher 413 // 414 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 415 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 416 if (U_FAILURE(*status)) { 417 return 0; 418 } 419 420 // 421 // Trim length based on buffer capacity 422 // 423 int32_t fullLength = endIx - startIx; 424 int32_t copyLength = fullLength; 425 if (copyLength < destCapacity) { 426 dest[copyLength] = 0; 427 } else if (copyLength == destCapacity) { 428 *status = U_STRING_NOT_TERMINATED_WARNING; 429 } else { 430 copyLength = destCapacity; 431 *status = U_BUFFER_OVERFLOW_ERROR; 432 } 433 434 // 435 // Copy capture group to user's buffer 436 // 437 if (copyLength > 0) { 438 u_memcpy(dest, ®exp->fText[startIx], copyLength); 439 } 440 return fullLength; 441} 442 443 444//------------------------------------------------------------------------------ 445// 446// uregex_start 447// 448//------------------------------------------------------------------------------ 449U_CAPI int32_t U_EXPORT2 450uregex_start(URegularExpression *regexp, 451 int32_t groupNum, 452 UErrorCode *status) { 453 if (validateRE(regexp, status) == FALSE) { 454 return 0; 455 } 456 int32_t result = regexp->fMatcher->start(groupNum, *status); 457 return result; 458} 459 460 461//------------------------------------------------------------------------------ 462// 463// uregex_end 464// 465//------------------------------------------------------------------------------ 466U_CAPI int32_t U_EXPORT2 467uregex_end(URegularExpression *regexp, 468 int32_t groupNum, 469 UErrorCode *status) { 470 if (validateRE(regexp, status) == FALSE) { 471 return 0; 472 } 473 int32_t result = regexp->fMatcher->end(groupNum, *status); 474 return result; 475} 476 477//------------------------------------------------------------------------------ 478// 479// uregex_reset 480// 481//------------------------------------------------------------------------------ 482U_CAPI void U_EXPORT2 483uregex_reset(URegularExpression *regexp, 484 int32_t index, 485 UErrorCode *status) { 486 if (validateRE(regexp, status) == FALSE) { 487 return; 488 } 489 regexp->fMatcher->reset(index, *status); 490} 491 492 493//------------------------------------------------------------------------------ 494// 495// uregex_setRegion 496// 497//------------------------------------------------------------------------------ 498U_CAPI void U_EXPORT2 499uregex_setRegion(URegularExpression *regexp, 500 int32_t regionStart, 501 int32_t regionLimit, 502 UErrorCode *status) { 503 if (validateRE(regexp, status) == FALSE) { 504 return; 505 } 506 regexp->fMatcher->region(regionStart, regionLimit, *status); 507} 508 509 510//------------------------------------------------------------------------------ 511// 512// uregex_regionStart 513// 514//------------------------------------------------------------------------------ 515U_CAPI int32_t U_EXPORT2 516uregex_regionStart(const URegularExpression *regexp, 517 UErrorCode *status) { 518 if (validateRE(regexp, status) == FALSE) { 519 return 0; 520 } 521 return regexp->fMatcher->regionStart(); 522} 523 524 525//------------------------------------------------------------------------------ 526// 527// uregex_regionEnd 528// 529//------------------------------------------------------------------------------ 530U_CAPI int32_t U_EXPORT2 531uregex_regionEnd(const URegularExpression *regexp, 532 UErrorCode *status) { 533 if (validateRE(regexp, status) == FALSE) { 534 return 0; 535 } 536 return regexp->fMatcher->regionEnd(); 537} 538 539 540//------------------------------------------------------------------------------ 541// 542// uregex_hasTransparentBounds 543// 544//------------------------------------------------------------------------------ 545U_CAPI UBool U_EXPORT2 546uregex_hasTransparentBounds(const URegularExpression *regexp, 547 UErrorCode *status) { 548 if (validateRE(regexp, status) == FALSE) { 549 return FALSE; 550 } 551 return regexp->fMatcher->hasTransparentBounds(); 552} 553 554 555//------------------------------------------------------------------------------ 556// 557// uregex_useTransparentBounds 558// 559//------------------------------------------------------------------------------ 560U_CAPI void U_EXPORT2 561uregex_useTransparentBounds(URegularExpression *regexp, 562 UBool b, 563 UErrorCode *status) { 564 if (validateRE(regexp, status) == FALSE) { 565 return; 566 } 567 regexp->fMatcher->useTransparentBounds(b); 568} 569 570 571//------------------------------------------------------------------------------ 572// 573// uregex_hasAnchoringBounds 574// 575//------------------------------------------------------------------------------ 576U_CAPI UBool U_EXPORT2 577uregex_hasAnchoringBounds(const URegularExpression *regexp, 578 UErrorCode *status) { 579 if (validateRE(regexp, status) == FALSE) { 580 return FALSE; 581 } 582 return regexp->fMatcher->hasAnchoringBounds(); 583} 584 585 586//------------------------------------------------------------------------------ 587// 588// uregex_useAnchoringBounds 589// 590//------------------------------------------------------------------------------ 591U_CAPI void U_EXPORT2 592uregex_useAnchoringBounds(URegularExpression *regexp, 593 UBool b, 594 UErrorCode *status) { 595 if (validateRE(regexp, status) == FALSE) { 596 return; 597 } 598 regexp->fMatcher->useAnchoringBounds(b); 599} 600 601 602//------------------------------------------------------------------------------ 603// 604// uregex_hitEnd 605// 606//------------------------------------------------------------------------------ 607U_CAPI UBool U_EXPORT2 608uregex_hitEnd(const URegularExpression *regexp, 609 UErrorCode *status) { 610 if (validateRE(regexp, status) == FALSE) { 611 return FALSE; 612 } 613 return regexp->fMatcher->hitEnd(); 614} 615 616 617//------------------------------------------------------------------------------ 618// 619// uregex_requireEnd 620// 621//------------------------------------------------------------------------------ 622U_CAPI UBool U_EXPORT2 623uregex_requireEnd(const URegularExpression *regexp, 624 UErrorCode *status) { 625 if (validateRE(regexp, status) == FALSE) { 626 return FALSE; 627 } 628 return regexp->fMatcher->requireEnd(); 629} 630 631 632//------------------------------------------------------------------------------ 633// 634// uregex_setTimeLimit 635// 636//------------------------------------------------------------------------------ 637U_CAPI void U_EXPORT2 638uregex_setTimeLimit(URegularExpression *regexp, 639 int32_t limit, 640 UErrorCode *status) { 641 if (validateRE(regexp, status)) { 642 regexp->fMatcher->setTimeLimit(limit, *status); 643 } 644} 645 646 647 648//------------------------------------------------------------------------------ 649// 650// uregex_getTimeLimit 651// 652//------------------------------------------------------------------------------ 653U_CAPI int32_t U_EXPORT2 654uregex_getTimeLimit(const URegularExpression *regexp, 655 UErrorCode *status) { 656 int32_t retVal = 0; 657 if (validateRE(regexp, status)) { 658 retVal = regexp->fMatcher->getTimeLimit(); 659 } 660 return retVal; 661} 662 663 664 665//------------------------------------------------------------------------------ 666// 667// uregex_setStackLimit 668// 669//------------------------------------------------------------------------------ 670U_CAPI void U_EXPORT2 671uregex_setStackLimit(URegularExpression *regexp, 672 int32_t limit, 673 UErrorCode *status) { 674 if (validateRE(regexp, status)) { 675 regexp->fMatcher->setStackLimit(limit, *status); 676 } 677} 678 679 680 681//------------------------------------------------------------------------------ 682// 683// uregex_getStackLimit 684// 685//------------------------------------------------------------------------------ 686U_CAPI int32_t U_EXPORT2 687uregex_getStackLimit(const URegularExpression *regexp, 688 UErrorCode *status) { 689 int32_t retVal = 0; 690 if (validateRE(regexp, status)) { 691 retVal = regexp->fMatcher->getStackLimit(); 692 } 693 return retVal; 694} 695 696 697//------------------------------------------------------------------------------ 698// 699// uregex_setMatchCallback 700// 701//------------------------------------------------------------------------------ 702U_CAPI void U_EXPORT2 703uregex_setMatchCallback(URegularExpression *regexp, 704 URegexMatchCallback *callback, 705 const void *context, 706 UErrorCode *status) { 707 if (validateRE(regexp, status)) { 708 regexp->fMatcher->setMatchCallback(callback, context, *status); 709 } 710} 711 712 713//------------------------------------------------------------------------------ 714// 715// uregex_getMatchCallback 716// 717//------------------------------------------------------------------------------ 718U_CAPI void U_EXPORT2 719uregex_getMatchCallback(const URegularExpression *regexp, 720 URegexMatchCallback **callback, 721 const void **context, 722 UErrorCode *status) { 723 if (validateRE(regexp, status)) { 724 regexp->fMatcher->getMatchCallback(*callback, *context, *status); 725 } 726} 727 728 729//------------------------------------------------------------------------------ 730// 731// uregex_replaceAll 732// 733//------------------------------------------------------------------------------ 734U_CAPI int32_t U_EXPORT2 735uregex_replaceAll(URegularExpression *regexp, 736 const UChar *replacementText, 737 int32_t replacementLength, 738 UChar *destBuf, 739 int32_t destCapacity, 740 UErrorCode *status) { 741 if (validateRE(regexp, status) == FALSE) { 742 return 0; 743 } 744 if (replacementText == NULL || replacementLength < -1 || 745 destBuf == NULL && destCapacity > 0 || 746 destCapacity < 0) { 747 *status = U_ILLEGAL_ARGUMENT_ERROR; 748 return 0; 749 } 750 751 int32_t len = 0; 752 753 uregex_reset(regexp, 0, status); 754 755 // Note: Seperate error code variables for findNext() and appendReplacement() 756 // are used so that destination buffer overflow errors 757 // in appendReplacement won't stop findNext() from working. 758 // appendReplacement() and appendTail() special case incoming buffer 759 // overflow errors, continuing to return the correct length. 760 UErrorCode findStatus = *status; 761 while (uregex_findNext(regexp, &findStatus)) { 762 len += uregex_appendReplacement(regexp, replacementText, replacementLength, 763 &destBuf, &destCapacity, status); 764 } 765 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status); 766 767 if (U_FAILURE(findStatus)) { 768 // If anything went wrong with the findNext(), make that error trump 769 // whatever may have happened with the append() operations. 770 // Errors in findNext() are not expected. 771 *status = findStatus; 772 } 773 774 return len; 775} 776 777 778//------------------------------------------------------------------------------ 779// 780// uregex_replaceFirst 781// 782//------------------------------------------------------------------------------ 783U_CAPI int32_t U_EXPORT2 784uregex_replaceFirst(URegularExpression *regexp, 785 const UChar *replacementText, 786 int32_t replacementLength, 787 UChar *destBuf, 788 int32_t destCapacity, 789 UErrorCode *status) { 790 if (validateRE(regexp, status) == FALSE) { 791 return 0; 792 } 793 if (replacementText == NULL || replacementLength < -1 || 794 destBuf == NULL && destCapacity > 0 || 795 destCapacity < 0) { 796 *status = U_ILLEGAL_ARGUMENT_ERROR; 797 return 0; 798 } 799 800 int32_t len = 0; 801 UBool findSucceeded; 802 uregex_reset(regexp, 0, status); 803 findSucceeded = uregex_find(regexp, 0, status); 804 if (findSucceeded) { 805 len = uregex_appendReplacement(regexp, replacementText, replacementLength, 806 &destBuf, &destCapacity, status); 807 } 808 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status); 809 810 return len; 811} 812 813 814//------------------------------------------------------------------------------ 815// 816// uregex_appendReplacement 817// 818//------------------------------------------------------------------------------ 819 820 821// 822// Dummy class, because these functions need to be friends of class RegexMatcher, 823// and stand-alone C functions don't work as friends 824// 825U_NAMESPACE_BEGIN 826class RegexCImpl { 827 public: 828 inline static int32_t appendReplacement(URegularExpression *regexp, 829 const UChar *replacementText, 830 int32_t replacementLength, 831 UChar **destBuf, 832 int32_t *destCapacity, 833 UErrorCode *status); 834 835 inline static int32_t appendTail(URegularExpression *regexp, 836 UChar **destBuf, 837 int32_t *destCapacity, 838 UErrorCode *status); 839}; 840U_NAMESPACE_END 841 842 843// 844// Call-back function for u_unescapeAt(), used when we encounter 845// \uxxxx or \Uxxxxxxxxx escapes in the replacement text. 846// 847U_CDECL_BEGIN 848static UChar U_CALLCONV 849unescape_charAt(int32_t offset, void *context) { 850 UChar c16 = ((UChar *)context)[offset]; 851 return c16; 852} 853U_CDECL_END 854 855 856static const UChar BACKSLASH = 0x5c; 857static const UChar DOLLARSIGN = 0x24; 858 859// 860// Move a character to an output buffer, with bounds checking on the index. 861// Index advances even if capacity is exceeded, for preflight size computations. 862// This little sequence is used a LOT. 863// 864static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { 865 if (*idx < bufCapacity) { 866 buf[*idx] = c; 867 } 868 (*idx)++; 869} 870 871 872// 873// appendReplacement, the actual implementation. 874// 875int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, 876 const UChar *replacementText, 877 int32_t replacementLength, 878 UChar **destBuf, 879 int32_t *destCapacity, 880 UErrorCode *status) { 881 882 // If we come in with a buffer overflow error, don't suppress the operation. 883 // A series of appendReplacements, appendTail need to correctly preflight 884 // the buffer size when an overflow happens somewhere in the middle. 885 UBool pendingBufferOverflow = FALSE; 886 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 887 pendingBufferOverflow = TRUE; 888 *status = U_ZERO_ERROR; 889 } 890 891 // 892 // Validate all paramters 893 // 894 if (validateRE(regexp, status) == FALSE) { 895 return 0; 896 } 897 if (replacementText == NULL || replacementLength < -1 || 898 destCapacity == NULL || destBuf == NULL || 899 *destBuf == NULL && *destCapacity > 0 || 900 *destCapacity < 0) { 901 *status = U_ILLEGAL_ARGUMENT_ERROR; 902 return 0; 903 } 904 905 RegexMatcher *m = regexp->fMatcher; 906 if (m->fMatch == FALSE) { 907 *status = U_REGEX_INVALID_STATE; 908 return 0; 909 } 910 911 UChar *dest = *destBuf; 912 int32_t capacity = *destCapacity; 913 int32_t destIdx = 0; 914 int32_t i; 915 916 // If it wasn't supplied by the caller, get the length of the replacement text. 917 // TODO: slightly smarter logic in the copy loop could watch for the NUL on 918 // the fly and avoid this step. 919 if (replacementLength == -1) { 920 replacementLength = u_strlen(replacementText); 921 } 922 923 // Copy input string from the end of previous match to start of current match 924 for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) { 925 appendToBuf(regexp->fText[i], &destIdx, dest, capacity); 926 } 927 928 929 930 // scan the replacement text, looking for substitutions ($n) and \escapes. 931 int32_t replIdx = 0; 932 while (replIdx < replacementLength) { 933 UChar c = replacementText[replIdx]; 934 replIdx++; 935 if (c != DOLLARSIGN && c != BACKSLASH) { 936 // Common case, no substitution, no escaping, 937 // just copy the char to the dest buf. 938 appendToBuf(c, &destIdx, dest, capacity); 939 continue; 940 } 941 942 if (c == BACKSLASH) { 943 // Backslash Escape. Copy the following char out without further checks. 944 // Note: Surrogate pairs don't need any special handling 945 // The second half wont be a '$' or a '\', and 946 // will move to the dest normally on the next 947 // loop iteration. 948 if (replIdx >= replacementLength) { 949 break; 950 } 951 c = replacementText[replIdx]; 952 953 if (c==0x55/*U*/ || c==0x75/*u*/) { 954 // We have a \udddd or \Udddddddd escape sequence. 955 UChar32 escapedChar = 956 u_unescapeAt(unescape_charAt, 957 &replIdx, // Index is updated by unescapeAt 958 replacementLength, // Length of replacement text 959 (void *)replacementText); 960 961 if (escapedChar != (UChar32)0xFFFFFFFF) { 962 if (escapedChar <= 0xffff) { 963 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); 964 } else { 965 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); 966 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); 967 } 968 continue; 969 } 970 // Note: if the \u escape was invalid, just fall through and 971 // treat it as a plain \<anything> escape. 972 } 973 974 // Plain backslash escape. Just put out the escaped character. 975 appendToBuf(c, &destIdx, dest, capacity); 976 977 replIdx++; 978 continue; 979 } 980 981 982 983 // We've got a $. Pick up a capture group number if one follows. 984 // Consume at most the number of digits necessary for the largest capture 985 // number that is valid for this pattern. 986 987 int32_t numDigits = 0; 988 int32_t groupNum = 0; 989 UChar32 digitC; 990 for (;;) { 991 if (replIdx >= replacementLength) { 992 break; 993 } 994 U16_GET(replacementText, 0, replIdx, replacementLength, digitC); 995 if (u_isdigit(digitC) == FALSE) { 996 break; 997 } 998 999 U16_FWD_1(replacementText, replIdx, replacementLength); 1000 groupNum=groupNum*10 + u_charDigitValue(digitC); 1001 numDigits++; 1002 if (numDigits >= m->fPattern->fMaxCaptureDigits) { 1003 break; 1004 } 1005 } 1006 1007 1008 if (numDigits == 0) { 1009 // The $ didn't introduce a group number at all. 1010 // Treat it as just part of the substitution text. 1011 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); 1012 continue; 1013 } 1014 1015 // Finally, append the capture group data to the destination. 1016 int32_t capacityRemaining = capacity - destIdx; 1017 if (capacityRemaining < 0) { 1018 capacityRemaining = 0; 1019 } 1020 destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status); 1021 if (*status == U_BUFFER_OVERFLOW_ERROR) { 1022 // Ignore buffer overflow when extracting the group. We need to 1023 // continue on to get full size of the untruncated result. We will 1024 // raise our own buffer overflow error at the end. 1025 *status = U_ZERO_ERROR; 1026 } 1027 1028 if (U_FAILURE(*status)) { 1029 // Can fail if group number is out of range. 1030 break; 1031 } 1032 1033 } 1034 1035 // 1036 // Nul Terminate the dest buffer if possible. 1037 // Set the appropriate buffer overflow or not terminated error, if needed. 1038 // 1039 if (destIdx < capacity) { 1040 dest[destIdx] = 0; 1041 } else if (destIdx == *destCapacity) { 1042 *status = U_STRING_NOT_TERMINATED_WARNING; 1043 } else { 1044 *status = U_BUFFER_OVERFLOW_ERROR; 1045 } 1046 1047 // 1048 // Return an updated dest buffer and capacity to the caller. 1049 // 1050 if (destIdx > 0 && *destCapacity > 0) { 1051 if (destIdx < capacity) { 1052 *destBuf += destIdx; 1053 *destCapacity -= destIdx; 1054 } else { 1055 *destBuf += capacity; 1056 *destCapacity = 0; 1057 } 1058 } 1059 1060 // If we came in with a buffer overflow, make sure we go out with one also. 1061 // (A zero length match right at the end of the previous match could 1062 // make this function succeed even though a previous call had overflowed the buf) 1063 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1064 *status = U_BUFFER_OVERFLOW_ERROR; 1065 } 1066 1067 return destIdx; 1068} 1069 1070// 1071// appendReplacement the acutal API function, 1072// 1073U_CAPI int32_t U_EXPORT2 1074uregex_appendReplacement(URegularExpression *regexp, 1075 const UChar *replacementText, 1076 int32_t replacementLength, 1077 UChar **destBuf, 1078 int32_t *destCapacity, 1079 UErrorCode *status) { 1080 return RegexCImpl::appendReplacement( 1081 regexp, replacementText, replacementLength,destBuf, destCapacity, status); 1082} 1083 1084 1085//------------------------------------------------------------------------------ 1086// 1087// uregex_appendTail 1088// 1089//------------------------------------------------------------------------------ 1090int32_t RegexCImpl::appendTail(URegularExpression *regexp, 1091 UChar **destBuf, 1092 int32_t *destCapacity, 1093 UErrorCode *status) 1094{ 1095 1096 // If we come in with a buffer overflow error, don't suppress the operation. 1097 // A series of appendReplacements, appendTail need to correctly preflight 1098 // the buffer size when an overflow happens somewhere in the middle. 1099 UBool pendingBufferOverflow = FALSE; 1100 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1101 pendingBufferOverflow = TRUE; 1102 *status = U_ZERO_ERROR; 1103 } 1104 1105 if (validateRE(regexp, status) == FALSE) { 1106 return 0; 1107 } 1108 1109 if (destCapacity == NULL || destBuf == NULL || 1110 *destBuf == NULL && *destCapacity > 0 || 1111 *destCapacity < 0) 1112 { 1113 *status = U_ILLEGAL_ARGUMENT_ERROR; 1114 return 0; 1115 } 1116 1117 RegexMatcher *m = regexp->fMatcher; 1118 1119 int32_t srcIdx; 1120 if (m->fMatch) { 1121 // The most recent call to find() succeeded. 1122 srcIdx = m->fMatchEnd; 1123 } else { 1124 // The last call to find() on this matcher failed(). 1125 // Look back to the end of the last find() that succeeded for src index. 1126 srcIdx = m->fLastMatchEnd; 1127 if (srcIdx == -1) { 1128 // There has been no successful match with this matcher. 1129 // We want to copy the whole string. 1130 srcIdx = 0; 1131 } 1132 } 1133 1134 int32_t destIdx = 0; 1135 int32_t destCap = *destCapacity; 1136 UChar *dest = *destBuf; 1137 1138 for (;;) { 1139 if (srcIdx == regexp->fTextLength) { 1140 break; 1141 } 1142 UChar c = regexp->fText[srcIdx]; 1143 if (c == 0 && regexp->fTextLength == -1) { 1144 break; 1145 } 1146 if (destIdx < destCap) { 1147 dest[destIdx] = c; 1148 } else { 1149 // We've overflowed the dest buffer. 1150 // If the total input string length is known, we can 1151 // compute the total buffer size needed without scanning through the string. 1152 if (regexp->fTextLength > 0) { 1153 destIdx += (regexp->fTextLength - srcIdx); 1154 break; 1155 } 1156 } 1157 srcIdx++; 1158 destIdx++; 1159 } 1160 1161 // 1162 // NUL terminate the output string, if possible, otherwise issue the 1163 // appropriate error or warning. 1164 // 1165 if (destIdx < destCap) { 1166 dest[destIdx] = 0; 1167 } else if (destIdx == destCap) { 1168 *status = U_STRING_NOT_TERMINATED_WARNING; 1169 } else { 1170 *status = U_BUFFER_OVERFLOW_ERROR; 1171 } 1172 1173 // 1174 // Update the user's buffer ptr and capacity vars to reflect the 1175 // amount used. 1176 // 1177 if (destIdx < destCap) { 1178 *destBuf += destIdx; 1179 *destCapacity -= destIdx; 1180 } else { 1181 *destBuf += destCap; 1182 *destCapacity = 0; 1183 } 1184 1185 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1186 *status = U_BUFFER_OVERFLOW_ERROR; 1187 } 1188 1189 return destIdx; 1190} 1191 1192 1193U_CAPI int32_t U_EXPORT2 1194uregex_appendTail(URegularExpression *regexp, 1195 UChar **destBuf, 1196 int32_t *destCapacity, 1197 UErrorCode *status) { 1198 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); 1199} 1200 1201 1202//------------------------------------------------------------------------------ 1203// 1204// copyString Internal utility to copy a string to an output buffer, 1205// while managing buffer overflow and preflight size 1206// computation. NUL termination is added to destination, 1207// and the NUL is counted in the output size. 1208// 1209//------------------------------------------------------------------------------ 1210static void copyString(UChar *destBuffer, // Destination buffer. 1211 int32_t destCapacity, // Total capacity of dest buffer 1212 int32_t *destIndex, // Index into dest buffer. Updated on return. 1213 // Update not clipped to destCapacity. 1214 const UChar *srcPtr, // Pointer to source string 1215 int32_t srcLen) // Source string len. 1216{ 1217 int32_t si; 1218 int32_t di = *destIndex; 1219 UChar c; 1220 1221 for (si=0; si<srcLen; si++) { 1222 c = srcPtr[si]; 1223 if (di < destCapacity) { 1224 destBuffer[di] = c; 1225 di++; 1226 } else { 1227 di += srcLen - si; 1228 break; 1229 } 1230 } 1231 if (di<destCapacity) { 1232 destBuffer[di] = 0; 1233 } 1234 di++; 1235 *destIndex = di; 1236} 1237 1238 1239//------------------------------------------------------------------------------ 1240// 1241// uregex_split 1242// 1243//------------------------------------------------------------------------------ 1244U_CAPI int32_t U_EXPORT2 1245uregex_split( URegularExpression *regexp, 1246 UChar *destBuf, 1247 int32_t destCapacity, 1248 int32_t *requiredCapacity, 1249 UChar *destFields[], 1250 int32_t destFieldsCapacity, 1251 UErrorCode *status) { 1252 if (validateRE(regexp, status) == FALSE) { 1253 return 0; 1254 } 1255 if (destBuf == NULL && destCapacity > 0 || 1256 destCapacity < 0 || 1257 destFields == NULL || 1258 destFieldsCapacity < 1 ) { 1259 *status = U_ILLEGAL_ARGUMENT_ERROR; 1260 return 0; 1261 } 1262 1263 // 1264 // Reset for the input text 1265 // 1266 regexp->fMatcher->reset(); 1267 int32_t inputLen = regexp->fTextString.length(); 1268 int32_t nextOutputStringStart = 0; 1269 if (inputLen == 0) { 1270 return 0; 1271 } 1272 1273 1274 // 1275 // Loop through the input text, searching for the delimiter pattern 1276 // 1277 int32_t i; // Index of the field being processed. 1278 int32_t destIdx = 0; // Next available position in destBuf; 1279 int32_t numCaptureGroups = regexp->fMatcher->groupCount(); 1280 for (i=0; ; i++) { 1281 if (i>=destFieldsCapacity-1) { 1282 // There are one or zero output string left. 1283 // Fill the last output string with whatever is left from the input, then exit the loop. 1284 // ( i will be == destFieldsCapacity if we filled the output array while processing 1285 // capture groups of the delimiter expression, in which case we will discard the 1286 // last capture group saved in favor of the unprocessed remainder of the 1287 // input string.) 1288 int32_t remainingLength = inputLen-nextOutputStringStart; 1289 if (remainingLength > 0) { 1290 } 1291 if (i >= destFieldsCapacity) { 1292 // No fields are left. Recycle the last one for holding the trailing part of 1293 // the input string. 1294 i = destFieldsCapacity-1; 1295 destIdx = (int32_t)(destFields[i] - destFields[0]); 1296 } 1297 1298 destFields[i] = &destBuf[destIdx]; 1299 copyString(destBuf, destCapacity, &destIdx, 1300 ®exp->fText[nextOutputStringStart], remainingLength); 1301 break; 1302 } 1303 1304 if (regexp->fMatcher->find()) { 1305 // We found another delimiter. Move everything from where we started looking 1306 // up until the start of the delimiter into the next output string. 1307 int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart; 1308 destFields[i] = &destBuf[destIdx]; 1309 copyString(destBuf, destCapacity, &destIdx, 1310 ®exp->fText[nextOutputStringStart], fieldLen); 1311 nextOutputStringStart = regexp->fMatcher->end(*status); 1312 1313 // If the delimiter pattern has capturing parentheses, the captured 1314 // text goes out into the next n destination strings. 1315 int32_t groupNum; 1316 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 1317 // If we've run out of output string slots, bail out. 1318 if (i==destFieldsCapacity-1) { 1319 break; 1320 } 1321 i++; 1322 1323 // Set up to extract the capture group contents into the dest buffer. 1324 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow 1325 // error while extracting this group. 1326 int32_t remainingCapacity = destCapacity - destIdx; 1327 if (remainingCapacity < 0) { 1328 remainingCapacity = 0; 1329 } 1330 destFields[i] = &destBuf[destIdx]; 1331 int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus); 1332 destIdx += t + 1; // Record the space used in the output string buffer. 1333 // +1 for the NUL that terminates the string. 1334 } 1335 1336 if (nextOutputStringStart == inputLen) { 1337 // The delimiter was at the end of the string. We're done. 1338 break; 1339 } 1340 1341 } 1342 else 1343 { 1344 // We ran off the end of the input while looking for the next delimiter. 1345 // All the remaining text goes into the current output string. 1346 destFields[i] = &destBuf[destIdx]; 1347 copyString(destBuf, destCapacity, &destIdx, 1348 ®exp->fText[nextOutputStringStart], inputLen-nextOutputStringStart); 1349 break; 1350 } 1351 } 1352 1353 // Zero out any unused portion of the destFields array 1354 int j; 1355 for (j=i+1; j<destFieldsCapacity; j++) { 1356 destFields[j] = NULL; 1357 } 1358 1359 if (requiredCapacity != NULL) { 1360 *requiredCapacity = destIdx; 1361 } 1362 if (destIdx > destCapacity) { 1363 *status = U_BUFFER_OVERFLOW_ERROR; 1364 } 1365 return i+1; 1366} 1367 1368 1369#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1370 1371