1/* 2****************************************************************************** 3* Copyright (C) 1999-2015, International Business Machines Corporation and 4* others. All Rights Reserved. 5****************************************************************************** 6* 7* File unistr.cpp 8* 9* Modification History: 10* 11* Date Name Description 12* 09/25/98 stephen Creation. 13* 04/20/99 stephen Overhauled per 4/16 code review. 14* 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX 15* 11/18/99 aliu Added handleReplaceBetween() to make inherit from 16* Replaceable. 17* 06/25/01 grhoten Removed the dependency on iostream 18****************************************************************************** 19*/ 20 21#include "unicode/utypes.h" 22#include "unicode/appendable.h" 23#include "unicode/putil.h" 24#include "cstring.h" 25#include "cmemory.h" 26#include "unicode/ustring.h" 27#include "unicode/unistr.h" 28#include "unicode/utf.h" 29#include "unicode/utf16.h" 30#include "uelement.h" 31#include "ustr_imp.h" 32#include "umutex.h" 33#include "uassert.h" 34 35#if 0 36 37#include <iostream> 38using namespace std; 39 40//DEBUGGING 41void 42print(const UnicodeString& s, 43 const char *name) 44{ 45 UChar c; 46 cout << name << ":|"; 47 for(int i = 0; i < s.length(); ++i) { 48 c = s[i]; 49 if(c>= 0x007E || c < 0x0020) 50 cout << "[0x" << hex << s[i] << "]"; 51 else 52 cout << (char) s[i]; 53 } 54 cout << '|' << endl; 55} 56 57void 58print(const UChar *s, 59 int32_t len, 60 const char *name) 61{ 62 UChar c; 63 cout << name << ":|"; 64 for(int i = 0; i < len; ++i) { 65 c = s[i]; 66 if(c>= 0x007E || c < 0x0020) 67 cout << "[0x" << hex << s[i] << "]"; 68 else 69 cout << (char) s[i]; 70 } 71 cout << '|' << endl; 72} 73// END DEBUGGING 74#endif 75 76// Local function definitions for now 77 78// need to copy areas that may overlap 79static 80inline void 81us_arrayCopy(const UChar *src, int32_t srcStart, 82 UChar *dst, int32_t dstStart, int32_t count) 83{ 84 if(count>0) { 85 uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src))); 86 } 87} 88 89// u_unescapeAt() callback to get a UChar from a UnicodeString 90U_CDECL_BEGIN 91static UChar U_CALLCONV 92UnicodeString_charAt(int32_t offset, void *context) { 93 return ((icu::UnicodeString*) context)->charAt(offset); 94} 95U_CDECL_END 96 97U_NAMESPACE_BEGIN 98 99/* The Replaceable virtual destructor can't be defined in the header 100 due to how AIX works with multiple definitions of virtual functions. 101*/ 102Replaceable::~Replaceable() {} 103 104UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString) 105 106UnicodeString U_EXPORT2 107operator+ (const UnicodeString &s1, const UnicodeString &s2) { 108 return 109 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0). 110 append(s1). 111 append(s2); 112} 113 114//======================================== 115// Reference Counting functions, put at top of file so that optimizing compilers 116// have a chance to automatically inline. 117//======================================== 118 119void 120UnicodeString::addRef() { 121 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1); 122} 123 124int32_t 125UnicodeString::removeRef() { 126 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1); 127} 128 129int32_t 130UnicodeString::refCount() const { 131 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1)); 132} 133 134void 135UnicodeString::releaseArray() { 136 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) { 137 uprv_free((int32_t *)fUnion.fFields.fArray - 1); 138 } 139} 140 141 142 143//======================================== 144// Constructors 145//======================================== 146 147// The default constructor is inline in unistr.h. 148 149UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) { 150 fUnion.fFields.fLengthAndFlags = 0; 151 if(count <= 0 || (uint32_t)c > 0x10ffff) { 152 // just allocate and do not do anything else 153 allocate(capacity); 154 } else { 155 // count > 0, allocate and fill the new string with count c's 156 int32_t unitCount = U16_LENGTH(c), length = count * unitCount; 157 if(capacity < length) { 158 capacity = length; 159 } 160 if(allocate(capacity)) { 161 UChar *array = getArrayStart(); 162 int32_t i = 0; 163 164 // fill the new string with c 165 if(unitCount == 1) { 166 // fill with length UChars 167 while(i < length) { 168 array[i++] = (UChar)c; 169 } 170 } else { 171 // get the code units for c 172 UChar units[U16_MAX_LENGTH]; 173 U16_APPEND_UNSAFE(units, i, c); 174 175 // now it must be i==unitCount 176 i = 0; 177 178 // for Unicode, unitCount can only be 1, 2, 3, or 4 179 // 1 is handled above 180 while(i < length) { 181 int32_t unitIdx = 0; 182 while(unitIdx < unitCount) { 183 array[i++]=units[unitIdx++]; 184 } 185 } 186 } 187 } 188 setLength(length); 189 } 190} 191 192UnicodeString::UnicodeString(UChar ch) { 193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString; 194 fUnion.fStackFields.fBuffer[0] = ch; 195} 196 197UnicodeString::UnicodeString(UChar32 ch) { 198 fUnion.fFields.fLengthAndFlags = kShortString; 199 int32_t i = 0; 200 UBool isError = FALSE; 201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError); 202 // We test isError so that the compiler does not complain that we don't. 203 // If isError then i==0 which is what we want anyway. 204 if(!isError) { 205 setShortLength(i); 206 } 207} 208 209UnicodeString::UnicodeString(const UChar *text) { 210 fUnion.fFields.fLengthAndFlags = kShortString; 211 doAppend(text, 0, -1); 212} 213 214UnicodeString::UnicodeString(const UChar *text, 215 int32_t textLength) { 216 fUnion.fFields.fLengthAndFlags = kShortString; 217 doAppend(text, 0, textLength); 218} 219 220UnicodeString::UnicodeString(UBool isTerminated, 221 const UChar *text, 222 int32_t textLength) { 223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias; 224 if(text == NULL) { 225 // treat as an empty string, do not alias 226 setToEmpty(); 227 } else if(textLength < -1 || 228 (textLength == -1 && !isTerminated) || 229 (textLength >= 0 && isTerminated && text[textLength] != 0) 230 ) { 231 setToBogus(); 232 } else { 233 if(textLength == -1) { 234 // text is terminated, or else it would have failed the above test 235 textLength = u_strlen(text); 236 } 237 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 238 } 239} 240 241UnicodeString::UnicodeString(UChar *buff, 242 int32_t buffLength, 243 int32_t buffCapacity) { 244 fUnion.fFields.fLengthAndFlags = kWritableAlias; 245 if(buff == NULL) { 246 // treat as an empty string, do not alias 247 setToEmpty(); 248 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 249 setToBogus(); 250 } else { 251 if(buffLength == -1) { 252 // fLength = u_strlen(buff); but do not look beyond buffCapacity 253 const UChar *p = buff, *limit = buff + buffCapacity; 254 while(p != limit && *p != 0) { 255 ++p; 256 } 257 buffLength = (int32_t)(p - buff); 258 } 259 setArray(buff, buffLength, buffCapacity); 260 } 261} 262 263UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) { 264 fUnion.fFields.fLengthAndFlags = kShortString; 265 if(src==NULL) { 266 // treat as an empty string 267 } else { 268 if(length<0) { 269 length=(int32_t)uprv_strlen(src); 270 } 271 if(cloneArrayIfNeeded(length, length, FALSE)) { 272 u_charsToUChars(src, getArrayStart(), length); 273 setLength(length); 274 } else { 275 setToBogus(); 276 } 277 } 278} 279 280#if U_CHARSET_IS_UTF8 281 282UnicodeString::UnicodeString(const char *codepageData) { 283 fUnion.fFields.fLengthAndFlags = kShortString; 284 if(codepageData != 0) { 285 setToUTF8(codepageData); 286 } 287} 288 289UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) { 290 fUnion.fFields.fLengthAndFlags = kShortString; 291 // if there's nothing to convert, do nothing 292 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 293 return; 294 } 295 if(dataLength == -1) { 296 dataLength = (int32_t)uprv_strlen(codepageData); 297 } 298 setToUTF8(StringPiece(codepageData, dataLength)); 299} 300 301// else see unistr_cnv.cpp 302#endif 303 304UnicodeString::UnicodeString(const UnicodeString& that) { 305 fUnion.fFields.fLengthAndFlags = kShortString; 306 copyFrom(that); 307} 308 309#if U_HAVE_RVALUE_REFERENCES 310UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT { 311 fUnion.fFields.fLengthAndFlags = kShortString; 312 moveFrom(src); 313} 314#endif 315 316UnicodeString::UnicodeString(const UnicodeString& that, 317 int32_t srcStart) { 318 fUnion.fFields.fLengthAndFlags = kShortString; 319 setTo(that, srcStart); 320} 321 322UnicodeString::UnicodeString(const UnicodeString& that, 323 int32_t srcStart, 324 int32_t srcLength) { 325 fUnion.fFields.fLengthAndFlags = kShortString; 326 setTo(that, srcStart, srcLength); 327} 328 329// Replaceable base class clone() default implementation, does not clone 330Replaceable * 331Replaceable::clone() const { 332 return NULL; 333} 334 335// UnicodeString overrides clone() with a real implementation 336Replaceable * 337UnicodeString::clone() const { 338 return new UnicodeString(*this); 339} 340 341//======================================== 342// array allocation 343//======================================== 344 345UBool 346UnicodeString::allocate(int32_t capacity) { 347 if(capacity <= US_STACKBUF_SIZE) { 348 fUnion.fFields.fLengthAndFlags = kShortString; 349 } else { 350 // count bytes for the refCounter and the string capacity, and 351 // round up to a multiple of 16; then divide by 4 and allocate int32_t's 352 // to be safely aligned for the refCount 353 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer() 354 int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2); 355 int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words ); 356 if(array != 0) { 357 // set initial refCount and point behind the refCount 358 *array++ = 1; 359 360 // have fArray point to the first UChar 361 fUnion.fFields.fArray = (UChar *)array; 362 fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR)); 363 fUnion.fFields.fLengthAndFlags = kLongString; 364 } else { 365 fUnion.fFields.fLengthAndFlags = kIsBogus; 366 fUnion.fFields.fArray = 0; 367 fUnion.fFields.fCapacity = 0; 368 return FALSE; 369 } 370 } 371 return TRUE; 372} 373 374//======================================== 375// Destructor 376//======================================== 377 378#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS 379static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1 380static u_atomic_int32_t beyondCount(0); 381 382U_CAPI void unistr_printLengths() { 383 int32_t i; 384 for(i = 0; i <= 59; ++i) { 385 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]); 386 } 387 int32_t beyond = beyondCount; 388 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) { 389 beyond += finalLengthCounts[i]; 390 } 391 printf(">59, %9d\n", beyond); 392} 393#endif 394 395UnicodeString::~UnicodeString() 396{ 397#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS 398 // Count lengths of strings at the end of their lifetime. 399 // Useful for discussion of a desirable stack buffer size. 400 // Count the contents length, not the optional NUL terminator nor further capacity. 401 // Ignore open-buffer strings and strings which alias external storage. 402 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) { 403 if(hasShortLength()) { 404 umtx_atomic_inc(finalLengthCounts + getShortLength()); 405 } else { 406 umtx_atomic_inc(&beyondCount); 407 } 408 } 409#endif 410 411 releaseArray(); 412} 413 414//======================================== 415// Factory methods 416//======================================== 417 418UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) { 419 UnicodeString result; 420 result.setToUTF8(utf8); 421 return result; 422} 423 424UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) { 425 UnicodeString result; 426 int32_t capacity; 427 // Most UTF-32 strings will be BMP-only and result in a same-length 428 // UTF-16 string. We overestimate the capacity just slightly, 429 // just in case there are a few supplementary characters. 430 if(length <= US_STACKBUF_SIZE) { 431 capacity = US_STACKBUF_SIZE; 432 } else { 433 capacity = length + (length >> 4) + 4; 434 } 435 do { 436 UChar *utf16 = result.getBuffer(capacity); 437 int32_t length16; 438 UErrorCode errorCode = U_ZERO_ERROR; 439 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16, 440 utf32, length, 441 0xfffd, // Substitution character. 442 NULL, // Don't care about number of substitutions. 443 &errorCode); 444 result.releaseBuffer(length16); 445 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 446 capacity = length16 + 1; // +1 for the terminating NUL. 447 continue; 448 } else if(U_FAILURE(errorCode)) { 449 result.setToBogus(); 450 } 451 break; 452 } while(TRUE); 453 return result; 454} 455 456//======================================== 457// Assignment 458//======================================== 459 460UnicodeString & 461UnicodeString::operator=(const UnicodeString &src) { 462 return copyFrom(src); 463} 464 465UnicodeString & 466UnicodeString::fastCopyFrom(const UnicodeString &src) { 467 return copyFrom(src, TRUE); 468} 469 470UnicodeString & 471UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { 472 // if assigning to ourselves, do nothing 473 if(this == &src) { 474 return *this; 475 } 476 477 // is the right side bogus? 478 if(src.isBogus()) { 479 setToBogus(); 480 return *this; 481 } 482 483 // delete the current contents 484 releaseArray(); 485 486 if(src.isEmpty()) { 487 // empty string - use the stack buffer 488 setToEmpty(); 489 return *this; 490 } 491 492 // fLength>0 and not an "open" src.getBuffer(minCapacity) 493 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags; 494 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) { 495 case kShortString: 496 // short string using the stack buffer, do the same 497 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer, 498 getShortLength() * U_SIZEOF_UCHAR); 499 break; 500 case kLongString: 501 // src uses a refCounted string buffer, use that buffer with refCount 502 // src is const, use a cast - we don't actually change it 503 ((UnicodeString &)src).addRef(); 504 // copy all fields, share the reference-counted buffer 505 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 506 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 507 if(!hasShortLength()) { 508 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 509 } 510 break; 511 case kReadonlyAlias: 512 if(fastCopy) { 513 // src is a readonly alias, do the same 514 // -> maintain the readonly alias as such 515 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 516 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 517 if(!hasShortLength()) { 518 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 519 } 520 break; 521 } 522 // else if(!fastCopy) fall through to case kWritableAlias 523 // -> allocate a new buffer and copy the contents 524 case kWritableAlias: { 525 // src is a writable alias; we make a copy of that instead 526 int32_t srcLength = src.length(); 527 if(allocate(srcLength)) { 528 uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR); 529 setLength(srcLength); 530 break; 531 } 532 // if there is not enough memory, then fall through to setting to bogus 533 } 534 default: 535 // if src is bogus, set ourselves to bogus 536 // do not call setToBogus() here because fArray and flags are not consistent here 537 fUnion.fFields.fLengthAndFlags = kIsBogus; 538 fUnion.fFields.fArray = 0; 539 fUnion.fFields.fCapacity = 0; 540 break; 541 } 542 543 return *this; 544} 545 546UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT { 547 // No explicit check for self move assignment, consistent with standard library. 548 // Self move assignment causes no crash nor leak but might make the object bogus. 549 releaseArray(); 550 copyFieldsFrom(src, TRUE); 551 return *this; 552} 553 554// Same as moveFrom() except without memory management. 555void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT { 556 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags; 557 if(lengthAndFlags & kUsingStackBuffer) { 558 // Short string using the stack buffer, copy the contents. 559 // Check for self assignment to prevent "overlap in memcpy" warnings, 560 // although it should be harmless to copy a buffer to itself exactly. 561 if(this != &src) { 562 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer, 563 getShortLength() * U_SIZEOF_UCHAR); 564 } 565 } else { 566 // In all other cases, copy all fields. 567 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 568 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 569 if(!hasShortLength()) { 570 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 571 } 572 if(setSrcToBogus) { 573 // Set src to bogus without releasing any memory. 574 src.fUnion.fFields.fLengthAndFlags = kIsBogus; 575 src.fUnion.fFields.fArray = NULL; 576 src.fUnion.fFields.fCapacity = 0; 577 } 578 } 579} 580 581void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT { 582 UnicodeString temp; // Empty short string: Known not to need releaseArray(). 583 // Copy fields without resetting source values in between. 584 temp.copyFieldsFrom(*this, FALSE); 585 this->copyFieldsFrom(other, FALSE); 586 other.copyFieldsFrom(temp, FALSE); 587 // Set temp to an empty string so that other's memory is not released twice. 588 temp.fUnion.fFields.fLengthAndFlags = kShortString; 589} 590 591//======================================== 592// Miscellaneous operations 593//======================================== 594 595UnicodeString UnicodeString::unescape() const { 596 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity 597 if (result.isBogus()) { 598 return result; 599 } 600 const UChar *array = getBuffer(); 601 int32_t len = length(); 602 int32_t prev = 0; 603 for (int32_t i=0;;) { 604 if (i == len) { 605 result.append(array, prev, len - prev); 606 break; 607 } 608 if (array[i++] == 0x5C /*'\\'*/) { 609 result.append(array, prev, (i - 1) - prev); 610 UChar32 c = unescapeAt(i); // advances i 611 if (c < 0) { 612 result.remove(); // return empty string 613 break; // invalid escape sequence 614 } 615 result.append(c); 616 prev = i; 617 } 618 } 619 return result; 620} 621 622UChar32 UnicodeString::unescapeAt(int32_t &offset) const { 623 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this); 624} 625 626//======================================== 627// Read-only implementation 628//======================================== 629UBool 630UnicodeString::doEquals(const UnicodeString &text, int32_t len) const { 631 // Requires: this & text not bogus and have same lengths. 632 // Byte-wise comparison works for equality regardless of endianness. 633 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0; 634} 635 636int8_t 637UnicodeString::doCompare( int32_t start, 638 int32_t length, 639 const UChar *srcChars, 640 int32_t srcStart, 641 int32_t srcLength) const 642{ 643 // compare illegal string values 644 if(isBogus()) { 645 return -1; 646 } 647 648 // pin indices to legal values 649 pinIndices(start, length); 650 651 if(srcChars == NULL) { 652 // treat const UChar *srcChars==NULL as an empty string 653 return length == 0 ? 0 : 1; 654 } 655 656 // get the correct pointer 657 const UChar *chars = getArrayStart(); 658 659 chars += start; 660 srcChars += srcStart; 661 662 int32_t minLength; 663 int8_t lengthResult; 664 665 // get the srcLength if necessary 666 if(srcLength < 0) { 667 srcLength = u_strlen(srcChars + srcStart); 668 } 669 670 // are we comparing different lengths? 671 if(length != srcLength) { 672 if(length < srcLength) { 673 minLength = length; 674 lengthResult = -1; 675 } else { 676 minLength = srcLength; 677 lengthResult = 1; 678 } 679 } else { 680 minLength = length; 681 lengthResult = 0; 682 } 683 684 /* 685 * note that uprv_memcmp() returns an int but we return an int8_t; 686 * we need to take care not to truncate the result - 687 * one way to do this is to right-shift the value to 688 * move the sign bit into the lower 8 bits and making sure that this 689 * does not become 0 itself 690 */ 691 692 if(minLength > 0 && chars != srcChars) { 693 int32_t result; 694 695# if U_IS_BIG_ENDIAN 696 // big-endian: byte comparison works 697 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar)); 698 if(result != 0) { 699 return (int8_t)(result >> 15 | 1); 700 } 701# else 702 // little-endian: compare UChar units 703 do { 704 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++)); 705 if(result != 0) { 706 return (int8_t)(result >> 15 | 1); 707 } 708 } while(--minLength > 0); 709# endif 710 } 711 return lengthResult; 712} 713 714/* String compare in code point order - doCompare() compares in code unit order. */ 715int8_t 716UnicodeString::doCompareCodePointOrder(int32_t start, 717 int32_t length, 718 const UChar *srcChars, 719 int32_t srcStart, 720 int32_t srcLength) const 721{ 722 // compare illegal string values 723 // treat const UChar *srcChars==NULL as an empty string 724 if(isBogus()) { 725 return -1; 726 } 727 728 // pin indices to legal values 729 pinIndices(start, length); 730 731 if(srcChars == NULL) { 732 srcStart = srcLength = 0; 733 } 734 735 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE); 736 /* translate the 32-bit result into an 8-bit one */ 737 if(diff!=0) { 738 return (int8_t)(diff >> 15 | 1); 739 } else { 740 return 0; 741 } 742} 743 744int32_t 745UnicodeString::getLength() const { 746 return length(); 747} 748 749UChar 750UnicodeString::getCharAt(int32_t offset) const { 751 return charAt(offset); 752} 753 754UChar32 755UnicodeString::getChar32At(int32_t offset) const { 756 return char32At(offset); 757} 758 759UChar32 760UnicodeString::char32At(int32_t offset) const 761{ 762 int32_t len = length(); 763 if((uint32_t)offset < (uint32_t)len) { 764 const UChar *array = getArrayStart(); 765 UChar32 c; 766 U16_GET(array, 0, offset, len, c); 767 return c; 768 } else { 769 return kInvalidUChar; 770 } 771} 772 773int32_t 774UnicodeString::getChar32Start(int32_t offset) const { 775 if((uint32_t)offset < (uint32_t)length()) { 776 const UChar *array = getArrayStart(); 777 U16_SET_CP_START(array, 0, offset); 778 return offset; 779 } else { 780 return 0; 781 } 782} 783 784int32_t 785UnicodeString::getChar32Limit(int32_t offset) const { 786 int32_t len = length(); 787 if((uint32_t)offset < (uint32_t)len) { 788 const UChar *array = getArrayStart(); 789 U16_SET_CP_LIMIT(array, 0, offset, len); 790 return offset; 791 } else { 792 return len; 793 } 794} 795 796int32_t 797UnicodeString::countChar32(int32_t start, int32_t length) const { 798 pinIndices(start, length); 799 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL 800 return u_countChar32(getArrayStart()+start, length); 801} 802 803UBool 804UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const { 805 pinIndices(start, length); 806 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL 807 return u_strHasMoreChar32Than(getArrayStart()+start, length, number); 808} 809 810int32_t 811UnicodeString::moveIndex32(int32_t index, int32_t delta) const { 812 // pin index 813 int32_t len = length(); 814 if(index<0) { 815 index=0; 816 } else if(index>len) { 817 index=len; 818 } 819 820 const UChar *array = getArrayStart(); 821 if(delta>0) { 822 U16_FWD_N(array, index, len, delta); 823 } else { 824 U16_BACK_N(array, 0, index, -delta); 825 } 826 827 return index; 828} 829 830void 831UnicodeString::doExtract(int32_t start, 832 int32_t length, 833 UChar *dst, 834 int32_t dstStart) const 835{ 836 // pin indices to legal values 837 pinIndices(start, length); 838 839 // do not copy anything if we alias dst itself 840 const UChar *array = getArrayStart(); 841 if(array + start != dst + dstStart) { 842 us_arrayCopy(array, start, dst, dstStart, length); 843 } 844} 845 846int32_t 847UnicodeString::extract(UChar *dest, int32_t destCapacity, 848 UErrorCode &errorCode) const { 849 int32_t len = length(); 850 if(U_SUCCESS(errorCode)) { 851 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 852 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 853 } else { 854 const UChar *array = getArrayStart(); 855 if(len>0 && len<=destCapacity && array!=dest) { 856 uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR); 857 } 858 return u_terminateUChars(dest, destCapacity, len, &errorCode); 859 } 860 } 861 862 return len; 863} 864 865int32_t 866UnicodeString::extract(int32_t start, 867 int32_t length, 868 char *target, 869 int32_t targetCapacity, 870 enum EInvariant) const 871{ 872 // if the arguments are illegal, then do nothing 873 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) { 874 return 0; 875 } 876 877 // pin the indices to legal values 878 pinIndices(start, length); 879 880 if(length <= targetCapacity) { 881 u_UCharsToChars(getArrayStart() + start, target, length); 882 } 883 UErrorCode status = U_ZERO_ERROR; 884 return u_terminateChars(target, targetCapacity, length, &status); 885} 886 887UnicodeString 888UnicodeString::tempSubString(int32_t start, int32_t len) const { 889 pinIndices(start, len); 890 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer 891 if(array==NULL) { 892 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string 893 len=-2; // bogus result string 894 } 895 return UnicodeString(FALSE, array + start, len); 896} 897 898int32_t 899UnicodeString::toUTF8(int32_t start, int32_t len, 900 char *target, int32_t capacity) const { 901 pinIndices(start, len); 902 int32_t length8; 903 UErrorCode errorCode = U_ZERO_ERROR; 904 u_strToUTF8WithSub(target, capacity, &length8, 905 getBuffer() + start, len, 906 0xFFFD, // Standard substitution character. 907 NULL, // Don't care about number of substitutions. 908 &errorCode); 909 return length8; 910} 911 912#if U_CHARSET_IS_UTF8 913 914int32_t 915UnicodeString::extract(int32_t start, int32_t len, 916 char *target, uint32_t dstSize) const { 917 // if the arguments are illegal, then do nothing 918 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 919 return 0; 920 } 921 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff); 922} 923 924// else see unistr_cnv.cpp 925#endif 926 927void 928UnicodeString::extractBetween(int32_t start, 929 int32_t limit, 930 UnicodeString& target) const { 931 pinIndex(start); 932 pinIndex(limit); 933 doExtract(start, limit - start, target); 934} 935 936// When converting from UTF-16 to UTF-8, the result will have at most 3 times 937// as many bytes as the source has UChars. 938// The "worst cases" are writing systems like Indic, Thai and CJK with 939// 3:1 bytes:UChars. 940void 941UnicodeString::toUTF8(ByteSink &sink) const { 942 int32_t length16 = length(); 943 if(length16 != 0) { 944 char stackBuffer[1024]; 945 int32_t capacity = (int32_t)sizeof(stackBuffer); 946 UBool utf8IsOwned = FALSE; 947 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity, 948 3*length16, 949 stackBuffer, capacity, 950 &capacity); 951 int32_t length8 = 0; 952 UErrorCode errorCode = U_ZERO_ERROR; 953 u_strToUTF8WithSub(utf8, capacity, &length8, 954 getBuffer(), length16, 955 0xFFFD, // Standard substitution character. 956 NULL, // Don't care about number of substitutions. 957 &errorCode); 958 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 959 utf8 = (char *)uprv_malloc(length8); 960 if(utf8 != NULL) { 961 utf8IsOwned = TRUE; 962 errorCode = U_ZERO_ERROR; 963 u_strToUTF8WithSub(utf8, length8, &length8, 964 getBuffer(), length16, 965 0xFFFD, // Standard substitution character. 966 NULL, // Don't care about number of substitutions. 967 &errorCode); 968 } else { 969 errorCode = U_MEMORY_ALLOCATION_ERROR; 970 } 971 } 972 if(U_SUCCESS(errorCode)) { 973 sink.Append(utf8, length8); 974 sink.Flush(); 975 } 976 if(utf8IsOwned) { 977 uprv_free(utf8); 978 } 979 } 980} 981 982int32_t 983UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const { 984 int32_t length32=0; 985 if(U_SUCCESS(errorCode)) { 986 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments. 987 u_strToUTF32WithSub(utf32, capacity, &length32, 988 getBuffer(), length(), 989 0xfffd, // Substitution character. 990 NULL, // Don't care about number of substitutions. 991 &errorCode); 992 } 993 return length32; 994} 995 996int32_t 997UnicodeString::indexOf(const UChar *srcChars, 998 int32_t srcStart, 999 int32_t srcLength, 1000 int32_t start, 1001 int32_t length) const 1002{ 1003 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 1004 return -1; 1005 } 1006 1007 // UnicodeString does not find empty substrings 1008 if(srcLength < 0 && srcChars[srcStart] == 0) { 1009 return -1; 1010 } 1011 1012 // get the indices within bounds 1013 pinIndices(start, length); 1014 1015 // find the first occurrence of the substring 1016 const UChar *array = getArrayStart(); 1017 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength); 1018 if(match == NULL) { 1019 return -1; 1020 } else { 1021 return (int32_t)(match - array); 1022 } 1023} 1024 1025int32_t 1026UnicodeString::doIndexOf(UChar c, 1027 int32_t start, 1028 int32_t length) const 1029{ 1030 // pin indices 1031 pinIndices(start, length); 1032 1033 // find the first occurrence of c 1034 const UChar *array = getArrayStart(); 1035 const UChar *match = u_memchr(array + start, c, length); 1036 if(match == NULL) { 1037 return -1; 1038 } else { 1039 return (int32_t)(match - array); 1040 } 1041} 1042 1043int32_t 1044UnicodeString::doIndexOf(UChar32 c, 1045 int32_t start, 1046 int32_t length) const { 1047 // pin indices 1048 pinIndices(start, length); 1049 1050 // find the first occurrence of c 1051 const UChar *array = getArrayStart(); 1052 const UChar *match = u_memchr32(array + start, c, length); 1053 if(match == NULL) { 1054 return -1; 1055 } else { 1056 return (int32_t)(match - array); 1057 } 1058} 1059 1060int32_t 1061UnicodeString::lastIndexOf(const UChar *srcChars, 1062 int32_t srcStart, 1063 int32_t srcLength, 1064 int32_t start, 1065 int32_t length) const 1066{ 1067 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 1068 return -1; 1069 } 1070 1071 // UnicodeString does not find empty substrings 1072 if(srcLength < 0 && srcChars[srcStart] == 0) { 1073 return -1; 1074 } 1075 1076 // get the indices within bounds 1077 pinIndices(start, length); 1078 1079 // find the last occurrence of the substring 1080 const UChar *array = getArrayStart(); 1081 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength); 1082 if(match == NULL) { 1083 return -1; 1084 } else { 1085 return (int32_t)(match - array); 1086 } 1087} 1088 1089int32_t 1090UnicodeString::doLastIndexOf(UChar c, 1091 int32_t start, 1092 int32_t length) const 1093{ 1094 if(isBogus()) { 1095 return -1; 1096 } 1097 1098 // pin indices 1099 pinIndices(start, length); 1100 1101 // find the last occurrence of c 1102 const UChar *array = getArrayStart(); 1103 const UChar *match = u_memrchr(array + start, c, length); 1104 if(match == NULL) { 1105 return -1; 1106 } else { 1107 return (int32_t)(match - array); 1108 } 1109} 1110 1111int32_t 1112UnicodeString::doLastIndexOf(UChar32 c, 1113 int32_t start, 1114 int32_t length) const { 1115 // pin indices 1116 pinIndices(start, length); 1117 1118 // find the last occurrence of c 1119 const UChar *array = getArrayStart(); 1120 const UChar *match = u_memrchr32(array + start, c, length); 1121 if(match == NULL) { 1122 return -1; 1123 } else { 1124 return (int32_t)(match - array); 1125 } 1126} 1127 1128//======================================== 1129// Write implementation 1130//======================================== 1131 1132UnicodeString& 1133UnicodeString::findAndReplace(int32_t start, 1134 int32_t length, 1135 const UnicodeString& oldText, 1136 int32_t oldStart, 1137 int32_t oldLength, 1138 const UnicodeString& newText, 1139 int32_t newStart, 1140 int32_t newLength) 1141{ 1142 if(isBogus() || oldText.isBogus() || newText.isBogus()) { 1143 return *this; 1144 } 1145 1146 pinIndices(start, length); 1147 oldText.pinIndices(oldStart, oldLength); 1148 newText.pinIndices(newStart, newLength); 1149 1150 if(oldLength == 0) { 1151 return *this; 1152 } 1153 1154 while(length > 0 && length >= oldLength) { 1155 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length); 1156 if(pos < 0) { 1157 // no more oldText's here: done 1158 break; 1159 } else { 1160 // we found oldText, replace it by newText and go beyond it 1161 replace(pos, oldLength, newText, newStart, newLength); 1162 length -= pos + oldLength - start; 1163 start = pos + newLength; 1164 } 1165 } 1166 1167 return *this; 1168} 1169 1170 1171void 1172UnicodeString::setToBogus() 1173{ 1174 releaseArray(); 1175 1176 fUnion.fFields.fLengthAndFlags = kIsBogus; 1177 fUnion.fFields.fArray = 0; 1178 fUnion.fFields.fCapacity = 0; 1179} 1180 1181// turn a bogus string into an empty one 1182void 1183UnicodeString::unBogus() { 1184 if(fUnion.fFields.fLengthAndFlags & kIsBogus) { 1185 setToEmpty(); 1186 } 1187} 1188 1189const UChar * 1190UnicodeString::getTerminatedBuffer() { 1191 if(!isWritable()) { 1192 return 0; 1193 } 1194 UChar *array = getArrayStart(); 1195 int32_t len = length(); 1196 if(len < getCapacity()) { 1197 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) { 1198 // If len<capacity on a read-only alias, then array[len] is 1199 // either the original NUL (if constructed with (TRUE, s, length)) 1200 // or one of the original string contents characters (if later truncated), 1201 // therefore we can assume that array[len] is initialized memory. 1202 if(array[len] == 0) { 1203 return array; 1204 } 1205 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) { 1206 // kRefCounted: Do not write the NUL if the buffer is shared. 1207 // That is mostly safe, except when the length of one copy was modified 1208 // without copy-on-write, e.g., via truncate(newLength) or remove(void). 1209 // Then the NUL would be written into the middle of another copy's string. 1210 1211 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL. 1212 // Do not test if there is a NUL already because it might be uninitialized memory. 1213 // (That would be safe, but tools like valgrind & Purify would complain.) 1214 array[len] = 0; 1215 return array; 1216 } 1217 } 1218 if(cloneArrayIfNeeded(len+1)) { 1219 array = getArrayStart(); 1220 array[len] = 0; 1221 return array; 1222 } else { 1223 return NULL; 1224 } 1225} 1226 1227// setTo() analogous to the readonly-aliasing constructor with the same signature 1228UnicodeString & 1229UnicodeString::setTo(UBool isTerminated, 1230 const UChar *text, 1231 int32_t textLength) 1232{ 1233 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) { 1234 // do not modify a string that has an "open" getBuffer(minCapacity) 1235 return *this; 1236 } 1237 1238 if(text == NULL) { 1239 // treat as an empty string, do not alias 1240 releaseArray(); 1241 setToEmpty(); 1242 return *this; 1243 } 1244 1245 if( textLength < -1 || 1246 (textLength == -1 && !isTerminated) || 1247 (textLength >= 0 && isTerminated && text[textLength] != 0) 1248 ) { 1249 setToBogus(); 1250 return *this; 1251 } 1252 1253 releaseArray(); 1254 1255 if(textLength == -1) { 1256 // text is terminated, or else it would have failed the above test 1257 textLength = u_strlen(text); 1258 } 1259 fUnion.fFields.fLengthAndFlags = kReadonlyAlias; 1260 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 1261 return *this; 1262} 1263 1264// setTo() analogous to the writable-aliasing constructor with the same signature 1265UnicodeString & 1266UnicodeString::setTo(UChar *buffer, 1267 int32_t buffLength, 1268 int32_t buffCapacity) { 1269 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) { 1270 // do not modify a string that has an "open" getBuffer(minCapacity) 1271 return *this; 1272 } 1273 1274 if(buffer == NULL) { 1275 // treat as an empty string, do not alias 1276 releaseArray(); 1277 setToEmpty(); 1278 return *this; 1279 } 1280 1281 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 1282 setToBogus(); 1283 return *this; 1284 } else if(buffLength == -1) { 1285 // buffLength = u_strlen(buff); but do not look beyond buffCapacity 1286 const UChar *p = buffer, *limit = buffer + buffCapacity; 1287 while(p != limit && *p != 0) { 1288 ++p; 1289 } 1290 buffLength = (int32_t)(p - buffer); 1291 } 1292 1293 releaseArray(); 1294 1295 fUnion.fFields.fLengthAndFlags = kWritableAlias; 1296 setArray(buffer, buffLength, buffCapacity); 1297 return *this; 1298} 1299 1300UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) { 1301 unBogus(); 1302 int32_t length = utf8.length(); 1303 int32_t capacity; 1304 // The UTF-16 string will be at most as long as the UTF-8 string. 1305 if(length <= US_STACKBUF_SIZE) { 1306 capacity = US_STACKBUF_SIZE; 1307 } else { 1308 capacity = length + 1; // +1 for the terminating NUL. 1309 } 1310 UChar *utf16 = getBuffer(capacity); 1311 int32_t length16; 1312 UErrorCode errorCode = U_ZERO_ERROR; 1313 u_strFromUTF8WithSub(utf16, getCapacity(), &length16, 1314 utf8.data(), length, 1315 0xfffd, // Substitution character. 1316 NULL, // Don't care about number of substitutions. 1317 &errorCode); 1318 releaseBuffer(length16); 1319 if(U_FAILURE(errorCode)) { 1320 setToBogus(); 1321 } 1322 return *this; 1323} 1324 1325UnicodeString& 1326UnicodeString::setCharAt(int32_t offset, 1327 UChar c) 1328{ 1329 int32_t len = length(); 1330 if(cloneArrayIfNeeded() && len > 0) { 1331 if(offset < 0) { 1332 offset = 0; 1333 } else if(offset >= len) { 1334 offset = len - 1; 1335 } 1336 1337 getArrayStart()[offset] = c; 1338 } 1339 return *this; 1340} 1341 1342UnicodeString& 1343UnicodeString::replace(int32_t start, 1344 int32_t _length, 1345 UChar32 srcChar) { 1346 UChar buffer[U16_MAX_LENGTH]; 1347 int32_t count = 0; 1348 UBool isError = FALSE; 1349 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError); 1350 // We test isError so that the compiler does not complain that we don't. 1351 // If isError (srcChar is not a valid code point) then count==0 which means 1352 // we remove the source segment rather than replacing it with srcChar. 1353 return doReplace(start, _length, buffer, 0, isError ? 0 : count); 1354} 1355 1356UnicodeString& 1357UnicodeString::append(UChar32 srcChar) { 1358 UChar buffer[U16_MAX_LENGTH]; 1359 int32_t _length = 0; 1360 UBool isError = FALSE; 1361 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError); 1362 // We test isError so that the compiler does not complain that we don't. 1363 // If isError then _length==0 which turns the doAppend() into a no-op anyway. 1364 return isError ? *this : doAppend(buffer, 0, _length); 1365} 1366 1367UnicodeString& 1368UnicodeString::doReplace( int32_t start, 1369 int32_t length, 1370 const UnicodeString& src, 1371 int32_t srcStart, 1372 int32_t srcLength) 1373{ 1374 // pin the indices to legal values 1375 src.pinIndices(srcStart, srcLength); 1376 1377 // get the characters from src 1378 // and replace the range in ourselves with them 1379 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength); 1380} 1381 1382UnicodeString& 1383UnicodeString::doReplace(int32_t start, 1384 int32_t length, 1385 const UChar *srcChars, 1386 int32_t srcStart, 1387 int32_t srcLength) 1388{ 1389 if(!isWritable()) { 1390 return *this; 1391 } 1392 1393 int32_t oldLength = this->length(); 1394 1395 // optimize (read-only alias).remove(0, start) and .remove(start, end) 1396 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) { 1397 if(start == 0) { 1398 // remove prefix by adjusting the array pointer 1399 pinIndex(length); 1400 fUnion.fFields.fArray += length; 1401 fUnion.fFields.fCapacity -= length; 1402 setLength(oldLength - length); 1403 return *this; 1404 } else { 1405 pinIndex(start); 1406 if(length >= (oldLength - start)) { 1407 // remove suffix by reducing the length (like truncate()) 1408 setLength(start); 1409 fUnion.fFields.fCapacity = start; // not NUL-terminated any more 1410 return *this; 1411 } 1412 } 1413 } 1414 1415 if(start == oldLength) { 1416 return doAppend(srcChars, srcStart, srcLength); 1417 } 1418 1419 if(srcChars == 0) { 1420 srcStart = srcLength = 0; 1421 } else if(srcLength < 0) { 1422 // get the srcLength if necessary 1423 srcLength = u_strlen(srcChars + srcStart); 1424 } 1425 1426 // pin the indices to legal values 1427 pinIndices(start, length); 1428 1429 // calculate the size of the string after the replace 1430 int32_t newLength = oldLength - length + srcLength; 1431 1432 // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents; 1433 // therefore we need to keep the current fArray 1434 UChar oldStackBuffer[US_STACKBUF_SIZE]; 1435 UChar *oldArray; 1436 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) { 1437 // copy the stack buffer contents because it will be overwritten with 1438 // fUnion.fFields values 1439 u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength); 1440 oldArray = oldStackBuffer; 1441 } else { 1442 oldArray = getArrayStart(); 1443 } 1444 1445 // clone our array and allocate a bigger array if needed 1446 int32_t *bufferToDelete = 0; 1447 if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize, 1448 FALSE, &bufferToDelete) 1449 ) { 1450 return *this; 1451 } 1452 1453 // now do the replace 1454 1455 UChar *newArray = getArrayStart(); 1456 if(newArray != oldArray) { 1457 // if fArray changed, then we need to copy everything except what will change 1458 us_arrayCopy(oldArray, 0, newArray, 0, start); 1459 us_arrayCopy(oldArray, start + length, 1460 newArray, start + srcLength, 1461 oldLength - (start + length)); 1462 } else if(length != srcLength) { 1463 // fArray did not change; copy only the portion that isn't changing, leaving a hole 1464 us_arrayCopy(oldArray, start + length, 1465 newArray, start + srcLength, 1466 oldLength - (start + length)); 1467 } 1468 1469 // now fill in the hole with the new string 1470 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength); 1471 1472 setLength(newLength); 1473 1474 // delayed delete in case srcChars == fArray when we started, and 1475 // to keep oldArray alive for the above operations 1476 if (bufferToDelete) { 1477 uprv_free(bufferToDelete); 1478 } 1479 1480 return *this; 1481} 1482 1483// Versions of doReplace() only for append() variants. 1484// doReplace() and doAppend() optimize for different cases. 1485 1486UnicodeString& 1487UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) { 1488 if(srcLength == 0) { 1489 return *this; 1490 } 1491 1492 // pin the indices to legal values 1493 src.pinIndices(srcStart, srcLength); 1494 return doAppend(src.getArrayStart(), srcStart, srcLength); 1495} 1496 1497UnicodeString& 1498UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) { 1499 if(!isWritable() || srcLength == 0 || srcChars == NULL) { 1500 return *this; 1501 } 1502 1503 if(srcLength < 0) { 1504 // get the srcLength if necessary 1505 if((srcLength = u_strlen(srcChars + srcStart)) == 0) { 1506 return *this; 1507 } 1508 } 1509 1510 int32_t oldLength = length(); 1511 int32_t newLength = oldLength + srcLength; 1512 // optimize append() onto a large-enough, owned string 1513 if((newLength <= getCapacity() && isBufferWritable()) || 1514 cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize)) { 1515 UChar *newArray = getArrayStart(); 1516 // Do not copy characters when 1517 // UChar *buffer=str.getAppendBuffer(...); 1518 // is followed by 1519 // str.append(buffer, length); 1520 // or 1521 // str.appendString(buffer, length) 1522 // or similar. 1523 if(srcChars + srcStart != newArray + oldLength) { 1524 us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength); 1525 } 1526 setLength(newLength); 1527 } 1528 return *this; 1529} 1530 1531/** 1532 * Replaceable API 1533 */ 1534void 1535UnicodeString::handleReplaceBetween(int32_t start, 1536 int32_t limit, 1537 const UnicodeString& text) { 1538 replaceBetween(start, limit, text); 1539} 1540 1541/** 1542 * Replaceable API 1543 */ 1544void 1545UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) { 1546 if (limit <= start) { 1547 return; // Nothing to do; avoid bogus malloc call 1548 } 1549 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) ); 1550 // Check to make sure text is not null. 1551 if (text != NULL) { 1552 extractBetween(start, limit, text, 0); 1553 insert(dest, text, 0, limit - start); 1554 uprv_free(text); 1555 } 1556} 1557 1558/** 1559 * Replaceable API 1560 * 1561 * NOTE: This is for the Replaceable class. There is no rep.cpp, 1562 * so we implement this function here. 1563 */ 1564UBool Replaceable::hasMetaData() const { 1565 return TRUE; 1566} 1567 1568/** 1569 * Replaceable API 1570 */ 1571UBool UnicodeString::hasMetaData() const { 1572 return FALSE; 1573} 1574 1575UnicodeString& 1576UnicodeString::doReverse(int32_t start, int32_t length) { 1577 if(length <= 1 || !cloneArrayIfNeeded()) { 1578 return *this; 1579 } 1580 1581 // pin the indices to legal values 1582 pinIndices(start, length); 1583 if(length <= 1) { // pinIndices() might have shrunk the length 1584 return *this; 1585 } 1586 1587 UChar *left = getArrayStart() + start; 1588 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2) 1589 UChar swap; 1590 UBool hasSupplementary = FALSE; 1591 1592 // Before the loop we know left<right because length>=2. 1593 do { 1594 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left); 1595 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right); 1596 *right-- = swap; 1597 } while(left < right); 1598 // Make sure to test the middle code unit of an odd-length string. 1599 // Redundant if the length is even. 1600 hasSupplementary |= (UBool)U16_IS_LEAD(*left); 1601 1602 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */ 1603 if(hasSupplementary) { 1604 UChar swap2; 1605 1606 left = getArrayStart() + start; 1607 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right 1608 while(left < right) { 1609 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) { 1610 *left++ = swap2; 1611 *left++ = swap; 1612 } else { 1613 ++left; 1614 } 1615 } 1616 } 1617 1618 return *this; 1619} 1620 1621UBool 1622UnicodeString::padLeading(int32_t targetLength, 1623 UChar padChar) 1624{ 1625 int32_t oldLength = length(); 1626 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1627 return FALSE; 1628 } else { 1629 // move contents up by padding width 1630 UChar *array = getArrayStart(); 1631 int32_t start = targetLength - oldLength; 1632 us_arrayCopy(array, 0, array, start, oldLength); 1633 1634 // fill in padding character 1635 while(--start >= 0) { 1636 array[start] = padChar; 1637 } 1638 setLength(targetLength); 1639 return TRUE; 1640 } 1641} 1642 1643UBool 1644UnicodeString::padTrailing(int32_t targetLength, 1645 UChar padChar) 1646{ 1647 int32_t oldLength = length(); 1648 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1649 return FALSE; 1650 } else { 1651 // fill in padding character 1652 UChar *array = getArrayStart(); 1653 int32_t length = targetLength; 1654 while(--length >= oldLength) { 1655 array[length] = padChar; 1656 } 1657 setLength(targetLength); 1658 return TRUE; 1659 } 1660} 1661 1662//======================================== 1663// Hashing 1664//======================================== 1665int32_t 1666UnicodeString::doHashCode() const 1667{ 1668 /* Delegate hash computation to uhash. This makes UnicodeString 1669 * hashing consistent with UChar* hashing. */ 1670 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length()); 1671 if (hashCode == kInvalidHashCode) { 1672 hashCode = kEmptyHashCode; 1673 } 1674 return hashCode; 1675} 1676 1677//======================================== 1678// External Buffer 1679//======================================== 1680 1681UChar * 1682UnicodeString::getBuffer(int32_t minCapacity) { 1683 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) { 1684 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer; 1685 setZeroLength(); 1686 return getArrayStart(); 1687 } else { 1688 return 0; 1689 } 1690} 1691 1692void 1693UnicodeString::releaseBuffer(int32_t newLength) { 1694 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) { 1695 // set the new fLength 1696 int32_t capacity=getCapacity(); 1697 if(newLength==-1) { 1698 // the new length is the string length, capped by fCapacity 1699 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity; 1700 while(p<limit && *p!=0) { 1701 ++p; 1702 } 1703 newLength=(int32_t)(p-array); 1704 } else if(newLength>capacity) { 1705 newLength=capacity; 1706 } 1707 setLength(newLength); 1708 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer; 1709 } 1710} 1711 1712//======================================== 1713// Miscellaneous 1714//======================================== 1715UBool 1716UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, 1717 int32_t growCapacity, 1718 UBool doCopyArray, 1719 int32_t **pBufferToDelete, 1720 UBool forceClone) { 1721 // default parameters need to be static, therefore 1722 // the defaults are -1 to have convenience defaults 1723 if(newCapacity == -1) { 1724 newCapacity = getCapacity(); 1725 } 1726 1727 // while a getBuffer(minCapacity) is "open", 1728 // prevent any modifications of the string by returning FALSE here 1729 // if the string is bogus, then only an assignment or similar can revive it 1730 if(!isWritable()) { 1731 return FALSE; 1732 } 1733 1734 /* 1735 * We need to make a copy of the array if 1736 * the buffer is read-only, or 1737 * the buffer is refCounted (shared), and refCount>1, or 1738 * the buffer is too small. 1739 * Return FALSE if memory could not be allocated. 1740 */ 1741 if(forceClone || 1742 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly || 1743 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) || 1744 newCapacity > getCapacity() 1745 ) { 1746 // check growCapacity for default value and use of the stack buffer 1747 if(growCapacity < 0) { 1748 growCapacity = newCapacity; 1749 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { 1750 growCapacity = US_STACKBUF_SIZE; 1751 } 1752 1753 // save old values 1754 UChar oldStackBuffer[US_STACKBUF_SIZE]; 1755 UChar *oldArray; 1756 int32_t oldLength = length(); 1757 int16_t flags = fUnion.fFields.fLengthAndFlags; 1758 1759 if(flags&kUsingStackBuffer) { 1760 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */ 1761 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) { 1762 // copy the stack buffer contents because it will be overwritten with 1763 // fUnion.fFields values 1764 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength); 1765 oldArray = oldStackBuffer; 1766 } else { 1767 oldArray = NULL; // no need to copy from the stack buffer to itself 1768 } 1769 } else { 1770 oldArray = fUnion.fFields.fArray; 1771 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */ 1772 } 1773 1774 // allocate a new array 1775 if(allocate(growCapacity) || 1776 (newCapacity < growCapacity && allocate(newCapacity)) 1777 ) { 1778 if(doCopyArray) { 1779 // copy the contents 1780 // do not copy more than what fits - it may be smaller than before 1781 int32_t minLength = oldLength; 1782 newCapacity = getCapacity(); 1783 if(newCapacity < minLength) { 1784 minLength = newCapacity; 1785 } 1786 if(oldArray != NULL) { 1787 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength); 1788 } 1789 setLength(minLength); 1790 } else { 1791 setZeroLength(); 1792 } 1793 1794 // release the old array 1795 if(flags & kRefCounted) { 1796 // the array is refCounted; decrement and release if 0 1797 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1); 1798 if(umtx_atomic_dec(pRefCount) == 0) { 1799 if(pBufferToDelete == 0) { 1800 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t 1801 // is defined as volatile. (Volatile has useful non-standard behavior 1802 // with this compiler.) 1803 uprv_free((void *)pRefCount); 1804 } else { 1805 // the caller requested to delete it himself 1806 *pBufferToDelete = (int32_t *)pRefCount; 1807 } 1808 } 1809 } 1810 } else { 1811 // not enough memory for growCapacity and not even for the smaller newCapacity 1812 // reset the old values for setToBogus() to release the array 1813 if(!(flags&kUsingStackBuffer)) { 1814 fUnion.fFields.fArray = oldArray; 1815 } 1816 fUnion.fFields.fLengthAndFlags = flags; 1817 setToBogus(); 1818 return FALSE; 1819 } 1820 } 1821 return TRUE; 1822} 1823 1824// UnicodeStringAppendable ------------------------------------------------- *** 1825 1826UnicodeStringAppendable::~UnicodeStringAppendable() {} 1827 1828UBool 1829UnicodeStringAppendable::appendCodeUnit(UChar c) { 1830 return str.doAppend(&c, 0, 1).isWritable(); 1831} 1832 1833UBool 1834UnicodeStringAppendable::appendCodePoint(UChar32 c) { 1835 UChar buffer[U16_MAX_LENGTH]; 1836 int32_t cLength = 0; 1837 UBool isError = FALSE; 1838 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError); 1839 return !isError && str.doAppend(buffer, 0, cLength).isWritable(); 1840} 1841 1842UBool 1843UnicodeStringAppendable::appendString(const UChar *s, int32_t length) { 1844 return str.doAppend(s, 0, length).isWritable(); 1845} 1846 1847UBool 1848UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) { 1849 return str.cloneArrayIfNeeded(str.length() + appendCapacity); 1850} 1851 1852UChar * 1853UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity, 1854 int32_t desiredCapacityHint, 1855 UChar *scratch, int32_t scratchCapacity, 1856 int32_t *resultCapacity) { 1857 if(minCapacity < 1 || scratchCapacity < minCapacity) { 1858 *resultCapacity = 0; 1859 return NULL; 1860 } 1861 int32_t oldLength = str.length(); 1862 if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) { 1863 *resultCapacity = str.getCapacity() - oldLength; 1864 return str.getArrayStart() + oldLength; 1865 } 1866 *resultCapacity = scratchCapacity; 1867 return scratch; 1868} 1869 1870U_NAMESPACE_END 1871 1872U_NAMESPACE_USE 1873 1874U_CAPI int32_t U_EXPORT2 1875uhash_hashUnicodeString(const UElement key) { 1876 const UnicodeString *str = (const UnicodeString*) key.pointer; 1877 return (str == NULL) ? 0 : str->hashCode(); 1878} 1879 1880// Moved here from uhash_us.cpp so that using a UVector of UnicodeString* 1881// does not depend on hashtable code. 1882U_CAPI UBool U_EXPORT2 1883uhash_compareUnicodeString(const UElement key1, const UElement key2) { 1884 const UnicodeString *str1 = (const UnicodeString*) key1.pointer; 1885 const UnicodeString *str2 = (const UnicodeString*) key2.pointer; 1886 if (str1 == str2) { 1887 return TRUE; 1888 } 1889 if (str1 == NULL || str2 == NULL) { 1890 return FALSE; 1891 } 1892 return *str1 == *str2; 1893} 1894 1895#ifdef U_STATIC_IMPLEMENTATION 1896/* 1897This should never be called. It is defined here to make sure that the 1898virtual vector deleting destructor is defined within unistr.cpp. 1899The vector deleting destructor is already a part of UObject, 1900but defining it here makes sure that it is included with this object file. 1901This makes sure that static library dependencies are kept to a minimum. 1902*/ 1903static void uprv_UnicodeStringDummy(void) { 1904 delete [] (new UnicodeString[2]); 1905} 1906#endif 1907