1/* 2 * (C) 1999 Lars Knoll (knoll@kde.org) 3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights reserved. 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Library General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Library General Public License for more details. 15 * 16 * You should have received a copy of the GNU Library General Public License 17 * along with this library; see the file COPYING.LIB. If not, write to 18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 * Boston, MA 02110-1301, USA. 20 */ 21 22#include "config.h" 23#include "WTFString.h" 24 25#include "IntegerToStringConversion.h" 26#include <stdarg.h> 27#include "wtf/ASCIICType.h" 28#include "wtf/DataLog.h" 29#include "wtf/HexNumber.h" 30#include "wtf/MathExtras.h" 31#include "wtf/text/CString.h" 32#include "wtf/StringExtras.h" 33#include "wtf/Vector.h" 34#include "wtf/dtoa.h" 35#include "wtf/unicode/CharacterNames.h" 36#include "wtf/unicode/UTF8.h" 37#include "wtf/unicode/Unicode.h" 38 39using namespace std; 40 41namespace WTF { 42 43using namespace Unicode; 44using namespace std; 45 46// Construct a string with UTF-16 data. 47String::String(const UChar* characters, unsigned length) 48 : m_impl(characters ? StringImpl::create(characters, length) : nullptr) 49{ 50} 51 52// Construct a string with UTF-16 data, from a null-terminated source. 53String::String(const UChar* str) 54{ 55 if (!str) 56 return; 57 m_impl = StringImpl::create(str, lengthOfNullTerminatedString(str)); 58} 59 60// Construct a string with latin1 data. 61String::String(const LChar* characters, unsigned length) 62 : m_impl(characters ? StringImpl::create(characters, length) : nullptr) 63{ 64} 65 66String::String(const char* characters, unsigned length) 67 : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters), length) : nullptr) 68{ 69} 70 71// Construct a string with latin1 data, from a null-terminated source. 72String::String(const LChar* characters) 73 : m_impl(characters ? StringImpl::create(characters) : nullptr) 74{ 75} 76 77String::String(const char* characters) 78 : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters)) : nullptr) 79{ 80} 81 82void String::append(const String& string) 83{ 84 if (string.isEmpty()) 85 return; 86 if (!m_impl) { 87 m_impl = string.m_impl; 88 return; 89 } 90 91 // FIXME: This is extremely inefficient. So much so that we might want to take this 92 // out of String's API. We can make it better by optimizing the case where exactly 93 // one String is pointing at this StringImpl, but even then it's going to require a 94 // call into the allocator every single time. 95 96 if (m_impl->is8Bit() && string.m_impl->is8Bit()) { 97 LChar* data; 98 RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length()); 99 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data); 100 memcpy(data, m_impl->characters8(), m_impl->length() * sizeof(LChar)); 101 memcpy(data + m_impl->length(), string.characters8(), string.length() * sizeof(LChar)); 102 m_impl = newImpl.release(); 103 return; 104 } 105 106 UChar* data; 107 RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length()); 108 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data); 109 110 if (m_impl->is8Bit()) 111 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length()); 112 else 113 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length()); 114 115 if (string.impl()->is8Bit()) 116 StringImpl::copyChars(data + m_impl->length(), string.impl()->characters8(), string.impl()->length()); 117 else 118 StringImpl::copyChars(data + m_impl->length(), string.impl()->characters16(), string.impl()->length()); 119 120 m_impl = newImpl.release(); 121} 122 123template <typename CharacterType> 124inline void String::appendInternal(CharacterType c) 125{ 126 // FIXME: This is extremely inefficient. So much so that we might want to take this 127 // out of String's API. We can make it better by optimizing the case where exactly 128 // one String is pointing at this StringImpl, but even then it's going to require a 129 // call into the allocator every single time. 130 if (!m_impl) { 131 m_impl = StringImpl::create(&c, 1); 132 return; 133 } 134 135 UChar* data; // FIXME: We should be able to create an 8 bit string via this code path. 136 RELEASE_ASSERT(m_impl->length() < numeric_limits<unsigned>::max()); 137 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + 1, data); 138 if (m_impl->is8Bit()) 139 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length()); 140 else 141 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length()); 142 data[m_impl->length()] = c; 143 m_impl = newImpl.release(); 144} 145 146void String::append(LChar c) 147{ 148 appendInternal(c); 149} 150 151void String::append(UChar c) 152{ 153 appendInternal(c); 154} 155 156int codePointCompare(const String& a, const String& b) 157{ 158 return codePointCompare(a.impl(), b.impl()); 159} 160 161void String::insert(const String& string, unsigned position) 162{ 163 if (string.isEmpty()) { 164 if (string.isNull()) 165 return; 166 if (isNull()) 167 m_impl = string.impl(); 168 return; 169 } 170 171 if (string.is8Bit()) 172 insert(string.impl()->characters8(), string.length(), position); 173 else 174 insert(string.impl()->characters16(), string.length(), position); 175} 176 177void String::append(const LChar* charactersToAppend, unsigned lengthToAppend) 178{ 179 if (!m_impl) { 180 if (!charactersToAppend) 181 return; 182 m_impl = StringImpl::create(charactersToAppend, lengthToAppend); 183 return; 184 } 185 186 if (!lengthToAppend) 187 return; 188 189 ASSERT(charactersToAppend); 190 191 unsigned strLength = m_impl->length(); 192 193 if (m_impl->is8Bit()) { 194 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength); 195 LChar* data; 196 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data); 197 StringImpl::copyChars(data, m_impl->characters8(), strLength); 198 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend); 199 m_impl = newImpl.release(); 200 return; 201 } 202 203 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength); 204 UChar* data; 205 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() + lengthToAppend, data); 206 StringImpl::copyChars(data, m_impl->characters16(), strLength); 207 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend); 208 m_impl = newImpl.release(); 209} 210 211void String::append(const UChar* charactersToAppend, unsigned lengthToAppend) 212{ 213 if (!m_impl) { 214 if (!charactersToAppend) 215 return; 216 m_impl = StringImpl::create(charactersToAppend, lengthToAppend); 217 return; 218 } 219 220 if (!lengthToAppend) 221 return; 222 223 unsigned strLength = m_impl->length(); 224 225 ASSERT(charactersToAppend); 226 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength); 227 UChar* data; 228 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data); 229 if (m_impl->is8Bit()) 230 StringImpl::copyChars(data, characters8(), strLength); 231 else 232 StringImpl::copyChars(data, characters16(), strLength); 233 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend); 234 m_impl = newImpl.release(); 235} 236 237template<typename CharType> 238PassRefPtr<StringImpl> insertInternal(PassRefPtr<StringImpl> impl, const CharType* charactersToInsert, unsigned lengthToInsert, unsigned position) 239{ 240 if (!lengthToInsert) 241 return impl; 242 243 ASSERT(charactersToInsert); 244 UChar* data; // FIXME: We should be able to create an 8 bit string here. 245 RELEASE_ASSERT(lengthToInsert <= numeric_limits<unsigned>::max() - impl->length()); 246 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(impl->length() + lengthToInsert, data); 247 248 if (impl->is8Bit()) 249 StringImpl::copyChars(data, impl->characters8(), position); 250 else 251 StringImpl::copyChars(data, impl->characters16(), position); 252 253 StringImpl::copyChars(data + position, charactersToInsert, lengthToInsert); 254 255 if (impl->is8Bit()) 256 StringImpl::copyChars(data + position + lengthToInsert, impl->characters8() + position, impl->length() - position); 257 else 258 StringImpl::copyChars(data + position + lengthToInsert, impl->characters16() + position, impl->length() - position); 259 260 return newImpl.release(); 261} 262 263void String::insert(const UChar* charactersToInsert, unsigned lengthToInsert, unsigned position) 264{ 265 if (position >= length()) { 266 append(charactersToInsert, lengthToInsert); 267 return; 268 } 269 ASSERT(m_impl); 270 m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position); 271} 272 273void String::insert(const LChar* charactersToInsert, unsigned lengthToInsert, unsigned position) 274{ 275 if (position >= length()) { 276 append(charactersToInsert, lengthToInsert); 277 return; 278 } 279 ASSERT(m_impl); 280 m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position); 281} 282 283UChar32 String::characterStartingAt(unsigned i) const 284{ 285 if (!m_impl || i >= m_impl->length()) 286 return 0; 287 return m_impl->characterStartingAt(i); 288} 289 290void String::ensure16Bit() 291{ 292 unsigned length = this->length(); 293 if (!length || !is8Bit()) 294 return; 295 m_impl = make16BitFrom8BitSource(m_impl->characters8(), length).impl(); 296} 297 298void String::truncate(unsigned position) 299{ 300 if (position >= length()) 301 return; 302 if (m_impl->is8Bit()) { 303 LChar* data; 304 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data); 305 memcpy(data, m_impl->characters8(), position * sizeof(LChar)); 306 m_impl = newImpl.release(); 307 } else { 308 UChar* data; 309 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data); 310 memcpy(data, m_impl->characters16(), position * sizeof(UChar)); 311 m_impl = newImpl.release(); 312 } 313} 314 315template <typename CharacterType> 316inline void String::removeInternal(const CharacterType* characters, unsigned position, int lengthToRemove) 317{ 318 CharacterType* data; 319 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() - lengthToRemove, data); 320 memcpy(data, characters, position * sizeof(CharacterType)); 321 memcpy(data + position, characters + position + lengthToRemove, 322 (length() - lengthToRemove - position) * sizeof(CharacterType)); 323 324 m_impl = newImpl.release(); 325} 326 327void String::remove(unsigned position, int lengthToRemove) 328{ 329 if (lengthToRemove <= 0) 330 return; 331 if (position >= length()) 332 return; 333 if (static_cast<unsigned>(lengthToRemove) > length() - position) 334 lengthToRemove = length() - position; 335 336 if (is8Bit()) { 337 removeInternal(characters8(), position, lengthToRemove); 338 339 return; 340 } 341 342 removeInternal(characters16(), position, lengthToRemove); 343} 344 345String String::substring(unsigned pos, unsigned len) const 346{ 347 if (!m_impl) 348 return String(); 349 return m_impl->substring(pos, len); 350} 351 352String String::lower() const 353{ 354 if (!m_impl) 355 return String(); 356 return m_impl->lower(); 357} 358 359String String::upper() const 360{ 361 if (!m_impl) 362 return String(); 363 return m_impl->upper(); 364} 365 366String String::lower(const AtomicString& localeIdentifier) const 367{ 368 if (!m_impl) 369 return String(); 370 return m_impl->lower(localeIdentifier); 371} 372 373String String::upper(const AtomicString& localeIdentifier) const 374{ 375 if (!m_impl) 376 return String(); 377 return m_impl->upper(localeIdentifier); 378} 379 380String String::stripWhiteSpace() const 381{ 382 if (!m_impl) 383 return String(); 384 return m_impl->stripWhiteSpace(); 385} 386 387String String::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const 388{ 389 if (!m_impl) 390 return String(); 391 return m_impl->stripWhiteSpace(isWhiteSpace); 392} 393 394String String::simplifyWhiteSpace(StripBehavior stripBehavior) const 395{ 396 if (!m_impl) 397 return String(); 398 return m_impl->simplifyWhiteSpace(stripBehavior); 399} 400 401String String::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace, StripBehavior stripBehavior) const 402{ 403 if (!m_impl) 404 return String(); 405 return m_impl->simplifyWhiteSpace(isWhiteSpace, stripBehavior); 406} 407 408String String::removeCharacters(CharacterMatchFunctionPtr findMatch) const 409{ 410 if (!m_impl) 411 return String(); 412 return m_impl->removeCharacters(findMatch); 413} 414 415String String::foldCase() const 416{ 417 if (!m_impl) 418 return String(); 419 return m_impl->foldCase(); 420} 421 422bool String::percentage(int& result) const 423{ 424 if (!m_impl || !m_impl->length()) 425 return false; 426 427 if ((*m_impl)[m_impl->length() - 1] != '%') 428 return false; 429 430 if (m_impl->is8Bit()) 431 result = charactersToIntStrict(m_impl->characters8(), m_impl->length() - 1); 432 else 433 result = charactersToIntStrict(m_impl->characters16(), m_impl->length() - 1); 434 435 return true; 436} 437 438Vector<UChar> String::charactersWithNullTermination() const 439{ 440 if (!m_impl) 441 return Vector<UChar>(); 442 443 Vector<UChar> result; 444 result.reserveInitialCapacity(length() + 1); 445 appendTo(result); 446 result.append(0); 447 return result; 448} 449 450unsigned String::copyTo(UChar* buffer, unsigned pos, unsigned maxLength) const 451{ 452 unsigned length = this->length(); 453 RELEASE_ASSERT(pos <= length); 454 unsigned numCharacters = std::min(length - pos, maxLength); 455 if (!numCharacters) 456 return 0; 457 if (is8Bit()) 458 StringImpl::copyChars(buffer, characters8() + pos, numCharacters); 459 else 460 StringImpl::copyChars(buffer, characters16() + pos, numCharacters); 461 return numCharacters; 462} 463 464String String::format(const char *format, ...) 465{ 466 va_list args; 467 va_start(args, format); 468 469 Vector<char, 256> buffer; 470 471 // Do the format once to get the length. 472#if COMPILER(MSVC) 473 int result = _vscprintf(format, args); 474#else 475 char ch; 476 int result = vsnprintf(&ch, 1, format, args); 477 // We need to call va_end() and then va_start() again here, as the 478 // contents of args is undefined after the call to vsnprintf 479 // according to http://man.cx/snprintf(3) 480 // 481 // Not calling va_end/va_start here happens to work on lots of 482 // systems, but fails e.g. on 64bit Linux. 483 va_end(args); 484 va_start(args, format); 485#endif 486 487 if (result == 0) 488 return String(""); 489 if (result < 0) 490 return String(); 491 unsigned len = result; 492 buffer.grow(len + 1); 493 494 // Now do the formatting again, guaranteed to fit. 495 vsnprintf(buffer.data(), buffer.size(), format, args); 496 497 va_end(args); 498 499 return StringImpl::create(reinterpret_cast<const LChar*>(buffer.data()), len); 500} 501 502String String::number(int number) 503{ 504 return numberToStringSigned<String>(number); 505} 506 507String String::number(unsigned number) 508{ 509 return numberToStringUnsigned<String>(number); 510} 511 512String String::number(long number) 513{ 514 return numberToStringSigned<String>(number); 515} 516 517String String::number(unsigned long number) 518{ 519 return numberToStringUnsigned<String>(number); 520} 521 522String String::number(long long number) 523{ 524 return numberToStringSigned<String>(number); 525} 526 527String String::number(unsigned long long number) 528{ 529 return numberToStringUnsigned<String>(number); 530} 531 532String String::number(double number, unsigned precision, TrailingZerosTruncatingPolicy trailingZerosTruncatingPolicy) 533{ 534 NumberToStringBuffer buffer; 535 return String(numberToFixedPrecisionString(number, precision, buffer, trailingZerosTruncatingPolicy == TruncateTrailingZeros)); 536} 537 538String String::numberToStringECMAScript(double number) 539{ 540 NumberToStringBuffer buffer; 541 return String(numberToString(number, buffer)); 542} 543 544String String::numberToStringFixedWidth(double number, unsigned decimalPlaces) 545{ 546 NumberToStringBuffer buffer; 547 return String(numberToFixedWidthString(number, decimalPlaces, buffer)); 548} 549 550int String::toIntStrict(bool* ok, int base) const 551{ 552 if (!m_impl) { 553 if (ok) 554 *ok = false; 555 return 0; 556 } 557 return m_impl->toIntStrict(ok, base); 558} 559 560unsigned String::toUIntStrict(bool* ok, int base) const 561{ 562 if (!m_impl) { 563 if (ok) 564 *ok = false; 565 return 0; 566 } 567 return m_impl->toUIntStrict(ok, base); 568} 569 570int64_t String::toInt64Strict(bool* ok, int base) const 571{ 572 if (!m_impl) { 573 if (ok) 574 *ok = false; 575 return 0; 576 } 577 return m_impl->toInt64Strict(ok, base); 578} 579 580uint64_t String::toUInt64Strict(bool* ok, int base) const 581{ 582 if (!m_impl) { 583 if (ok) 584 *ok = false; 585 return 0; 586 } 587 return m_impl->toUInt64Strict(ok, base); 588} 589 590intptr_t String::toIntPtrStrict(bool* ok, int base) const 591{ 592 if (!m_impl) { 593 if (ok) 594 *ok = false; 595 return 0; 596 } 597 return m_impl->toIntPtrStrict(ok, base); 598} 599 600int String::toInt(bool* ok) const 601{ 602 if (!m_impl) { 603 if (ok) 604 *ok = false; 605 return 0; 606 } 607 return m_impl->toInt(ok); 608} 609 610unsigned String::toUInt(bool* ok) const 611{ 612 if (!m_impl) { 613 if (ok) 614 *ok = false; 615 return 0; 616 } 617 return m_impl->toUInt(ok); 618} 619 620int64_t String::toInt64(bool* ok) const 621{ 622 if (!m_impl) { 623 if (ok) 624 *ok = false; 625 return 0; 626 } 627 return m_impl->toInt64(ok); 628} 629 630uint64_t String::toUInt64(bool* ok) const 631{ 632 if (!m_impl) { 633 if (ok) 634 *ok = false; 635 return 0; 636 } 637 return m_impl->toUInt64(ok); 638} 639 640intptr_t String::toIntPtr(bool* ok) const 641{ 642 if (!m_impl) { 643 if (ok) 644 *ok = false; 645 return 0; 646 } 647 return m_impl->toIntPtr(ok); 648} 649 650double String::toDouble(bool* ok) const 651{ 652 if (!m_impl) { 653 if (ok) 654 *ok = false; 655 return 0.0; 656 } 657 return m_impl->toDouble(ok); 658} 659 660float String::toFloat(bool* ok) const 661{ 662 if (!m_impl) { 663 if (ok) 664 *ok = false; 665 return 0.0f; 666 } 667 return m_impl->toFloat(ok); 668} 669 670String String::isolatedCopy() const 671{ 672 if (!m_impl) 673 return String(); 674 return m_impl->isolatedCopy(); 675} 676 677bool String::isSafeToSendToAnotherThread() const 678{ 679 if (!impl()) 680 return true; 681 if (impl()->isStatic()) 682 return true; 683 // AtomicStrings are not safe to send between threads as ~StringImpl() 684 // will try to remove them from the wrong AtomicStringTable. 685 if (impl()->isAtomic()) 686 return false; 687 if (impl()->hasOneRef()) 688 return true; 689 return false; 690} 691 692void String::split(const String& separator, bool allowEmptyEntries, Vector<String>& result) const 693{ 694 result.clear(); 695 696 unsigned startPos = 0; 697 size_t endPos; 698 while ((endPos = find(separator, startPos)) != kNotFound) { 699 if (allowEmptyEntries || startPos != endPos) 700 result.append(substring(startPos, endPos - startPos)); 701 startPos = endPos + separator.length(); 702 } 703 if (allowEmptyEntries || startPos != length()) 704 result.append(substring(startPos)); 705} 706 707void String::split(UChar separator, bool allowEmptyEntries, Vector<String>& result) const 708{ 709 result.clear(); 710 711 unsigned startPos = 0; 712 size_t endPos; 713 while ((endPos = find(separator, startPos)) != kNotFound) { 714 if (allowEmptyEntries || startPos != endPos) 715 result.append(substring(startPos, endPos - startPos)); 716 startPos = endPos + 1; 717 } 718 if (allowEmptyEntries || startPos != length()) 719 result.append(substring(startPos)); 720} 721 722CString String::ascii() const 723{ 724 // Printable ASCII characters 32..127 and the null character are 725 // preserved, characters outside of this range are converted to '?'. 726 727 unsigned length = this->length(); 728 if (!length) { 729 char* characterBuffer; 730 return CString::newUninitialized(length, characterBuffer); 731 } 732 733 if (this->is8Bit()) { 734 const LChar* characters = this->characters8(); 735 736 char* characterBuffer; 737 CString result = CString::newUninitialized(length, characterBuffer); 738 739 for (unsigned i = 0; i < length; ++i) { 740 LChar ch = characters[i]; 741 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch; 742 } 743 744 return result; 745 } 746 747 const UChar* characters = this->characters16(); 748 749 char* characterBuffer; 750 CString result = CString::newUninitialized(length, characterBuffer); 751 752 for (unsigned i = 0; i < length; ++i) { 753 UChar ch = characters[i]; 754 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch; 755 } 756 757 return result; 758} 759 760CString String::latin1() const 761{ 762 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are 763 // preserved, characters outside of this range are converted to '?'. 764 765 unsigned length = this->length(); 766 767 if (!length) 768 return CString("", 0); 769 770 if (is8Bit()) 771 return CString(reinterpret_cast<const char*>(this->characters8()), length); 772 773 const UChar* characters = this->characters16(); 774 775 char* characterBuffer; 776 CString result = CString::newUninitialized(length, characterBuffer); 777 778 for (unsigned i = 0; i < length; ++i) { 779 UChar ch = characters[i]; 780 characterBuffer[i] = ch > 0xff ? '?' : ch; 781 } 782 783 return result; 784} 785 786// Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available. 787static inline void putUTF8Triple(char*& buffer, UChar ch) 788{ 789 ASSERT(ch >= 0x0800); 790 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); 791 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); 792 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); 793} 794 795CString String::utf8(UTF8ConversionMode mode) const 796{ 797 unsigned length = this->length(); 798 799 if (!length) 800 return CString("", 0); 801 802 // Allocate a buffer big enough to hold all the characters 803 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). 804 // Optimization ideas, if we find this function is hot: 805 // * We could speculatively create a CStringBuffer to contain 'length' 806 // characters, and resize if necessary (i.e. if the buffer contains 807 // non-ascii characters). (Alternatively, scan the buffer first for 808 // ascii characters, so we know this will be sufficient). 809 // * We could allocate a CStringBuffer with an appropriate size to 810 // have a good chance of being able to write the string into the 811 // buffer without reallocing (say, 1.5 x length). 812 if (length > numeric_limits<unsigned>::max() / 3) 813 return CString(); 814 Vector<char, 1024> bufferVector(length * 3); 815 816 char* buffer = bufferVector.data(); 817 818 if (is8Bit()) { 819 const LChar* characters = this->characters8(); 820 821 ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size()); 822 ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion 823 } else { 824 const UChar* characters = this->characters16(); 825 826 if (mode == StrictUTF8ConversionReplacingUnpairedSurrogatesWithFFFD) { 827 const UChar* charactersEnd = characters + length; 828 char* bufferEnd = buffer + bufferVector.size(); 829 while (characters < charactersEnd) { 830 // Use strict conversion to detect unpaired surrogates. 831 ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, &buffer, bufferEnd, true); 832 ASSERT(result != targetExhausted); 833 // Conversion fails when there is an unpaired surrogate. 834 // Put replacement character (U+FFFD) instead of the unpaired surrogate. 835 if (result != conversionOK) { 836 ASSERT((0xD800 <= *characters && *characters <= 0xDFFF)); 837 // There should be room left, since one UChar hasn't been converted. 838 ASSERT((buffer + 3) <= bufferEnd); 839 putUTF8Triple(buffer, replacementCharacter); 840 ++characters; 841 } 842 } 843 } else { 844 bool strict = mode == StrictUTF8Conversion; 845 ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict); 846 ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion 847 848 // Only produced from strict conversion. 849 if (result == sourceIllegal) { 850 ASSERT(strict); 851 return CString(); 852 } 853 854 // Check for an unconverted high surrogate. 855 if (result == sourceExhausted) { 856 if (strict) 857 return CString(); 858 // This should be one unpaired high surrogate. Treat it the same 859 // was as an unpaired high surrogate would have been handled in 860 // the middle of a string with non-strict conversion - which is 861 // to say, simply encode it to UTF-8. 862 ASSERT((characters + 1) == (this->characters16() + length)); 863 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF)); 864 // There should be room left, since one UChar hasn't been converted. 865 ASSERT((buffer + 3) <= (buffer + bufferVector.size())); 866 putUTF8Triple(buffer, *characters); 867 } 868 } 869 } 870 871 return CString(bufferVector.data(), buffer - bufferVector.data()); 872} 873 874String String::make8BitFrom16BitSource(const UChar* source, size_t length) 875{ 876 if (!length) 877 return emptyString(); 878 879 LChar* destination; 880 String result = String::createUninitialized(length, destination); 881 882 copyLCharsFromUCharSource(destination, source, length); 883 884 return result; 885} 886 887String String::make16BitFrom8BitSource(const LChar* source, size_t length) 888{ 889 if (!length) 890 return emptyString16Bit(); 891 892 UChar* destination; 893 String result = String::createUninitialized(length, destination); 894 895 StringImpl::copyChars(destination, source, length); 896 897 return result; 898} 899 900String String::fromUTF8(const LChar* stringStart, size_t length) 901{ 902 RELEASE_ASSERT(length <= numeric_limits<unsigned>::max()); 903 904 if (!stringStart) 905 return String(); 906 907 if (!length) 908 return emptyString(); 909 910 if (charactersAreAllASCII(stringStart, length)) 911 return StringImpl::create(stringStart, length); 912 913 Vector<UChar, 1024> buffer(length); 914 UChar* bufferStart = buffer.data(); 915 916 UChar* bufferCurrent = bufferStart; 917 const char* stringCurrent = reinterpret_cast<const char*>(stringStart); 918 if (convertUTF8ToUTF16(&stringCurrent, reinterpret_cast<const char *>(stringStart + length), &bufferCurrent, bufferCurrent + buffer.size()) != conversionOK) 919 return String(); 920 921 unsigned utf16Length = bufferCurrent - bufferStart; 922 ASSERT(utf16Length < length); 923 return StringImpl::create(bufferStart, utf16Length); 924} 925 926String String::fromUTF8(const LChar* string) 927{ 928 if (!string) 929 return String(); 930 return fromUTF8(string, strlen(reinterpret_cast<const char*>(string))); 931} 932 933String String::fromUTF8(const CString& s) 934{ 935 return fromUTF8(s.data()); 936} 937 938String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size) 939{ 940 String utf8 = fromUTF8(string, size); 941 if (!utf8) 942 return String(string, size); 943 return utf8; 944} 945 946// String Operations 947 948static bool isCharacterAllowedInBase(UChar c, int base) 949{ 950 if (c > 0x7F) 951 return false; 952 if (isASCIIDigit(c)) 953 return c - '0' < base; 954 if (isASCIIAlpha(c)) { 955 if (base > 36) 956 base = 36; 957 return (c >= 'a' && c < 'a' + base - 10) 958 || (c >= 'A' && c < 'A' + base - 10); 959 } 960 return false; 961} 962 963template <typename IntegralType, typename CharType> 964static inline IntegralType toIntegralType(const CharType* data, size_t length, bool* ok, int base) 965{ 966 static const IntegralType integralMax = numeric_limits<IntegralType>::max(); 967 static const bool isSigned = numeric_limits<IntegralType>::is_signed; 968 const IntegralType maxMultiplier = integralMax / base; 969 970 IntegralType value = 0; 971 bool isOk = false; 972 bool isNegative = false; 973 974 if (!data) 975 goto bye; 976 977 // skip leading whitespace 978 while (length && isSpaceOrNewline(*data)) { 979 --length; 980 ++data; 981 } 982 983 if (isSigned && length && *data == '-') { 984 --length; 985 ++data; 986 isNegative = true; 987 } else if (length && *data == '+') { 988 --length; 989 ++data; 990 } 991 992 if (!length || !isCharacterAllowedInBase(*data, base)) 993 goto bye; 994 995 while (length && isCharacterAllowedInBase(*data, base)) { 996 --length; 997 IntegralType digitValue; 998 CharType c = *data; 999 if (isASCIIDigit(c)) 1000 digitValue = c - '0'; 1001 else if (c >= 'a') 1002 digitValue = c - 'a' + 10; 1003 else 1004 digitValue = c - 'A' + 10; 1005 1006 if (value > maxMultiplier || (value == maxMultiplier && digitValue > (integralMax % base) + isNegative)) 1007 goto bye; 1008 1009 value = base * value + digitValue; 1010 ++data; 1011 } 1012 1013#if COMPILER(MSVC) 1014#pragma warning(push, 0) 1015#pragma warning(disable:4146) 1016#endif 1017 1018 if (isNegative) 1019 value = -value; 1020 1021#if COMPILER(MSVC) 1022#pragma warning(pop) 1023#endif 1024 1025 // skip trailing space 1026 while (length && isSpaceOrNewline(*data)) { 1027 --length; 1028 ++data; 1029 } 1030 1031 if (!length) 1032 isOk = true; 1033bye: 1034 if (ok) 1035 *ok = isOk; 1036 return isOk ? value : 0; 1037} 1038 1039template <typename CharType> 1040static unsigned lengthOfCharactersAsInteger(const CharType* data, size_t length) 1041{ 1042 size_t i = 0; 1043 1044 // Allow leading spaces. 1045 for (; i != length; ++i) { 1046 if (!isSpaceOrNewline(data[i])) 1047 break; 1048 } 1049 1050 // Allow sign. 1051 if (i != length && (data[i] == '+' || data[i] == '-')) 1052 ++i; 1053 1054 // Allow digits. 1055 for (; i != length; ++i) { 1056 if (!isASCIIDigit(data[i])) 1057 break; 1058 } 1059 1060 return i; 1061} 1062 1063int charactersToIntStrict(const LChar* data, size_t length, bool* ok, int base) 1064{ 1065 return toIntegralType<int, LChar>(data, length, ok, base); 1066} 1067 1068int charactersToIntStrict(const UChar* data, size_t length, bool* ok, int base) 1069{ 1070 return toIntegralType<int, UChar>(data, length, ok, base); 1071} 1072 1073unsigned charactersToUIntStrict(const LChar* data, size_t length, bool* ok, int base) 1074{ 1075 return toIntegralType<unsigned, LChar>(data, length, ok, base); 1076} 1077 1078unsigned charactersToUIntStrict(const UChar* data, size_t length, bool* ok, int base) 1079{ 1080 return toIntegralType<unsigned, UChar>(data, length, ok, base); 1081} 1082 1083int64_t charactersToInt64Strict(const LChar* data, size_t length, bool* ok, int base) 1084{ 1085 return toIntegralType<int64_t, LChar>(data, length, ok, base); 1086} 1087 1088int64_t charactersToInt64Strict(const UChar* data, size_t length, bool* ok, int base) 1089{ 1090 return toIntegralType<int64_t, UChar>(data, length, ok, base); 1091} 1092 1093uint64_t charactersToUInt64Strict(const LChar* data, size_t length, bool* ok, int base) 1094{ 1095 return toIntegralType<uint64_t, LChar>(data, length, ok, base); 1096} 1097 1098uint64_t charactersToUInt64Strict(const UChar* data, size_t length, bool* ok, int base) 1099{ 1100 return toIntegralType<uint64_t, UChar>(data, length, ok, base); 1101} 1102 1103intptr_t charactersToIntPtrStrict(const LChar* data, size_t length, bool* ok, int base) 1104{ 1105 return toIntegralType<intptr_t, LChar>(data, length, ok, base); 1106} 1107 1108intptr_t charactersToIntPtrStrict(const UChar* data, size_t length, bool* ok, int base) 1109{ 1110 return toIntegralType<intptr_t, UChar>(data, length, ok, base); 1111} 1112 1113int charactersToInt(const LChar* data, size_t length, bool* ok) 1114{ 1115 return toIntegralType<int, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1116} 1117 1118int charactersToInt(const UChar* data, size_t length, bool* ok) 1119{ 1120 return toIntegralType<int, UChar>(data, lengthOfCharactersAsInteger(data, length), ok, 10); 1121} 1122 1123unsigned charactersToUInt(const LChar* data, size_t length, bool* ok) 1124{ 1125 return toIntegralType<unsigned, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1126} 1127 1128unsigned charactersToUInt(const UChar* data, size_t length, bool* ok) 1129{ 1130 return toIntegralType<unsigned, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1131} 1132 1133int64_t charactersToInt64(const LChar* data, size_t length, bool* ok) 1134{ 1135 return toIntegralType<int64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1136} 1137 1138int64_t charactersToInt64(const UChar* data, size_t length, bool* ok) 1139{ 1140 return toIntegralType<int64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1141} 1142 1143uint64_t charactersToUInt64(const LChar* data, size_t length, bool* ok) 1144{ 1145 return toIntegralType<uint64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1146} 1147 1148uint64_t charactersToUInt64(const UChar* data, size_t length, bool* ok) 1149{ 1150 return toIntegralType<uint64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1151} 1152 1153intptr_t charactersToIntPtr(const LChar* data, size_t length, bool* ok) 1154{ 1155 return toIntegralType<intptr_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1156} 1157 1158intptr_t charactersToIntPtr(const UChar* data, size_t length, bool* ok) 1159{ 1160 return toIntegralType<intptr_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1161} 1162 1163enum TrailingJunkPolicy { DisallowTrailingJunk, AllowTrailingJunk }; 1164 1165template <typename CharType, TrailingJunkPolicy policy> 1166static inline double toDoubleType(const CharType* data, size_t length, bool* ok, size_t& parsedLength) 1167{ 1168 size_t leadingSpacesLength = 0; 1169 while (leadingSpacesLength < length && isASCIISpace(data[leadingSpacesLength])) 1170 ++leadingSpacesLength; 1171 1172 double number = parseDouble(data + leadingSpacesLength, length - leadingSpacesLength, parsedLength); 1173 if (!parsedLength) { 1174 if (ok) 1175 *ok = false; 1176 return 0.0; 1177 } 1178 1179 parsedLength += leadingSpacesLength; 1180 if (ok) 1181 *ok = policy == AllowTrailingJunk || parsedLength == length; 1182 return number; 1183} 1184 1185double charactersToDouble(const LChar* data, size_t length, bool* ok) 1186{ 1187 size_t parsedLength; 1188 return toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength); 1189} 1190 1191double charactersToDouble(const UChar* data, size_t length, bool* ok) 1192{ 1193 size_t parsedLength; 1194 return toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength); 1195} 1196 1197float charactersToFloat(const LChar* data, size_t length, bool* ok) 1198{ 1199 // FIXME: This will return ok even when the string fits into a double but not a float. 1200 size_t parsedLength; 1201 return static_cast<float>(toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength)); 1202} 1203 1204float charactersToFloat(const UChar* data, size_t length, bool* ok) 1205{ 1206 // FIXME: This will return ok even when the string fits into a double but not a float. 1207 size_t parsedLength; 1208 return static_cast<float>(toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength)); 1209} 1210 1211float charactersToFloat(const LChar* data, size_t length, size_t& parsedLength) 1212{ 1213 // FIXME: This will return ok even when the string fits into a double but not a float. 1214 return static_cast<float>(toDoubleType<LChar, AllowTrailingJunk>(data, length, 0, parsedLength)); 1215} 1216 1217float charactersToFloat(const UChar* data, size_t length, size_t& parsedLength) 1218{ 1219 // FIXME: This will return ok even when the string fits into a double but not a float. 1220 return static_cast<float>(toDoubleType<UChar, AllowTrailingJunk>(data, length, 0, parsedLength)); 1221} 1222 1223const String& emptyString() 1224{ 1225 DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty())); 1226 return emptyString; 1227} 1228 1229const String& emptyString16Bit() 1230{ 1231 DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty16Bit())); 1232 return emptyString; 1233} 1234 1235} // namespace WTF 1236 1237#ifndef NDEBUG 1238// For use in the debugger 1239String* string(const char*); 1240Vector<char> asciiDebug(StringImpl* impl); 1241Vector<char> asciiDebug(String& string); 1242 1243void String::show() const 1244{ 1245 dataLogF("%s\n", asciiDebug(impl()).data()); 1246} 1247 1248String* string(const char* s) 1249{ 1250 // leaks memory! 1251 return new String(s); 1252} 1253 1254Vector<char> asciiDebug(StringImpl* impl) 1255{ 1256 if (!impl) 1257 return asciiDebug(String("[null]").impl()); 1258 1259 Vector<char> buffer; 1260 for (unsigned i = 0; i < impl->length(); ++i) { 1261 UChar ch = (*impl)[i]; 1262 if (isASCIIPrintable(ch)) { 1263 if (ch == '\\') 1264 buffer.append(ch); 1265 buffer.append(ch); 1266 } else { 1267 buffer.append('\\'); 1268 buffer.append('u'); 1269 appendUnsignedAsHexFixedSize(ch, buffer, 4); 1270 } 1271 } 1272 buffer.append('\0'); 1273 return buffer; 1274} 1275 1276Vector<char> asciiDebug(String& string) 1277{ 1278 return asciiDebug(string.impl()); 1279} 1280 1281#endif 1282