1/* 2 * (C) 1999 Lars Knoll (knoll@kde.org) 3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights reserved. 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Library General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Library General Public License for more details. 15 * 16 * You should have received a copy of the GNU Library General Public License 17 * along with this library; see the file COPYING.LIB. If not, write to 18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 * Boston, MA 02110-1301, USA. 20 */ 21 22#include "config.h" 23#include "WTFString.h" 24 25#include "IntegerToStringConversion.h" 26#include <stdarg.h> 27#include "wtf/ASCIICType.h" 28#include "wtf/DataLog.h" 29#include "wtf/HexNumber.h" 30#include "wtf/MathExtras.h" 31#include "wtf/text/CString.h" 32#include "wtf/StringExtras.h" 33#include "wtf/Vector.h" 34#include "wtf/dtoa.h" 35#include "wtf/unicode/CharacterNames.h" 36#include "wtf/unicode/UTF8.h" 37#include "wtf/unicode/Unicode.h" 38 39using namespace std; 40 41namespace WTF { 42 43using namespace Unicode; 44using namespace std; 45 46// Construct a string with UTF-16 data. 47String::String(const UChar* characters, unsigned length) 48 : m_impl(characters ? StringImpl::create(characters, length) : 0) 49{ 50} 51 52// Construct a string with UTF-16 data, from a null-terminated source. 53String::String(const UChar* str) 54{ 55 if (!str) 56 return; 57 m_impl = StringImpl::create(str, lengthOfNullTerminatedString(str)); 58} 59 60// Construct a string with latin1 data. 61String::String(const LChar* characters, unsigned length) 62 : m_impl(characters ? StringImpl::create(characters, length) : 0) 63{ 64} 65 66String::String(const char* characters, unsigned length) 67 : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters), length) : 0) 68{ 69} 70 71// Construct a string with latin1 data, from a null-terminated source. 72String::String(const LChar* characters) 73 : m_impl(characters ? StringImpl::create(characters) : 0) 74{ 75} 76 77String::String(const char* characters) 78 : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters)) : 0) 79{ 80} 81 82void String::append(const String& string) 83{ 84 if (string.isEmpty()) 85 return; 86 if (!m_impl) { 87 m_impl = string.m_impl; 88 return; 89 } 90 91 // FIXME: This is extremely inefficient. So much so that we might want to take this 92 // out of String's API. We can make it better by optimizing the case where exactly 93 // one String is pointing at this StringImpl, but even then it's going to require a 94 // call to fastMalloc every single time. 95 96 if (m_impl->is8Bit() && string.m_impl->is8Bit()) { 97 LChar* data; 98 RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length()); 99 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data); 100 memcpy(data, m_impl->characters8(), m_impl->length() * sizeof(LChar)); 101 memcpy(data + m_impl->length(), string.characters8(), string.length() * sizeof(LChar)); 102 m_impl = newImpl.release(); 103 return; 104 } 105 106 UChar* data; 107 RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length()); 108 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data); 109 110 if (m_impl->is8Bit()) 111 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length()); 112 else 113 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length()); 114 115 if (string.impl()->is8Bit()) 116 StringImpl::copyChars(data + m_impl->length(), string.impl()->characters8(), string.impl()->length()); 117 else 118 StringImpl::copyChars(data + m_impl->length(), string.impl()->characters16(), string.impl()->length()); 119 120 m_impl = newImpl.release(); 121} 122 123template <typename CharacterType> 124inline void String::appendInternal(CharacterType c) 125{ 126 // FIXME: This is extremely inefficient. So much so that we might want to take this 127 // out of String's API. We can make it better by optimizing the case where exactly 128 // one String is pointing at this StringImpl, but even then it's going to require a 129 // call to fastMalloc every single time. 130 if (!m_impl) { 131 m_impl = StringImpl::create(&c, 1); 132 return; 133 } 134 135 UChar* data; // FIXME: We should be able to create an 8 bit string via this code path. 136 RELEASE_ASSERT(m_impl->length() < numeric_limits<unsigned>::max()); 137 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + 1, data); 138 if (m_impl->is8Bit()) 139 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length()); 140 else 141 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length()); 142 data[m_impl->length()] = c; 143 m_impl = newImpl.release(); 144} 145 146void String::append(LChar c) 147{ 148 appendInternal(c); 149} 150 151void String::append(UChar c) 152{ 153 appendInternal(c); 154} 155 156int codePointCompare(const String& a, const String& b) 157{ 158 return codePointCompare(a.impl(), b.impl()); 159} 160 161void String::insert(const String& string, unsigned position) 162{ 163 if (string.isEmpty()) { 164 if (string.isNull()) 165 return; 166 if (isNull()) 167 m_impl = string.impl(); 168 return; 169 } 170 171 if (string.is8Bit()) 172 insert(string.impl()->characters8(), string.length(), position); 173 else 174 insert(string.impl()->characters16(), string.length(), position); 175} 176 177void String::append(const LChar* charactersToAppend, unsigned lengthToAppend) 178{ 179 if (!m_impl) { 180 if (!charactersToAppend) 181 return; 182 m_impl = StringImpl::create(charactersToAppend, lengthToAppend); 183 return; 184 } 185 186 if (!lengthToAppend) 187 return; 188 189 ASSERT(charactersToAppend); 190 191 unsigned strLength = m_impl->length(); 192 193 if (m_impl->is8Bit()) { 194 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength); 195 LChar* data; 196 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data); 197 StringImpl::copyChars(data, m_impl->characters8(), strLength); 198 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend); 199 m_impl = newImpl.release(); 200 return; 201 } 202 203 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength); 204 UChar* data; 205 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() + lengthToAppend, data); 206 StringImpl::copyChars(data, m_impl->characters16(), strLength); 207 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend); 208 m_impl = newImpl.release(); 209} 210 211void String::append(const UChar* charactersToAppend, unsigned lengthToAppend) 212{ 213 if (!m_impl) { 214 if (!charactersToAppend) 215 return; 216 m_impl = StringImpl::create(charactersToAppend, lengthToAppend); 217 return; 218 } 219 220 if (!lengthToAppend) 221 return; 222 223 unsigned strLength = m_impl->length(); 224 225 ASSERT(charactersToAppend); 226 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength); 227 UChar* data; 228 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data); 229 if (m_impl->is8Bit()) 230 StringImpl::copyChars(data, characters8(), strLength); 231 else 232 StringImpl::copyChars(data, characters16(), strLength); 233 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend); 234 m_impl = newImpl.release(); 235} 236 237template<typename CharType> 238PassRefPtr<StringImpl> insertInternal(PassRefPtr<StringImpl> impl, const CharType* charactersToInsert, unsigned lengthToInsert, unsigned position) 239{ 240 if (!lengthToInsert) 241 return impl; 242 243 ASSERT(charactersToInsert); 244 UChar* data; // FIXME: We should be able to create an 8 bit string here. 245 RELEASE_ASSERT(lengthToInsert <= numeric_limits<unsigned>::max() - impl->length()); 246 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(impl->length() + lengthToInsert, data); 247 248 if (impl->is8Bit()) 249 StringImpl::copyChars(data, impl->characters8(), position); 250 else 251 StringImpl::copyChars(data, impl->characters16(), position); 252 253 StringImpl::copyChars(data + position, charactersToInsert, lengthToInsert); 254 255 if (impl->is8Bit()) 256 StringImpl::copyChars(data + position + lengthToInsert, impl->characters8() + position, impl->length() - position); 257 else 258 StringImpl::copyChars(data + position + lengthToInsert, impl->characters16() + position, impl->length() - position); 259 260 return newImpl.release(); 261} 262 263void String::insert(const UChar* charactersToInsert, unsigned lengthToInsert, unsigned position) 264{ 265 if (position >= length()) { 266 append(charactersToInsert, lengthToInsert); 267 return; 268 } 269 ASSERT(m_impl); 270 m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position); 271} 272 273void String::insert(const LChar* charactersToInsert, unsigned lengthToInsert, unsigned position) 274{ 275 if (position >= length()) { 276 append(charactersToInsert, lengthToInsert); 277 return; 278 } 279 ASSERT(m_impl); 280 m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position); 281} 282 283UChar32 String::characterStartingAt(unsigned i) const 284{ 285 if (!m_impl || i >= m_impl->length()) 286 return 0; 287 return m_impl->characterStartingAt(i); 288} 289 290void String::ensure16Bit() 291{ 292 unsigned length = this->length(); 293 if (!length || !is8Bit()) 294 return; 295 m_impl = make16BitFrom8BitSource(m_impl->characters8(), length).impl(); 296} 297 298void String::truncate(unsigned position) 299{ 300 if (position >= length()) 301 return; 302 if (m_impl->is8Bit()) { 303 LChar* data; 304 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data); 305 memcpy(data, m_impl->characters8(), position * sizeof(LChar)); 306 m_impl = newImpl.release(); 307 } else { 308 UChar* data; 309 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data); 310 memcpy(data, m_impl->characters16(), position * sizeof(UChar)); 311 m_impl = newImpl.release(); 312 } 313} 314 315template <typename CharacterType> 316inline void String::removeInternal(const CharacterType* characters, unsigned position, int lengthToRemove) 317{ 318 CharacterType* data; 319 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() - lengthToRemove, data); 320 memcpy(data, characters, position * sizeof(CharacterType)); 321 memcpy(data + position, characters + position + lengthToRemove, 322 (length() - lengthToRemove - position) * sizeof(CharacterType)); 323 324 m_impl = newImpl.release(); 325} 326 327void String::remove(unsigned position, int lengthToRemove) 328{ 329 if (lengthToRemove <= 0) 330 return; 331 if (position >= length()) 332 return; 333 if (static_cast<unsigned>(lengthToRemove) > length() - position) 334 lengthToRemove = length() - position; 335 336 if (is8Bit()) { 337 removeInternal(characters8(), position, lengthToRemove); 338 339 return; 340 } 341 342 removeInternal(characters16(), position, lengthToRemove); 343} 344 345String String::substring(unsigned pos, unsigned len) const 346{ 347 if (!m_impl) 348 return String(); 349 return m_impl->substring(pos, len); 350} 351 352String String::lower() const 353{ 354 if (!m_impl) 355 return String(); 356 return m_impl->lower(); 357} 358 359String String::upper() const 360{ 361 if (!m_impl) 362 return String(); 363 return m_impl->upper(); 364} 365 366String String::stripWhiteSpace() const 367{ 368 if (!m_impl) 369 return String(); 370 return m_impl->stripWhiteSpace(); 371} 372 373String String::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const 374{ 375 if (!m_impl) 376 return String(); 377 return m_impl->stripWhiteSpace(isWhiteSpace); 378} 379 380String String::simplifyWhiteSpace() const 381{ 382 if (!m_impl) 383 return String(); 384 return m_impl->simplifyWhiteSpace(); 385} 386 387String String::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const 388{ 389 if (!m_impl) 390 return String(); 391 return m_impl->simplifyWhiteSpace(isWhiteSpace); 392} 393 394String String::removeCharacters(CharacterMatchFunctionPtr findMatch) const 395{ 396 if (!m_impl) 397 return String(); 398 return m_impl->removeCharacters(findMatch); 399} 400 401String String::foldCase() const 402{ 403 if (!m_impl) 404 return String(); 405 return m_impl->foldCase(); 406} 407 408bool String::percentage(int& result) const 409{ 410 if (!m_impl || !m_impl->length()) 411 return false; 412 413 if ((*m_impl)[m_impl->length() - 1] != '%') 414 return false; 415 416 if (m_impl->is8Bit()) 417 result = charactersToIntStrict(m_impl->characters8(), m_impl->length() - 1); 418 else 419 result = charactersToIntStrict(m_impl->characters16(), m_impl->length() - 1); 420 421 return true; 422} 423 424Vector<UChar> String::charactersWithNullTermination() const 425{ 426 if (!m_impl) 427 return Vector<UChar>(); 428 429 Vector<UChar> result; 430 result.reserveInitialCapacity(length() + 1); 431 appendTo(result); 432 result.append(0); 433 return result; 434} 435 436unsigned String::copyTo(UChar* buffer, unsigned pos, unsigned maxLength) const 437{ 438 unsigned length = this->length(); 439 RELEASE_ASSERT(pos <= length); 440 unsigned numCharacters = std::min(length - pos, maxLength); 441 if (!numCharacters) 442 return 0; 443 if (is8Bit()) 444 StringImpl::copyChars(buffer, characters8() + pos, numCharacters); 445 else 446 StringImpl::copyChars(buffer, characters16() + pos, numCharacters); 447 return numCharacters; 448} 449 450String String::format(const char *format, ...) 451{ 452 va_list args; 453 va_start(args, format); 454 455 Vector<char, 256> buffer; 456 457 // Do the format once to get the length. 458#if COMPILER(MSVC) 459 int result = _vscprintf(format, args); 460#else 461 char ch; 462 int result = vsnprintf(&ch, 1, format, args); 463 // We need to call va_end() and then va_start() again here, as the 464 // contents of args is undefined after the call to vsnprintf 465 // according to http://man.cx/snprintf(3) 466 // 467 // Not calling va_end/va_start here happens to work on lots of 468 // systems, but fails e.g. on 64bit Linux. 469 va_end(args); 470 va_start(args, format); 471#endif 472 473 if (result == 0) 474 return String(""); 475 if (result < 0) 476 return String(); 477 unsigned len = result; 478 buffer.grow(len + 1); 479 480 // Now do the formatting again, guaranteed to fit. 481 vsnprintf(buffer.data(), buffer.size(), format, args); 482 483 va_end(args); 484 485 return StringImpl::create(reinterpret_cast<const LChar*>(buffer.data()), len); 486} 487 488String String::number(int number) 489{ 490 return numberToStringSigned<String>(number); 491} 492 493String String::number(unsigned int number) 494{ 495 return numberToStringUnsigned<String>(number); 496} 497 498String String::number(long number) 499{ 500 return numberToStringSigned<String>(number); 501} 502 503String String::number(unsigned long number) 504{ 505 return numberToStringUnsigned<String>(number); 506} 507 508String String::number(long long number) 509{ 510 return numberToStringSigned<String>(number); 511} 512 513String String::number(unsigned long long number) 514{ 515 return numberToStringUnsigned<String>(number); 516} 517 518String String::number(double number, unsigned precision, TrailingZerosTruncatingPolicy trailingZerosTruncatingPolicy) 519{ 520 NumberToStringBuffer buffer; 521 return String(numberToFixedPrecisionString(number, precision, buffer, trailingZerosTruncatingPolicy == TruncateTrailingZeros)); 522} 523 524String String::numberToStringECMAScript(double number) 525{ 526 NumberToStringBuffer buffer; 527 return String(numberToString(number, buffer)); 528} 529 530String String::numberToStringFixedWidth(double number, unsigned decimalPlaces) 531{ 532 NumberToStringBuffer buffer; 533 return String(numberToFixedWidthString(number, decimalPlaces, buffer)); 534} 535 536int String::toIntStrict(bool* ok, int base) const 537{ 538 if (!m_impl) { 539 if (ok) 540 *ok = false; 541 return 0; 542 } 543 return m_impl->toIntStrict(ok, base); 544} 545 546unsigned String::toUIntStrict(bool* ok, int base) const 547{ 548 if (!m_impl) { 549 if (ok) 550 *ok = false; 551 return 0; 552 } 553 return m_impl->toUIntStrict(ok, base); 554} 555 556int64_t String::toInt64Strict(bool* ok, int base) const 557{ 558 if (!m_impl) { 559 if (ok) 560 *ok = false; 561 return 0; 562 } 563 return m_impl->toInt64Strict(ok, base); 564} 565 566uint64_t String::toUInt64Strict(bool* ok, int base) const 567{ 568 if (!m_impl) { 569 if (ok) 570 *ok = false; 571 return 0; 572 } 573 return m_impl->toUInt64Strict(ok, base); 574} 575 576intptr_t String::toIntPtrStrict(bool* ok, int base) const 577{ 578 if (!m_impl) { 579 if (ok) 580 *ok = false; 581 return 0; 582 } 583 return m_impl->toIntPtrStrict(ok, base); 584} 585 586int String::toInt(bool* ok) const 587{ 588 if (!m_impl) { 589 if (ok) 590 *ok = false; 591 return 0; 592 } 593 return m_impl->toInt(ok); 594} 595 596unsigned String::toUInt(bool* ok) const 597{ 598 if (!m_impl) { 599 if (ok) 600 *ok = false; 601 return 0; 602 } 603 return m_impl->toUInt(ok); 604} 605 606int64_t String::toInt64(bool* ok) const 607{ 608 if (!m_impl) { 609 if (ok) 610 *ok = false; 611 return 0; 612 } 613 return m_impl->toInt64(ok); 614} 615 616uint64_t String::toUInt64(bool* ok) const 617{ 618 if (!m_impl) { 619 if (ok) 620 *ok = false; 621 return 0; 622 } 623 return m_impl->toUInt64(ok); 624} 625 626intptr_t String::toIntPtr(bool* ok) const 627{ 628 if (!m_impl) { 629 if (ok) 630 *ok = false; 631 return 0; 632 } 633 return m_impl->toIntPtr(ok); 634} 635 636double String::toDouble(bool* ok) const 637{ 638 if (!m_impl) { 639 if (ok) 640 *ok = false; 641 return 0.0; 642 } 643 return m_impl->toDouble(ok); 644} 645 646float String::toFloat(bool* ok) const 647{ 648 if (!m_impl) { 649 if (ok) 650 *ok = false; 651 return 0.0f; 652 } 653 return m_impl->toFloat(ok); 654} 655 656String String::isolatedCopy() const 657{ 658 if (!m_impl) 659 return String(); 660 return m_impl->isolatedCopy(); 661} 662 663bool String::isSafeToSendToAnotherThread() const 664{ 665 if (!impl()) 666 return true; 667 if (impl()->isStatic()) 668 return true; 669 // AtomicStrings are not safe to send between threads as ~StringImpl() 670 // will try to remove them from the wrong AtomicStringTable. 671 if (impl()->isAtomic()) 672 return false; 673 if (impl()->hasOneRef()) 674 return true; 675 return false; 676} 677 678void String::split(const String& separator, bool allowEmptyEntries, Vector<String>& result) const 679{ 680 result.clear(); 681 682 unsigned startPos = 0; 683 size_t endPos; 684 while ((endPos = find(separator, startPos)) != notFound) { 685 if (allowEmptyEntries || startPos != endPos) 686 result.append(substring(startPos, endPos - startPos)); 687 startPos = endPos + separator.length(); 688 } 689 if (allowEmptyEntries || startPos != length()) 690 result.append(substring(startPos)); 691} 692 693void String::split(UChar separator, bool allowEmptyEntries, Vector<String>& result) const 694{ 695 result.clear(); 696 697 unsigned startPos = 0; 698 size_t endPos; 699 while ((endPos = find(separator, startPos)) != notFound) { 700 if (allowEmptyEntries || startPos != endPos) 701 result.append(substring(startPos, endPos - startPos)); 702 startPos = endPos + 1; 703 } 704 if (allowEmptyEntries || startPos != length()) 705 result.append(substring(startPos)); 706} 707 708CString String::ascii() const 709{ 710 // Printable ASCII characters 32..127 and the null character are 711 // preserved, characters outside of this range are converted to '?'. 712 713 unsigned length = this->length(); 714 if (!length) { 715 char* characterBuffer; 716 return CString::newUninitialized(length, characterBuffer); 717 } 718 719 if (this->is8Bit()) { 720 const LChar* characters = this->characters8(); 721 722 char* characterBuffer; 723 CString result = CString::newUninitialized(length, characterBuffer); 724 725 for (unsigned i = 0; i < length; ++i) { 726 LChar ch = characters[i]; 727 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch; 728 } 729 730 return result; 731 } 732 733 const UChar* characters = this->characters16(); 734 735 char* characterBuffer; 736 CString result = CString::newUninitialized(length, characterBuffer); 737 738 for (unsigned i = 0; i < length; ++i) { 739 UChar ch = characters[i]; 740 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch; 741 } 742 743 return result; 744} 745 746CString String::latin1() const 747{ 748 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are 749 // preserved, characters outside of this range are converted to '?'. 750 751 unsigned length = this->length(); 752 753 if (!length) 754 return CString("", 0); 755 756 if (is8Bit()) 757 return CString(reinterpret_cast<const char*>(this->characters8()), length); 758 759 const UChar* characters = this->characters16(); 760 761 char* characterBuffer; 762 CString result = CString::newUninitialized(length, characterBuffer); 763 764 for (unsigned i = 0; i < length; ++i) { 765 UChar ch = characters[i]; 766 characterBuffer[i] = ch > 0xff ? '?' : ch; 767 } 768 769 return result; 770} 771 772// Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available. 773static inline void putUTF8Triple(char*& buffer, UChar ch) 774{ 775 ASSERT(ch >= 0x0800); 776 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); 777 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); 778 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); 779} 780 781CString String::utf8(ConversionMode mode) const 782{ 783 unsigned length = this->length(); 784 785 if (!length) 786 return CString("", 0); 787 788 // Allocate a buffer big enough to hold all the characters 789 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). 790 // Optimization ideas, if we find this function is hot: 791 // * We could speculatively create a CStringBuffer to contain 'length' 792 // characters, and resize if necessary (i.e. if the buffer contains 793 // non-ascii characters). (Alternatively, scan the buffer first for 794 // ascii characters, so we know this will be sufficient). 795 // * We could allocate a CStringBuffer with an appropriate size to 796 // have a good chance of being able to write the string into the 797 // buffer without reallocing (say, 1.5 x length). 798 if (length > numeric_limits<unsigned>::max() / 3) 799 return CString(); 800 Vector<char, 1024> bufferVector(length * 3); 801 802 char* buffer = bufferVector.data(); 803 804 if (is8Bit()) { 805 const LChar* characters = this->characters8(); 806 807 ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size()); 808 ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion 809 } else { 810 const UChar* characters = this->characters16(); 811 812 if (mode == StrictConversionReplacingUnpairedSurrogatesWithFFFD) { 813 const UChar* charactersEnd = characters + length; 814 char* bufferEnd = buffer + bufferVector.size(); 815 while (characters < charactersEnd) { 816 // Use strict conversion to detect unpaired surrogates. 817 ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, &buffer, bufferEnd, true); 818 ASSERT(result != targetExhausted); 819 // Conversion fails when there is an unpaired surrogate. 820 // Put replacement character (U+FFFD) instead of the unpaired surrogate. 821 if (result != conversionOK) { 822 ASSERT((0xD800 <= *characters && *characters <= 0xDFFF)); 823 // There should be room left, since one UChar hasn't been converted. 824 ASSERT((buffer + 3) <= bufferEnd); 825 putUTF8Triple(buffer, replacementCharacter); 826 ++characters; 827 } 828 } 829 } else { 830 bool strict = mode == StrictConversion; 831 ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict); 832 ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion 833 834 // Only produced from strict conversion. 835 if (result == sourceIllegal) { 836 ASSERT(strict); 837 return CString(); 838 } 839 840 // Check for an unconverted high surrogate. 841 if (result == sourceExhausted) { 842 if (strict) 843 return CString(); 844 // This should be one unpaired high surrogate. Treat it the same 845 // was as an unpaired high surrogate would have been handled in 846 // the middle of a string with non-strict conversion - which is 847 // to say, simply encode it to UTF-8. 848 ASSERT((characters + 1) == (this->characters16() + length)); 849 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF)); 850 // There should be room left, since one UChar hasn't been converted. 851 ASSERT((buffer + 3) <= (buffer + bufferVector.size())); 852 putUTF8Triple(buffer, *characters); 853 } 854 } 855 } 856 857 return CString(bufferVector.data(), buffer - bufferVector.data()); 858} 859 860String String::make8BitFrom16BitSource(const UChar* source, size_t length) 861{ 862 if (!length) 863 return String(); 864 865 LChar* destination; 866 String result = String::createUninitialized(length, destination); 867 868 copyLCharsFromUCharSource(destination, source, length); 869 870 return result; 871} 872 873String String::make16BitFrom8BitSource(const LChar* source, size_t length) 874{ 875 if (!length) 876 return String(); 877 878 UChar* destination; 879 String result = String::createUninitialized(length, destination); 880 881 StringImpl::copyChars(destination, source, length); 882 883 return result; 884} 885 886String String::fromUTF8(const LChar* stringStart, size_t length) 887{ 888 RELEASE_ASSERT(length <= numeric_limits<unsigned>::max()); 889 890 if (!stringStart) 891 return String(); 892 893 if (!length) 894 return emptyString(); 895 896 if (charactersAreAllASCII(stringStart, length)) 897 return StringImpl::create(stringStart, length); 898 899 Vector<UChar, 1024> buffer(length); 900 UChar* bufferStart = buffer.data(); 901 902 UChar* bufferCurrent = bufferStart; 903 const char* stringCurrent = reinterpret_cast<const char*>(stringStart); 904 if (convertUTF8ToUTF16(&stringCurrent, reinterpret_cast<const char *>(stringStart + length), &bufferCurrent, bufferCurrent + buffer.size()) != conversionOK) 905 return String(); 906 907 unsigned utf16Length = bufferCurrent - bufferStart; 908 ASSERT(utf16Length < length); 909 return StringImpl::create(bufferStart, utf16Length); 910} 911 912String String::fromUTF8(const LChar* string) 913{ 914 if (!string) 915 return String(); 916 return fromUTF8(string, strlen(reinterpret_cast<const char*>(string))); 917} 918 919String String::fromUTF8(const CString& s) 920{ 921 return fromUTF8(s.data()); 922} 923 924String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size) 925{ 926 String utf8 = fromUTF8(string, size); 927 if (!utf8) 928 return String(string, size); 929 return utf8; 930} 931 932// String Operations 933 934static bool isCharacterAllowedInBase(UChar c, int base) 935{ 936 if (c > 0x7F) 937 return false; 938 if (isASCIIDigit(c)) 939 return c - '0' < base; 940 if (isASCIIAlpha(c)) { 941 if (base > 36) 942 base = 36; 943 return (c >= 'a' && c < 'a' + base - 10) 944 || (c >= 'A' && c < 'A' + base - 10); 945 } 946 return false; 947} 948 949template <typename IntegralType, typename CharType> 950static inline IntegralType toIntegralType(const CharType* data, size_t length, bool* ok, int base) 951{ 952 static const IntegralType integralMax = numeric_limits<IntegralType>::max(); 953 static const bool isSigned = numeric_limits<IntegralType>::is_signed; 954 const IntegralType maxMultiplier = integralMax / base; 955 956 IntegralType value = 0; 957 bool isOk = false; 958 bool isNegative = false; 959 960 if (!data) 961 goto bye; 962 963 // skip leading whitespace 964 while (length && isSpaceOrNewline(*data)) { 965 --length; 966 ++data; 967 } 968 969 if (isSigned && length && *data == '-') { 970 --length; 971 ++data; 972 isNegative = true; 973 } else if (length && *data == '+') { 974 --length; 975 ++data; 976 } 977 978 if (!length || !isCharacterAllowedInBase(*data, base)) 979 goto bye; 980 981 while (length && isCharacterAllowedInBase(*data, base)) { 982 --length; 983 IntegralType digitValue; 984 CharType c = *data; 985 if (isASCIIDigit(c)) 986 digitValue = c - '0'; 987 else if (c >= 'a') 988 digitValue = c - 'a' + 10; 989 else 990 digitValue = c - 'A' + 10; 991 992 if (value > maxMultiplier || (value == maxMultiplier && digitValue > (integralMax % base) + isNegative)) 993 goto bye; 994 995 value = base * value + digitValue; 996 ++data; 997 } 998 999#if COMPILER(MSVC) 1000#pragma warning(push, 0) 1001#pragma warning(disable:4146) 1002#endif 1003 1004 if (isNegative) 1005 value = -value; 1006 1007#if COMPILER(MSVC) 1008#pragma warning(pop) 1009#endif 1010 1011 // skip trailing space 1012 while (length && isSpaceOrNewline(*data)) { 1013 --length; 1014 ++data; 1015 } 1016 1017 if (!length) 1018 isOk = true; 1019bye: 1020 if (ok) 1021 *ok = isOk; 1022 return isOk ? value : 0; 1023} 1024 1025template <typename CharType> 1026static unsigned lengthOfCharactersAsInteger(const CharType* data, size_t length) 1027{ 1028 size_t i = 0; 1029 1030 // Allow leading spaces. 1031 for (; i != length; ++i) { 1032 if (!isSpaceOrNewline(data[i])) 1033 break; 1034 } 1035 1036 // Allow sign. 1037 if (i != length && (data[i] == '+' || data[i] == '-')) 1038 ++i; 1039 1040 // Allow digits. 1041 for (; i != length; ++i) { 1042 if (!isASCIIDigit(data[i])) 1043 break; 1044 } 1045 1046 return i; 1047} 1048 1049int charactersToIntStrict(const LChar* data, size_t length, bool* ok, int base) 1050{ 1051 return toIntegralType<int, LChar>(data, length, ok, base); 1052} 1053 1054int charactersToIntStrict(const UChar* data, size_t length, bool* ok, int base) 1055{ 1056 return toIntegralType<int, UChar>(data, length, ok, base); 1057} 1058 1059unsigned charactersToUIntStrict(const LChar* data, size_t length, bool* ok, int base) 1060{ 1061 return toIntegralType<unsigned, LChar>(data, length, ok, base); 1062} 1063 1064unsigned charactersToUIntStrict(const UChar* data, size_t length, bool* ok, int base) 1065{ 1066 return toIntegralType<unsigned, UChar>(data, length, ok, base); 1067} 1068 1069int64_t charactersToInt64Strict(const LChar* data, size_t length, bool* ok, int base) 1070{ 1071 return toIntegralType<int64_t, LChar>(data, length, ok, base); 1072} 1073 1074int64_t charactersToInt64Strict(const UChar* data, size_t length, bool* ok, int base) 1075{ 1076 return toIntegralType<int64_t, UChar>(data, length, ok, base); 1077} 1078 1079uint64_t charactersToUInt64Strict(const LChar* data, size_t length, bool* ok, int base) 1080{ 1081 return toIntegralType<uint64_t, LChar>(data, length, ok, base); 1082} 1083 1084uint64_t charactersToUInt64Strict(const UChar* data, size_t length, bool* ok, int base) 1085{ 1086 return toIntegralType<uint64_t, UChar>(data, length, ok, base); 1087} 1088 1089intptr_t charactersToIntPtrStrict(const LChar* data, size_t length, bool* ok, int base) 1090{ 1091 return toIntegralType<intptr_t, LChar>(data, length, ok, base); 1092} 1093 1094intptr_t charactersToIntPtrStrict(const UChar* data, size_t length, bool* ok, int base) 1095{ 1096 return toIntegralType<intptr_t, UChar>(data, length, ok, base); 1097} 1098 1099int charactersToInt(const LChar* data, size_t length, bool* ok) 1100{ 1101 return toIntegralType<int, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1102} 1103 1104int charactersToInt(const UChar* data, size_t length, bool* ok) 1105{ 1106 return toIntegralType<int, UChar>(data, lengthOfCharactersAsInteger(data, length), ok, 10); 1107} 1108 1109unsigned charactersToUInt(const LChar* data, size_t length, bool* ok) 1110{ 1111 return toIntegralType<unsigned, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1112} 1113 1114unsigned charactersToUInt(const UChar* data, size_t length, bool* ok) 1115{ 1116 return toIntegralType<unsigned, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1117} 1118 1119int64_t charactersToInt64(const LChar* data, size_t length, bool* ok) 1120{ 1121 return toIntegralType<int64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1122} 1123 1124int64_t charactersToInt64(const UChar* data, size_t length, bool* ok) 1125{ 1126 return toIntegralType<int64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1127} 1128 1129uint64_t charactersToUInt64(const LChar* data, size_t length, bool* ok) 1130{ 1131 return toIntegralType<uint64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1132} 1133 1134uint64_t charactersToUInt64(const UChar* data, size_t length, bool* ok) 1135{ 1136 return toIntegralType<uint64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1137} 1138 1139intptr_t charactersToIntPtr(const LChar* data, size_t length, bool* ok) 1140{ 1141 return toIntegralType<intptr_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1142} 1143 1144intptr_t charactersToIntPtr(const UChar* data, size_t length, bool* ok) 1145{ 1146 return toIntegralType<intptr_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1147} 1148 1149enum TrailingJunkPolicy { DisallowTrailingJunk, AllowTrailingJunk }; 1150 1151template <typename CharType, TrailingJunkPolicy policy> 1152static inline double toDoubleType(const CharType* data, size_t length, bool* ok, size_t& parsedLength) 1153{ 1154 size_t leadingSpacesLength = 0; 1155 while (leadingSpacesLength < length && isASCIISpace(data[leadingSpacesLength])) 1156 ++leadingSpacesLength; 1157 1158 double number = parseDouble(data + leadingSpacesLength, length - leadingSpacesLength, parsedLength); 1159 if (!parsedLength) { 1160 if (ok) 1161 *ok = false; 1162 return 0.0; 1163 } 1164 1165 parsedLength += leadingSpacesLength; 1166 if (ok) 1167 *ok = policy == AllowTrailingJunk || parsedLength == length; 1168 return number; 1169} 1170 1171double charactersToDouble(const LChar* data, size_t length, bool* ok) 1172{ 1173 size_t parsedLength; 1174 return toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength); 1175} 1176 1177double charactersToDouble(const UChar* data, size_t length, bool* ok) 1178{ 1179 size_t parsedLength; 1180 return toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength); 1181} 1182 1183float charactersToFloat(const LChar* data, size_t length, bool* ok) 1184{ 1185 // FIXME: This will return ok even when the string fits into a double but not a float. 1186 size_t parsedLength; 1187 return static_cast<float>(toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength)); 1188} 1189 1190float charactersToFloat(const UChar* data, size_t length, bool* ok) 1191{ 1192 // FIXME: This will return ok even when the string fits into a double but not a float. 1193 size_t parsedLength; 1194 return static_cast<float>(toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength)); 1195} 1196 1197float charactersToFloat(const LChar* data, size_t length, size_t& parsedLength) 1198{ 1199 // FIXME: This will return ok even when the string fits into a double but not a float. 1200 return static_cast<float>(toDoubleType<LChar, AllowTrailingJunk>(data, length, 0, parsedLength)); 1201} 1202 1203float charactersToFloat(const UChar* data, size_t length, size_t& parsedLength) 1204{ 1205 // FIXME: This will return ok even when the string fits into a double but not a float. 1206 return static_cast<float>(toDoubleType<UChar, AllowTrailingJunk>(data, length, 0, parsedLength)); 1207} 1208 1209const String& emptyString() 1210{ 1211 DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty())); 1212 return emptyString; 1213} 1214 1215} // namespace WTF 1216 1217#ifndef NDEBUG 1218// For use in the debugger 1219String* string(const char*); 1220Vector<char> asciiDebug(StringImpl* impl); 1221Vector<char> asciiDebug(String& string); 1222 1223void String::show() const 1224{ 1225 dataLogF("%s\n", asciiDebug(impl()).data()); 1226} 1227 1228String* string(const char* s) 1229{ 1230 // leaks memory! 1231 return new String(s); 1232} 1233 1234Vector<char> asciiDebug(StringImpl* impl) 1235{ 1236 if (!impl) 1237 return asciiDebug(String("[null]").impl()); 1238 1239 Vector<char> buffer; 1240 for (unsigned i = 0; i < impl->length(); ++i) { 1241 UChar ch = (*impl)[i]; 1242 if (isASCIIPrintable(ch)) { 1243 if (ch == '\\') 1244 buffer.append(ch); 1245 buffer.append(ch); 1246 } else { 1247 buffer.append('\\'); 1248 buffer.append('u'); 1249 appendUnsignedAsHexFixedSize(ch, buffer, 4); 1250 } 1251 } 1252 buffer.append('\0'); 1253 return buffer; 1254} 1255 1256Vector<char> asciiDebug(String& string) 1257{ 1258 return asciiDebug(string.impl()); 1259} 1260 1261#endif 1262