1/*
2 * (C) 1999 Lars Knoll (knoll@kde.org)
3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights reserved.
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB.  If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 */
21
22#include "config.h"
23#include "WTFString.h"
24
25#include "IntegerToStringConversion.h"
26#include <stdarg.h>
27#include "wtf/ASCIICType.h"
28#include "wtf/DataLog.h"
29#include "wtf/HexNumber.h"
30#include "wtf/MathExtras.h"
31#include "wtf/text/CString.h"
32#include "wtf/StringExtras.h"
33#include "wtf/Vector.h"
34#include "wtf/dtoa.h"
35#include "wtf/unicode/CharacterNames.h"
36#include "wtf/unicode/UTF8.h"
37#include "wtf/unicode/Unicode.h"
38
39using namespace std;
40
41namespace WTF {
42
43using namespace Unicode;
44using namespace std;
45
46// Construct a string with UTF-16 data.
47String::String(const UChar* characters, unsigned length)
48    : m_impl(characters ? StringImpl::create(characters, length) : nullptr)
49{
50}
51
52// Construct a string with UTF-16 data, from a null-terminated source.
53String::String(const UChar* str)
54{
55    if (!str)
56        return;
57    m_impl = StringImpl::create(str, lengthOfNullTerminatedString(str));
58}
59
60// Construct a string with latin1 data.
61String::String(const LChar* characters, unsigned length)
62    : m_impl(characters ? StringImpl::create(characters, length) : nullptr)
63{
64}
65
66String::String(const char* characters, unsigned length)
67    : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters), length) : nullptr)
68{
69}
70
71// Construct a string with latin1 data, from a null-terminated source.
72String::String(const LChar* characters)
73    : m_impl(characters ? StringImpl::create(characters) : nullptr)
74{
75}
76
77String::String(const char* characters)
78    : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters)) : nullptr)
79{
80}
81
82void String::append(const String& string)
83{
84    if (string.isEmpty())
85        return;
86    if (!m_impl) {
87        m_impl = string.m_impl;
88        return;
89    }
90
91    // FIXME: This is extremely inefficient. So much so that we might want to take this
92    // out of String's API. We can make it better by optimizing the case where exactly
93    // one String is pointing at this StringImpl, but even then it's going to require a
94    // call into the allocator every single time.
95
96    if (m_impl->is8Bit() && string.m_impl->is8Bit()) {
97        LChar* data;
98        RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length());
99        RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data);
100        memcpy(data, m_impl->characters8(), m_impl->length() * sizeof(LChar));
101        memcpy(data + m_impl->length(), string.characters8(), string.length() * sizeof(LChar));
102        m_impl = newImpl.release();
103        return;
104    }
105
106    UChar* data;
107    RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length());
108    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data);
109
110    if (m_impl->is8Bit())
111        StringImpl::copyChars(data, m_impl->characters8(), m_impl->length());
112    else
113        StringImpl::copyChars(data, m_impl->characters16(), m_impl->length());
114
115    if (string.impl()->is8Bit())
116        StringImpl::copyChars(data + m_impl->length(), string.impl()->characters8(), string.impl()->length());
117    else
118        StringImpl::copyChars(data + m_impl->length(), string.impl()->characters16(), string.impl()->length());
119
120    m_impl = newImpl.release();
121}
122
123template <typename CharacterType>
124inline void String::appendInternal(CharacterType c)
125{
126    // FIXME: This is extremely inefficient. So much so that we might want to take this
127    // out of String's API. We can make it better by optimizing the case where exactly
128    // one String is pointing at this StringImpl, but even then it's going to require a
129    // call into the allocator every single time.
130    if (!m_impl) {
131        m_impl = StringImpl::create(&c, 1);
132        return;
133    }
134
135    UChar* data; // FIXME: We should be able to create an 8 bit string via this code path.
136    RELEASE_ASSERT(m_impl->length() < numeric_limits<unsigned>::max());
137    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + 1, data);
138    if (m_impl->is8Bit())
139        StringImpl::copyChars(data, m_impl->characters8(), m_impl->length());
140    else
141        StringImpl::copyChars(data, m_impl->characters16(), m_impl->length());
142    data[m_impl->length()] = c;
143    m_impl = newImpl.release();
144}
145
146void String::append(LChar c)
147{
148    appendInternal(c);
149}
150
151void String::append(UChar c)
152{
153    appendInternal(c);
154}
155
156int codePointCompare(const String& a, const String& b)
157{
158    return codePointCompare(a.impl(), b.impl());
159}
160
161void String::insert(const String& string, unsigned position)
162{
163    if (string.isEmpty()) {
164        if (string.isNull())
165            return;
166        if (isNull())
167            m_impl = string.impl();
168        return;
169    }
170
171    if (string.is8Bit())
172        insert(string.impl()->characters8(), string.length(), position);
173    else
174        insert(string.impl()->characters16(), string.length(), position);
175}
176
177void String::append(const LChar* charactersToAppend, unsigned lengthToAppend)
178{
179    if (!m_impl) {
180        if (!charactersToAppend)
181            return;
182        m_impl = StringImpl::create(charactersToAppend, lengthToAppend);
183        return;
184    }
185
186    if (!lengthToAppend)
187        return;
188
189    ASSERT(charactersToAppend);
190
191    unsigned strLength = m_impl->length();
192
193    if (m_impl->is8Bit()) {
194        RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
195        LChar* data;
196        RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data);
197        StringImpl::copyChars(data, m_impl->characters8(), strLength);
198        StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
199        m_impl = newImpl.release();
200        return;
201    }
202
203    RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
204    UChar* data;
205    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() + lengthToAppend, data);
206    StringImpl::copyChars(data, m_impl->characters16(), strLength);
207    StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
208    m_impl = newImpl.release();
209}
210
211void String::append(const UChar* charactersToAppend, unsigned lengthToAppend)
212{
213    if (!m_impl) {
214        if (!charactersToAppend)
215            return;
216        m_impl = StringImpl::create(charactersToAppend, lengthToAppend);
217        return;
218    }
219
220    if (!lengthToAppend)
221        return;
222
223    unsigned strLength = m_impl->length();
224
225    ASSERT(charactersToAppend);
226    RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
227    UChar* data;
228    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data);
229    if (m_impl->is8Bit())
230        StringImpl::copyChars(data, characters8(), strLength);
231    else
232        StringImpl::copyChars(data, characters16(), strLength);
233    StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
234    m_impl = newImpl.release();
235}
236
237template<typename CharType>
238PassRefPtr<StringImpl> insertInternal(PassRefPtr<StringImpl> impl, const CharType* charactersToInsert, unsigned lengthToInsert, unsigned position)
239{
240    if (!lengthToInsert)
241        return impl;
242
243    ASSERT(charactersToInsert);
244    UChar* data; // FIXME: We should be able to create an 8 bit string here.
245    RELEASE_ASSERT(lengthToInsert <= numeric_limits<unsigned>::max() - impl->length());
246    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(impl->length() + lengthToInsert, data);
247
248    if (impl->is8Bit())
249        StringImpl::copyChars(data, impl->characters8(), position);
250    else
251        StringImpl::copyChars(data, impl->characters16(), position);
252
253    StringImpl::copyChars(data + position, charactersToInsert, lengthToInsert);
254
255    if (impl->is8Bit())
256        StringImpl::copyChars(data + position + lengthToInsert, impl->characters8() + position, impl->length() - position);
257    else
258        StringImpl::copyChars(data + position + lengthToInsert, impl->characters16() + position, impl->length() - position);
259
260    return newImpl.release();
261}
262
263void String::insert(const UChar* charactersToInsert, unsigned lengthToInsert, unsigned position)
264{
265    if (position >= length()) {
266        append(charactersToInsert, lengthToInsert);
267        return;
268    }
269    ASSERT(m_impl);
270    m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position);
271}
272
273void String::insert(const LChar* charactersToInsert, unsigned lengthToInsert, unsigned position)
274{
275    if (position >= length()) {
276        append(charactersToInsert, lengthToInsert);
277        return;
278    }
279    ASSERT(m_impl);
280    m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position);
281}
282
283UChar32 String::characterStartingAt(unsigned i) const
284{
285    if (!m_impl || i >= m_impl->length())
286        return 0;
287    return m_impl->characterStartingAt(i);
288}
289
290void String::ensure16Bit()
291{
292    unsigned length = this->length();
293    if (!length || !is8Bit())
294        return;
295    m_impl = make16BitFrom8BitSource(m_impl->characters8(), length).impl();
296}
297
298void String::truncate(unsigned position)
299{
300    if (position >= length())
301        return;
302    if (m_impl->is8Bit()) {
303        LChar* data;
304        RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data);
305        memcpy(data, m_impl->characters8(), position * sizeof(LChar));
306        m_impl = newImpl.release();
307    } else {
308        UChar* data;
309        RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data);
310        memcpy(data, m_impl->characters16(), position * sizeof(UChar));
311        m_impl = newImpl.release();
312    }
313}
314
315template <typename CharacterType>
316inline void String::removeInternal(const CharacterType* characters, unsigned position, int lengthToRemove)
317{
318    CharacterType* data;
319    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() - lengthToRemove, data);
320    memcpy(data, characters, position * sizeof(CharacterType));
321    memcpy(data + position, characters + position + lengthToRemove,
322        (length() - lengthToRemove - position) * sizeof(CharacterType));
323
324    m_impl = newImpl.release();
325}
326
327void String::remove(unsigned position, int lengthToRemove)
328{
329    if (lengthToRemove <= 0)
330        return;
331    if (position >= length())
332        return;
333    if (static_cast<unsigned>(lengthToRemove) > length() - position)
334        lengthToRemove = length() - position;
335
336    if (is8Bit()) {
337        removeInternal(characters8(), position, lengthToRemove);
338
339        return;
340    }
341
342    removeInternal(characters16(), position, lengthToRemove);
343}
344
345String String::substring(unsigned pos, unsigned len) const
346{
347    if (!m_impl)
348        return String();
349    return m_impl->substring(pos, len);
350}
351
352String String::lower() const
353{
354    if (!m_impl)
355        return String();
356    return m_impl->lower();
357}
358
359String String::upper() const
360{
361    if (!m_impl)
362        return String();
363    return m_impl->upper();
364}
365
366String String::lower(const AtomicString& localeIdentifier) const
367{
368    if (!m_impl)
369        return String();
370    return m_impl->lower(localeIdentifier);
371}
372
373String String::upper(const AtomicString& localeIdentifier) const
374{
375    if (!m_impl)
376        return String();
377    return m_impl->upper(localeIdentifier);
378}
379
380String String::stripWhiteSpace() const
381{
382    if (!m_impl)
383        return String();
384    return m_impl->stripWhiteSpace();
385}
386
387String String::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const
388{
389    if (!m_impl)
390        return String();
391    return m_impl->stripWhiteSpace(isWhiteSpace);
392}
393
394String String::simplifyWhiteSpace(StripBehavior stripBehavior) const
395{
396    if (!m_impl)
397        return String();
398    return m_impl->simplifyWhiteSpace(stripBehavior);
399}
400
401String String::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace, StripBehavior stripBehavior) const
402{
403    if (!m_impl)
404        return String();
405    return m_impl->simplifyWhiteSpace(isWhiteSpace, stripBehavior);
406}
407
408String String::removeCharacters(CharacterMatchFunctionPtr findMatch) const
409{
410    if (!m_impl)
411        return String();
412    return m_impl->removeCharacters(findMatch);
413}
414
415String String::foldCase() const
416{
417    if (!m_impl)
418        return String();
419    return m_impl->foldCase();
420}
421
422bool String::percentage(int& result) const
423{
424    if (!m_impl || !m_impl->length())
425        return false;
426
427    if ((*m_impl)[m_impl->length() - 1] != '%')
428        return false;
429
430    if (m_impl->is8Bit())
431        result = charactersToIntStrict(m_impl->characters8(), m_impl->length() - 1);
432    else
433        result = charactersToIntStrict(m_impl->characters16(), m_impl->length() - 1);
434
435    return true;
436}
437
438Vector<UChar> String::charactersWithNullTermination() const
439{
440    if (!m_impl)
441        return Vector<UChar>();
442
443    Vector<UChar> result;
444    result.reserveInitialCapacity(length() + 1);
445    appendTo(result);
446    result.append(0);
447    return result;
448}
449
450unsigned String::copyTo(UChar* buffer, unsigned pos, unsigned maxLength) const
451{
452    unsigned length = this->length();
453    RELEASE_ASSERT(pos <= length);
454    unsigned numCharacters = std::min(length - pos, maxLength);
455    if (!numCharacters)
456        return 0;
457    if (is8Bit())
458        StringImpl::copyChars(buffer, characters8() + pos, numCharacters);
459    else
460        StringImpl::copyChars(buffer, characters16() + pos, numCharacters);
461    return numCharacters;
462}
463
464String String::format(const char *format, ...)
465{
466    va_list args;
467    va_start(args, format);
468
469    Vector<char, 256> buffer;
470
471    // Do the format once to get the length.
472#if COMPILER(MSVC)
473    int result = _vscprintf(format, args);
474#else
475    char ch;
476    int result = vsnprintf(&ch, 1, format, args);
477    // We need to call va_end() and then va_start() again here, as the
478    // contents of args is undefined after the call to vsnprintf
479    // according to http://man.cx/snprintf(3)
480    //
481    // Not calling va_end/va_start here happens to work on lots of
482    // systems, but fails e.g. on 64bit Linux.
483    va_end(args);
484    va_start(args, format);
485#endif
486
487    if (result == 0)
488        return String("");
489    if (result < 0)
490        return String();
491    unsigned len = result;
492    buffer.grow(len + 1);
493
494    // Now do the formatting again, guaranteed to fit.
495    vsnprintf(buffer.data(), buffer.size(), format, args);
496
497    va_end(args);
498
499    return StringImpl::create(reinterpret_cast<const LChar*>(buffer.data()), len);
500}
501
502String String::number(int number)
503{
504    return numberToStringSigned<String>(number);
505}
506
507String String::number(unsigned number)
508{
509    return numberToStringUnsigned<String>(number);
510}
511
512String String::number(long number)
513{
514    return numberToStringSigned<String>(number);
515}
516
517String String::number(unsigned long number)
518{
519    return numberToStringUnsigned<String>(number);
520}
521
522String String::number(long long number)
523{
524    return numberToStringSigned<String>(number);
525}
526
527String String::number(unsigned long long number)
528{
529    return numberToStringUnsigned<String>(number);
530}
531
532String String::number(double number, unsigned precision, TrailingZerosTruncatingPolicy trailingZerosTruncatingPolicy)
533{
534    NumberToStringBuffer buffer;
535    return String(numberToFixedPrecisionString(number, precision, buffer, trailingZerosTruncatingPolicy == TruncateTrailingZeros));
536}
537
538String String::numberToStringECMAScript(double number)
539{
540    NumberToStringBuffer buffer;
541    return String(numberToString(number, buffer));
542}
543
544String String::numberToStringFixedWidth(double number, unsigned decimalPlaces)
545{
546    NumberToStringBuffer buffer;
547    return String(numberToFixedWidthString(number, decimalPlaces, buffer));
548}
549
550int String::toIntStrict(bool* ok, int base) const
551{
552    if (!m_impl) {
553        if (ok)
554            *ok = false;
555        return 0;
556    }
557    return m_impl->toIntStrict(ok, base);
558}
559
560unsigned String::toUIntStrict(bool* ok, int base) const
561{
562    if (!m_impl) {
563        if (ok)
564            *ok = false;
565        return 0;
566    }
567    return m_impl->toUIntStrict(ok, base);
568}
569
570int64_t String::toInt64Strict(bool* ok, int base) const
571{
572    if (!m_impl) {
573        if (ok)
574            *ok = false;
575        return 0;
576    }
577    return m_impl->toInt64Strict(ok, base);
578}
579
580uint64_t String::toUInt64Strict(bool* ok, int base) const
581{
582    if (!m_impl) {
583        if (ok)
584            *ok = false;
585        return 0;
586    }
587    return m_impl->toUInt64Strict(ok, base);
588}
589
590intptr_t String::toIntPtrStrict(bool* ok, int base) const
591{
592    if (!m_impl) {
593        if (ok)
594            *ok = false;
595        return 0;
596    }
597    return m_impl->toIntPtrStrict(ok, base);
598}
599
600int String::toInt(bool* ok) const
601{
602    if (!m_impl) {
603        if (ok)
604            *ok = false;
605        return 0;
606    }
607    return m_impl->toInt(ok);
608}
609
610unsigned String::toUInt(bool* ok) const
611{
612    if (!m_impl) {
613        if (ok)
614            *ok = false;
615        return 0;
616    }
617    return m_impl->toUInt(ok);
618}
619
620int64_t String::toInt64(bool* ok) const
621{
622    if (!m_impl) {
623        if (ok)
624            *ok = false;
625        return 0;
626    }
627    return m_impl->toInt64(ok);
628}
629
630uint64_t String::toUInt64(bool* ok) const
631{
632    if (!m_impl) {
633        if (ok)
634            *ok = false;
635        return 0;
636    }
637    return m_impl->toUInt64(ok);
638}
639
640intptr_t String::toIntPtr(bool* ok) const
641{
642    if (!m_impl) {
643        if (ok)
644            *ok = false;
645        return 0;
646    }
647    return m_impl->toIntPtr(ok);
648}
649
650double String::toDouble(bool* ok) const
651{
652    if (!m_impl) {
653        if (ok)
654            *ok = false;
655        return 0.0;
656    }
657    return m_impl->toDouble(ok);
658}
659
660float String::toFloat(bool* ok) const
661{
662    if (!m_impl) {
663        if (ok)
664            *ok = false;
665        return 0.0f;
666    }
667    return m_impl->toFloat(ok);
668}
669
670String String::isolatedCopy() const
671{
672    if (!m_impl)
673        return String();
674    return m_impl->isolatedCopy();
675}
676
677bool String::isSafeToSendToAnotherThread() const
678{
679    if (!impl())
680        return true;
681    if (impl()->isStatic())
682        return true;
683    // AtomicStrings are not safe to send between threads as ~StringImpl()
684    // will try to remove them from the wrong AtomicStringTable.
685    if (impl()->isAtomic())
686        return false;
687    if (impl()->hasOneRef())
688        return true;
689    return false;
690}
691
692void String::split(const String& separator, bool allowEmptyEntries, Vector<String>& result) const
693{
694    result.clear();
695
696    unsigned startPos = 0;
697    size_t endPos;
698    while ((endPos = find(separator, startPos)) != kNotFound) {
699        if (allowEmptyEntries || startPos != endPos)
700            result.append(substring(startPos, endPos - startPos));
701        startPos = endPos + separator.length();
702    }
703    if (allowEmptyEntries || startPos != length())
704        result.append(substring(startPos));
705}
706
707void String::split(UChar separator, bool allowEmptyEntries, Vector<String>& result) const
708{
709    result.clear();
710
711    unsigned startPos = 0;
712    size_t endPos;
713    while ((endPos = find(separator, startPos)) != kNotFound) {
714        if (allowEmptyEntries || startPos != endPos)
715            result.append(substring(startPos, endPos - startPos));
716        startPos = endPos + 1;
717    }
718    if (allowEmptyEntries || startPos != length())
719        result.append(substring(startPos));
720}
721
722CString String::ascii() const
723{
724    // Printable ASCII characters 32..127 and the null character are
725    // preserved, characters outside of this range are converted to '?'.
726
727    unsigned length = this->length();
728    if (!length) {
729        char* characterBuffer;
730        return CString::newUninitialized(length, characterBuffer);
731    }
732
733    if (this->is8Bit()) {
734        const LChar* characters = this->characters8();
735
736        char* characterBuffer;
737        CString result = CString::newUninitialized(length, characterBuffer);
738
739        for (unsigned i = 0; i < length; ++i) {
740            LChar ch = characters[i];
741            characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
742        }
743
744        return result;
745    }
746
747    const UChar* characters = this->characters16();
748
749    char* characterBuffer;
750    CString result = CString::newUninitialized(length, characterBuffer);
751
752    for (unsigned i = 0; i < length; ++i) {
753        UChar ch = characters[i];
754        characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
755    }
756
757    return result;
758}
759
760CString String::latin1() const
761{
762    // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
763    // preserved, characters outside of this range are converted to '?'.
764
765    unsigned length = this->length();
766
767    if (!length)
768        return CString("", 0);
769
770    if (is8Bit())
771        return CString(reinterpret_cast<const char*>(this->characters8()), length);
772
773    const UChar* characters = this->characters16();
774
775    char* characterBuffer;
776    CString result = CString::newUninitialized(length, characterBuffer);
777
778    for (unsigned i = 0; i < length; ++i) {
779        UChar ch = characters[i];
780        characterBuffer[i] = ch > 0xff ? '?' : ch;
781    }
782
783    return result;
784}
785
786// Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
787static inline void putUTF8Triple(char*& buffer, UChar ch)
788{
789    ASSERT(ch >= 0x0800);
790    *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
791    *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
792    *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
793}
794
795CString String::utf8(UTF8ConversionMode mode) const
796{
797    unsigned length = this->length();
798
799    if (!length)
800        return CString("", 0);
801
802    // Allocate a buffer big enough to hold all the characters
803    // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
804    // Optimization ideas, if we find this function is hot:
805    //  * We could speculatively create a CStringBuffer to contain 'length'
806    //    characters, and resize if necessary (i.e. if the buffer contains
807    //    non-ascii characters). (Alternatively, scan the buffer first for
808    //    ascii characters, so we know this will be sufficient).
809    //  * We could allocate a CStringBuffer with an appropriate size to
810    //    have a good chance of being able to write the string into the
811    //    buffer without reallocing (say, 1.5 x length).
812    if (length > numeric_limits<unsigned>::max() / 3)
813        return CString();
814    Vector<char, 1024> bufferVector(length * 3);
815
816    char* buffer = bufferVector.data();
817
818    if (is8Bit()) {
819        const LChar* characters = this->characters8();
820
821        ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
822        ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
823    } else {
824        const UChar* characters = this->characters16();
825
826        if (mode == StrictUTF8ConversionReplacingUnpairedSurrogatesWithFFFD) {
827            const UChar* charactersEnd = characters + length;
828            char* bufferEnd = buffer + bufferVector.size();
829            while (characters < charactersEnd) {
830                // Use strict conversion to detect unpaired surrogates.
831                ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, &buffer, bufferEnd, true);
832                ASSERT(result != targetExhausted);
833                // Conversion fails when there is an unpaired surrogate.
834                // Put replacement character (U+FFFD) instead of the unpaired surrogate.
835                if (result != conversionOK) {
836                    ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
837                    // There should be room left, since one UChar hasn't been converted.
838                    ASSERT((buffer + 3) <= bufferEnd);
839                    putUTF8Triple(buffer, replacementCharacter);
840                    ++characters;
841                }
842            }
843        } else {
844            bool strict = mode == StrictUTF8Conversion;
845            ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
846            ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
847
848            // Only produced from strict conversion.
849            if (result == sourceIllegal) {
850                ASSERT(strict);
851                return CString();
852            }
853
854            // Check for an unconverted high surrogate.
855            if (result == sourceExhausted) {
856                if (strict)
857                    return CString();
858                // This should be one unpaired high surrogate. Treat it the same
859                // was as an unpaired high surrogate would have been handled in
860                // the middle of a string with non-strict conversion - which is
861                // to say, simply encode it to UTF-8.
862                ASSERT((characters + 1) == (this->characters16() + length));
863                ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
864                // There should be room left, since one UChar hasn't been converted.
865                ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
866                putUTF8Triple(buffer, *characters);
867            }
868        }
869    }
870
871    return CString(bufferVector.data(), buffer - bufferVector.data());
872}
873
874String String::make8BitFrom16BitSource(const UChar* source, size_t length)
875{
876    if (!length)
877        return emptyString();
878
879    LChar* destination;
880    String result = String::createUninitialized(length, destination);
881
882    copyLCharsFromUCharSource(destination, source, length);
883
884    return result;
885}
886
887String String::make16BitFrom8BitSource(const LChar* source, size_t length)
888{
889    if (!length)
890        return emptyString16Bit();
891
892    UChar* destination;
893    String result = String::createUninitialized(length, destination);
894
895    StringImpl::copyChars(destination, source, length);
896
897    return result;
898}
899
900String String::fromUTF8(const LChar* stringStart, size_t length)
901{
902    RELEASE_ASSERT(length <= numeric_limits<unsigned>::max());
903
904    if (!stringStart)
905        return String();
906
907    if (!length)
908        return emptyString();
909
910    if (charactersAreAllASCII(stringStart, length))
911        return StringImpl::create(stringStart, length);
912
913    Vector<UChar, 1024> buffer(length);
914    UChar* bufferStart = buffer.data();
915
916    UChar* bufferCurrent = bufferStart;
917    const char* stringCurrent = reinterpret_cast<const char*>(stringStart);
918    if (convertUTF8ToUTF16(&stringCurrent, reinterpret_cast<const char *>(stringStart + length), &bufferCurrent, bufferCurrent + buffer.size()) != conversionOK)
919        return String();
920
921    unsigned utf16Length = bufferCurrent - bufferStart;
922    ASSERT(utf16Length < length);
923    return StringImpl::create(bufferStart, utf16Length);
924}
925
926String String::fromUTF8(const LChar* string)
927{
928    if (!string)
929        return String();
930    return fromUTF8(string, strlen(reinterpret_cast<const char*>(string)));
931}
932
933String String::fromUTF8(const CString& s)
934{
935    return fromUTF8(s.data());
936}
937
938String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size)
939{
940    String utf8 = fromUTF8(string, size);
941    if (!utf8)
942        return String(string, size);
943    return utf8;
944}
945
946// String Operations
947
948static bool isCharacterAllowedInBase(UChar c, int base)
949{
950    if (c > 0x7F)
951        return false;
952    if (isASCIIDigit(c))
953        return c - '0' < base;
954    if (isASCIIAlpha(c)) {
955        if (base > 36)
956            base = 36;
957        return (c >= 'a' && c < 'a' + base - 10)
958            || (c >= 'A' && c < 'A' + base - 10);
959    }
960    return false;
961}
962
963template <typename IntegralType, typename CharType>
964static inline IntegralType toIntegralType(const CharType* data, size_t length, bool* ok, int base)
965{
966    static const IntegralType integralMax = numeric_limits<IntegralType>::max();
967    static const bool isSigned = numeric_limits<IntegralType>::is_signed;
968    const IntegralType maxMultiplier = integralMax / base;
969
970    IntegralType value = 0;
971    bool isOk = false;
972    bool isNegative = false;
973
974    if (!data)
975        goto bye;
976
977    // skip leading whitespace
978    while (length && isSpaceOrNewline(*data)) {
979        --length;
980        ++data;
981    }
982
983    if (isSigned && length && *data == '-') {
984        --length;
985        ++data;
986        isNegative = true;
987    } else if (length && *data == '+') {
988        --length;
989        ++data;
990    }
991
992    if (!length || !isCharacterAllowedInBase(*data, base))
993        goto bye;
994
995    while (length && isCharacterAllowedInBase(*data, base)) {
996        --length;
997        IntegralType digitValue;
998        CharType c = *data;
999        if (isASCIIDigit(c))
1000            digitValue = c - '0';
1001        else if (c >= 'a')
1002            digitValue = c - 'a' + 10;
1003        else
1004            digitValue = c - 'A' + 10;
1005
1006        if (value > maxMultiplier || (value == maxMultiplier && digitValue > (integralMax % base) + isNegative))
1007            goto bye;
1008
1009        value = base * value + digitValue;
1010        ++data;
1011    }
1012
1013#if COMPILER(MSVC)
1014#pragma warning(push, 0)
1015#pragma warning(disable:4146)
1016#endif
1017
1018    if (isNegative)
1019        value = -value;
1020
1021#if COMPILER(MSVC)
1022#pragma warning(pop)
1023#endif
1024
1025    // skip trailing space
1026    while (length && isSpaceOrNewline(*data)) {
1027        --length;
1028        ++data;
1029    }
1030
1031    if (!length)
1032        isOk = true;
1033bye:
1034    if (ok)
1035        *ok = isOk;
1036    return isOk ? value : 0;
1037}
1038
1039template <typename CharType>
1040static unsigned lengthOfCharactersAsInteger(const CharType* data, size_t length)
1041{
1042    size_t i = 0;
1043
1044    // Allow leading spaces.
1045    for (; i != length; ++i) {
1046        if (!isSpaceOrNewline(data[i]))
1047            break;
1048    }
1049
1050    // Allow sign.
1051    if (i != length && (data[i] == '+' || data[i] == '-'))
1052        ++i;
1053
1054    // Allow digits.
1055    for (; i != length; ++i) {
1056        if (!isASCIIDigit(data[i]))
1057            break;
1058    }
1059
1060    return i;
1061}
1062
1063int charactersToIntStrict(const LChar* data, size_t length, bool* ok, int base)
1064{
1065    return toIntegralType<int, LChar>(data, length, ok, base);
1066}
1067
1068int charactersToIntStrict(const UChar* data, size_t length, bool* ok, int base)
1069{
1070    return toIntegralType<int, UChar>(data, length, ok, base);
1071}
1072
1073unsigned charactersToUIntStrict(const LChar* data, size_t length, bool* ok, int base)
1074{
1075    return toIntegralType<unsigned, LChar>(data, length, ok, base);
1076}
1077
1078unsigned charactersToUIntStrict(const UChar* data, size_t length, bool* ok, int base)
1079{
1080    return toIntegralType<unsigned, UChar>(data, length, ok, base);
1081}
1082
1083int64_t charactersToInt64Strict(const LChar* data, size_t length, bool* ok, int base)
1084{
1085    return toIntegralType<int64_t, LChar>(data, length, ok, base);
1086}
1087
1088int64_t charactersToInt64Strict(const UChar* data, size_t length, bool* ok, int base)
1089{
1090    return toIntegralType<int64_t, UChar>(data, length, ok, base);
1091}
1092
1093uint64_t charactersToUInt64Strict(const LChar* data, size_t length, bool* ok, int base)
1094{
1095    return toIntegralType<uint64_t, LChar>(data, length, ok, base);
1096}
1097
1098uint64_t charactersToUInt64Strict(const UChar* data, size_t length, bool* ok, int base)
1099{
1100    return toIntegralType<uint64_t, UChar>(data, length, ok, base);
1101}
1102
1103intptr_t charactersToIntPtrStrict(const LChar* data, size_t length, bool* ok, int base)
1104{
1105    return toIntegralType<intptr_t, LChar>(data, length, ok, base);
1106}
1107
1108intptr_t charactersToIntPtrStrict(const UChar* data, size_t length, bool* ok, int base)
1109{
1110    return toIntegralType<intptr_t, UChar>(data, length, ok, base);
1111}
1112
1113int charactersToInt(const LChar* data, size_t length, bool* ok)
1114{
1115    return toIntegralType<int, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1116}
1117
1118int charactersToInt(const UChar* data, size_t length, bool* ok)
1119{
1120    return toIntegralType<int, UChar>(data, lengthOfCharactersAsInteger(data, length), ok, 10);
1121}
1122
1123unsigned charactersToUInt(const LChar* data, size_t length, bool* ok)
1124{
1125    return toIntegralType<unsigned, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1126}
1127
1128unsigned charactersToUInt(const UChar* data, size_t length, bool* ok)
1129{
1130    return toIntegralType<unsigned, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1131}
1132
1133int64_t charactersToInt64(const LChar* data, size_t length, bool* ok)
1134{
1135    return toIntegralType<int64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1136}
1137
1138int64_t charactersToInt64(const UChar* data, size_t length, bool* ok)
1139{
1140    return toIntegralType<int64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1141}
1142
1143uint64_t charactersToUInt64(const LChar* data, size_t length, bool* ok)
1144{
1145    return toIntegralType<uint64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1146}
1147
1148uint64_t charactersToUInt64(const UChar* data, size_t length, bool* ok)
1149{
1150    return toIntegralType<uint64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1151}
1152
1153intptr_t charactersToIntPtr(const LChar* data, size_t length, bool* ok)
1154{
1155    return toIntegralType<intptr_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1156}
1157
1158intptr_t charactersToIntPtr(const UChar* data, size_t length, bool* ok)
1159{
1160    return toIntegralType<intptr_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1161}
1162
1163enum TrailingJunkPolicy { DisallowTrailingJunk, AllowTrailingJunk };
1164
1165template <typename CharType, TrailingJunkPolicy policy>
1166static inline double toDoubleType(const CharType* data, size_t length, bool* ok, size_t& parsedLength)
1167{
1168    size_t leadingSpacesLength = 0;
1169    while (leadingSpacesLength < length && isASCIISpace(data[leadingSpacesLength]))
1170        ++leadingSpacesLength;
1171
1172    double number = parseDouble(data + leadingSpacesLength, length - leadingSpacesLength, parsedLength);
1173    if (!parsedLength) {
1174        if (ok)
1175            *ok = false;
1176        return 0.0;
1177    }
1178
1179    parsedLength += leadingSpacesLength;
1180    if (ok)
1181        *ok = policy == AllowTrailingJunk || parsedLength == length;
1182    return number;
1183}
1184
1185double charactersToDouble(const LChar* data, size_t length, bool* ok)
1186{
1187    size_t parsedLength;
1188    return toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength);
1189}
1190
1191double charactersToDouble(const UChar* data, size_t length, bool* ok)
1192{
1193    size_t parsedLength;
1194    return toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength);
1195}
1196
1197float charactersToFloat(const LChar* data, size_t length, bool* ok)
1198{
1199    // FIXME: This will return ok even when the string fits into a double but not a float.
1200    size_t parsedLength;
1201    return static_cast<float>(toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength));
1202}
1203
1204float charactersToFloat(const UChar* data, size_t length, bool* ok)
1205{
1206    // FIXME: This will return ok even when the string fits into a double but not a float.
1207    size_t parsedLength;
1208    return static_cast<float>(toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength));
1209}
1210
1211float charactersToFloat(const LChar* data, size_t length, size_t& parsedLength)
1212{
1213    // FIXME: This will return ok even when the string fits into a double but not a float.
1214    return static_cast<float>(toDoubleType<LChar, AllowTrailingJunk>(data, length, 0, parsedLength));
1215}
1216
1217float charactersToFloat(const UChar* data, size_t length, size_t& parsedLength)
1218{
1219    // FIXME: This will return ok even when the string fits into a double but not a float.
1220    return static_cast<float>(toDoubleType<UChar, AllowTrailingJunk>(data, length, 0, parsedLength));
1221}
1222
1223const String& emptyString()
1224{
1225    DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty()));
1226    return emptyString;
1227}
1228
1229const String& emptyString16Bit()
1230{
1231    DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty16Bit()));
1232    return emptyString;
1233}
1234
1235} // namespace WTF
1236
1237#ifndef NDEBUG
1238// For use in the debugger
1239String* string(const char*);
1240Vector<char> asciiDebug(StringImpl* impl);
1241Vector<char> asciiDebug(String& string);
1242
1243void String::show() const
1244{
1245    dataLogF("%s\n", asciiDebug(impl()).data());
1246}
1247
1248String* string(const char* s)
1249{
1250    // leaks memory!
1251    return new String(s);
1252}
1253
1254Vector<char> asciiDebug(StringImpl* impl)
1255{
1256    if (!impl)
1257        return asciiDebug(String("[null]").impl());
1258
1259    Vector<char> buffer;
1260    for (unsigned i = 0; i < impl->length(); ++i) {
1261        UChar ch = (*impl)[i];
1262        if (isASCIIPrintable(ch)) {
1263            if (ch == '\\')
1264                buffer.append(ch);
1265            buffer.append(ch);
1266        } else {
1267            buffer.append('\\');
1268            buffer.append('u');
1269            appendUnsignedAsHexFixedSize(ch, buffer, 4);
1270        }
1271    }
1272    buffer.append('\0');
1273    return buffer;
1274}
1275
1276Vector<char> asciiDebug(String& string)
1277{
1278    return asciiDebug(string.impl());
1279}
1280
1281#endif
1282