1/*
2 * (C) 1999 Lars Knoll (knoll@kde.org)
3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights reserved.
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB.  If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 */
21
22#include "config.h"
23#include "WTFString.h"
24
25#include "IntegerToStringConversion.h"
26#include <stdarg.h>
27#include "wtf/ASCIICType.h"
28#include "wtf/DataLog.h"
29#include "wtf/HexNumber.h"
30#include "wtf/MathExtras.h"
31#include "wtf/text/CString.h"
32#include "wtf/StringExtras.h"
33#include "wtf/Vector.h"
34#include "wtf/dtoa.h"
35#include "wtf/unicode/CharacterNames.h"
36#include "wtf/unicode/UTF8.h"
37#include "wtf/unicode/Unicode.h"
38
39using namespace std;
40
41namespace WTF {
42
43using namespace Unicode;
44using namespace std;
45
46// Construct a string with UTF-16 data.
47String::String(const UChar* characters, unsigned length)
48    : m_impl(characters ? StringImpl::create(characters, length) : 0)
49{
50}
51
52// Construct a string with UTF-16 data, from a null-terminated source.
53String::String(const UChar* str)
54{
55    if (!str)
56        return;
57    m_impl = StringImpl::create(str, lengthOfNullTerminatedString(str));
58}
59
60// Construct a string with latin1 data.
61String::String(const LChar* characters, unsigned length)
62    : m_impl(characters ? StringImpl::create(characters, length) : 0)
63{
64}
65
66String::String(const char* characters, unsigned length)
67    : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters), length) : 0)
68{
69}
70
71// Construct a string with latin1 data, from a null-terminated source.
72String::String(const LChar* characters)
73    : m_impl(characters ? StringImpl::create(characters) : 0)
74{
75}
76
77String::String(const char* characters)
78    : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters)) : 0)
79{
80}
81
82void String::append(const String& string)
83{
84    if (string.isEmpty())
85        return;
86    if (!m_impl) {
87        m_impl = string.m_impl;
88        return;
89    }
90
91    // FIXME: This is extremely inefficient. So much so that we might want to take this
92    // out of String's API. We can make it better by optimizing the case where exactly
93    // one String is pointing at this StringImpl, but even then it's going to require a
94    // call to fastMalloc every single time.
95
96    if (m_impl->is8Bit() && string.m_impl->is8Bit()) {
97        LChar* data;
98        RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length());
99        RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data);
100        memcpy(data, m_impl->characters8(), m_impl->length() * sizeof(LChar));
101        memcpy(data + m_impl->length(), string.characters8(), string.length() * sizeof(LChar));
102        m_impl = newImpl.release();
103        return;
104    }
105
106    UChar* data;
107    RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length());
108    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data);
109
110    if (m_impl->is8Bit())
111        StringImpl::copyChars(data, m_impl->characters8(), m_impl->length());
112    else
113        StringImpl::copyChars(data, m_impl->characters16(), m_impl->length());
114
115    if (string.impl()->is8Bit())
116        StringImpl::copyChars(data + m_impl->length(), string.impl()->characters8(), string.impl()->length());
117    else
118        StringImpl::copyChars(data + m_impl->length(), string.impl()->characters16(), string.impl()->length());
119
120    m_impl = newImpl.release();
121}
122
123template <typename CharacterType>
124inline void String::appendInternal(CharacterType c)
125{
126    // FIXME: This is extremely inefficient. So much so that we might want to take this
127    // out of String's API. We can make it better by optimizing the case where exactly
128    // one String is pointing at this StringImpl, but even then it's going to require a
129    // call to fastMalloc every single time.
130    if (!m_impl) {
131        m_impl = StringImpl::create(&c, 1);
132        return;
133    }
134
135    UChar* data; // FIXME: We should be able to create an 8 bit string via this code path.
136    RELEASE_ASSERT(m_impl->length() < numeric_limits<unsigned>::max());
137    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + 1, data);
138    if (m_impl->is8Bit())
139        StringImpl::copyChars(data, m_impl->characters8(), m_impl->length());
140    else
141        StringImpl::copyChars(data, m_impl->characters16(), m_impl->length());
142    data[m_impl->length()] = c;
143    m_impl = newImpl.release();
144}
145
146void String::append(LChar c)
147{
148    appendInternal(c);
149}
150
151void String::append(UChar c)
152{
153    appendInternal(c);
154}
155
156int codePointCompare(const String& a, const String& b)
157{
158    return codePointCompare(a.impl(), b.impl());
159}
160
161void String::insert(const String& string, unsigned position)
162{
163    if (string.isEmpty()) {
164        if (string.isNull())
165            return;
166        if (isNull())
167            m_impl = string.impl();
168        return;
169    }
170
171    if (string.is8Bit())
172        insert(string.impl()->characters8(), string.length(), position);
173    else
174        insert(string.impl()->characters16(), string.length(), position);
175}
176
177void String::append(const LChar* charactersToAppend, unsigned lengthToAppend)
178{
179    if (!m_impl) {
180        if (!charactersToAppend)
181            return;
182        m_impl = StringImpl::create(charactersToAppend, lengthToAppend);
183        return;
184    }
185
186    if (!lengthToAppend)
187        return;
188
189    ASSERT(charactersToAppend);
190
191    unsigned strLength = m_impl->length();
192
193    if (m_impl->is8Bit()) {
194        RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
195        LChar* data;
196        RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data);
197        StringImpl::copyChars(data, m_impl->characters8(), strLength);
198        StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
199        m_impl = newImpl.release();
200        return;
201    }
202
203    RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
204    UChar* data;
205    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() + lengthToAppend, data);
206    StringImpl::copyChars(data, m_impl->characters16(), strLength);
207    StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
208    m_impl = newImpl.release();
209}
210
211void String::append(const UChar* charactersToAppend, unsigned lengthToAppend)
212{
213    if (!m_impl) {
214        if (!charactersToAppend)
215            return;
216        m_impl = StringImpl::create(charactersToAppend, lengthToAppend);
217        return;
218    }
219
220    if (!lengthToAppend)
221        return;
222
223    unsigned strLength = m_impl->length();
224
225    ASSERT(charactersToAppend);
226    RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
227    UChar* data;
228    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data);
229    if (m_impl->is8Bit())
230        StringImpl::copyChars(data, characters8(), strLength);
231    else
232        StringImpl::copyChars(data, characters16(), strLength);
233    StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
234    m_impl = newImpl.release();
235}
236
237template<typename CharType>
238PassRefPtr<StringImpl> insertInternal(PassRefPtr<StringImpl> impl, const CharType* charactersToInsert, unsigned lengthToInsert, unsigned position)
239{
240    if (!lengthToInsert)
241        return impl;
242
243    ASSERT(charactersToInsert);
244    UChar* data; // FIXME: We should be able to create an 8 bit string here.
245    RELEASE_ASSERT(lengthToInsert <= numeric_limits<unsigned>::max() - impl->length());
246    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(impl->length() + lengthToInsert, data);
247
248    if (impl->is8Bit())
249        StringImpl::copyChars(data, impl->characters8(), position);
250    else
251        StringImpl::copyChars(data, impl->characters16(), position);
252
253    StringImpl::copyChars(data + position, charactersToInsert, lengthToInsert);
254
255    if (impl->is8Bit())
256        StringImpl::copyChars(data + position + lengthToInsert, impl->characters8() + position, impl->length() - position);
257    else
258        StringImpl::copyChars(data + position + lengthToInsert, impl->characters16() + position, impl->length() - position);
259
260    return newImpl.release();
261}
262
263void String::insert(const UChar* charactersToInsert, unsigned lengthToInsert, unsigned position)
264{
265    if (position >= length()) {
266        append(charactersToInsert, lengthToInsert);
267        return;
268    }
269    ASSERT(m_impl);
270    m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position);
271}
272
273void String::insert(const LChar* charactersToInsert, unsigned lengthToInsert, unsigned position)
274{
275    if (position >= length()) {
276        append(charactersToInsert, lengthToInsert);
277        return;
278    }
279    ASSERT(m_impl);
280    m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position);
281}
282
283UChar32 String::characterStartingAt(unsigned i) const
284{
285    if (!m_impl || i >= m_impl->length())
286        return 0;
287    return m_impl->characterStartingAt(i);
288}
289
290void String::ensure16Bit()
291{
292    unsigned length = this->length();
293    if (!length || !is8Bit())
294        return;
295    m_impl = make16BitFrom8BitSource(m_impl->characters8(), length).impl();
296}
297
298void String::truncate(unsigned position)
299{
300    if (position >= length())
301        return;
302    if (m_impl->is8Bit()) {
303        LChar* data;
304        RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data);
305        memcpy(data, m_impl->characters8(), position * sizeof(LChar));
306        m_impl = newImpl.release();
307    } else {
308        UChar* data;
309        RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data);
310        memcpy(data, m_impl->characters16(), position * sizeof(UChar));
311        m_impl = newImpl.release();
312    }
313}
314
315template <typename CharacterType>
316inline void String::removeInternal(const CharacterType* characters, unsigned position, int lengthToRemove)
317{
318    CharacterType* data;
319    RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() - lengthToRemove, data);
320    memcpy(data, characters, position * sizeof(CharacterType));
321    memcpy(data + position, characters + position + lengthToRemove,
322        (length() - lengthToRemove - position) * sizeof(CharacterType));
323
324    m_impl = newImpl.release();
325}
326
327void String::remove(unsigned position, int lengthToRemove)
328{
329    if (lengthToRemove <= 0)
330        return;
331    if (position >= length())
332        return;
333    if (static_cast<unsigned>(lengthToRemove) > length() - position)
334        lengthToRemove = length() - position;
335
336    if (is8Bit()) {
337        removeInternal(characters8(), position, lengthToRemove);
338
339        return;
340    }
341
342    removeInternal(characters16(), position, lengthToRemove);
343}
344
345String String::substring(unsigned pos, unsigned len) const
346{
347    if (!m_impl)
348        return String();
349    return m_impl->substring(pos, len);
350}
351
352String String::lower() const
353{
354    if (!m_impl)
355        return String();
356    return m_impl->lower();
357}
358
359String String::upper() const
360{
361    if (!m_impl)
362        return String();
363    return m_impl->upper();
364}
365
366String String::stripWhiteSpace() const
367{
368    if (!m_impl)
369        return String();
370    return m_impl->stripWhiteSpace();
371}
372
373String String::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const
374{
375    if (!m_impl)
376        return String();
377    return m_impl->stripWhiteSpace(isWhiteSpace);
378}
379
380String String::simplifyWhiteSpace() const
381{
382    if (!m_impl)
383        return String();
384    return m_impl->simplifyWhiteSpace();
385}
386
387String String::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const
388{
389    if (!m_impl)
390        return String();
391    return m_impl->simplifyWhiteSpace(isWhiteSpace);
392}
393
394String String::removeCharacters(CharacterMatchFunctionPtr findMatch) const
395{
396    if (!m_impl)
397        return String();
398    return m_impl->removeCharacters(findMatch);
399}
400
401String String::foldCase() const
402{
403    if (!m_impl)
404        return String();
405    return m_impl->foldCase();
406}
407
408bool String::percentage(int& result) const
409{
410    if (!m_impl || !m_impl->length())
411        return false;
412
413    if ((*m_impl)[m_impl->length() - 1] != '%')
414        return false;
415
416    if (m_impl->is8Bit())
417        result = charactersToIntStrict(m_impl->characters8(), m_impl->length() - 1);
418    else
419        result = charactersToIntStrict(m_impl->characters16(), m_impl->length() - 1);
420
421    return true;
422}
423
424Vector<UChar> String::charactersWithNullTermination() const
425{
426    if (!m_impl)
427        return Vector<UChar>();
428
429    Vector<UChar> result;
430    result.reserveInitialCapacity(length() + 1);
431    appendTo(result);
432    result.append(0);
433    return result;
434}
435
436unsigned String::copyTo(UChar* buffer, unsigned pos, unsigned maxLength) const
437{
438    unsigned length = this->length();
439    RELEASE_ASSERT(pos <= length);
440    unsigned numCharacters = std::min(length - pos, maxLength);
441    if (!numCharacters)
442        return 0;
443    if (is8Bit())
444        StringImpl::copyChars(buffer, characters8() + pos, numCharacters);
445    else
446        StringImpl::copyChars(buffer, characters16() + pos, numCharacters);
447    return numCharacters;
448}
449
450String String::format(const char *format, ...)
451{
452    va_list args;
453    va_start(args, format);
454
455    Vector<char, 256> buffer;
456
457    // Do the format once to get the length.
458#if COMPILER(MSVC)
459    int result = _vscprintf(format, args);
460#else
461    char ch;
462    int result = vsnprintf(&ch, 1, format, args);
463    // We need to call va_end() and then va_start() again here, as the
464    // contents of args is undefined after the call to vsnprintf
465    // according to http://man.cx/snprintf(3)
466    //
467    // Not calling va_end/va_start here happens to work on lots of
468    // systems, but fails e.g. on 64bit Linux.
469    va_end(args);
470    va_start(args, format);
471#endif
472
473    if (result == 0)
474        return String("");
475    if (result < 0)
476        return String();
477    unsigned len = result;
478    buffer.grow(len + 1);
479
480    // Now do the formatting again, guaranteed to fit.
481    vsnprintf(buffer.data(), buffer.size(), format, args);
482
483    va_end(args);
484
485    return StringImpl::create(reinterpret_cast<const LChar*>(buffer.data()), len);
486}
487
488String String::number(int number)
489{
490    return numberToStringSigned<String>(number);
491}
492
493String String::number(unsigned int number)
494{
495    return numberToStringUnsigned<String>(number);
496}
497
498String String::number(long number)
499{
500    return numberToStringSigned<String>(number);
501}
502
503String String::number(unsigned long number)
504{
505    return numberToStringUnsigned<String>(number);
506}
507
508String String::number(long long number)
509{
510    return numberToStringSigned<String>(number);
511}
512
513String String::number(unsigned long long number)
514{
515    return numberToStringUnsigned<String>(number);
516}
517
518String String::number(double number, unsigned precision, TrailingZerosTruncatingPolicy trailingZerosTruncatingPolicy)
519{
520    NumberToStringBuffer buffer;
521    return String(numberToFixedPrecisionString(number, precision, buffer, trailingZerosTruncatingPolicy == TruncateTrailingZeros));
522}
523
524String String::numberToStringECMAScript(double number)
525{
526    NumberToStringBuffer buffer;
527    return String(numberToString(number, buffer));
528}
529
530String String::numberToStringFixedWidth(double number, unsigned decimalPlaces)
531{
532    NumberToStringBuffer buffer;
533    return String(numberToFixedWidthString(number, decimalPlaces, buffer));
534}
535
536int String::toIntStrict(bool* ok, int base) const
537{
538    if (!m_impl) {
539        if (ok)
540            *ok = false;
541        return 0;
542    }
543    return m_impl->toIntStrict(ok, base);
544}
545
546unsigned String::toUIntStrict(bool* ok, int base) const
547{
548    if (!m_impl) {
549        if (ok)
550            *ok = false;
551        return 0;
552    }
553    return m_impl->toUIntStrict(ok, base);
554}
555
556int64_t String::toInt64Strict(bool* ok, int base) const
557{
558    if (!m_impl) {
559        if (ok)
560            *ok = false;
561        return 0;
562    }
563    return m_impl->toInt64Strict(ok, base);
564}
565
566uint64_t String::toUInt64Strict(bool* ok, int base) const
567{
568    if (!m_impl) {
569        if (ok)
570            *ok = false;
571        return 0;
572    }
573    return m_impl->toUInt64Strict(ok, base);
574}
575
576intptr_t String::toIntPtrStrict(bool* ok, int base) const
577{
578    if (!m_impl) {
579        if (ok)
580            *ok = false;
581        return 0;
582    }
583    return m_impl->toIntPtrStrict(ok, base);
584}
585
586int String::toInt(bool* ok) const
587{
588    if (!m_impl) {
589        if (ok)
590            *ok = false;
591        return 0;
592    }
593    return m_impl->toInt(ok);
594}
595
596unsigned String::toUInt(bool* ok) const
597{
598    if (!m_impl) {
599        if (ok)
600            *ok = false;
601        return 0;
602    }
603    return m_impl->toUInt(ok);
604}
605
606int64_t String::toInt64(bool* ok) const
607{
608    if (!m_impl) {
609        if (ok)
610            *ok = false;
611        return 0;
612    }
613    return m_impl->toInt64(ok);
614}
615
616uint64_t String::toUInt64(bool* ok) const
617{
618    if (!m_impl) {
619        if (ok)
620            *ok = false;
621        return 0;
622    }
623    return m_impl->toUInt64(ok);
624}
625
626intptr_t String::toIntPtr(bool* ok) const
627{
628    if (!m_impl) {
629        if (ok)
630            *ok = false;
631        return 0;
632    }
633    return m_impl->toIntPtr(ok);
634}
635
636double String::toDouble(bool* ok) const
637{
638    if (!m_impl) {
639        if (ok)
640            *ok = false;
641        return 0.0;
642    }
643    return m_impl->toDouble(ok);
644}
645
646float String::toFloat(bool* ok) const
647{
648    if (!m_impl) {
649        if (ok)
650            *ok = false;
651        return 0.0f;
652    }
653    return m_impl->toFloat(ok);
654}
655
656String String::isolatedCopy() const
657{
658    if (!m_impl)
659        return String();
660    return m_impl->isolatedCopy();
661}
662
663bool String::isSafeToSendToAnotherThread() const
664{
665    if (!impl())
666        return true;
667    if (impl()->isStatic())
668        return true;
669    // AtomicStrings are not safe to send between threads as ~StringImpl()
670    // will try to remove them from the wrong AtomicStringTable.
671    if (impl()->isAtomic())
672        return false;
673    if (impl()->hasOneRef())
674        return true;
675    return false;
676}
677
678void String::split(const String& separator, bool allowEmptyEntries, Vector<String>& result) const
679{
680    result.clear();
681
682    unsigned startPos = 0;
683    size_t endPos;
684    while ((endPos = find(separator, startPos)) != notFound) {
685        if (allowEmptyEntries || startPos != endPos)
686            result.append(substring(startPos, endPos - startPos));
687        startPos = endPos + separator.length();
688    }
689    if (allowEmptyEntries || startPos != length())
690        result.append(substring(startPos));
691}
692
693void String::split(UChar separator, bool allowEmptyEntries, Vector<String>& result) const
694{
695    result.clear();
696
697    unsigned startPos = 0;
698    size_t endPos;
699    while ((endPos = find(separator, startPos)) != notFound) {
700        if (allowEmptyEntries || startPos != endPos)
701            result.append(substring(startPos, endPos - startPos));
702        startPos = endPos + 1;
703    }
704    if (allowEmptyEntries || startPos != length())
705        result.append(substring(startPos));
706}
707
708CString String::ascii() const
709{
710    // Printable ASCII characters 32..127 and the null character are
711    // preserved, characters outside of this range are converted to '?'.
712
713    unsigned length = this->length();
714    if (!length) {
715        char* characterBuffer;
716        return CString::newUninitialized(length, characterBuffer);
717    }
718
719    if (this->is8Bit()) {
720        const LChar* characters = this->characters8();
721
722        char* characterBuffer;
723        CString result = CString::newUninitialized(length, characterBuffer);
724
725        for (unsigned i = 0; i < length; ++i) {
726            LChar ch = characters[i];
727            characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
728        }
729
730        return result;
731    }
732
733    const UChar* characters = this->characters16();
734
735    char* characterBuffer;
736    CString result = CString::newUninitialized(length, characterBuffer);
737
738    for (unsigned i = 0; i < length; ++i) {
739        UChar ch = characters[i];
740        characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
741    }
742
743    return result;
744}
745
746CString String::latin1() const
747{
748    // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
749    // preserved, characters outside of this range are converted to '?'.
750
751    unsigned length = this->length();
752
753    if (!length)
754        return CString("", 0);
755
756    if (is8Bit())
757        return CString(reinterpret_cast<const char*>(this->characters8()), length);
758
759    const UChar* characters = this->characters16();
760
761    char* characterBuffer;
762    CString result = CString::newUninitialized(length, characterBuffer);
763
764    for (unsigned i = 0; i < length; ++i) {
765        UChar ch = characters[i];
766        characterBuffer[i] = ch > 0xff ? '?' : ch;
767    }
768
769    return result;
770}
771
772// Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
773static inline void putUTF8Triple(char*& buffer, UChar ch)
774{
775    ASSERT(ch >= 0x0800);
776    *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
777    *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
778    *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
779}
780
781CString String::utf8(ConversionMode mode) const
782{
783    unsigned length = this->length();
784
785    if (!length)
786        return CString("", 0);
787
788    // Allocate a buffer big enough to hold all the characters
789    // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
790    // Optimization ideas, if we find this function is hot:
791    //  * We could speculatively create a CStringBuffer to contain 'length'
792    //    characters, and resize if necessary (i.e. if the buffer contains
793    //    non-ascii characters). (Alternatively, scan the buffer first for
794    //    ascii characters, so we know this will be sufficient).
795    //  * We could allocate a CStringBuffer with an appropriate size to
796    //    have a good chance of being able to write the string into the
797    //    buffer without reallocing (say, 1.5 x length).
798    if (length > numeric_limits<unsigned>::max() / 3)
799        return CString();
800    Vector<char, 1024> bufferVector(length * 3);
801
802    char* buffer = bufferVector.data();
803
804    if (is8Bit()) {
805        const LChar* characters = this->characters8();
806
807        ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
808        ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
809    } else {
810        const UChar* characters = this->characters16();
811
812        if (mode == StrictConversionReplacingUnpairedSurrogatesWithFFFD) {
813            const UChar* charactersEnd = characters + length;
814            char* bufferEnd = buffer + bufferVector.size();
815            while (characters < charactersEnd) {
816                // Use strict conversion to detect unpaired surrogates.
817                ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, &buffer, bufferEnd, true);
818                ASSERT(result != targetExhausted);
819                // Conversion fails when there is an unpaired surrogate.
820                // Put replacement character (U+FFFD) instead of the unpaired surrogate.
821                if (result != conversionOK) {
822                    ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
823                    // There should be room left, since one UChar hasn't been converted.
824                    ASSERT((buffer + 3) <= bufferEnd);
825                    putUTF8Triple(buffer, replacementCharacter);
826                    ++characters;
827                }
828            }
829        } else {
830            bool strict = mode == StrictConversion;
831            ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
832            ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
833
834            // Only produced from strict conversion.
835            if (result == sourceIllegal) {
836                ASSERT(strict);
837                return CString();
838            }
839
840            // Check for an unconverted high surrogate.
841            if (result == sourceExhausted) {
842                if (strict)
843                    return CString();
844                // This should be one unpaired high surrogate. Treat it the same
845                // was as an unpaired high surrogate would have been handled in
846                // the middle of a string with non-strict conversion - which is
847                // to say, simply encode it to UTF-8.
848                ASSERT((characters + 1) == (this->characters16() + length));
849                ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
850                // There should be room left, since one UChar hasn't been converted.
851                ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
852                putUTF8Triple(buffer, *characters);
853            }
854        }
855    }
856
857    return CString(bufferVector.data(), buffer - bufferVector.data());
858}
859
860String String::make8BitFrom16BitSource(const UChar* source, size_t length)
861{
862    if (!length)
863        return String();
864
865    LChar* destination;
866    String result = String::createUninitialized(length, destination);
867
868    copyLCharsFromUCharSource(destination, source, length);
869
870    return result;
871}
872
873String String::make16BitFrom8BitSource(const LChar* source, size_t length)
874{
875    if (!length)
876        return String();
877
878    UChar* destination;
879    String result = String::createUninitialized(length, destination);
880
881    StringImpl::copyChars(destination, source, length);
882
883    return result;
884}
885
886String String::fromUTF8(const LChar* stringStart, size_t length)
887{
888    RELEASE_ASSERT(length <= numeric_limits<unsigned>::max());
889
890    if (!stringStart)
891        return String();
892
893    if (!length)
894        return emptyString();
895
896    if (charactersAreAllASCII(stringStart, length))
897        return StringImpl::create(stringStart, length);
898
899    Vector<UChar, 1024> buffer(length);
900    UChar* bufferStart = buffer.data();
901
902    UChar* bufferCurrent = bufferStart;
903    const char* stringCurrent = reinterpret_cast<const char*>(stringStart);
904    if (convertUTF8ToUTF16(&stringCurrent, reinterpret_cast<const char *>(stringStart + length), &bufferCurrent, bufferCurrent + buffer.size()) != conversionOK)
905        return String();
906
907    unsigned utf16Length = bufferCurrent - bufferStart;
908    ASSERT(utf16Length < length);
909    return StringImpl::create(bufferStart, utf16Length);
910}
911
912String String::fromUTF8(const LChar* string)
913{
914    if (!string)
915        return String();
916    return fromUTF8(string, strlen(reinterpret_cast<const char*>(string)));
917}
918
919String String::fromUTF8(const CString& s)
920{
921    return fromUTF8(s.data());
922}
923
924String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size)
925{
926    String utf8 = fromUTF8(string, size);
927    if (!utf8)
928        return String(string, size);
929    return utf8;
930}
931
932// String Operations
933
934static bool isCharacterAllowedInBase(UChar c, int base)
935{
936    if (c > 0x7F)
937        return false;
938    if (isASCIIDigit(c))
939        return c - '0' < base;
940    if (isASCIIAlpha(c)) {
941        if (base > 36)
942            base = 36;
943        return (c >= 'a' && c < 'a' + base - 10)
944            || (c >= 'A' && c < 'A' + base - 10);
945    }
946    return false;
947}
948
949template <typename IntegralType, typename CharType>
950static inline IntegralType toIntegralType(const CharType* data, size_t length, bool* ok, int base)
951{
952    static const IntegralType integralMax = numeric_limits<IntegralType>::max();
953    static const bool isSigned = numeric_limits<IntegralType>::is_signed;
954    const IntegralType maxMultiplier = integralMax / base;
955
956    IntegralType value = 0;
957    bool isOk = false;
958    bool isNegative = false;
959
960    if (!data)
961        goto bye;
962
963    // skip leading whitespace
964    while (length && isSpaceOrNewline(*data)) {
965        --length;
966        ++data;
967    }
968
969    if (isSigned && length && *data == '-') {
970        --length;
971        ++data;
972        isNegative = true;
973    } else if (length && *data == '+') {
974        --length;
975        ++data;
976    }
977
978    if (!length || !isCharacterAllowedInBase(*data, base))
979        goto bye;
980
981    while (length && isCharacterAllowedInBase(*data, base)) {
982        --length;
983        IntegralType digitValue;
984        CharType c = *data;
985        if (isASCIIDigit(c))
986            digitValue = c - '0';
987        else if (c >= 'a')
988            digitValue = c - 'a' + 10;
989        else
990            digitValue = c - 'A' + 10;
991
992        if (value > maxMultiplier || (value == maxMultiplier && digitValue > (integralMax % base) + isNegative))
993            goto bye;
994
995        value = base * value + digitValue;
996        ++data;
997    }
998
999#if COMPILER(MSVC)
1000#pragma warning(push, 0)
1001#pragma warning(disable:4146)
1002#endif
1003
1004    if (isNegative)
1005        value = -value;
1006
1007#if COMPILER(MSVC)
1008#pragma warning(pop)
1009#endif
1010
1011    // skip trailing space
1012    while (length && isSpaceOrNewline(*data)) {
1013        --length;
1014        ++data;
1015    }
1016
1017    if (!length)
1018        isOk = true;
1019bye:
1020    if (ok)
1021        *ok = isOk;
1022    return isOk ? value : 0;
1023}
1024
1025template <typename CharType>
1026static unsigned lengthOfCharactersAsInteger(const CharType* data, size_t length)
1027{
1028    size_t i = 0;
1029
1030    // Allow leading spaces.
1031    for (; i != length; ++i) {
1032        if (!isSpaceOrNewline(data[i]))
1033            break;
1034    }
1035
1036    // Allow sign.
1037    if (i != length && (data[i] == '+' || data[i] == '-'))
1038        ++i;
1039
1040    // Allow digits.
1041    for (; i != length; ++i) {
1042        if (!isASCIIDigit(data[i]))
1043            break;
1044    }
1045
1046    return i;
1047}
1048
1049int charactersToIntStrict(const LChar* data, size_t length, bool* ok, int base)
1050{
1051    return toIntegralType<int, LChar>(data, length, ok, base);
1052}
1053
1054int charactersToIntStrict(const UChar* data, size_t length, bool* ok, int base)
1055{
1056    return toIntegralType<int, UChar>(data, length, ok, base);
1057}
1058
1059unsigned charactersToUIntStrict(const LChar* data, size_t length, bool* ok, int base)
1060{
1061    return toIntegralType<unsigned, LChar>(data, length, ok, base);
1062}
1063
1064unsigned charactersToUIntStrict(const UChar* data, size_t length, bool* ok, int base)
1065{
1066    return toIntegralType<unsigned, UChar>(data, length, ok, base);
1067}
1068
1069int64_t charactersToInt64Strict(const LChar* data, size_t length, bool* ok, int base)
1070{
1071    return toIntegralType<int64_t, LChar>(data, length, ok, base);
1072}
1073
1074int64_t charactersToInt64Strict(const UChar* data, size_t length, bool* ok, int base)
1075{
1076    return toIntegralType<int64_t, UChar>(data, length, ok, base);
1077}
1078
1079uint64_t charactersToUInt64Strict(const LChar* data, size_t length, bool* ok, int base)
1080{
1081    return toIntegralType<uint64_t, LChar>(data, length, ok, base);
1082}
1083
1084uint64_t charactersToUInt64Strict(const UChar* data, size_t length, bool* ok, int base)
1085{
1086    return toIntegralType<uint64_t, UChar>(data, length, ok, base);
1087}
1088
1089intptr_t charactersToIntPtrStrict(const LChar* data, size_t length, bool* ok, int base)
1090{
1091    return toIntegralType<intptr_t, LChar>(data, length, ok, base);
1092}
1093
1094intptr_t charactersToIntPtrStrict(const UChar* data, size_t length, bool* ok, int base)
1095{
1096    return toIntegralType<intptr_t, UChar>(data, length, ok, base);
1097}
1098
1099int charactersToInt(const LChar* data, size_t length, bool* ok)
1100{
1101    return toIntegralType<int, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1102}
1103
1104int charactersToInt(const UChar* data, size_t length, bool* ok)
1105{
1106    return toIntegralType<int, UChar>(data, lengthOfCharactersAsInteger(data, length), ok, 10);
1107}
1108
1109unsigned charactersToUInt(const LChar* data, size_t length, bool* ok)
1110{
1111    return toIntegralType<unsigned, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1112}
1113
1114unsigned charactersToUInt(const UChar* data, size_t length, bool* ok)
1115{
1116    return toIntegralType<unsigned, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1117}
1118
1119int64_t charactersToInt64(const LChar* data, size_t length, bool* ok)
1120{
1121    return toIntegralType<int64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1122}
1123
1124int64_t charactersToInt64(const UChar* data, size_t length, bool* ok)
1125{
1126    return toIntegralType<int64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1127}
1128
1129uint64_t charactersToUInt64(const LChar* data, size_t length, bool* ok)
1130{
1131    return toIntegralType<uint64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1132}
1133
1134uint64_t charactersToUInt64(const UChar* data, size_t length, bool* ok)
1135{
1136    return toIntegralType<uint64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1137}
1138
1139intptr_t charactersToIntPtr(const LChar* data, size_t length, bool* ok)
1140{
1141    return toIntegralType<intptr_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1142}
1143
1144intptr_t charactersToIntPtr(const UChar* data, size_t length, bool* ok)
1145{
1146    return toIntegralType<intptr_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1147}
1148
1149enum TrailingJunkPolicy { DisallowTrailingJunk, AllowTrailingJunk };
1150
1151template <typename CharType, TrailingJunkPolicy policy>
1152static inline double toDoubleType(const CharType* data, size_t length, bool* ok, size_t& parsedLength)
1153{
1154    size_t leadingSpacesLength = 0;
1155    while (leadingSpacesLength < length && isASCIISpace(data[leadingSpacesLength]))
1156        ++leadingSpacesLength;
1157
1158    double number = parseDouble(data + leadingSpacesLength, length - leadingSpacesLength, parsedLength);
1159    if (!parsedLength) {
1160        if (ok)
1161            *ok = false;
1162        return 0.0;
1163    }
1164
1165    parsedLength += leadingSpacesLength;
1166    if (ok)
1167        *ok = policy == AllowTrailingJunk || parsedLength == length;
1168    return number;
1169}
1170
1171double charactersToDouble(const LChar* data, size_t length, bool* ok)
1172{
1173    size_t parsedLength;
1174    return toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength);
1175}
1176
1177double charactersToDouble(const UChar* data, size_t length, bool* ok)
1178{
1179    size_t parsedLength;
1180    return toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength);
1181}
1182
1183float charactersToFloat(const LChar* data, size_t length, bool* ok)
1184{
1185    // FIXME: This will return ok even when the string fits into a double but not a float.
1186    size_t parsedLength;
1187    return static_cast<float>(toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength));
1188}
1189
1190float charactersToFloat(const UChar* data, size_t length, bool* ok)
1191{
1192    // FIXME: This will return ok even when the string fits into a double but not a float.
1193    size_t parsedLength;
1194    return static_cast<float>(toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength));
1195}
1196
1197float charactersToFloat(const LChar* data, size_t length, size_t& parsedLength)
1198{
1199    // FIXME: This will return ok even when the string fits into a double but not a float.
1200    return static_cast<float>(toDoubleType<LChar, AllowTrailingJunk>(data, length, 0, parsedLength));
1201}
1202
1203float charactersToFloat(const UChar* data, size_t length, size_t& parsedLength)
1204{
1205    // FIXME: This will return ok even when the string fits into a double but not a float.
1206    return static_cast<float>(toDoubleType<UChar, AllowTrailingJunk>(data, length, 0, parsedLength));
1207}
1208
1209const String& emptyString()
1210{
1211    DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty()));
1212    return emptyString;
1213}
1214
1215} // namespace WTF
1216
1217#ifndef NDEBUG
1218// For use in the debugger
1219String* string(const char*);
1220Vector<char> asciiDebug(StringImpl* impl);
1221Vector<char> asciiDebug(String& string);
1222
1223void String::show() const
1224{
1225    dataLogF("%s\n", asciiDebug(impl()).data());
1226}
1227
1228String* string(const char* s)
1229{
1230    // leaks memory!
1231    return new String(s);
1232}
1233
1234Vector<char> asciiDebug(StringImpl* impl)
1235{
1236    if (!impl)
1237        return asciiDebug(String("[null]").impl());
1238
1239    Vector<char> buffer;
1240    for (unsigned i = 0; i < impl->length(); ++i) {
1241        UChar ch = (*impl)[i];
1242        if (isASCIIPrintable(ch)) {
1243            if (ch == '\\')
1244                buffer.append(ch);
1245            buffer.append(ch);
1246        } else {
1247            buffer.append('\\');
1248            buffer.append('u');
1249            appendUnsignedAsHexFixedSize(ch, buffer, 4);
1250        }
1251    }
1252    buffer.append('\0');
1253    return buffer;
1254}
1255
1256Vector<char> asciiDebug(String& string)
1257{
1258    return asciiDebug(string.impl());
1259}
1260
1261#endif
1262