1/*
2 *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 *  Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4 *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5 *  Copyright (C) 2009 Google Inc. All rights reserved.
6 *
7 *  This library is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU Library General Public
9 *  License as published by the Free Software Foundation; either
10 *  version 2 of the License, or (at your option) any later version.
11 *
12 *  This library is distributed in the hope that it will be useful,
13 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 *  Library General Public License for more details.
16 *
17 *  You should have received a copy of the GNU Library General Public License
18 *  along with this library; see the file COPYING.LIB.  If not, write to
19 *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 *  Boston, MA 02110-1301, USA.
21 *
22 */
23
24#include "config.h"
25#include "UString.h"
26
27#include "JSGlobalObjectFunctions.h"
28#include "Heap.h"
29#include "Identifier.h"
30#include "Operations.h"
31#include <ctype.h>
32#include <limits.h>
33#include <limits>
34#include <stdio.h>
35#include <stdlib.h>
36#include <wtf/ASCIICType.h>
37#include <wtf/Assertions.h>
38#include <wtf/DecimalNumber.h>
39#include <wtf/MathExtras.h>
40#include <wtf/StringExtras.h>
41#include <wtf/Vector.h>
42#include <wtf/unicode/UTF8.h>
43
44#if HAVE(STRINGS_H)
45#include <strings.h>
46#endif
47
48using namespace WTF;
49using namespace WTF::Unicode;
50using namespace std;
51
52namespace JSC {
53
54extern const double NaN;
55extern const double Inf;
56
57COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small);
58
59// Construct a string with UTF-16 data.
60UString::UString(const UChar* characters, unsigned length)
61    : m_impl(characters ? StringImpl::create(characters, length) : 0)
62{
63}
64
65// Construct a string with UTF-16 data, from a null-terminated source.
66UString::UString(const UChar* characters)
67{
68    if (!characters)
69        return;
70
71    int length = 0;
72    while (characters[length] != UChar(0))
73        ++length;
74
75    m_impl = StringImpl::create(characters, length);
76}
77
78// Construct a string with latin1 data.
79UString::UString(const char* characters, unsigned length)
80    : m_impl(characters ? StringImpl::create(characters, length) : 0)
81{
82}
83
84// Construct a string with latin1 data, from a null-terminated source.
85UString::UString(const char* characters)
86    : m_impl(characters ? StringImpl::create(characters) : 0)
87{
88}
89
90UString UString::number(int i)
91{
92    UChar buf[1 + sizeof(i) * 3];
93    UChar* end = buf + WTF_ARRAY_LENGTH(buf);
94    UChar* p = end;
95
96    if (i == 0)
97        *--p = '0';
98    else if (i == INT_MIN) {
99        char minBuf[1 + sizeof(i) * 3];
100        snprintf(minBuf, sizeof(minBuf), "%d", INT_MIN);
101        return UString(minBuf);
102    } else {
103        bool negative = false;
104        if (i < 0) {
105            negative = true;
106            i = -i;
107        }
108        while (i) {
109            *--p = static_cast<unsigned short>((i % 10) + '0');
110            i /= 10;
111        }
112        if (negative)
113            *--p = '-';
114    }
115
116    return UString(p, static_cast<unsigned>(end - p));
117}
118
119UString UString::number(long long i)
120{
121    UChar buf[1 + sizeof(i) * 3];
122    UChar* end = buf + WTF_ARRAY_LENGTH(buf);
123    UChar* p = end;
124
125    if (i == 0)
126        *--p = '0';
127    else if (i == std::numeric_limits<long long>::min()) {
128        char minBuf[1 + sizeof(i) * 3];
129#if OS(WINDOWS)
130        snprintf(minBuf, sizeof(minBuf), "%I64d", std::numeric_limits<long long>::min());
131#else
132        snprintf(minBuf, sizeof(minBuf), "%lld", std::numeric_limits<long long>::min());
133#endif
134        return UString(minBuf);
135    } else {
136        bool negative = false;
137        if (i < 0) {
138            negative = true;
139            i = -i;
140        }
141        while (i) {
142            *--p = static_cast<unsigned short>((i % 10) + '0');
143            i /= 10;
144        }
145        if (negative)
146            *--p = '-';
147    }
148
149    return UString(p, static_cast<unsigned>(end - p));
150}
151
152UString UString::number(unsigned u)
153{
154    UChar buf[sizeof(u) * 3];
155    UChar* end = buf + WTF_ARRAY_LENGTH(buf);
156    UChar* p = end;
157
158    if (u == 0)
159        *--p = '0';
160    else {
161        while (u) {
162            *--p = static_cast<unsigned short>((u % 10) + '0');
163            u /= 10;
164        }
165    }
166
167    return UString(p, static_cast<unsigned>(end - p));
168}
169
170UString UString::number(long l)
171{
172    UChar buf[1 + sizeof(l) * 3];
173    UChar* end = buf + WTF_ARRAY_LENGTH(buf);
174    UChar* p = end;
175
176    if (l == 0)
177        *--p = '0';
178    else if (l == LONG_MIN) {
179        char minBuf[1 + sizeof(l) * 3];
180        snprintf(minBuf, sizeof(minBuf), "%ld", LONG_MIN);
181        return UString(minBuf);
182    } else {
183        bool negative = false;
184        if (l < 0) {
185            negative = true;
186            l = -l;
187        }
188        while (l) {
189            *--p = static_cast<unsigned short>((l % 10) + '0');
190            l /= 10;
191        }
192        if (negative)
193            *--p = '-';
194    }
195
196    return UString(p, end - p);
197}
198
199UString UString::number(double d)
200{
201    NumberToStringBuffer buffer;
202    unsigned length = numberToString(d, buffer);
203    return UString(buffer, length);
204}
205
206UString UString::substringSharingImpl(unsigned offset, unsigned length) const
207{
208    // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar).
209
210    unsigned stringLength = this->length();
211    offset = min(offset, stringLength);
212    length = min(length, stringLength - offset);
213
214    if (!offset && length == stringLength)
215        return *this;
216    return UString(StringImpl::create(m_impl, offset, length));
217}
218
219bool operator==(const UString& s1, const char *s2)
220{
221    if (s2 == 0)
222        return s1.isEmpty();
223
224    const UChar* u = s1.characters();
225    const UChar* uend = u + s1.length();
226    while (u != uend && *s2) {
227        if (u[0] != (unsigned char)*s2)
228            return false;
229        s2++;
230        u++;
231    }
232
233    return u == uend && *s2 == 0;
234}
235
236bool operator<(const UString& s1, const UString& s2)
237{
238    const unsigned l1 = s1.length();
239    const unsigned l2 = s2.length();
240    const unsigned lmin = l1 < l2 ? l1 : l2;
241    const UChar* c1 = s1.characters();
242    const UChar* c2 = s2.characters();
243    unsigned l = 0;
244    while (l < lmin && *c1 == *c2) {
245        c1++;
246        c2++;
247        l++;
248    }
249    if (l < lmin)
250        return (c1[0] < c2[0]);
251
252    return (l1 < l2);
253}
254
255bool operator>(const UString& s1, const UString& s2)
256{
257    const unsigned l1 = s1.length();
258    const unsigned l2 = s2.length();
259    const unsigned lmin = l1 < l2 ? l1 : l2;
260    const UChar* c1 = s1.characters();
261    const UChar* c2 = s2.characters();
262    unsigned l = 0;
263    while (l < lmin && *c1 == *c2) {
264        c1++;
265        c2++;
266        l++;
267    }
268    if (l < lmin)
269        return (c1[0] > c2[0]);
270
271    return (l1 > l2);
272}
273
274CString UString::ascii() const
275{
276    // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
277    // preserved, characters outside of this range are converted to '?'.
278
279    unsigned length = this->length();
280    const UChar* characters = this->characters();
281
282    char* characterBuffer;
283    CString result = CString::newUninitialized(length, characterBuffer);
284
285    for (unsigned i = 0; i < length; ++i) {
286        UChar ch = characters[i];
287        characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch;
288    }
289
290    return result;
291}
292
293CString UString::latin1() const
294{
295    // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
296    // preserved, characters outside of this range are converted to '?'.
297
298    unsigned length = this->length();
299    const UChar* characters = this->characters();
300
301    char* characterBuffer;
302    CString result = CString::newUninitialized(length, characterBuffer);
303
304    for (unsigned i = 0; i < length; ++i) {
305        UChar ch = characters[i];
306        characterBuffer[i] = ch > 0xff ? '?' : ch;
307    }
308
309    return result;
310}
311
312// Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
313static inline void putUTF8Triple(char*& buffer, UChar ch)
314{
315    ASSERT(ch >= 0x0800);
316    *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
317    *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
318    *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
319}
320
321CString UString::utf8(bool strict) const
322{
323    unsigned length = this->length();
324    const UChar* characters = this->characters();
325
326    // Allocate a buffer big enough to hold all the characters
327    // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
328    // Optimization ideas, if we find this function is hot:
329    //  * We could speculatively create a CStringBuffer to contain 'length'
330    //    characters, and resize if necessary (i.e. if the buffer contains
331    //    non-ascii characters). (Alternatively, scan the buffer first for
332    //    ascii characters, so we know this will be sufficient).
333    //  * We could allocate a CStringBuffer with an appropriate size to
334    //    have a good chance of being able to write the string into the
335    //    buffer without reallocing (say, 1.5 x length).
336    if (length > numeric_limits<unsigned>::max() / 3)
337        return CString();
338    Vector<char, 1024> bufferVector(length * 3);
339
340    char* buffer = bufferVector.data();
341    ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
342    ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
343
344    // Only produced from strict conversion.
345    if (result == sourceIllegal)
346        return CString();
347
348    // Check for an unconverted high surrogate.
349    if (result == sourceExhausted) {
350        if (strict)
351            return CString();
352        // This should be one unpaired high surrogate. Treat it the same
353        // was as an unpaired high surrogate would have been handled in
354        // the middle of a string with non-strict conversion - which is
355        // to say, simply encode it to UTF-8.
356        ASSERT((characters + 1) == (this->characters() + length));
357        ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
358        // There should be room left, since one UChar hasn't been converted.
359        ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
360        putUTF8Triple(buffer, *characters);
361    }
362
363    return CString(bufferVector.data(), buffer - bufferVector.data());
364}
365
366} // namespace JSC
367