1/*
2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "wtf/text/TextCodecUTF8.h"
28
29#include "wtf/text/TextCodecASCIIFastPath.h"
30#include "wtf/text/CString.h"
31#include "wtf/text/StringBuffer.h"
32#include "wtf/unicode/CharacterNames.h"
33
34using namespace WTF;
35using namespace WTF::Unicode;
36using namespace std;
37
38namespace WTF {
39
40const int nonCharacter = -1;
41
42PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
43{
44    return adoptPtr(new TextCodecUTF8);
45}
46
47void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
48{
49    registrar("UTF-8", "UTF-8");
50
51    // Additional aliases that originally were present in the encoding
52    // table in WebKit on Macintosh, and subsequently added by
53    // TextCodecICU. Perhaps we can prove some are not used on the web
54    // and remove them.
55    registrar("unicode11utf8", "UTF-8");
56    registrar("unicode20utf8", "UTF-8");
57    registrar("utf8", "UTF-8");
58    registrar("x-unicode20utf8", "UTF-8");
59
60    // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
61    // and Firefox (24), but not in ICU 4.6.
62    registrar("unicode-1-1-utf-8", "UTF-8");
63}
64
65void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
66{
67    registrar("UTF-8", create, 0);
68}
69
70static inline int nonASCIISequenceLength(uint8_t firstByte)
71{
72    static const uint8_t lengths[256] = {
73        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
82        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
83        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
86        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
87        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
88        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
89    };
90    return lengths[firstByte];
91}
92
93static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
94{
95    ASSERT(!isASCII(sequence[0]));
96    if (length == 2) {
97        ASSERT(sequence[0] <= 0xDF);
98        if (sequence[0] < 0xC2)
99            return nonCharacter;
100        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
101            return nonCharacter;
102        return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
103    }
104    if (length == 3) {
105        ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
106        switch (sequence[0]) {
107        case 0xE0:
108            if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
109                return nonCharacter;
110            break;
111        case 0xED:
112            if (sequence[1] < 0x80 || sequence[1] > 0x9F)
113                return nonCharacter;
114            break;
115        default:
116            if (sequence[1] < 0x80 || sequence[1] > 0xBF)
117                return nonCharacter;
118        }
119        if (sequence[2] < 0x80 || sequence[2] > 0xBF)
120            return nonCharacter;
121        return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
122    }
123    ASSERT(length == 4);
124    ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
125    switch (sequence[0]) {
126    case 0xF0:
127        if (sequence[1] < 0x90 || sequence[1] > 0xBF)
128            return nonCharacter;
129        break;
130    case 0xF4:
131        if (sequence[1] < 0x80 || sequence[1] > 0x8F)
132            return nonCharacter;
133        break;
134    default:
135        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
136            return nonCharacter;
137    }
138    if (sequence[2] < 0x80 || sequence[2] > 0xBF)
139        return nonCharacter;
140    if (sequence[3] < 0x80 || sequence[3] > 0xBF)
141        return nonCharacter;
142    return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
143}
144
145static inline UChar* appendCharacter(UChar* destination, int character)
146{
147    ASSERT(character != nonCharacter);
148    ASSERT(!U_IS_SURROGATE(character));
149    if (U_IS_BMP(character))
150        *destination++ = character;
151    else {
152        *destination++ = U16_LEAD(character);
153        *destination++ = U16_TRAIL(character);
154    }
155    return destination;
156}
157
158void TextCodecUTF8::consumePartialSequenceByte()
159{
160    --m_partialSequenceSize;
161    memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
162}
163
164void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
165{
166    sawError = true;
167    if (stopOnError)
168        return;
169    // Each error generates a replacement character and consumes one byte.
170    *destination++ = replacementCharacter;
171    consumePartialSequenceByte();
172}
173
174template <>
175bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
176{
177    ASSERT(m_partialSequenceSize);
178    do {
179        if (isASCII(m_partialSequence[0])) {
180            *destination++ = m_partialSequence[0];
181            consumePartialSequenceByte();
182            continue;
183        }
184        int count = nonASCIISequenceLength(m_partialSequence[0]);
185        if (!count)
186            return true;
187
188        if (count > m_partialSequenceSize) {
189            if (count - m_partialSequenceSize > end - source) {
190                if (!flush) {
191                    // The new data is not enough to complete the sequence, so
192                    // add it to the existing partial sequence.
193                    memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
194                    m_partialSequenceSize += end - source;
195                    return false;
196                }
197                // An incomplete partial sequence at the end is an error, but it will create
198                // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
199                // the error.
200                return true;
201            }
202            memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
203            source += count - m_partialSequenceSize;
204            m_partialSequenceSize = count;
205        }
206        int character = decodeNonASCIISequence(m_partialSequence, count);
207        if ((character == nonCharacter) || (character > 0xff))
208            return true;
209
210        m_partialSequenceSize -= count;
211        *destination++ = character;
212    } while (m_partialSequenceSize);
213
214    return false;
215}
216
217template <>
218bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
219{
220    ASSERT(m_partialSequenceSize);
221    do {
222        if (isASCII(m_partialSequence[0])) {
223            *destination++ = m_partialSequence[0];
224            consumePartialSequenceByte();
225            continue;
226        }
227        int count = nonASCIISequenceLength(m_partialSequence[0]);
228        if (!count) {
229            handleError(destination, stopOnError, sawError);
230            if (stopOnError)
231                return false;
232            continue;
233        }
234        if (count > m_partialSequenceSize) {
235            if (count - m_partialSequenceSize > end - source) {
236                if (!flush) {
237                    // The new data is not enough to complete the sequence, so
238                    // add it to the existing partial sequence.
239                    memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
240                    m_partialSequenceSize += end - source;
241                    return false;
242                }
243                // An incomplete partial sequence at the end is an error.
244                handleError(destination, stopOnError, sawError);
245                if (stopOnError)
246                    return false;
247                continue;
248            }
249            memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
250            source += count - m_partialSequenceSize;
251            m_partialSequenceSize = count;
252        }
253        int character = decodeNonASCIISequence(m_partialSequence, count);
254        if (character == nonCharacter) {
255            handleError(destination, stopOnError, sawError);
256            if (stopOnError)
257                return false;
258            continue;
259        }
260
261        m_partialSequenceSize -= count;
262        destination = appendCharacter(destination, character);
263    } while (m_partialSequenceSize);
264
265    return false;
266}
267
268String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flush, bool stopOnError, bool& sawError)
269{
270    // Each input byte might turn into a character.
271    // That includes all bytes in the partial-sequence buffer because
272    // each byte in an invalid sequence will turn into a replacement character.
273    StringBuffer<LChar> buffer(m_partialSequenceSize + length);
274
275    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
276    const uint8_t* end = source + length;
277    const uint8_t* alignedEnd = alignToMachineWord(end);
278    LChar* destination = buffer.characters();
279
280    do {
281        if (m_partialSequenceSize) {
282            // Explicitly copy destination and source pointers to avoid taking pointers to the
283            // local variables, which may harm code generation by disabling some optimizations
284            // in some compilers.
285            LChar* destinationForHandlePartialSequence = destination;
286            const uint8_t* sourceForHandlePartialSequence = source;
287            if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
288                source = sourceForHandlePartialSequence;
289                goto upConvertTo16Bit;
290            }
291            destination = destinationForHandlePartialSequence;
292            source = sourceForHandlePartialSequence;
293            if (m_partialSequenceSize)
294                break;
295        }
296
297        while (source < end) {
298            if (isASCII(*source)) {
299                // Fast path for ASCII. Most UTF-8 text will be ASCII.
300                if (isAlignedToMachineWord(source)) {
301                    while (source < alignedEnd) {
302                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
303                        if (!isAllASCII<LChar>(chunk))
304                            break;
305                        copyASCIIMachineWord(destination, source);
306                        source += sizeof(MachineWord);
307                        destination += sizeof(MachineWord);
308                    }
309                    if (source == end)
310                        break;
311                    if (!isASCII(*source))
312                        continue;
313                }
314                *destination++ = *source++;
315                continue;
316            }
317            int count = nonASCIISequenceLength(*source);
318            int character;
319            if (!count)
320                character = nonCharacter;
321            else {
322                if (count > end - source) {
323                    ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
324                    ASSERT(!m_partialSequenceSize);
325                    m_partialSequenceSize = end - source;
326                    memcpy(m_partialSequence, source, m_partialSequenceSize);
327                    source = end;
328                    break;
329                }
330                character = decodeNonASCIISequence(source, count);
331            }
332            if (character == nonCharacter) {
333                sawError = true;
334                if (stopOnError)
335                    break;
336
337                goto upConvertTo16Bit;
338            }
339            if (character > 0xff)
340                goto upConvertTo16Bit;
341
342            source += count;
343            *destination++ = character;
344        }
345    } while (flush && m_partialSequenceSize);
346
347    buffer.shrink(destination - buffer.characters());
348
349    return String::adopt(buffer);
350
351upConvertTo16Bit:
352    StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
353
354    UChar* destination16 = buffer16.characters();
355
356    // Copy the already converted characters
357    for (LChar* converted8 = buffer.characters(); converted8 < destination;)
358        *destination16++ = *converted8++;
359
360    do {
361        if (m_partialSequenceSize) {
362            // Explicitly copy destination and source pointers to avoid taking pointers to the
363            // local variables, which may harm code generation by disabling some optimizations
364            // in some compilers.
365            UChar* destinationForHandlePartialSequence = destination16;
366            const uint8_t* sourceForHandlePartialSequence = source;
367            handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
368            destination16 = destinationForHandlePartialSequence;
369            source = sourceForHandlePartialSequence;
370            if (m_partialSequenceSize)
371                break;
372        }
373
374        while (source < end) {
375            if (isASCII(*source)) {
376                // Fast path for ASCII. Most UTF-8 text will be ASCII.
377                if (isAlignedToMachineWord(source)) {
378                    while (source < alignedEnd) {
379                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
380                        if (!isAllASCII<LChar>(chunk))
381                            break;
382                        copyASCIIMachineWord(destination16, source);
383                        source += sizeof(MachineWord);
384                        destination16 += sizeof(MachineWord);
385                    }
386                    if (source == end)
387                        break;
388                    if (!isASCII(*source))
389                        continue;
390                }
391                *destination16++ = *source++;
392                continue;
393            }
394            int count = nonASCIISequenceLength(*source);
395            int character;
396            if (!count)
397                character = nonCharacter;
398            else {
399                if (count > end - source) {
400                    ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
401                    ASSERT(!m_partialSequenceSize);
402                    m_partialSequenceSize = end - source;
403                    memcpy(m_partialSequence, source, m_partialSequenceSize);
404                    source = end;
405                    break;
406                }
407                character = decodeNonASCIISequence(source, count);
408            }
409            if (character == nonCharacter) {
410                sawError = true;
411                if (stopOnError)
412                    break;
413                // Each error generates a replacement character and consumes one byte.
414                *destination16++ = replacementCharacter;
415                ++source;
416                continue;
417            }
418            source += count;
419            destination16 = appendCharacter(destination16, character);
420        }
421    } while (flush && m_partialSequenceSize);
422
423    buffer16.shrink(destination16 - buffer16.characters());
424
425    return String::adopt(buffer16);
426}
427
428template<typename CharType>
429CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
430{
431    // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
432    // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
433    // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
434    if (length > numeric_limits<size_t>::max() / 3)
435        CRASH();
436    Vector<uint8_t> bytes(length * 3);
437
438    size_t i = 0;
439    size_t bytesWritten = 0;
440    while (i < length) {
441        UChar32 character;
442        U16_NEXT(characters, i, length, character);
443        // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
444        // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
445        if (0xD800 <= character && character <= 0xDFFF)
446            character = replacementCharacter;
447        U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
448    }
449
450    return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
451}
452
453CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
454{
455    return encodeCommon(characters, length);
456}
457
458CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
459{
460    return encodeCommon(characters, length);
461}
462
463} // namespace WTF
464