12fc2651226baac27029e38c9d6ef883fa32084dbSteve Block/*
22fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
32fc2651226baac27029e38c9d6ef883fa32084dbSteve Block *
42fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * Redistribution and use in source and binary forms, with or without
52fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * modification, are permitted provided that the following conditions
62fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * are met:
72fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * 1. Redistributions of source code must retain the above copyright
82fc2651226baac27029e38c9d6ef883fa32084dbSteve Block *    notice, this list of conditions and the following disclaimer.
92fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * 2. Redistributions in binary form must reproduce the above copyright
102fc2651226baac27029e38c9d6ef883fa32084dbSteve Block *    notice, this list of conditions and the following disclaimer in the
112fc2651226baac27029e38c9d6ef883fa32084dbSteve Block *    documentation and/or other materials provided with the distribution.
122fc2651226baac27029e38c9d6ef883fa32084dbSteve Block *
132fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
142fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
152fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
162fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
172fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
182fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
192fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
202fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
212fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
222fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
232fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
242fc2651226baac27029e38c9d6ef883fa32084dbSteve Block */
252fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
262fc2651226baac27029e38c9d6ef883fa32084dbSteve Block#include "config.h"
272fc2651226baac27029e38c9d6ef883fa32084dbSteve Block#include "TextCodecUTF8.h"
282fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
2981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch#include "TextCodecASCIIFastPath.h"
302fc2651226baac27029e38c9d6ef883fa32084dbSteve Block#include <wtf/text/CString.h>
312fc2651226baac27029e38c9d6ef883fa32084dbSteve Block#include <wtf/text/StringBuffer.h>
3281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch#include <wtf/unicode/CharacterNames.h>
332fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
342fc2651226baac27029e38c9d6ef883fa32084dbSteve Blockusing namespace WTF::Unicode;
352fc2651226baac27029e38c9d6ef883fa32084dbSteve Blockusing namespace std;
362fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
372fc2651226baac27029e38c9d6ef883fa32084dbSteve Blocknamespace WebCore {
382fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
3981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochconst int nonCharacter = -1;
402fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
412fc2651226baac27029e38c9d6ef883fa32084dbSteve BlockPassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
422fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{
432fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    return adoptPtr(new TextCodecUTF8);
442fc2651226baac27029e38c9d6ef883fa32084dbSteve Block}
452fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
462fc2651226baac27029e38c9d6ef883fa32084dbSteve Blockvoid TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
472fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{
482fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    registrar("UTF-8", "UTF-8");
4981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch
5081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    // Additional aliases that originally were present in the encoding
5181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    // table in WebKit on Macintosh, and subsequently added by
5281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    // TextCodecICU. Perhaps we can prove some are not used on the web
5381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    // and remove them.
5481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    registrar("unicode11utf8", "UTF-8");
5581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    registrar("unicode20utf8", "UTF-8");
5681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    registrar("utf8", "UTF-8");
5781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    registrar("x-unicode20utf8", "UTF-8");
582fc2651226baac27029e38c9d6ef883fa32084dbSteve Block}
592fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
602fc2651226baac27029e38c9d6ef883fa32084dbSteve Blockvoid TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
612fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{
622fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    registrar("UTF-8", create, 0);
632fc2651226baac27029e38c9d6ef883fa32084dbSteve Block}
642fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
6581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochstatic inline int nonASCIISequenceLength(uint8_t firstByte)
662fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{
6781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    static const uint8_t lengths[256] = {
6881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
8381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    };
8581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    return lengths[firstByte];
862fc2651226baac27029e38c9d6ef883fa32084dbSteve Block}
872fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
8881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochstatic inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
892fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{
902fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    ASSERT(!isASCII(sequence[0]));
912fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    if (length == 2) {
922fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        ASSERT(sequence[0] <= 0xDF);
932fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        if (sequence[0] < 0xC2)
9481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            return nonCharacter;
952fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
9681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            return nonCharacter;
972fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
982fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    }
992fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    if (length == 3) {
1002fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
1012fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        switch (sequence[0]) {
1022fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        case 0xE0:
1032fc2651226baac27029e38c9d6ef883fa32084dbSteve Block            if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
10481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                return nonCharacter;
1052fc2651226baac27029e38c9d6ef883fa32084dbSteve Block            break;
1062fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        case 0xED:
1072fc2651226baac27029e38c9d6ef883fa32084dbSteve Block            if (sequence[1] < 0x80 || sequence[1] > 0x9F)
10881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                return nonCharacter;
1092fc2651226baac27029e38c9d6ef883fa32084dbSteve Block            break;
1102fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        default:
1112fc2651226baac27029e38c9d6ef883fa32084dbSteve Block            if (sequence[1] < 0x80 || sequence[1] > 0xBF)
11281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                return nonCharacter;
1132fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        }
1142fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        if (sequence[2] < 0x80 || sequence[2] > 0xBF)
11581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            return nonCharacter;
1162fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
1172fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    }
1182fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    ASSERT(length == 4);
1192fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
1202fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    switch (sequence[0]) {
1212fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    case 0xF0:
1222fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        if (sequence[1] < 0x90 || sequence[1] > 0xBF)
12381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            return nonCharacter;
1242fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        break;
1252fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    case 0xF4:
1262fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        if (sequence[1] < 0x80 || sequence[1] > 0x8F)
12781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            return nonCharacter;
1282fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        break;
1292fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    default:
1302fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
13181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            return nonCharacter;
1322fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    }
1332fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    if (sequence[2] < 0x80 || sequence[2] > 0xBF)
13481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        return nonCharacter;
1352fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    if (sequence[3] < 0x80 || sequence[3] > 0xBF)
13681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        return nonCharacter;
1372fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
1382fc2651226baac27029e38c9d6ef883fa32084dbSteve Block}
1392fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
14081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochstatic inline UChar* appendCharacter(UChar* destination, int character)
14181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch{
14281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    ASSERT(character != nonCharacter);
14381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    ASSERT(!U_IS_SURROGATE(character));
14481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    if (U_IS_BMP(character))
14581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        *destination++ = character;
14681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    else {
14781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        *destination++ = U16_LEAD(character);
14881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        *destination++ = U16_TRAIL(character);
14981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    }
15081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    return destination;
15181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch}
15281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch
15381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochvoid TextCodecUTF8::consumePartialSequenceByte()
15481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch{
15581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    --m_partialSequenceSize;
15681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
15781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch}
15881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch
15981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochvoid TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
16081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch{
16181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    sawError = true;
16281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    if (stopOnError)
16381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        return;
16481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    // Each error generates a replacement character and consumes one byte.
16581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    *destination++ = replacementCharacter;
16681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    consumePartialSequenceByte();
16781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch}
16881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch
16981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochvoid TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
17081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch{
17181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    ASSERT(m_partialSequenceSize);
17281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    do {
17381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        if (isASCII(m_partialSequence[0])) {
17481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            *destination++ = m_partialSequence[0];
17581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            consumePartialSequenceByte();
17681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            continue;
17781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        }
17881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        int count = nonASCIISequenceLength(m_partialSequence[0]);
17981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        if (!count) {
18081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            handleError(destination, stopOnError, sawError);
18181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            if (stopOnError)
18281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                return;
18381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            continue;
18481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        }
18581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        if (count > m_partialSequenceSize) {
18681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            if (count - m_partialSequenceSize > end - source) {
18781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                if (!flush) {
18881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    // The new data is not enough to complete the sequence, so
18981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    // add it to the existing partial sequence.
19081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
19181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    m_partialSequenceSize += end - source;
19281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    return;
19381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                }
19481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                // An incomplete partial sequence at the end is an error.
19581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                handleError(destination, stopOnError, sawError);
19681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                if (stopOnError)
19781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    return;
19881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                continue;
19981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            }
20081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
20181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            source += count - m_partialSequenceSize;
20281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            m_partialSequenceSize = count;
20381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        }
20481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        int character = decodeNonASCIISequence(m_partialSequence, count);
20581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        if (character == nonCharacter) {
20681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            handleError(destination, stopOnError, sawError);
20781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            if (stopOnError)
20881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                return;
20981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            continue;
21081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        }
21181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        m_partialSequenceSize -= count;
21281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        destination = appendCharacter(destination, character);
21381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    } while (m_partialSequenceSize);
21481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch}
21581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch
2162fc2651226baac27029e38c9d6ef883fa32084dbSteve BlockString TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
2172fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{
21881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    // Each input byte might turn into a character.
21981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    // That includes all bytes in the partial-sequence buffer because
22081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    // each byte in an invalid sequence will turn into a replacement character.
22181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    StringBuffer buffer(m_partialSequenceSize + length);
2222fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
2232fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
2242fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    const uint8_t* end = source + length;
2252fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    const uint8_t* alignedEnd = alignToMachineWord(end);
2262fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    UChar* destination = buffer.characters();
2272fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
22881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    do {
22981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        if (m_partialSequenceSize) {
23081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            // Explicitly copy destination and source pointers to avoid taking pointers to the
23181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            // local variables, which may harm code generation by disabling some optimizations
23281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            // in some compilers.
23381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            UChar* destinationForHandlePartialSequence = destination;
23481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            const uint8_t* sourceForHandlePartialSequence = source;
23581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
23681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            destination = destinationForHandlePartialSequence;
23781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            source = sourceForHandlePartialSequence;
23881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            if (m_partialSequenceSize)
23981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                break;
2402fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        }
2412fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
24281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch        while (source < end) {
24381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            if (isASCII(*source)) {
24481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                // Fast path for ASCII. Most UTF-8 text will be ASCII.
24581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                if (isAlignedToMachineWord(source)) {
24681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    while (source < alignedEnd) {
24781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
24881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                        if (!isAllASCII(chunk))
2492fc2651226baac27029e38c9d6ef883fa32084dbSteve Block                            break;
25081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                        copyASCIIMachineWord(destination, source);
25181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                        source += sizeof(MachineWord);
25281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                        destination += sizeof(MachineWord);
2532fc2651226baac27029e38c9d6ef883fa32084dbSteve Block                    }
25481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    if (source == end)
25581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                        break;
25681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    if (!isASCII(*source))
25781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                        continue;
2582fc2651226baac27029e38c9d6ef883fa32084dbSteve Block                }
25981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                *destination++ = *source++;
26081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                continue;
2612fc2651226baac27029e38c9d6ef883fa32084dbSteve Block            }
26281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            int count = nonASCIISequenceLength(*source);
26381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            int character;
26481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            if (!count)
26581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                character = nonCharacter;
26681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            else {
26781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                if (count > end - source) {
26881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    ASSERT(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
26981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    ASSERT(!m_partialSequenceSize);
27081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    m_partialSequenceSize = end - source;
27181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    memcpy(m_partialSequence, source, m_partialSequenceSize);
27281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    source = end;
2732fc2651226baac27029e38c9d6ef883fa32084dbSteve Block                    break;
2742fc2651226baac27029e38c9d6ef883fa32084dbSteve Block                }
27581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                character = decodeNonASCIISequence(source, count);
2762fc2651226baac27029e38c9d6ef883fa32084dbSteve Block            }
27781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            if (character == nonCharacter) {
27881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                sawError = true;
27981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                if (stopOnError)
28081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                    break;
28181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                // Each error generates a replacement character and consumes one byte.
28281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                *destination++ = replacementCharacter;
28381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                ++source;
28481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch                continue;
28581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            }
28681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            source += count;
28781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch            destination = appendCharacter(destination, character);
2882fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        }
28981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch    } while (flush && m_partialSequenceSize);
2902fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
2912fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    buffer.shrink(destination - buffer.characters());
2922fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
2932fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    return String::adopt(buffer);
2942fc2651226baac27029e38c9d6ef883fa32084dbSteve Block}
2952fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
2962fc2651226baac27029e38c9d6ef883fa32084dbSteve BlockCString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
2972fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{
2982fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
2992fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
3002fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
3012fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    if (length > numeric_limits<size_t>::max() / 3)
3022fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        CRASH();
3032fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    Vector<uint8_t> bytes(length * 3);
3042fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
3052fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    size_t i = 0;
3062fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    size_t bytesWritten = 0;
3072fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    while (i < length) {
3082fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        UChar32 character;
3092fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        U16_NEXT(characters, i, length, character);
3102fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
3112fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    }
3122fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
3132fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
3142fc2651226baac27029e38c9d6ef883fa32084dbSteve Block}
3152fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
3162fc2651226baac27029e38c9d6ef883fa32084dbSteve Block} // namespace WebCore
317