15c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)/*
25c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
35c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *
45c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * Redistribution and use in source and binary forms, with or without
55c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * modification, are permitted provided that the following conditions
65c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * are met:
75c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 1. Redistributions of source code must retain the above copyright
85c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *    notice, this list of conditions and the following disclaimer.
95c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 2. Redistributions in binary form must reproduce the above copyright
105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *    notice, this list of conditions and the following disclaimer in the
115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *    documentation and/or other materials provided with the distribution.
125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *
135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2302772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) */
255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include "config.h"
2781a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)#include "wtf/text/TextCodecUTF8.h"
285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2981a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)#include "wtf/text/TextCodecASCIIFastPath.h"
30591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch#include "wtf/text/CString.h"
31591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch#include "wtf/text/StringBuffer.h"
32591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch#include "wtf/unicode/CharacterNames.h"
335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)using namespace WTF;
355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)using namespace WTF::Unicode;
365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)using namespace std;
375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3881a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)namespace WTF {
395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)const int nonCharacter = -1;
415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return adoptPtr(new TextCodecUTF8);
455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    registrar("UTF-8", "UTF-8");
505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // Additional aliases that originally were present in the encoding
525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // table in WebKit on Macintosh, and subsequently added by
535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // TextCodecICU. Perhaps we can prove some are not used on the web
545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // and remove them.
555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    registrar("unicode11utf8", "UTF-8");
565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    registrar("unicode20utf8", "UTF-8");
575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    registrar("utf8", "UTF-8");
585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    registrar("x-unicode20utf8", "UTF-8");
59bfe3590b1806e3ff18f46ee3af5d4b83078f305aTorne (Richard Coles)
60bfe3590b1806e3ff18f46ee3af5d4b83078f305aTorne (Richard Coles)    // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
61bfe3590b1806e3ff18f46ee3af5d4b83078f305aTorne (Richard Coles)    // and Firefox (24), but not in ICU 4.6.
62bfe3590b1806e3ff18f46ee3af5d4b83078f305aTorne (Richard Coles)    registrar("unicode-1-1-utf-8", "UTF-8");
635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    registrar("UTF-8", create, 0);
685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)static inline int nonASCIISequenceLength(uint8_t firstByte)
715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    static const uint8_t lengths[256] = {
735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    };
905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return lengths[firstByte];
915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ASSERT(!isASCII(sequence[0]));
965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (length == 2) {
975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        ASSERT(sequence[0] <= 0xDF);
985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (sequence[0] < 0xC2)
995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            return nonCharacter;
1005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
1015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            return nonCharacter;
1025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
1035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
1045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (length == 3) {
1055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
1065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        switch (sequence[0]) {
1075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        case 0xE0:
1085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
1095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                return nonCharacter;
1105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            break;
1115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        case 0xED:
1125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (sequence[1] < 0x80 || sequence[1] > 0x9F)
1135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                return nonCharacter;
1145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            break;
1155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        default:
1165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (sequence[1] < 0x80 || sequence[1] > 0xBF)
1175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                return nonCharacter;
1185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
1195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (sequence[2] < 0x80 || sequence[2] > 0xBF)
1205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            return nonCharacter;
1215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
1225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
1235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ASSERT(length == 4);
1245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
1255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    switch (sequence[0]) {
1265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    case 0xF0:
1275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (sequence[1] < 0x90 || sequence[1] > 0xBF)
1285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            return nonCharacter;
1295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        break;
1305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    case 0xF4:
1315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (sequence[1] < 0x80 || sequence[1] > 0x8F)
1325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            return nonCharacter;
1335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        break;
1345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    default:
1355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
1365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            return nonCharacter;
1375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
1385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (sequence[2] < 0x80 || sequence[2] > 0xBF)
1395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        return nonCharacter;
1405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (sequence[3] < 0x80 || sequence[3] > 0xBF)
1415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        return nonCharacter;
1425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
1435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)static inline UChar* appendCharacter(UChar* destination, int character)
1465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
1475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ASSERT(character != nonCharacter);
1485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ASSERT(!U_IS_SURROGATE(character));
1495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (U_IS_BMP(character))
1505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        *destination++ = character;
1515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    else {
1525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        *destination++ = U16_LEAD(character);
1535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        *destination++ = U16_TRAIL(character);
1545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
1555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return destination;
1565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void TextCodecUTF8::consumePartialSequenceByte()
1595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
1605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    --m_partialSequenceSize;
1615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
1625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
1655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
1665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    sawError = true;
1675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (stopOnError)
1685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        return;
1695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // Each error generates a replacement character and consumes one byte.
1705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    *destination++ = replacementCharacter;
1715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    consumePartialSequenceByte();
1725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)template <>
1755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
1765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
1775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ASSERT(m_partialSequenceSize);
1785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    do {
1795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (isASCII(m_partialSequence[0])) {
1805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            *destination++ = m_partialSequence[0];
1815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            consumePartialSequenceByte();
1825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            continue;
1835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
1845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int count = nonASCIISequenceLength(m_partialSequence[0]);
1855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (!count)
1865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            return true;
1875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (count > m_partialSequenceSize) {
1895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (count - m_partialSequenceSize > end - source) {
1905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                if (!flush) {
1915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    // The new data is not enough to complete the sequence, so
1925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    // add it to the existing partial sequence.
1935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
1945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    m_partialSequenceSize += end - source;
1955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    return false;
1965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                }
1975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                // An incomplete partial sequence at the end is an error, but it will create
1985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
1995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                // the error.
2005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                return true;
2015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
2025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
2035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source += count - m_partialSequenceSize;
2045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            m_partialSequenceSize = count;
2055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int character = decodeNonASCIISequence(m_partialSequence, count);
2075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if ((character == nonCharacter) || (character > 0xff))
2085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            return true;
2095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        m_partialSequenceSize -= count;
2115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        *destination++ = character;
2125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    } while (m_partialSequenceSize);
2135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return false;
2155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
2165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)template <>
2185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
2195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
2205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ASSERT(m_partialSequenceSize);
2215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    do {
2225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (isASCII(m_partialSequence[0])) {
2235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            *destination++ = m_partialSequence[0];
2245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            consumePartialSequenceByte();
2255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            continue;
2265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int count = nonASCIISequenceLength(m_partialSequence[0]);
2285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (!count) {
2295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            handleError(destination, stopOnError, sawError);
2305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (stopOnError)
2315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                return false;
2325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            continue;
2335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (count > m_partialSequenceSize) {
2355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (count - m_partialSequenceSize > end - source) {
2365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                if (!flush) {
2375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    // The new data is not enough to complete the sequence, so
2385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    // add it to the existing partial sequence.
2395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
2405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    m_partialSequenceSize += end - source;
2415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    return false;
2425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                }
2435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                // An incomplete partial sequence at the end is an error.
2445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                handleError(destination, stopOnError, sawError);
2455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                if (stopOnError)
2465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    return false;
2475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                continue;
2485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
2495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
2505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source += count - m_partialSequenceSize;
2515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            m_partialSequenceSize = count;
2525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int character = decodeNonASCIISequence(m_partialSequence, count);
2545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (character == nonCharacter) {
2555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            handleError(destination, stopOnError, sawError);
2565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (stopOnError)
2575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                return false;
2585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            continue;
2595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        m_partialSequenceSize -= count;
2625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        destination = appendCharacter(destination, character);
2635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    } while (m_partialSequenceSize);
2645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return false;
2665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
26702772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch
268d5428f32f5d1719f774f62e19147104ca245a3abTorne (Richard Coles)String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flush, bool stopOnError, bool& sawError)
2695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
2705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // Each input byte might turn into a character.
2715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // That includes all bytes in the partial-sequence buffer because
2725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // each byte in an invalid sequence will turn into a replacement character.
2735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    StringBuffer<LChar> buffer(m_partialSequenceSize + length);
2745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
2765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    const uint8_t* end = source + length;
2775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    const uint8_t* alignedEnd = alignToMachineWord(end);
2785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    LChar* destination = buffer.characters();
2795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    do {
2815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (m_partialSequenceSize) {
2825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            // Explicitly copy destination and source pointers to avoid taking pointers to the
2835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            // local variables, which may harm code generation by disabling some optimizations
2845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            // in some compilers.
2855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            LChar* destinationForHandlePartialSequence = destination;
2865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            const uint8_t* sourceForHandlePartialSequence = source;
2875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
2885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source = sourceForHandlePartialSequence;
2895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                goto upConvertTo16Bit;
2905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
2915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destination = destinationForHandlePartialSequence;
2925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source = sourceForHandlePartialSequence;
2935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (m_partialSequenceSize)
2945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                break;
2955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (source < end) {
2985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (isASCII(*source)) {
2995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                // Fast path for ASCII. Most UTF-8 text will be ASCII.
3005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                if (isAlignedToMachineWord(source)) {
3015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    while (source < alignedEnd) {
3025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
3035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        if (!isAllASCII<LChar>(chunk))
3045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                            break;
3055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        copyASCIIMachineWord(destination, source);
3065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        source += sizeof(MachineWord);
3075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        destination += sizeof(MachineWord);
3085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    }
3095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    if (source == end)
3105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        break;
3115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    if (!isASCII(*source))
3125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        continue;
3135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                }
3145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                *destination++ = *source++;
3155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                continue;
3165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
3175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            int count = nonASCIISequenceLength(*source);
3185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            int character;
3195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (!count)
3205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                character = nonCharacter;
3215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            else {
3225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                if (count > end - source) {
323926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)                    ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
3245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    ASSERT(!m_partialSequenceSize);
3255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    m_partialSequenceSize = end - source;
3265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    memcpy(m_partialSequence, source, m_partialSequenceSize);
3275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    source = end;
3285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    break;
3295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                }
3305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                character = decodeNonASCIISequence(source, count);
3315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
3325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (character == nonCharacter) {
3335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                sawError = true;
3345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                if (stopOnError)
3355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    break;
33602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch
3375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                goto upConvertTo16Bit;
3385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
3395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (character > 0xff)
3405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                goto upConvertTo16Bit;
3415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source += count;
3435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            *destination++ = character;
3445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
3455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    } while (flush && m_partialSequenceSize);
3465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    buffer.shrink(destination - buffer.characters());
3485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return String::adopt(buffer);
3505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)upConvertTo16Bit:
3525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
3535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    UChar* destination16 = buffer16.characters();
3555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // Copy the already converted characters
3575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    for (LChar* converted8 = buffer.characters(); converted8 < destination;)
3585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        *destination16++ = *converted8++;
3595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    do {
3615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (m_partialSequenceSize) {
3625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            // Explicitly copy destination and source pointers to avoid taking pointers to the
3635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            // local variables, which may harm code generation by disabling some optimizations
3645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            // in some compilers.
3655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            UChar* destinationForHandlePartialSequence = destination16;
3665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            const uint8_t* sourceForHandlePartialSequence = source;
3675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
3685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destination16 = destinationForHandlePartialSequence;
3695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source = sourceForHandlePartialSequence;
3705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (m_partialSequenceSize)
3715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                break;
3725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
37302772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch
3745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (source < end) {
3755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (isASCII(*source)) {
3765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                // Fast path for ASCII. Most UTF-8 text will be ASCII.
3775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                if (isAlignedToMachineWord(source)) {
3785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    while (source < alignedEnd) {
3795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
3805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        if (!isAllASCII<LChar>(chunk))
3815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                            break;
3825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        copyASCIIMachineWord(destination16, source);
3835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        source += sizeof(MachineWord);
3845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        destination16 += sizeof(MachineWord);
3855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    }
3865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    if (source == end)
3875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        break;
3885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    if (!isASCII(*source))
3895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                        continue;
3905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                }
3915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                *destination16++ = *source++;
3925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                continue;
3935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
3945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            int count = nonASCIISequenceLength(*source);
3955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            int character;
3965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (!count)
3975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                character = nonCharacter;
3985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            else {
3995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                if (count > end - source) {
400926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)                    ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
4015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    ASSERT(!m_partialSequenceSize);
4025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    m_partialSequenceSize = end - source;
4035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    memcpy(m_partialSequence, source, m_partialSequenceSize);
4045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    source = end;
4055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    break;
4065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                }
4075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                character = decodeNonASCIISequence(source, count);
4085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
4095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (character == nonCharacter) {
4105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                sawError = true;
4115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                if (stopOnError)
4125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                    break;
4135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                // Each error generates a replacement character and consumes one byte.
4145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                *destination16++ = replacementCharacter;
4155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                ++source;
4165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                continue;
4175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
4185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source += count;
4195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destination16 = appendCharacter(destination16, character);
4205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
4215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    } while (flush && m_partialSequenceSize);
42202772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch
4235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    buffer16.shrink(destination16 - buffer16.characters());
42402772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch
4255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return String::adopt(buffer16);
4265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
4275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
428591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdochtemplate<typename CharType>
429591b958dee2cf159d33a0b931e6231072eaf38d5Ben MurdochCString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
4305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
4315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
4325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
4335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
4345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (length > numeric_limits<size_t>::max() / 3)
4355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        CRASH();
4365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    Vector<uint8_t> bytes(length * 3);
4375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
4385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    size_t i = 0;
4395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    size_t bytesWritten = 0;
4405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    while (i < length) {
4415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        UChar32 character;
4425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        U16_NEXT(characters, i, length, character);
4438abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)        // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
4448abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)        // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
4458abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)        if (0xD800 <= character && character <= 0xDFFF)
4468abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)            character = replacementCharacter;
4475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
4485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
4495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
4505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
4515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
4525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
453591b958dee2cf159d33a0b931e6231072eaf38d5Ben MurdochCString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
454591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch{
455591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch    return encodeCommon(characters, length);
456591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch}
457591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch
458591b958dee2cf159d33a0b931e6231072eaf38d5Ben MurdochCString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
459591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch{
460591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch    return encodeCommon(characters, length);
461591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch}
462591b958dee2cf159d33a0b931e6231072eaf38d5Ben Murdoch
46381a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)} // namespace WTF
464