12fc2651226baac27029e38c9d6ef883fa32084dbSteve Block/* 22fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. 32fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * 42fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * Redistribution and use in source and binary forms, with or without 52fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * modification, are permitted provided that the following conditions 62fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * are met: 72fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * 1. Redistributions of source code must retain the above copyright 82fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * notice, this list of conditions and the following disclaimer. 92fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * 2. Redistributions in binary form must reproduce the above copyright 102fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * notice, this list of conditions and the following disclaimer in the 112fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * documentation and/or other materials provided with the distribution. 122fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * 132fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 142fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 152fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 162fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 172fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 182fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 192fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 202fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 212fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 222fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 232fc2651226baac27029e38c9d6ef883fa32084dbSteve Block * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 242fc2651226baac27029e38c9d6ef883fa32084dbSteve Block */ 252fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 262fc2651226baac27029e38c9d6ef883fa32084dbSteve Block#include "config.h" 272fc2651226baac27029e38c9d6ef883fa32084dbSteve Block#include "TextCodecUTF8.h" 282fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 2981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch#include "TextCodecASCIIFastPath.h" 302fc2651226baac27029e38c9d6ef883fa32084dbSteve Block#include <wtf/text/CString.h> 312fc2651226baac27029e38c9d6ef883fa32084dbSteve Block#include <wtf/text/StringBuffer.h> 3281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch#include <wtf/unicode/CharacterNames.h> 332fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 342fc2651226baac27029e38c9d6ef883fa32084dbSteve Blockusing namespace WTF::Unicode; 352fc2651226baac27029e38c9d6ef883fa32084dbSteve Blockusing namespace std; 362fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 372fc2651226baac27029e38c9d6ef883fa32084dbSteve Blocknamespace WebCore { 382fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 3981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochconst int nonCharacter = -1; 402fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 412fc2651226baac27029e38c9d6ef883fa32084dbSteve BlockPassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) 422fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{ 432fc2651226baac27029e38c9d6ef883fa32084dbSteve Block return adoptPtr(new TextCodecUTF8); 442fc2651226baac27029e38c9d6ef883fa32084dbSteve Block} 452fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 462fc2651226baac27029e38c9d6ef883fa32084dbSteve Blockvoid TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) 472fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{ 482fc2651226baac27029e38c9d6ef883fa32084dbSteve Block registrar("UTF-8", "UTF-8"); 4981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 5081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // Additional aliases that originally were present in the encoding 5181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // table in WebKit on Macintosh, and subsequently added by 5281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // TextCodecICU. Perhaps we can prove some are not used on the web 5381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // and remove them. 5481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch registrar("unicode11utf8", "UTF-8"); 5581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch registrar("unicode20utf8", "UTF-8"); 5681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch registrar("utf8", "UTF-8"); 5781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch registrar("x-unicode20utf8", "UTF-8"); 582fc2651226baac27029e38c9d6ef883fa32084dbSteve Block} 592fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 602fc2651226baac27029e38c9d6ef883fa32084dbSteve Blockvoid TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) 612fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{ 622fc2651226baac27029e38c9d6ef883fa32084dbSteve Block registrar("UTF-8", create, 0); 632fc2651226baac27029e38c9d6ef883fa32084dbSteve Block} 642fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 6581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochstatic inline int nonASCIISequenceLength(uint8_t firstByte) 662fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{ 6781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch static const uint8_t lengths[256] = { 6881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 8381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 8481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch }; 8581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return lengths[firstByte]; 862fc2651226baac27029e38c9d6ef883fa32084dbSteve Block} 872fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 8881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochstatic inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length) 892fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{ 902fc2651226baac27029e38c9d6ef883fa32084dbSteve Block ASSERT(!isASCII(sequence[0])); 912fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (length == 2) { 922fc2651226baac27029e38c9d6ef883fa32084dbSteve Block ASSERT(sequence[0] <= 0xDF); 932fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[0] < 0xC2) 9481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 952fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[1] < 0x80 || sequence[1] > 0xBF) 9681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 972fc2651226baac27029e38c9d6ef883fa32084dbSteve Block return ((sequence[0] << 6) + sequence[1]) - 0x00003080; 982fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 992fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (length == 3) { 1002fc2651226baac27029e38c9d6ef883fa32084dbSteve Block ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); 1012fc2651226baac27029e38c9d6ef883fa32084dbSteve Block switch (sequence[0]) { 1022fc2651226baac27029e38c9d6ef883fa32084dbSteve Block case 0xE0: 1032fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[1] < 0xA0 || sequence[1] > 0xBF) 10481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 1052fc2651226baac27029e38c9d6ef883fa32084dbSteve Block break; 1062fc2651226baac27029e38c9d6ef883fa32084dbSteve Block case 0xED: 1072fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[1] < 0x80 || sequence[1] > 0x9F) 10881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 1092fc2651226baac27029e38c9d6ef883fa32084dbSteve Block break; 1102fc2651226baac27029e38c9d6ef883fa32084dbSteve Block default: 1112fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[1] < 0x80 || sequence[1] > 0xBF) 11281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 1132fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 1142fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[2] < 0x80 || sequence[2] > 0xBF) 11581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 1162fc2651226baac27029e38c9d6ef883fa32084dbSteve Block return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080; 1172fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 1182fc2651226baac27029e38c9d6ef883fa32084dbSteve Block ASSERT(length == 4); 1192fc2651226baac27029e38c9d6ef883fa32084dbSteve Block ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); 1202fc2651226baac27029e38c9d6ef883fa32084dbSteve Block switch (sequence[0]) { 1212fc2651226baac27029e38c9d6ef883fa32084dbSteve Block case 0xF0: 1222fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[1] < 0x90 || sequence[1] > 0xBF) 12381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 1242fc2651226baac27029e38c9d6ef883fa32084dbSteve Block break; 1252fc2651226baac27029e38c9d6ef883fa32084dbSteve Block case 0xF4: 1262fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[1] < 0x80 || sequence[1] > 0x8F) 12781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 1282fc2651226baac27029e38c9d6ef883fa32084dbSteve Block break; 1292fc2651226baac27029e38c9d6ef883fa32084dbSteve Block default: 1302fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[1] < 0x80 || sequence[1] > 0xBF) 13181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 1322fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 1332fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[2] < 0x80 || sequence[2] > 0xBF) 13481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 1352fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (sequence[3] < 0x80 || sequence[3] > 0xBF) 13681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return nonCharacter; 1372fc2651226baac27029e38c9d6ef883fa32084dbSteve Block return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080; 1382fc2651226baac27029e38c9d6ef883fa32084dbSteve Block} 1392fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 14081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochstatic inline UChar* appendCharacter(UChar* destination, int character) 14181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch{ 14281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch ASSERT(character != nonCharacter); 14381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch ASSERT(!U_IS_SURROGATE(character)); 14481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (U_IS_BMP(character)) 14581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch *destination++ = character; 14681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch else { 14781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch *destination++ = U16_LEAD(character); 14881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch *destination++ = U16_TRAIL(character); 14981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch } 15081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return destination; 15181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch} 15281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 15381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochvoid TextCodecUTF8::consumePartialSequenceByte() 15481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch{ 15581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch --m_partialSequenceSize; 15681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); 15781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch} 15881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 15981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochvoid TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError) 16081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch{ 16181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch sawError = true; 16281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (stopOnError) 16381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return; 16481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // Each error generates a replacement character and consumes one byte. 16581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch *destination++ = replacementCharacter; 16681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch consumePartialSequenceByte(); 16781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch} 16881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 16981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdochvoid TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) 17081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch{ 17181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch ASSERT(m_partialSequenceSize); 17281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch do { 17381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (isASCII(m_partialSequence[0])) { 17481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch *destination++ = m_partialSequence[0]; 17581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch consumePartialSequenceByte(); 17681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch continue; 17781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch } 17881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch int count = nonASCIISequenceLength(m_partialSequence[0]); 17981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (!count) { 18081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch handleError(destination, stopOnError, sawError); 18181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (stopOnError) 18281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return; 18381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch continue; 18481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch } 18581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (count > m_partialSequenceSize) { 18681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (count - m_partialSequenceSize > end - source) { 18781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (!flush) { 18881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // The new data is not enough to complete the sequence, so 18981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // add it to the existing partial sequence. 19081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); 19181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch m_partialSequenceSize += end - source; 19281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return; 19381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch } 19481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // An incomplete partial sequence at the end is an error. 19581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch handleError(destination, stopOnError, sawError); 19681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (stopOnError) 19781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return; 19881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch continue; 19981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch } 20081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); 20181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch source += count - m_partialSequenceSize; 20281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch m_partialSequenceSize = count; 20381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch } 20481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch int character = decodeNonASCIISequence(m_partialSequence, count); 20581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (character == nonCharacter) { 20681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch handleError(destination, stopOnError, sawError); 20781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (stopOnError) 20881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch return; 20981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch continue; 21081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch } 21181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch m_partialSequenceSize -= count; 21281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch destination = appendCharacter(destination, character); 21381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch } while (m_partialSequenceSize); 21481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch} 21581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch 2162fc2651226baac27029e38c9d6ef883fa32084dbSteve BlockString TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) 2172fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{ 21881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // Each input byte might turn into a character. 21981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // That includes all bytes in the partial-sequence buffer because 22081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // each byte in an invalid sequence will turn into a replacement character. 22181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch StringBuffer buffer(m_partialSequenceSize + length); 2222fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 2232fc2651226baac27029e38c9d6ef883fa32084dbSteve Block const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); 2242fc2651226baac27029e38c9d6ef883fa32084dbSteve Block const uint8_t* end = source + length; 2252fc2651226baac27029e38c9d6ef883fa32084dbSteve Block const uint8_t* alignedEnd = alignToMachineWord(end); 2262fc2651226baac27029e38c9d6ef883fa32084dbSteve Block UChar* destination = buffer.characters(); 2272fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 22881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch do { 22981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (m_partialSequenceSize) { 23081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // Explicitly copy destination and source pointers to avoid taking pointers to the 23181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // local variables, which may harm code generation by disabling some optimizations 23281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // in some compilers. 23381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch UChar* destinationForHandlePartialSequence = destination; 23481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch const uint8_t* sourceForHandlePartialSequence = source; 23581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError); 23681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch destination = destinationForHandlePartialSequence; 23781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch source = sourceForHandlePartialSequence; 23881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (m_partialSequenceSize) 23981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch break; 2402fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 2412fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 24281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch while (source < end) { 24381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (isASCII(*source)) { 24481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // Fast path for ASCII. Most UTF-8 text will be ASCII. 24581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (isAlignedToMachineWord(source)) { 24681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch while (source < alignedEnd) { 24781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); 24881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (!isAllASCII(chunk)) 2492fc2651226baac27029e38c9d6ef883fa32084dbSteve Block break; 25081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch copyASCIIMachineWord(destination, source); 25181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch source += sizeof(MachineWord); 25281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch destination += sizeof(MachineWord); 2532fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 25481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (source == end) 25581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch break; 25681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (!isASCII(*source)) 25781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch continue; 2582fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 25981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch *destination++ = *source++; 26081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch continue; 2612fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 26281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch int count = nonASCIISequenceLength(*source); 26381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch int character; 26481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (!count) 26581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch character = nonCharacter; 26681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch else { 26781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (count > end - source) { 26881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch ASSERT(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); 26981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch ASSERT(!m_partialSequenceSize); 27081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch m_partialSequenceSize = end - source; 27181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch memcpy(m_partialSequence, source, m_partialSequenceSize); 27281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch source = end; 2732fc2651226baac27029e38c9d6ef883fa32084dbSteve Block break; 2742fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 27581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch character = decodeNonASCIISequence(source, count); 2762fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 27781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (character == nonCharacter) { 27881bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch sawError = true; 27981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch if (stopOnError) 28081bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch break; 28181bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch // Each error generates a replacement character and consumes one byte. 28281bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch *destination++ = replacementCharacter; 28381bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch ++source; 28481bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch continue; 28581bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch } 28681bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch source += count; 28781bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch destination = appendCharacter(destination, character); 2882fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 28981bc750723a18f21cd17d1b173cd2a4dda9cea6eBen Murdoch } while (flush && m_partialSequenceSize); 2902fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 2912fc2651226baac27029e38c9d6ef883fa32084dbSteve Block buffer.shrink(destination - buffer.characters()); 2922fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 2932fc2651226baac27029e38c9d6ef883fa32084dbSteve Block return String::adopt(buffer); 2942fc2651226baac27029e38c9d6ef883fa32084dbSteve Block} 2952fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 2962fc2651226baac27029e38c9d6ef883fa32084dbSteve BlockCString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling) 2972fc2651226baac27029e38c9d6ef883fa32084dbSteve Block{ 2982fc2651226baac27029e38c9d6ef883fa32084dbSteve Block // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. 2992fc2651226baac27029e38c9d6ef883fa32084dbSteve Block // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x). 3002fc2651226baac27029e38c9d6ef883fa32084dbSteve Block // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x). 3012fc2651226baac27029e38c9d6ef883fa32084dbSteve Block if (length > numeric_limits<size_t>::max() / 3) 3022fc2651226baac27029e38c9d6ef883fa32084dbSteve Block CRASH(); 3032fc2651226baac27029e38c9d6ef883fa32084dbSteve Block Vector<uint8_t> bytes(length * 3); 3042fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 3052fc2651226baac27029e38c9d6ef883fa32084dbSteve Block size_t i = 0; 3062fc2651226baac27029e38c9d6ef883fa32084dbSteve Block size_t bytesWritten = 0; 3072fc2651226baac27029e38c9d6ef883fa32084dbSteve Block while (i < length) { 3082fc2651226baac27029e38c9d6ef883fa32084dbSteve Block UChar32 character; 3092fc2651226baac27029e38c9d6ef883fa32084dbSteve Block U16_NEXT(characters, i, length, character); 3102fc2651226baac27029e38c9d6ef883fa32084dbSteve Block U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); 3112fc2651226baac27029e38c9d6ef883fa32084dbSteve Block } 3122fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 3132fc2651226baac27029e38c9d6ef883fa32084dbSteve Block return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); 3142fc2651226baac27029e38c9d6ef883fa32084dbSteve Block} 3152fc2651226baac27029e38c9d6ef883fa32084dbSteve Block 3162fc2651226baac27029e38c9d6ef883fa32084dbSteve Block} // namespace WebCore 317