1/*
2    Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4    Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5
6    This library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Library General Public
8    License as published by the Free Software Foundation; either
9    version 2 of the License, or (at your option) any later version.
10
11    This library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Library General Public License for more details.
15
16    You should have received a copy of the GNU Library General Public License
17    along with this library; see the file COPYING.LIB.  If not, write to
18    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19    Boston, MA 02110-1301, USA.
20*/
21
22
23#include "config.h"
24#include "core/loader/TextResourceDecoder.h"
25
26#include "HTMLNames.h"
27#include "core/dom/DOMImplementation.h"
28#include "core/html/parser/HTMLMetaCharsetParser.h"
29#include "core/platform/text/TextEncodingDetector.h"
30#include "wtf/StringExtras.h"
31#include "wtf/text/TextCodec.h"
32#include "wtf/text/TextEncoding.h"
33#include "wtf/text/TextEncodingRegistry.h"
34
35using namespace WTF;
36
37namespace WebCore {
38
39using namespace HTMLNames;
40
41static inline bool bytesEqual(const char* p, char b0, char b1)
42{
43    return p[0] == b0 && p[1] == b1;
44}
45
46static inline bool bytesEqual(const char* p, char b0, char b1, char b2)
47{
48    return p[0] == b0 && p[1] == b1 && p[2] == b2;
49}
50
51static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
52{
53    return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4;
54}
55
56static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5)
57{
58    return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5;
59}
60
61static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7)
62{
63    return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7;
64}
65
66static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9)
67{
68    return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7 && p[8] == b8 && p[9] == b9;
69}
70
71// You might think we should put these find functions elsewhere, perhaps with the
72// similar functions that operate on UChar, but arguably only the decoder has
73// a reason to process strings of char rather than UChar.
74
75static int find(const char* subject, size_t subjectLength, const char* target)
76{
77    size_t targetLength = strlen(target);
78    if (targetLength > subjectLength)
79        return -1;
80    for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
81        bool match = true;
82        for (size_t j = 0; j < targetLength; ++j) {
83            if (subject[i + j] != target[j]) {
84                match = false;
85                break;
86            }
87        }
88        if (match)
89            return i;
90    }
91    return -1;
92}
93
94static WTF::TextEncoding findTextEncoding(const char* encodingName, int length)
95{
96    Vector<char, 64> buffer(length + 1);
97    memcpy(buffer.data(), encodingName, length);
98    buffer[length] = '\0';
99    return buffer.data();
100}
101
102class KanjiCode {
103public:
104    enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
105    static enum Type judge(const char* str, int length);
106    static const int ESC = 0x1b;
107    static const unsigned char sjisMap[256];
108    static int ISkanji(int code)
109    {
110        if (code >= 0x100)
111            return 0;
112        return sjisMap[code & 0xff] & 1;
113    }
114    static int ISkana(int code)
115    {
116        if (code >= 0x100)
117            return 0;
118        return sjisMap[code & 0xff] & 2;
119    }
120};
121
122const unsigned char KanjiCode::sjisMap[256] = {
123    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
126    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
129    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
133    0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
138    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
139};
140
141/*
142 * EUC-JP is
143 *     [0xa1 - 0xfe][0xa1 - 0xfe]
144 *     0x8e[0xa1 - 0xfe](SS2)
145 *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
146 *
147 * Shift_Jis is
148 *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
149 *
150 * Shift_Jis Hankaku Kana is
151 *     [0xa1 - 0xdf]
152 */
153
154/*
155 * KanjiCode::judge() is based on judge_jcode() from jvim
156 *     http://hp.vector.co.jp/authors/VA003457/vim/
157 *
158 * Special Thanks to Kenichi Tsuchida
159 */
160
161enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
162{
163    enum Type code;
164    int i;
165    int bfr = false;            /* Kana Moji */
166    int bfk = 0;                /* EUC Kana */
167    int sjis = 0;
168    int euc = 0;
169
170    const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
171
172    code = ASCII;
173
174    i = 0;
175    while (i < size) {
176        if (ptr[i] == ESC && (size - i >= 3)) {
177            if (bytesEqual(str + i + 1, '$', 'B')
178                    || bytesEqual(str + i + 1, '(', 'B')
179                    || bytesEqual(str + i + 1, '$', '@')
180                    || bytesEqual(str + i + 1, '(', 'J')) {
181                code = JIS;
182                goto breakBreak;
183            }
184            if (bytesEqual(str + i + 1, '(', 'I') || bytesEqual(str + i + 1, ')', 'I')) {
185                code = JIS;
186                i += 3;
187            } else {
188                i++;
189            }
190            bfr = false;
191            bfk = 0;
192        } else {
193            if (ptr[i] < 0x20) {
194                bfr = false;
195                bfk = 0;
196                /* ?? check kudokuten ?? && ?? hiragana ?? */
197                if ((i >= 2) && (ptr[i - 2] == 0x81)
198                        && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
199                    code = SJIS;
200                    sjis += 100;        /* kudokuten */
201                } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
202                        && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
203                    code = EUC;
204                    euc += 100;         /* kudokuten */
205                } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
206                    sjis += 40;         /* hiragana */
207                } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
208                    euc += 40;          /* hiragana */
209                }
210            } else {
211                /* ?? check hiragana or katana ?? */
212                if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
213                    sjis++;     /* hiragana */
214                } else if ((size - i > 1) && (ptr[i] == 0x83)
215                         && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
216                    sjis++;     /* katakana */
217                } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
218                    euc++;      /* hiragana */
219                } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
220                    euc++;      /* katakana */
221                }
222                if (bfr) {
223                    if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
224                        code = SJIS;
225                        goto breakBreak;
226                    } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
227                        code = SJIS;
228                        goto breakBreak;
229                    } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
230                        code = EUC;
231                        goto breakBreak;
232                    } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
233                        code = EUC;
234                        goto breakBreak;
235                    } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
236                        code = SJIS;
237                        goto breakBreak;
238                    } else if (ptr[i] <= 0x7f) {
239                        code = SJIS;
240                        goto breakBreak;
241                    } else {
242                        if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
243                            euc++;      /* sjis hankaku kana kigo */
244                        } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
245                            ;           /* sjis hankaku kana */
246                        } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
247                            euc++;
248                        } else if (0x8e == ptr[i]) {
249                            euc++;
250                        } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
251                            sjis++;
252                        }
253                        bfr = false;
254                        bfk = 0;
255                    }
256                } else if (0x8e == ptr[i]) {
257                    if (size - i <= 1) {
258                        ;
259                    } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
260                        /* EUC KANA or SJIS KANJI */
261                        if (bfk == 1) {
262                            euc += 100;
263                        }
264                        bfk++;
265                        i++;
266                    } else {
267                        /* SJIS only */
268                        code = SJIS;
269                        goto breakBreak;
270                    }
271                } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
272                    /* SJIS only */
273                    code = SJIS;
274                    if ((size - i >= 1)
275                            && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
276                            || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
277                        goto breakBreak;
278                    }
279                } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
280                    /* EUC only */
281                    code = EUC;
282                    if ((size - i >= 1)
283                            && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
284                        goto breakBreak;
285                    }
286                } else if (ptr[i] <= 0x7f) {
287                    ;
288                } else {
289                    bfr = true;
290                    bfk = 0;
291                }
292            }
293            i++;
294        }
295    }
296    if (code == ASCII) {
297        if (sjis > euc) {
298            code = SJIS;
299        } else if (sjis < euc) {
300            code = EUC;
301        }
302    }
303breakBreak:
304    return (code);
305}
306
307TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
308{
309    if (equalIgnoringCase(mimeType, "text/css"))
310        return CSS;
311    if (equalIgnoringCase(mimeType, "text/html"))
312        return HTML;
313    if (DOMImplementation::isXMLMIMEType(mimeType))
314        return XML;
315    return PlainText;
316}
317
318const WTF::TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const WTF::TextEncoding& specifiedDefaultEncoding)
319{
320    // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
321    // for text/xml. This matches Firefox.
322    if (contentType == XML)
323        return UTF8Encoding();
324    if (!specifiedDefaultEncoding.isValid())
325        return Latin1Encoding();
326    return specifiedDefaultEncoding;
327}
328
329TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
330    : m_contentType(determineContentType(mimeType))
331    , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
332    , m_source(DefaultEncoding)
333    , m_hintEncoding(0)
334    , m_checkedForBOM(false)
335    , m_checkedForCSSCharset(false)
336    , m_checkedForXMLCharset(false)
337    , m_checkedForMetaCharset(false)
338    , m_useLenientXMLDecoding(false)
339    , m_sawError(false)
340    , m_usesEncodingDetector(usesEncodingDetector)
341{
342}
343
344TextResourceDecoder::~TextResourceDecoder()
345{
346}
347
348void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, EncodingSource source)
349{
350    // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
351    if (!encoding.isValid())
352        return;
353
354    // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
355    // treat x-user-defined as windows-1252 (bug 18270)
356    if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
357        m_encoding = "windows-1252";
358    else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
359        m_encoding = encoding.closestByteBasedEquivalent();
360    else
361        m_encoding = encoding;
362
363    m_codec.clear();
364    m_source = source;
365}
366
367// Returns the position of the encoding string.
368static int findXMLEncoding(const char* str, int len, int& encodingLength)
369{
370    int pos = find(str, len, "encoding");
371    if (pos == -1)
372        return -1;
373    pos += 8;
374
375    // Skip spaces and stray control characters.
376    while (pos < len && str[pos] <= ' ')
377        ++pos;
378
379    // Skip equals sign.
380    if (pos >= len || str[pos] != '=')
381        return -1;
382    ++pos;
383
384    // Skip spaces and stray control characters.
385    while (pos < len && str[pos] <= ' ')
386        ++pos;
387
388    // Skip quotation mark.
389    if (pos >= len)
390        return - 1;
391    char quoteMark = str[pos];
392    if (quoteMark != '"' && quoteMark != '\'')
393        return -1;
394    ++pos;
395
396    // Find the trailing quotation mark.
397    int end = pos;
398    while (end < len && str[end] != quoteMark)
399        ++end;
400    if (end >= len)
401        return -1;
402
403    encodingLength = end - pos;
404    return pos;
405}
406
407// true if there is more to parse
408static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
409{
410    while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
411        ++pos;
412    return pos != dataEnd;
413}
414
415size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
416{
417    // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
418    // We let it override even a user-chosen encoding.
419    ASSERT(!m_checkedForBOM);
420
421    size_t lengthOfBOM = 0;
422
423    size_t bufferLength = m_buffer.size();
424
425    size_t buf1Len = bufferLength;
426    size_t buf2Len = len;
427    const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
428    const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
429    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
430    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
431    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
432    unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
433
434    // Check for the BOM.
435    if (c1 == 0xFF && c2 == 0xFE) {
436        if (c3 != 0 || c4 != 0) {
437            setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
438            lengthOfBOM = 2;
439        } else {
440            setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
441            lengthOfBOM = 4;
442        }
443    } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
444        setEncoding(UTF8Encoding(), AutoDetectedEncoding);
445        lengthOfBOM = 3;
446    } else if (c1 == 0xFE && c2 == 0xFF) {
447        setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
448        lengthOfBOM = 2;
449    } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
450        setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
451        lengthOfBOM = 4;
452    }
453
454    if (lengthOfBOM || bufferLength + len >= 4)
455        m_checkedForBOM = true;
456
457    return lengthOfBOM;
458}
459
460bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
461{
462    if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
463        m_checkedForCSSCharset = true;
464        return true;
465    }
466
467    size_t oldSize = m_buffer.size();
468    m_buffer.grow(oldSize + len);
469    memcpy(m_buffer.data() + oldSize, data, len);
470
471    movedDataToBuffer = true;
472
473    if (m_buffer.size() <= 13) // strlen('@charset "x";') == 13
474        return false;
475
476    const char* dataStart = m_buffer.data();
477    const char* dataEnd = dataStart + m_buffer.size();
478
479    if (bytesEqual(dataStart, '@', 'c', 'h', 'a', 'r', 's', 'e', 't', ' ', '"')) {
480        dataStart += 10;
481        const char* pos = dataStart;
482
483        while (pos < dataEnd && *pos != '"')
484            ++pos;
485        if (pos == dataEnd)
486            return false;
487
488        int encodingNameLength = pos - dataStart;
489
490        ++pos;
491
492        if (*pos == ';')
493            setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
494    }
495
496    m_checkedForCSSCharset = true;
497    return true;
498}
499
500bool TextResourceDecoder::checkForXMLCharset(const char* data, size_t len, bool& movedDataToBuffer)
501{
502    if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
503        m_checkedForXMLCharset = true;
504        return true;
505    }
506
507    // This is not completely efficient, since the function might go
508    // through the HTML head several times.
509
510    size_t oldSize = m_buffer.size();
511    m_buffer.grow(oldSize + len);
512    memcpy(m_buffer.data() + oldSize, data, len);
513
514    movedDataToBuffer = true;
515
516    const char* ptr = m_buffer.data();
517    const char* pEnd = ptr + m_buffer.size();
518
519    // Is there enough data available to check for XML declaration?
520    if (m_buffer.size() < 8)
521        return false;
522
523    // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
524    // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
525    if (bytesEqual(ptr, '<', '?', 'x', 'm', 'l')) {
526        const char* xmlDeclarationEnd = ptr;
527        while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
528            ++xmlDeclarationEnd;
529        if (xmlDeclarationEnd == pEnd)
530            return false;
531        // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
532        int len = 0;
533        int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
534        if (pos != -1)
535            setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
536        // continue looking for a charset - it may be specified in an HTTP-Equiv meta
537    } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0))
538        setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
539    else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x'))
540        setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
541    else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0))
542        setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
543    else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?'))
544        setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
545
546    m_checkedForXMLCharset = true;
547    return true;
548}
549
550void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
551{
552    if (m_source == UserChosenEncoding || m_source == EncodingFromHTTPHeader || m_source == AutoDetectedEncoding) {
553        m_checkedForMetaCharset = true;
554        return;
555    }
556
557    if (!m_charsetParser)
558        m_charsetParser = HTMLMetaCharsetParser::create();
559
560    if (!m_charsetParser->checkForMetaCharset(data, length))
561        return;
562
563    setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
564    m_charsetParser.clear();
565    m_checkedForMetaCharset = true;
566    return;
567}
568
569void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
570{
571    switch (KanjiCode::judge(data, len)) {
572        case KanjiCode::JIS:
573            setEncoding("ISO-2022-JP", EncodingFromContentSniffing);
574            break;
575        case KanjiCode::EUC:
576            setEncoding("EUC-JP", EncodingFromContentSniffing);
577            break;
578        case KanjiCode::SJIS:
579            setEncoding("Shift_JIS", EncodingFromContentSniffing);
580            break;
581        case KanjiCode::ASCII:
582        case KanjiCode::UTF16:
583        case KanjiCode::UTF8:
584            break;
585    }
586}
587
588// We use the encoding detector in two cases:
589//   1. Encoding detector is turned ON and no other encoding source is
590//      available (that is, it's DefaultEncoding).
591//   2. Encoding detector is turned ON and the encoding is set to
592//      the encoding of the parent frame, which is also auto-detected.
593//   Note that condition #2 is NOT satisfied unless parent-child frame
594//   relationship is compliant to the same-origin policy. If they're from
595//   different domains, |m_source| would not be set to EncodingFromParentFrame
596//   in the first place.
597bool TextResourceDecoder::shouldAutoDetect() const
598{
599    // Just checking m_hintEncoding suffices here because it's only set
600    // in setHintEncoding when the source is AutoDetectedEncoding.
601    return m_usesEncodingDetector
602        && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
603}
604
605String TextResourceDecoder::decode(const char* data, size_t len)
606{
607    size_t lengthOfBOM = 0;
608    if (!m_checkedForBOM)
609        lengthOfBOM = checkForBOM(data, len);
610
611    bool movedDataToBuffer = false;
612
613    if (m_contentType == CSS && !m_checkedForCSSCharset)
614        if (!checkForCSSCharset(data, len, movedDataToBuffer))
615            return emptyString();
616
617    if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForXMLCharset)
618        if (!checkForXMLCharset(data, len, movedDataToBuffer))
619            return emptyString();
620
621    // FIXME: It would be more efficient to move this logic below checkForMetaCharset because
622    //        checkForMetaCharset can overrule these detections.
623    if (shouldAutoDetect()) {
624        if (m_encoding.isJapanese())
625            detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
626        else {
627            WTF::TextEncoding detectedEncoding;
628            if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
629                setEncoding(detectedEncoding, EncodingFromContentSniffing);
630        }
631    }
632
633    ASSERT(m_encoding.isValid());
634
635    const char* dataForDecode = data + lengthOfBOM;
636    size_t lengthForDecode = len - lengthOfBOM;
637
638    if (!m_buffer.isEmpty()) {
639        if (!movedDataToBuffer) {
640            size_t oldSize = m_buffer.size();
641            m_buffer.grow(oldSize + len);
642            memcpy(m_buffer.data() + oldSize, data, len);
643        }
644
645        dataForDecode = m_buffer.data() + lengthOfBOM;
646        lengthForDecode = m_buffer.size() - lengthOfBOM;
647    }
648
649    if (m_contentType == HTML && !m_checkedForMetaCharset)
650        checkForMetaCharset(dataForDecode, lengthForDecode);
651
652    if (!m_codec)
653        m_codec = newTextCodec(m_encoding);
654
655    String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
656
657    m_buffer.clear();
658    return result;
659}
660
661String TextResourceDecoder::flush()
662{
663   // If we can not identify the encoding even after a document is completely
664   // loaded, we need to detect the encoding if other conditions for
665   // autodetection is satisfied.
666    if (m_buffer.size() && shouldAutoDetect()
667        && ((!m_checkedForXMLCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
668        WTF::TextEncoding detectedEncoding;
669        if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
670            setEncoding(detectedEncoding, EncodingFromContentSniffing);
671    }
672
673    if (!m_codec)
674        m_codec = newTextCodec(m_encoding);
675
676    String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
677    m_buffer.clear();
678    m_codec.clear();
679    m_checkedForBOM = false; // Skip BOM again when re-decoding.
680    return result;
681}
682
683}
684