1/*
2    Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4    Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5
6    This library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Library General Public
8    License as published by the Free Software Foundation; either
9    version 2 of the License, or (at your option) any later version.
10
11    This library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Library General Public License for more details.
15
16    You should have received a copy of the GNU Library General Public License
17    along with this library; see the file COPYING.LIB.  If not, write to
18    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19    Boston, MA 02110-1301, USA.
20*/
21
22
23#include "config.h"
24#include "TextResourceDecoder.h"
25
26#include "DOMImplementation.h"
27#include "HTMLMetaCharsetParser.h"
28#include "HTMLNames.h"
29#include "TextCodec.h"
30#include "TextEncoding.h"
31#include "TextEncodingDetector.h"
32#include "TextEncodingRegistry.h"
33#include <wtf/ASCIICType.h>
34#include <wtf/StringExtras.h>
35
36using namespace WTF;
37
38namespace WebCore {
39
40using namespace HTMLNames;
41
42// You might think we should put these find functions elsewhere, perhaps with the
43// similar functions that operate on UChar, but arguably only the decoder has
44// a reason to process strings of char rather than UChar.
45
46static int find(const char* subject, size_t subjectLength, const char* target)
47{
48    size_t targetLength = strlen(target);
49    if (targetLength > subjectLength)
50        return -1;
51    for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
52        bool match = true;
53        for (size_t j = 0; j < targetLength; ++j) {
54            if (subject[i + j] != target[j]) {
55                match = false;
56                break;
57            }
58        }
59        if (match)
60            return i;
61    }
62    return -1;
63}
64
65static TextEncoding findTextEncoding(const char* encodingName, int length)
66{
67    Vector<char, 64> buffer(length + 1);
68    memcpy(buffer.data(), encodingName, length);
69    buffer[length] = '\0';
70    return buffer.data();
71}
72
73class KanjiCode {
74public:
75    enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
76    static enum Type judge(const char* str, int length);
77    static const int ESC = 0x1b;
78    static const unsigned char sjisMap[256];
79    static int ISkanji(int code)
80    {
81        if (code >= 0x100)
82            return 0;
83        return sjisMap[code & 0xff] & 1;
84    }
85    static int ISkana(int code)
86    {
87        if (code >= 0x100)
88            return 0;
89        return sjisMap[code & 0xff] & 2;
90    }
91};
92
93const unsigned char KanjiCode::sjisMap[256] = {
94    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104    0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
105    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
108    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
110};
111
112/*
113 * EUC-JP is
114 *     [0xa1 - 0xfe][0xa1 - 0xfe]
115 *     0x8e[0xa1 - 0xfe](SS2)
116 *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
117 *
118 * Shift_Jis is
119 *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
120 *
121 * Shift_Jis Hankaku Kana is
122 *     [0xa1 - 0xdf]
123 */
124
125/*
126 * KanjiCode::judge() is based on judge_jcode() from jvim
127 *     http://hp.vector.co.jp/authors/VA003457/vim/
128 *
129 * Special Thanks to Kenichi Tsuchida
130 */
131
132enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
133{
134    enum Type code;
135    int i;
136    int bfr = false;            /* Kana Moji */
137    int bfk = 0;                /* EUC Kana */
138    int sjis = 0;
139    int euc = 0;
140
141    const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
142
143    code = ASCII;
144
145    i = 0;
146    while (i < size) {
147        if (ptr[i] == ESC && (size - i >= 3)) {
148            if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
149            || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
150                code = JIS;
151                goto breakBreak;
152            } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
153                    || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
154                code = JIS;
155                goto breakBreak;
156            } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
157                code = JIS;
158                i += 3;
159            } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
160                code = JIS;
161                i += 3;
162            } else {
163                i++;
164            }
165            bfr = false;
166            bfk = 0;
167        } else {
168            if (ptr[i] < 0x20) {
169                bfr = false;
170                bfk = 0;
171                /* ?? check kudokuten ?? && ?? hiragana ?? */
172                if ((i >= 2) && (ptr[i - 2] == 0x81)
173                        && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
174                    code = SJIS;
175                    sjis += 100;        /* kudokuten */
176                } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
177                        && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
178                    code = EUC;
179                    euc += 100;         /* kudokuten */
180                } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
181                    sjis += 40;         /* hiragana */
182                } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
183                    euc += 40;          /* hiragana */
184                }
185            } else {
186                /* ?? check hiragana or katana ?? */
187                if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
188                    sjis++;     /* hiragana */
189                } else if ((size - i > 1) && (ptr[i] == 0x83)
190                         && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
191                    sjis++;     /* katakana */
192                } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
193                    euc++;      /* hiragana */
194                } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
195                    euc++;      /* katakana */
196                }
197                if (bfr) {
198                    if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
199                        code = SJIS;
200                        goto breakBreak;
201                    } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
202                        code = SJIS;
203                        goto breakBreak;
204                    } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
205                        code = EUC;
206                        goto breakBreak;
207                    } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
208                        code = EUC;
209                        goto breakBreak;
210                    } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
211                        code = SJIS;
212                        goto breakBreak;
213                    } else if (ptr[i] <= 0x7f) {
214                        code = SJIS;
215                        goto breakBreak;
216                    } else {
217                        if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
218                            euc++;      /* sjis hankaku kana kigo */
219                        } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
220                            ;           /* sjis hankaku kana */
221                        } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
222                            euc++;
223                        } else if (0x8e == ptr[i]) {
224                            euc++;
225                        } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
226                            sjis++;
227                        }
228                        bfr = false;
229                        bfk = 0;
230                    }
231                } else if (0x8e == ptr[i]) {
232                    if (size - i <= 1) {
233                        ;
234                    } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
235                        /* EUC KANA or SJIS KANJI */
236                        if (bfk == 1) {
237                            euc += 100;
238                        }
239                        bfk++;
240                        i++;
241                    } else {
242                        /* SJIS only */
243                        code = SJIS;
244                        goto breakBreak;
245                    }
246                } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
247                    /* SJIS only */
248                    code = SJIS;
249                    if ((size - i >= 1)
250                            && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
251                            || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
252                        goto breakBreak;
253                    }
254                } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
255                    /* EUC only */
256                    code = EUC;
257                    if ((size - i >= 1)
258                            && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
259                        goto breakBreak;
260                    }
261                } else if (ptr[i] <= 0x7f) {
262                    ;
263                } else {
264                    bfr = true;
265                    bfk = 0;
266                }
267            }
268            i++;
269        }
270    }
271    if (code == ASCII) {
272        if (sjis > euc) {
273            code = SJIS;
274        } else if (sjis < euc) {
275            code = EUC;
276        }
277    }
278breakBreak:
279    return (code);
280}
281
282TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
283{
284    if (equalIgnoringCase(mimeType, "text/css"))
285        return CSS;
286    if (equalIgnoringCase(mimeType, "text/html"))
287        return HTML;
288    if (DOMImplementation::isXMLMIMEType(mimeType))
289        return XML;
290    return PlainText;
291}
292
293const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
294{
295    // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
296    // for text/xml. This matches Firefox.
297    if (contentType == XML)
298        return UTF8Encoding();
299    if (!specifiedDefaultEncoding.isValid())
300        return Latin1Encoding();
301    return specifiedDefaultEncoding;
302}
303
304TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
305    : m_contentType(determineContentType(mimeType))
306    , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
307    , m_source(DefaultEncoding)
308    , m_hintEncoding(0)
309    , m_checkedForBOM(false)
310    , m_checkedForCSSCharset(false)
311    , m_checkedForHeadCharset(false)
312    , m_useLenientXMLDecoding(false)
313    , m_sawError(false)
314    , m_usesEncodingDetector(usesEncodingDetector)
315{
316}
317
318TextResourceDecoder::~TextResourceDecoder()
319{
320}
321
322void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
323{
324    // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
325    if (!encoding.isValid())
326        return;
327
328    // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
329    // treat x-user-defined as windows-1252 (bug 18270)
330    if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
331        m_encoding = "windows-1252";
332    else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
333        m_encoding = encoding.closestByteBasedEquivalent();
334    else
335        m_encoding = encoding;
336
337    m_codec.clear();
338    m_source = source;
339}
340
341// Returns the position of the encoding string.
342static int findXMLEncoding(const char* str, int len, int& encodingLength)
343{
344    int pos = find(str, len, "encoding");
345    if (pos == -1)
346        return -1;
347    pos += 8;
348
349    // Skip spaces and stray control characters.
350    while (pos < len && str[pos] <= ' ')
351        ++pos;
352
353    // Skip equals sign.
354    if (pos >= len || str[pos] != '=')
355        return -1;
356    ++pos;
357
358    // Skip spaces and stray control characters.
359    while (pos < len && str[pos] <= ' ')
360        ++pos;
361
362    // Skip quotation mark.
363    if (pos >= len)
364        return - 1;
365    char quoteMark = str[pos];
366    if (quoteMark != '"' && quoteMark != '\'')
367        return -1;
368    ++pos;
369
370    // Find the trailing quotation mark.
371    int end = pos;
372    while (end < len && str[end] != quoteMark)
373        ++end;
374    if (end >= len)
375        return -1;
376
377    encodingLength = end - pos;
378    return pos;
379}
380
381// true if there is more to parse
382static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
383{
384    while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
385        ++pos;
386    return pos != dataEnd;
387}
388
389size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
390{
391    // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
392    // We let it override even a user-chosen encoding.
393    ASSERT(!m_checkedForBOM);
394
395    size_t lengthOfBOM = 0;
396
397    size_t bufferLength = m_buffer.size();
398
399    size_t buf1Len = bufferLength;
400    size_t buf2Len = len;
401    const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
402    const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
403    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
404    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
405    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
406    unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
407
408    // Check for the BOM.
409    if (c1 == 0xFF && c2 == 0xFE) {
410        if (c3 != 0 || c4 != 0) {
411            setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
412            lengthOfBOM = 2;
413        } else {
414            setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
415            lengthOfBOM = 4;
416        }
417    } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
418        setEncoding(UTF8Encoding(), AutoDetectedEncoding);
419        lengthOfBOM = 3;
420    } else if (c1 == 0xFE && c2 == 0xFF) {
421        setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
422        lengthOfBOM = 2;
423    } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
424        setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
425        lengthOfBOM = 4;
426    }
427
428    if (lengthOfBOM || bufferLength + len >= 4)
429        m_checkedForBOM = true;
430
431    return lengthOfBOM;
432}
433
434bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
435{
436    if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
437        m_checkedForCSSCharset = true;
438        return true;
439    }
440
441    size_t oldSize = m_buffer.size();
442    m_buffer.grow(oldSize + len);
443    memcpy(m_buffer.data() + oldSize, data, len);
444
445    movedDataToBuffer = true;
446
447    if (m_buffer.size() > 8) { // strlen("@charset") == 8
448        const char* dataStart = m_buffer.data();
449        const char* dataEnd = dataStart + m_buffer.size();
450
451        if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' &&
452            dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') {
453
454            dataStart += 8;
455            const char* pos = dataStart;
456            if (!skipWhitespace(pos, dataEnd))
457                return false;
458
459            if (*pos == '"' || *pos == '\'') {
460                char quotationMark = *pos;
461                ++pos;
462                dataStart = pos;
463
464                while (pos < dataEnd && *pos != quotationMark)
465                    ++pos;
466                if (pos == dataEnd)
467                    return false;
468
469                int encodingNameLength = pos - dataStart;
470
471                ++pos;
472                if (!skipWhitespace(pos, dataEnd))
473                    return false;
474
475                if (*pos == ';')
476                    setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
477            }
478        }
479        m_checkedForCSSCharset = true;
480        return true;
481    }
482    return false;
483}
484
485// Other browsers allow comments in the head section, so we need to also.
486// It's important not to look for tags inside the comments.
487static inline void skipComment(const char*& ptr, const char* pEnd)
488{
489    const char* p = ptr;
490    if (p == pEnd)
491      return;
492    // Allow <!-->; other browsers do.
493    if (*p == '>') {
494        p++;
495    } else {
496        while (p + 2 < pEnd) {
497            if (*p == '-') {
498                // This is the real end of comment, "-->".
499                if (p[1] == '-' && p[2] == '>') {
500                    p += 3;
501                    break;
502                }
503                // This is the incorrect end of comment that other browsers allow, "--!>".
504                if (p + 3 < pEnd && p[1] == '-' && p[2] == '!' && p[3] == '>') {
505                    p += 4;
506                    break;
507                }
508            }
509            p++;
510        }
511    }
512    ptr = p;
513}
514
515bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
516{
517    if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
518        m_checkedForHeadCharset = true;
519        return true;
520    }
521
522    // This is not completely efficient, since the function might go
523    // through the HTML head several times.
524
525    size_t oldSize = m_buffer.size();
526    m_buffer.grow(oldSize + len);
527    memcpy(m_buffer.data() + oldSize, data, len);
528
529    movedDataToBuffer = true;
530
531    // Continue with checking for an HTML meta tag if we were already doing so.
532    if (m_charsetParser)
533        return checkForMetaCharset(data, len);
534
535    const char* ptr = m_buffer.data();
536    const char* pEnd = ptr + m_buffer.size();
537
538    // Is there enough data available to check for XML declaration?
539    if (m_buffer.size() < 8)
540        return false;
541
542    // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
543    // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
544    if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') {
545        const char* xmlDeclarationEnd = ptr;
546        while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
547            ++xmlDeclarationEnd;
548        if (xmlDeclarationEnd == pEnd)
549            return false;
550        // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
551        int len = 0;
552        int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
553        if (pos != -1)
554            setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
555        // continue looking for a charset - it may be specified in an HTTP-Equiv meta
556    } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) {
557        setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
558        return true;
559    } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') {
560        setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
561        return true;
562    } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) {
563        setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
564        return true;
565    } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') {
566        setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
567        return true;
568    }
569
570    // The HTTP-EQUIV meta has no effect on XHTML.
571    if (m_contentType == XML)
572        return true;
573
574    m_charsetParser = HTMLMetaCharsetParser::create();
575    return checkForMetaCharset(data, len);
576}
577
578bool TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
579{
580    if (!m_charsetParser->checkForMetaCharset(data, length))
581        return false;
582
583    setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
584    m_charsetParser.clear();
585    m_checkedForHeadCharset = true;
586    return true;
587}
588
589void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
590{
591    switch (KanjiCode::judge(data, len)) {
592        case KanjiCode::JIS:
593            setEncoding("ISO-2022-JP", AutoDetectedEncoding);
594            break;
595        case KanjiCode::EUC:
596            setEncoding("EUC-JP", AutoDetectedEncoding);
597            break;
598        case KanjiCode::SJIS:
599            setEncoding("Shift_JIS", AutoDetectedEncoding);
600            break;
601        case KanjiCode::ASCII:
602        case KanjiCode::UTF16:
603        case KanjiCode::UTF8:
604            break;
605    }
606}
607
608// We use the encoding detector in two cases:
609//   1. Encoding detector is turned ON and no other encoding source is
610//      available (that is, it's DefaultEncoding).
611//   2. Encoding detector is turned ON and the encoding is set to
612//      the encoding of the parent frame, which is also auto-detected.
613//   Note that condition #2 is NOT satisfied unless parent-child frame
614//   relationship is compliant to the same-origin policy. If they're from
615//   different domains, |m_source| would not be set to EncodingFromParentFrame
616//   in the first place.
617bool TextResourceDecoder::shouldAutoDetect() const
618{
619    // Just checking m_hintEncoding suffices here because it's only set
620    // in setHintEncoding when the source is AutoDetectedEncoding.
621    return m_usesEncodingDetector
622        && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
623}
624
625String TextResourceDecoder::decode(const char* data, size_t len)
626{
627    size_t lengthOfBOM = 0;
628    if (!m_checkedForBOM)
629        lengthOfBOM = checkForBOM(data, len);
630
631    bool movedDataToBuffer = false;
632
633    if (m_contentType == CSS && !m_checkedForCSSCharset)
634        if (!checkForCSSCharset(data, len, movedDataToBuffer))
635            return "";
636
637    if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
638        if (!checkForHeadCharset(data, len, movedDataToBuffer))
639            return "";
640
641    // FIXME: It is wrong to change the encoding downstream after we have already done some decoding.
642    if (shouldAutoDetect()) {
643        if (m_encoding.isJapanese())
644            detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
645        else {
646            TextEncoding detectedEncoding;
647            if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
648                setEncoding(detectedEncoding, AutoDetectedEncoding);
649        }
650    }
651
652    ASSERT(m_encoding.isValid());
653
654    if (!m_codec)
655        m_codec = newTextCodec(m_encoding);
656
657    if (m_buffer.isEmpty())
658        return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError);
659
660    if (!movedDataToBuffer) {
661        size_t oldSize = m_buffer.size();
662        m_buffer.grow(oldSize + len);
663        memcpy(m_buffer.data() + oldSize, data, len);
664    }
665
666    String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
667    m_buffer.clear();
668    return result;
669}
670
671String TextResourceDecoder::flush()
672{
673   // If we can not identify the encoding even after a document is completely
674   // loaded, we need to detect the encoding if other conditions for
675   // autodetection is satisfied.
676    if (m_buffer.size() && shouldAutoDetect()
677        && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
678         TextEncoding detectedEncoding;
679         if (detectTextEncoding(m_buffer.data(), m_buffer.size(),
680                                m_hintEncoding, &detectedEncoding))
681             setEncoding(detectedEncoding, AutoDetectedEncoding);
682    }
683
684    if (!m_codec)
685        m_codec = newTextCodec(m_encoding);
686
687    String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
688    m_buffer.clear();
689    m_codec.clear();
690    m_checkedForBOM = false; // Skip BOM again when re-decoding.
691    return result;
692}
693
694}
695