1/*
2    Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4    Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5
6    This library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Library General Public
8    License as published by the Free Software Foundation; either
9    version 2 of the License, or (at your option) any later version.
10
11    This library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Library General Public License for more details.
15
16    You should have received a copy of the GNU Library General Public License
17    along with this library; see the file COPYING.LIB.  If not, write to
18    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19    Boston, MA 02110-1301, USA.
20*/
21
22
23#include "config.h"
24#include "TextResourceDecoder.h"
25
26#include "DOMImplementation.h"
27#include "HTMLNames.h"
28#include "TextCodec.h"
29#include "TextEncoding.h"
30#include "TextEncodingDetector.h"
31#include "TextEncodingRegistry.h"
32#include <wtf/ASCIICType.h>
33#include <wtf/StringExtras.h>
34
35using namespace WTF;
36
37namespace WebCore {
38
39using namespace HTMLNames;
40
41// You might think we should put these find functions elsewhere, perhaps with the
42// similar functions that operate on UChar, but arguably only the decoder has
43// a reason to process strings of char rather than UChar.
44
45static int find(const char* subject, size_t subjectLength, const char* target)
46{
47    size_t targetLength = strlen(target);
48    if (targetLength > subjectLength)
49        return -1;
50    for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
51        bool match = true;
52        for (size_t j = 0; j < targetLength; ++j) {
53            if (subject[i + j] != target[j]) {
54                match = false;
55                break;
56            }
57        }
58        if (match)
59            return i;
60    }
61    return -1;
62}
63
64static int findIgnoringCase(const char* subject, size_t subjectLength, const char* target)
65{
66    size_t targetLength = strlen(target);
67    if (targetLength > subjectLength)
68        return -1;
69#ifndef NDEBUG
70    for (size_t i = 0; i < targetLength; ++i)
71        ASSERT(isASCIILower(target[i]));
72#endif
73    for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
74        bool match = true;
75        for (size_t j = 0; j < targetLength; ++j) {
76            if (toASCIILower(subject[i + j]) != target[j]) {
77                match = false;
78                break;
79            }
80        }
81        if (match)
82            return i;
83    }
84    return -1;
85}
86
87static TextEncoding findTextEncoding(const char* encodingName, int length)
88{
89    Vector<char, 64> buffer(length + 1);
90    memcpy(buffer.data(), encodingName, length);
91    buffer[length] = '\0';
92    return buffer.data();
93}
94
95class KanjiCode {
96public:
97    enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
98    static enum Type judge(const char* str, int length);
99    static const int ESC = 0x1b;
100    static const unsigned char sjisMap[256];
101    static int ISkanji(int code)
102    {
103        if (code >= 0x100)
104            return 0;
105        return sjisMap[code & 0xff] & 1;
106    }
107    static int ISkana(int code)
108    {
109        if (code >= 0x100)
110            return 0;
111        return sjisMap[code & 0xff] & 2;
112    }
113};
114
115const unsigned char KanjiCode::sjisMap[256] = {
116    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126    0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
127    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
128    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
129    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
132};
133
134/*
135 * EUC-JP is
136 *     [0xa1 - 0xfe][0xa1 - 0xfe]
137 *     0x8e[0xa1 - 0xfe](SS2)
138 *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
139 *
140 * Shift_Jis is
141 *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
142 *
143 * Shift_Jis Hankaku Kana is
144 *     [0xa1 - 0xdf]
145 */
146
147/*
148 * KanjiCode::judge() is based on judge_jcode() from jvim
149 *     http://hp.vector.co.jp/authors/VA003457/vim/
150 *
151 * Special Thanks to Kenichi Tsuchida
152 */
153
154enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
155{
156    enum Type code;
157    int i;
158    int bfr = false;            /* Kana Moji */
159    int bfk = 0;                /* EUC Kana */
160    int sjis = 0;
161    int euc = 0;
162
163    const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
164
165    code = ASCII;
166
167    i = 0;
168    while (i < size) {
169        if (ptr[i] == ESC && (size - i >= 3)) {
170            if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
171            || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
172                code = JIS;
173                goto breakBreak;
174            } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
175                    || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
176                code = JIS;
177                goto breakBreak;
178            } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
179                code = JIS;
180                i += 3;
181            } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
182                code = JIS;
183                i += 3;
184            } else {
185                i++;
186            }
187            bfr = false;
188            bfk = 0;
189        } else {
190            if (ptr[i] < 0x20) {
191                bfr = false;
192                bfk = 0;
193                /* ?? check kudokuten ?? && ?? hiragana ?? */
194                if ((i >= 2) && (ptr[i - 2] == 0x81)
195                        && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
196                    code = SJIS;
197                    sjis += 100;        /* kudokuten */
198                } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
199                        && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
200                    code = EUC;
201                    euc += 100;         /* kudokuten */
202                } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
203                    sjis += 40;         /* hiragana */
204                } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
205                    euc += 40;          /* hiragana */
206                }
207            } else {
208                /* ?? check hiragana or katana ?? */
209                if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
210                    sjis++;     /* hiragana */
211                } else if ((size - i > 1) && (ptr[i] == 0x83)
212                         && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
213                    sjis++;     /* katakana */
214                } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
215                    euc++;      /* hiragana */
216                } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
217                    euc++;      /* katakana */
218                }
219                if (bfr) {
220                    if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
221                        code = SJIS;
222                        goto breakBreak;
223                    } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
224                        code = SJIS;
225                        goto breakBreak;
226                    } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
227                        code = EUC;
228                        goto breakBreak;
229                    } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
230                        code = EUC;
231                        goto breakBreak;
232                    } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
233                        code = SJIS;
234                        goto breakBreak;
235                    } else if (ptr[i] <= 0x7f) {
236                        code = SJIS;
237                        goto breakBreak;
238                    } else {
239                        if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
240                            euc++;      /* sjis hankaku kana kigo */
241                        } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
242                            ;           /* sjis hankaku kana */
243                        } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
244                            euc++;
245                        } else if (0x8e == ptr[i]) {
246                            euc++;
247                        } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
248                            sjis++;
249                        }
250                        bfr = false;
251                        bfk = 0;
252                    }
253                } else if (0x8e == ptr[i]) {
254                    if (size - i <= 1) {
255                        ;
256                    } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
257                        /* EUC KANA or SJIS KANJI */
258                        if (bfk == 1) {
259                            euc += 100;
260                        }
261                        bfk++;
262                        i++;
263                    } else {
264                        /* SJIS only */
265                        code = SJIS;
266                        goto breakBreak;
267                    }
268                } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
269                    /* SJIS only */
270                    code = SJIS;
271                    if ((size - i >= 1)
272                            && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
273                            || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
274                        goto breakBreak;
275                    }
276                } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
277                    /* EUC only */
278                    code = EUC;
279                    if ((size - i >= 1)
280                            && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
281                        goto breakBreak;
282                    }
283                } else if (ptr[i] <= 0x7f) {
284                    ;
285                } else {
286                    bfr = true;
287                    bfk = 0;
288                }
289            }
290            i++;
291        }
292    }
293    if (code == ASCII) {
294        if (sjis > euc) {
295            code = SJIS;
296        } else if (sjis < euc) {
297            code = EUC;
298        }
299    }
300breakBreak:
301    return (code);
302}
303
304TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
305{
306    if (equalIgnoringCase(mimeType, "text/css"))
307        return CSS;
308    if (equalIgnoringCase(mimeType, "text/html"))
309        return HTML;
310    if (DOMImplementation::isXMLMIMEType(mimeType))
311        return XML;
312    return PlainText;
313}
314
315const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
316{
317    // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
318    // for text/xml. This matches Firefox.
319    if (contentType == XML)
320        return UTF8Encoding();
321    if (!specifiedDefaultEncoding.isValid())
322        return Latin1Encoding();
323    return specifiedDefaultEncoding;
324}
325
326TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
327    : m_contentType(determineContentType(mimeType))
328    , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
329    , m_source(DefaultEncoding)
330    , m_hintEncoding(0)
331    , m_checkedForBOM(false)
332    , m_checkedForCSSCharset(false)
333    , m_checkedForHeadCharset(false)
334    , m_useLenientXMLDecoding(false)
335    , m_sawError(false)
336    , m_usesEncodingDetector(usesEncodingDetector)
337{
338}
339
340TextResourceDecoder::~TextResourceDecoder()
341{
342}
343
344void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
345{
346    // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
347    if (!encoding.isValid())
348        return;
349
350    // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
351    // treat x-user-defined as windows-1252 (bug 18270)
352    if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
353        m_encoding = "windows-1252";
354    else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
355        m_encoding = encoding.closestByteBasedEquivalent();
356    else
357        m_encoding = encoding;
358
359    m_codec.clear();
360    m_source = source;
361}
362
363// Returns the position of the encoding string.
364static int findXMLEncoding(const char* str, int len, int& encodingLength)
365{
366    int pos = find(str, len, "encoding");
367    if (pos == -1)
368        return -1;
369    pos += 8;
370
371    // Skip spaces and stray control characters.
372    while (pos < len && str[pos] <= ' ')
373        ++pos;
374
375    // Skip equals sign.
376    if (pos >= len || str[pos] != '=')
377        return -1;
378    ++pos;
379
380    // Skip spaces and stray control characters.
381    while (pos < len && str[pos] <= ' ')
382        ++pos;
383
384    // Skip quotation mark.
385    if (pos >= len)
386        return - 1;
387    char quoteMark = str[pos];
388    if (quoteMark != '"' && quoteMark != '\'')
389        return -1;
390    ++pos;
391
392    // Find the trailing quotation mark.
393    int end = pos;
394    while (end < len && str[end] != quoteMark)
395        ++end;
396    if (end >= len)
397        return -1;
398
399    encodingLength = end - pos;
400    return pos;
401}
402
403// true if there is more to parse
404static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
405{
406    while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
407        ++pos;
408    return pos != dataEnd;
409}
410
411size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
412{
413    // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
414    // We let it override even a user-chosen encoding.
415    ASSERT(!m_checkedForBOM);
416
417    size_t lengthOfBOM = 0;
418
419    size_t bufferLength = m_buffer.size();
420
421    size_t buf1Len = bufferLength;
422    size_t buf2Len = len;
423    const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
424    const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
425    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
426    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
427    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
428    unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
429
430    // Check for the BOM.
431    if (c1 == 0xFF && c2 == 0xFE) {
432        if (c3 != 0 || c4 != 0) {
433            setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
434            lengthOfBOM = 2;
435        } else {
436            setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
437            lengthOfBOM = 4;
438        }
439    } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
440        setEncoding(UTF8Encoding(), AutoDetectedEncoding);
441        lengthOfBOM = 3;
442    } else if (c1 == 0xFE && c2 == 0xFF) {
443        setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
444        lengthOfBOM = 2;
445    } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
446        setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
447        lengthOfBOM = 4;
448    }
449
450    if (lengthOfBOM || bufferLength + len >= 4)
451        m_checkedForBOM = true;
452
453    return lengthOfBOM;
454}
455
456bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
457{
458    if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
459        m_checkedForCSSCharset = true;
460        return true;
461    }
462
463    size_t oldSize = m_buffer.size();
464    m_buffer.grow(oldSize + len);
465    memcpy(m_buffer.data() + oldSize, data, len);
466
467    movedDataToBuffer = true;
468
469    if (m_buffer.size() > 8) { // strlen("@charset") == 8
470        const char* dataStart = m_buffer.data();
471        const char* dataEnd = dataStart + m_buffer.size();
472
473        if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' &&
474            dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') {
475
476            dataStart += 8;
477            const char* pos = dataStart;
478            if (!skipWhitespace(pos, dataEnd))
479                return false;
480
481            if (*pos == '"' || *pos == '\'') {
482                char quotationMark = *pos;
483                ++pos;
484                dataStart = pos;
485
486                while (pos < dataEnd && *pos != quotationMark)
487                    ++pos;
488                if (pos == dataEnd)
489                    return false;
490
491                int encodingNameLength = pos - dataStart + 1;
492
493                ++pos;
494                if (!skipWhitespace(pos, dataEnd))
495                    return false;
496
497                if (*pos == ';')
498                    setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
499            }
500        }
501        m_checkedForCSSCharset = true;
502        return true;
503    }
504    return false;
505}
506
507// Other browsers allow comments in the head section, so we need to also.
508// It's important not to look for tags inside the comments.
509static inline void skipComment(const char*& ptr, const char* pEnd)
510{
511    const char* p = ptr;
512    if (p == pEnd)
513      return;
514    // Allow <!-->; other browsers do.
515    if (*p == '>') {
516        p++;
517    } else {
518        while (p + 2 < pEnd) {
519            if (*p == '-') {
520                // This is the real end of comment, "-->".
521                if (p[1] == '-' && p[2] == '>') {
522                    p += 3;
523                    break;
524                }
525                // This is the incorrect end of comment that other browsers allow, "--!>".
526                if (p + 3 < pEnd && p[1] == '-' && p[2] == '!' && p[3] == '>') {
527                    p += 4;
528                    break;
529                }
530            }
531            p++;
532        }
533    }
534    ptr = p;
535}
536
537const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over.
538
539bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
540{
541    if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
542        m_checkedForHeadCharset = true;
543        return true;
544    }
545
546    // This is not completely efficient, since the function might go
547    // through the HTML head several times.
548
549    size_t oldSize = m_buffer.size();
550    m_buffer.grow(oldSize + len);
551    memcpy(m_buffer.data() + oldSize, data, len);
552
553    movedDataToBuffer = true;
554
555    const char* ptr = m_buffer.data();
556    const char* pEnd = ptr + m_buffer.size();
557
558    // Is there enough data available to check for XML declaration?
559    if (m_buffer.size() < 8)
560        return false;
561
562    // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
563    // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
564    if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') {
565        const char* xmlDeclarationEnd = ptr;
566        while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
567            ++xmlDeclarationEnd;
568        if (xmlDeclarationEnd == pEnd)
569            return false;
570        // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
571        int len;
572        int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
573        if (pos != -1)
574            setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
575        // continue looking for a charset - it may be specified in an HTTP-Equiv meta
576    } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) {
577        setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
578        return true;
579    } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') {
580        setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
581        return true;
582    } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) {
583        setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
584        return true;
585    } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') {
586        setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
587        return true;
588    }
589
590    // we still don't have an encoding, and are in the head
591    // the following tags are allowed in <head>:
592    // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
593
594    // We stop scanning when a tag that is not permitted in <head>
595    // is seen, rather when </head> is seen, because that more closely
596    // matches behavior in other browsers; more details in
597    // <http://bugs.webkit.org/show_bug.cgi?id=3590>.
598
599    // Additionally, we ignore things that looks like tags in <title>, <script> and <noscript>; see
600    // <http://bugs.webkit.org/show_bug.cgi?id=4560>, <http://bugs.webkit.org/show_bug.cgi?id=12165>
601    // and <http://bugs.webkit.org/show_bug.cgi?id=12389>.
602
603    // Since many sites have charset declarations after <body> or other tags that are disallowed in <head>,
604    // we don't bail out until we've checked at least bytesToCheckUnconditionally bytes of input.
605
606    AtomicStringImpl* enclosingTagName = 0;
607    bool inHeadSection = true; // Becomes false when </head> or any tag not allowed in head is encountered.
608
609    // the HTTP-EQUIV meta has no effect on XHTML
610    if (m_contentType == XML)
611        return true;
612
613    while (ptr + 3 < pEnd) { // +3 guarantees that "<!--" fits in the buffer - and certainly we aren't going to lose any "charset" that way.
614        if (*ptr == '<') {
615            bool end = false;
616            ptr++;
617
618            // Handle comments.
619            if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
620                ptr += 3;
621                skipComment(ptr, pEnd);
622                if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) {
623                    // Some pages that test bandwidth from within the browser do it by having
624                    // huge comments and measuring the time they take to load. Repeatedly scanning
625                    // these comments can take a lot of CPU time.
626                    m_checkedForHeadCharset = true;
627                    return true;
628                }
629                continue;
630            }
631
632            if (*ptr == '/') {
633                ++ptr;
634                end = true;
635            }
636
637            // Grab the tag name, but mostly ignore namespaces.
638            bool sawNamespace = false;
639            char tagBuffer[20];
640            int len = 0;
641            while (len < 19) {
642                if (ptr == pEnd)
643                    return false;
644                char c = *ptr;
645                if (c == ':') {
646                    len = 0;
647                    sawNamespace = true;
648                    ptr++;
649                    continue;
650                }
651                if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'))
652                    ;
653                else if (c >= 'A' && c <= 'Z')
654                    c += 'a' - 'A';
655                else
656                    break;
657                tagBuffer[len++] = c;
658                ptr++;
659            }
660            tagBuffer[len] = 0;
661            AtomicString tag(tagBuffer);
662
663            if (enclosingTagName) {
664                if (end && tag.impl() == enclosingTagName)
665                    enclosingTagName = 0;
666            } else {
667                if (tag == titleTag)
668                    enclosingTagName = titleTag.localName().impl();
669                else if (tag == scriptTag)
670                    enclosingTagName = scriptTag.localName().impl();
671                else if (tag == noscriptTag)
672                    enclosingTagName = noscriptTag.localName().impl();
673            }
674
675            // Find where the opening tag ends.
676            const char* tagContentStart = ptr;
677            if (!end) {
678                while (ptr != pEnd && *ptr != '>') {
679                    if (*ptr == '\'' || *ptr == '"') {
680                        char quoteMark = *ptr;
681                        ++ptr;
682                        while (ptr != pEnd && *ptr != quoteMark)
683                            ++ptr;
684                        if (ptr == pEnd)
685                            return false;
686                    }
687                    ++ptr;
688                }
689                if (ptr == pEnd)
690                    return false;
691                ++ptr;
692            }
693
694            if (!end && tag == metaTag && !sawNamespace) {
695                const char* str = tagContentStart;
696                int length = ptr - tagContentStart;
697                int pos = 0;
698                while (pos < length) {
699                    int charsetPos = findIgnoringCase(str + pos, length - pos, "charset");
700                    if (charsetPos == -1)
701                        break;
702                    pos += charsetPos + 7;
703                    // skip whitespace
704                    while (pos < length && str[pos] <= ' ')
705                        pos++;
706                    if (pos == length)
707                        break;
708                    if (str[pos++] != '=')
709                        continue;
710                    while ((pos < length) &&
711                            (str[pos] <= ' ' || str[pos] == '=' || str[pos] == '"' || str[pos] == '\''))
712                        pos++;
713
714                    // end ?
715                    if (pos == length)
716                        break;
717                    int end = pos;
718                    while (end < length &&
719                           str[end] != ' ' && str[end] != '"' && str[end] != '\'' &&
720                           str[end] != ';' && str[end] != '>')
721                        end++;
722                    setEncoding(findTextEncoding(str + pos, end - pos), EncodingFromMetaTag);
723                    if (m_source == EncodingFromMetaTag)
724                        return true;
725
726                    if (end >= length || str[end] == '/' || str[end] == '>')
727                        break;
728
729                    pos = end + 1;
730                }
731            } else {
732                if (!enclosingTagName && tag != scriptTag && tag != noscriptTag && tag != styleTag
733                    && tag != linkTag && tag != metaTag && tag != objectTag && tag != titleTag && tag != baseTag
734                    && (end || tag != htmlTag) && (end || tag != headTag) && isASCIIAlpha(tagBuffer[0])) {
735                    inHeadSection = false;
736                }
737
738                if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) {
739                    m_checkedForHeadCharset = true;
740                    return true;
741                }
742            }
743        } else
744            ++ptr;
745    }
746    return false;
747}
748
749void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
750{
751    switch (KanjiCode::judge(data, len)) {
752        case KanjiCode::JIS:
753            setEncoding("ISO-2022-JP", AutoDetectedEncoding);
754            break;
755        case KanjiCode::EUC:
756            setEncoding("EUC-JP", AutoDetectedEncoding);
757            break;
758        case KanjiCode::SJIS:
759            setEncoding("Shift_JIS", AutoDetectedEncoding);
760            break;
761        case KanjiCode::ASCII:
762        case KanjiCode::UTF16:
763        case KanjiCode::UTF8:
764            break;
765    }
766}
767
768// We use the encoding detector in two cases:
769//   1. Encoding detector is turned ON and no other encoding source is
770//      available (that is, it's DefaultEncoding).
771//   2. Encoding detector is turned ON and the encoding is set to
772//      the encoding of the parent frame, which is also auto-detected.
773//   Note that condition #2 is NOT satisfied unless parent-child frame
774//   relationship is compliant to the same-origin policy. If they're from
775//   different domains, |m_source| would not be set to EncodingFromParentFrame
776//   in the first place.
777bool TextResourceDecoder::shouldAutoDetect() const
778{
779    // Just checking m_hintEncoding suffices here because it's only set
780    // in setHintEncoding when the source is AutoDetectedEncoding.
781    return m_usesEncodingDetector
782        && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
783}
784
785String TextResourceDecoder::decode(const char* data, size_t len)
786{
787    size_t lengthOfBOM = 0;
788    if (!m_checkedForBOM)
789        lengthOfBOM = checkForBOM(data, len);
790
791    bool movedDataToBuffer = false;
792
793    if (m_contentType == CSS && !m_checkedForCSSCharset)
794        if (!checkForCSSCharset(data, len, movedDataToBuffer))
795            return "";
796
797    if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
798        if (!checkForHeadCharset(data, len, movedDataToBuffer))
799            return "";
800
801    // FIXME: It is wrong to change the encoding downstream after we have already done some decoding.
802    if (shouldAutoDetect()) {
803        if (m_encoding.isJapanese())
804            detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
805        else {
806            TextEncoding detectedEncoding;
807            if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
808                setEncoding(detectedEncoding, AutoDetectedEncoding);
809        }
810    }
811
812    ASSERT(m_encoding.isValid());
813
814    if (!m_codec)
815        m_codec.set(newTextCodec(m_encoding).release());
816
817    if (m_buffer.isEmpty())
818        return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError);
819
820    if (!movedDataToBuffer) {
821        size_t oldSize = m_buffer.size();
822        m_buffer.grow(oldSize + len);
823        memcpy(m_buffer.data() + oldSize, data, len);
824    }
825
826    String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
827    m_buffer.clear();
828    return result;
829}
830
831String TextResourceDecoder::flush()
832{
833   // If we can not identify the encoding even after a document is completely
834   // loaded, we need to detect the encoding if other conditions for
835   // autodetection is satisfied.
836    if (m_buffer.size() && shouldAutoDetect()
837        && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
838         TextEncoding detectedEncoding;
839         if (detectTextEncoding(m_buffer.data(), m_buffer.size(),
840                                m_hintEncoding, &detectedEncoding))
841             setEncoding(detectedEncoding, AutoDetectedEncoding);
842    }
843
844    if (!m_codec)
845        m_codec.set(newTextCodec(m_encoding).release());
846
847    String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
848    m_buffer.clear();
849    m_codec.clear();
850    m_checkedForBOM = false; // Skip BOM again when re-decoding.
851    return result;
852}
853
854}
855