1/* 2 * Copyright (C) 2010 Google Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#include "config.h" 27#include "core/html/parser/HTMLMetaCharsetParser.h" 28 29#include "core/HTMLNames.h" 30#include "core/html/parser/HTMLParserIdioms.h" 31#include "core/html/parser/HTMLParserOptions.h" 32#include "core/html/parser/HTMLTokenizer.h" 33#include "wtf/text/TextEncodingRegistry.h" 34#include "wtf/text/WTFString.h" 35 36using namespace WTF; 37 38namespace blink { 39 40using namespace HTMLNames; 41 42HTMLMetaCharsetParser::HTMLMetaCharsetParser() 43 : m_tokenizer(HTMLTokenizer::create(HTMLParserOptions(0))) 44 , m_assumedCodec(newTextCodec(Latin1Encoding())) 45 , m_inHeadSection(true) 46 , m_doneChecking(false) 47{ 48} 49 50HTMLMetaCharsetParser::~HTMLMetaCharsetParser() 51{ 52} 53 54bool HTMLMetaCharsetParser::processMeta() 55{ 56 const HTMLToken::AttributeList& tokenAttributes = m_token.attributes(); 57 HTMLAttributeList attributes; 58 for (HTMLToken::AttributeList::const_iterator iter = tokenAttributes.begin(); iter != tokenAttributes.end(); ++iter) { 59 String attributeName = attemptStaticStringCreation(iter->name, Likely8Bit); 60 String attributeValue = StringImpl::create8BitIfPossible(iter->value); 61 attributes.append(std::make_pair(attributeName, attributeValue)); 62 } 63 64 m_encoding = encodingFromMetaAttributes(attributes); 65 return m_encoding.isValid(); 66} 67 68static const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over. 69 70bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) 71{ 72 if (m_doneChecking) 73 return true; 74 75 ASSERT(!m_encoding.isValid()); 76 77 // We still don't have an encoding, and are in the head. 78 // The following tags are allowed in <head>: 79 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE 80 81 // We stop scanning when a tag that is not permitted in <head> 82 // is seen, rather when </head> is seen, because that more closely 83 // matches behavior in other browsers; more details in 84 // <http://bugs.webkit.org/show_bug.cgi?id=3590>. 85 86 // Additionally, we ignore things that looks like tags in <title>, <script> 87 // and <noscript>; see <http://bugs.webkit.org/show_bug.cgi?id=4560>, 88 // <http://bugs.webkit.org/show_bug.cgi?id=12165> and 89 // <http://bugs.webkit.org/show_bug.cgi?id=12389>. 90 91 // Since many sites have charset declarations after <body> or other tags 92 // that are disallowed in <head>, we don't bail out until we've checked at 93 // least bytesToCheckUnconditionally bytes of input. 94 95 m_input.append(SegmentedString(m_assumedCodec->decode(data, length))); 96 97 while (m_tokenizer->nextToken(m_input, m_token)) { 98 bool end = m_token.type() == HTMLToken::EndTag; 99 if (end || m_token.type() == HTMLToken::StartTag) { 100 String tagName = attemptStaticStringCreation(m_token.name(), Likely8Bit); 101 if (!end) { 102 m_tokenizer->updateStateFor(tagName); 103 if (threadSafeMatch(tagName, metaTag) && processMeta()) { 104 m_doneChecking = true; 105 return true; 106 } 107 } 108 109 if (!threadSafeMatch(tagName, scriptTag) && !threadSafeMatch(tagName, noscriptTag) 110 && !threadSafeMatch(tagName, styleTag) && !threadSafeMatch(tagName, linkTag) 111 && !threadSafeMatch(tagName, metaTag) && !threadSafeMatch(tagName, objectTag) 112 && !threadSafeMatch(tagName, titleTag) && !threadSafeMatch(tagName, baseTag) 113 && (end || !threadSafeMatch(tagName, htmlTag)) && (end || !threadSafeMatch(tagName, headTag))) { 114 m_inHeadSection = false; 115 } 116 } 117 118 if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToCheckUnconditionally) { 119 m_doneChecking = true; 120 return true; 121 } 122 123 m_token.clear(); 124 } 125 126 return false; 127} 128 129} 130