106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen/* 206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * Copyright (C) 2008 Apple Inc. All Rights Reserved. 306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * Copyright (C) 2010 Google, Inc. All Rights Reserved. 406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * 506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * Redistribution and use in source and binary forms, with or without 606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * modification, are permitted provided that the following conditions 706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * are met: 806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * 1. Redistributions of source code must retain the above copyright 906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * notice, this list of conditions and the following disclaimer. 1006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * 2. Redistributions in binary form must reproduce the above copyright 1106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * notice, this list of conditions and the following disclaimer in the 1206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * documentation and/or other materials provided with the distribution. 1306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * 1406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 1506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 1706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 1806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 2006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 2106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 2206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 2306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 2406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 2506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen */ 2606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 2706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#ifndef HTMLTokenizer_h 2806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#define HTMLTokenizer_h 2906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 3006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#include "SegmentedString.h" 3106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#include <wtf/Noncopyable.h> 32e8b154fd68f9b33be40a3590e58347f353835f5cSteve Block#include <wtf/PassOwnPtr.h> 3306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#include <wtf/Vector.h> 34f486d19d62f1bc33246748b14b14a9dfa617b57fIain Merrick#include <wtf/text/AtomicString.h> 3506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 3606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsennamespace WebCore { 3706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 38dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdochclass Element; 3968513a70bcd92384395513322f1b801e7bf9c729Steve Blockclass Frame; 4006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsenclass HTMLToken; 4106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 42ab9e7a118cf1ea2e3a93dce683b2ded3e7291ddbBen Murdochclass HTMLTokenizer { 43ab9e7a118cf1ea2e3a93dce683b2ded3e7291ddbBen Murdoch WTF_MAKE_NONCOPYABLE(HTMLTokenizer); WTF_MAKE_FAST_ALLOCATED; 4406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsenpublic: 4506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen enum State { 4606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen DataState, 4706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen CharacterReferenceInDataState, 4806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen RCDATAState, 4906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen CharacterReferenceInRCDATAState, 5006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen RAWTEXTState, 5106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataState, 5206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen PLAINTEXTState, 5306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen TagOpenState, 5406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen EndTagOpenState, 5506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen TagNameState, 5606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen RCDATALessThanSignState, 5706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen RCDATAEndTagOpenState, 5806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen RCDATAEndTagNameState, 5906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen RAWTEXTLessThanSignState, 6006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen RAWTEXTEndTagOpenState, 6106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen RAWTEXTEndTagNameState, 6206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataLessThanSignState, 6306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataEndTagOpenState, 6406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataEndTagNameState, 6506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataEscapeStartState, 6606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataEscapeStartDashState, 6706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataEscapedState, 6806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataEscapedDashState, 6906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataEscapedDashDashState, 7006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataEscapedLessThanSignState, 7106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataEscapedEndTagOpenState, 7206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataEscapedEndTagNameState, 7306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataDoubleEscapeStartState, 7406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataDoubleEscapedState, 7506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataDoubleEscapedDashState, 7606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataDoubleEscapedDashDashState, 7706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataDoubleEscapedLessThanSignState, 7806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ScriptDataDoubleEscapeEndState, 7906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen BeforeAttributeNameState, 8006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AttributeNameState, 8106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AfterAttributeNameState, 8206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen BeforeAttributeValueState, 8306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AttributeValueDoubleQuotedState, 8406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AttributeValueSingleQuotedState, 8506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AttributeValueUnquotedState, 8606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen CharacterReferenceInAttributeValueState, 8706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AfterAttributeValueQuotedState, 8806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen SelfClosingStartTagState, 8906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen BogusCommentState, 90ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block // The ContinueBogusCommentState is not in the HTML5 spec, but we use 91ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block // it internally to keep track of whether we've started the bogus 92ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block // comment token yet. 93ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block ContinueBogusCommentState, 9406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen MarkupDeclarationOpenState, 9506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen CommentStartState, 9606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen CommentStartDashState, 9706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen CommentState, 9806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen CommentEndDashState, 9906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen CommentEndState, 10006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen CommentEndBangState, 10106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen DOCTYPEState, 10206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen BeforeDOCTYPENameState, 10306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen DOCTYPENameState, 10406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AfterDOCTYPENameState, 10506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AfterDOCTYPEPublicKeywordState, 10606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen BeforeDOCTYPEPublicIdentifierState, 10706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen DOCTYPEPublicIdentifierDoubleQuotedState, 10806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen DOCTYPEPublicIdentifierSingleQuotedState, 10906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AfterDOCTYPEPublicIdentifierState, 11006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen BetweenDOCTYPEPublicAndSystemIdentifiersState, 11106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AfterDOCTYPESystemKeywordState, 11206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen BeforeDOCTYPESystemIdentifierState, 11306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen DOCTYPESystemIdentifierDoubleQuotedState, 11406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen DOCTYPESystemIdentifierSingleQuotedState, 11506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen AfterDOCTYPESystemIdentifierState, 11606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen BogusDOCTYPEState, 11706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen CDATASectionState, 1185ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen // These CDATA states are not in the HTML5 spec, but we use them internally. 1195ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen CDATASectionRightSquareBracketState, 1205ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen CDATASectionDoubleRightSquareBracketState, 12106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen }; 12206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 12368513a70bcd92384395513322f1b801e7bf9c729Steve Block static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); } 12406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen ~HTMLTokenizer(); 12506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 12606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen void reset(); 12706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 12868513a70bcd92384395513322f1b801e7bf9c729Steve Block // This function returns true if it emits a token. Otherwise, callers 12906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // must provide the same (in progress) token on the next call (unless 13006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // they call reset() first). 13106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen bool nextToken(SegmentedString&, HTMLToken&); 13206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 13306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen int lineNumber() const { return m_lineNumber; } 13406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior. 13506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 13606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen State state() const { return m_state; } 13706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen void setState(State state) { m_state = state; } 13806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 13968513a70bcd92384395513322f1b801e7bf9c729Steve Block // Updates the tokenizer's state according to the given tag name. This is 14068513a70bcd92384395513322f1b801e7bf9c729Steve Block // an approximation of how the tree builder would update the tokenizer's 14168513a70bcd92384395513322f1b801e7bf9c729Steve Block // state. This method is useful for approximating HTML tokenization. To 14268513a70bcd92384395513322f1b801e7bf9c729Steve Block // get exactly the correct tokenization, you need the real tree builder. 14368513a70bcd92384395513322f1b801e7bf9c729Steve Block // 14468513a70bcd92384395513322f1b801e7bf9c729Steve Block // The main failures in the approximation are as follows: 14568513a70bcd92384395513322f1b801e7bf9c729Steve Block // 14668513a70bcd92384395513322f1b801e7bf9c729Steve Block // * The first set of character tokens emitted for a <pre> element might 14768513a70bcd92384395513322f1b801e7bf9c729Steve Block // contain an extra leading newline. 14868513a70bcd92384395513322f1b801e7bf9c729Steve Block // * The replacement of U+0000 with U+FFFD will not be sensitive to the 14968513a70bcd92384395513322f1b801e7bf9c729Steve Block // tree builder's insertion mode. 15068513a70bcd92384395513322f1b801e7bf9c729Steve Block // * CDATA sections in foreign content will be tokenized as bogus comments 15168513a70bcd92384395513322f1b801e7bf9c729Steve Block // instead of as character tokens. 15268513a70bcd92384395513322f1b801e7bf9c729Steve Block // 15368513a70bcd92384395513322f1b801e7bf9c729Steve Block void updateStateFor(const AtomicString& tagName, Frame*); 15468513a70bcd92384395513322f1b801e7bf9c729Steve Block 15506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // Hack to skip leading newline in <pre>/<listing> for authoring ease. 15606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody 157dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; } 158dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch 159dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; } 160dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; } 161dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch 1625ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen bool shouldAllowCDATA() const { return m_shouldAllowCDATA; } 1635ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; } 1645ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen 165dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch bool shouldSkipNullCharacters() const 166dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch { 167dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch return !m_forceNullCharacterReplacement 168dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch && (m_state == DataState 169dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch || m_state == RCDATAState 170dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch || m_state == RAWTEXTState 171dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch || m_state == PLAINTEXTState); 172dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch } 17306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 17406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsenprivate: 17506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream 176ab9e7a118cf1ea2e3a93dce683b2ded3e7291ddbBen Murdoch class InputStreamPreprocessor { 177ab9e7a118cf1ea2e3a93dce683b2ded3e7291ddbBen Murdoch WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor); 17806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen public: 179dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch InputStreamPreprocessor(HTMLTokenizer* tokenizer) 180dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch : m_tokenizer(tokenizer) 181dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch , m_nextInputCharacter('\0') 18206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen , m_skipNextNewLine(false) 18306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen { 18406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen } 18506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 18606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen UChar nextInputCharacter() const { return m_nextInputCharacter; } 18706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 18806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // Returns whether we succeeded in peeking at the next character. 18906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // The only way we can fail to peek is if there are no more 19006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // characters in |source| (after collapsing \r\n, etc). 1910617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber) 19206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen { 193dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch PeekAgain: 19406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen m_nextInputCharacter = *source; 1950617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen 1960617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen // Every branch in this function is expensive, so we have a 1970617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen // fast-reject branch for characters that don't require special 19868513a70bcd92384395513322f1b801e7bf9c729Steve Block // handling. Please run the parser benchmark whenever you touch 19968513a70bcd92384395513322f1b801e7bf9c729Steve Block // this function. It's very hot. 2000617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen static const UChar specialCharacterMask = '\n' | '\r' | '\0'; 2010617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen if (m_nextInputCharacter & ~specialCharacterMask) { 2020617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen m_skipNextNewLine = false; 2030617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen return true; 2040617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen } 2050617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen 20606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen if (m_nextInputCharacter == '\n' && m_skipNextNewLine) { 20706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen m_skipNextNewLine = false; 20806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen source.advancePastNewline(lineNumber); 20906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen if (source.isEmpty()) 21006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen return false; 21106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen m_nextInputCharacter = *source; 21206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen } 21306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen if (m_nextInputCharacter == '\r') { 21406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen m_nextInputCharacter = '\n'; 21506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen m_skipNextNewLine = true; 21606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen } else { 21706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen m_skipNextNewLine = false; 21806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // FIXME: The spec indicates that the surrogate pair range as well as 21906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // a number of specific character values are parse errors and should be replaced 22006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // by the replacement character. We suspect this is a problem with the spec as doing 22106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // that filtering breaks surrogate pair handling and causes us not to match Minefield. 222dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) { 223dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch if (m_tokenizer->shouldSkipNullCharacters()) { 224dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch source.advancePastNonNewline(); 225dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch if (source.isEmpty()) 226dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch return false; 227dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch goto PeekAgain; 228dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch } 22906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen m_nextInputCharacter = 0xFFFD; 230dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch } 23106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen } 23206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen return true; 23306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen } 23406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 23506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // Returns whether there are more characters in |source| after advancing. 23606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen bool advance(SegmentedString& source, int& lineNumber) 23706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen { 23806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen source.advance(lineNumber); 23906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen if (source.isEmpty()) 24006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen return false; 24106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen return peek(source, lineNumber); 24206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen } 24306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 244ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block static const UChar endOfFileMarker; 245ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block 24606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen private: 247ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const 248ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block { 249ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block return source.isClosed() && source.length() == 1; 250ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block } 251ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block 252dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch HTMLTokenizer* m_tokenizer; 253dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch 25406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character 25506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen UChar m_nextInputCharacter; 25606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen bool m_skipNextNewLine; 25706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen }; 25806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 25968513a70bcd92384395513322f1b801e7bf9c729Steve Block HTMLTokenizer(bool usePreHTML5ParserQuirks); 260e8b154fd68f9b33be40a3590e58347f353835f5cSteve Block 261ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block inline bool processEntity(SegmentedString&); 262ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block 263ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block inline void parseError(); 264ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block inline void bufferCharacter(UChar); 265ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block inline void bufferCodePoint(unsigned); 26606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 267ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block inline bool emitAndResumeIn(SegmentedString&, State); 268ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block inline bool emitAndReconsumeIn(SegmentedString&, State); 269ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block inline bool emitEndOfFile(SegmentedString&); 270ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block inline bool flushEmitAndResumeIn(SegmentedString&, State); 27106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 272ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block // Return whether we need to emit a character token before dealing with 273ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block // the buffered end tag. 274ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block inline bool flushBufferedEndTag(SegmentedString&); 27506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen inline bool temporaryBufferIs(const String&); 27606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 27706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // Sometimes we speculatively consume input characters and we don't 27868513a70bcd92384395513322f1b801e7bf9c729Steve Block // know whether they represent end tags or RCDATA, etc. These 27906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // functions help manage these state. 28006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen inline void addToPossibleEndTag(UChar cc); 281ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block inline void saveEndTagNameIfNeeded(); 28206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen inline bool isAppropriateEndTag(); 28306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 284f486d19d62f1bc33246748b14b14a9dfa617b57fIain Merrick inline bool haveBufferedCharacterToken(); 28506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 28606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen State m_state; 28706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 28806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen Vector<UChar, 32> m_appropriateEndTagName; 28906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 29068513a70bcd92384395513322f1b801e7bf9c729Steve Block // m_token is owned by the caller. If nextToken is not on the stack, 29106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // this member might be pointing to unallocated memory. 29206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen HTMLToken* m_token; 29306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen int m_lineNumber; 29406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 29506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen bool m_skipLeadingNewLineForListing; 296dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch bool m_forceNullCharacterReplacement; 2975ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen bool m_shouldAllowCDATA; 29806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 29906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer 30006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen Vector<UChar, 32> m_temporaryBuffer; 30106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 30206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // We occationally want to emit both a character token and an end tag 30368513a70bcd92384395513322f1b801e7bf9c729Steve Block // token (e.g., when lexing script). We buffer the name of the end tag 30406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // token here so we remember it next time we re-enter the tokenizer. 30506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen Vector<UChar, 32> m_bufferedEndTagName; 30606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 30706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character 30806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen UChar m_additionalAllowedCharacter; 30906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 31006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream 31106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen InputStreamPreprocessor m_inputStreamPreprocessor; 31268513a70bcd92384395513322f1b801e7bf9c729Steve Block 31368513a70bcd92384395513322f1b801e7bf9c729Steve Block bool m_usePreHTML5ParserQuirks; 31406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen}; 31506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 31606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen} 31706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen 31806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#endif 319