106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen/*
206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * Copyright (C) 2008 Apple Inc. All Rights Reserved.
306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * Copyright (C) 2010 Google, Inc. All Rights Reserved.
406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen *
506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * Redistribution and use in source and binary forms, with or without
606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * modification, are permitted provided that the following conditions
706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * are met:
806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * 1. Redistributions of source code must retain the above copyright
906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen *    notice, this list of conditions and the following disclaimer.
1006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * 2. Redistributions in binary form must reproduce the above copyright
1106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen *    notice, this list of conditions and the following disclaimer in the
1206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen *    documentation and/or other materials provided with the distribution.
1306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen *
1406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
1506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
1706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
1806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
1906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
2006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
2106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
2206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen */
2606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
2706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#ifndef HTMLTokenizer_h
2806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#define HTMLTokenizer_h
2906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
3006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#include "SegmentedString.h"
3106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#include <wtf/Noncopyable.h>
32e8b154fd68f9b33be40a3590e58347f353835f5cSteve Block#include <wtf/PassOwnPtr.h>
3306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#include <wtf/Vector.h>
34f486d19d62f1bc33246748b14b14a9dfa617b57fIain Merrick#include <wtf/text/AtomicString.h>
3506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
3606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsennamespace WebCore {
3706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
38dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdochclass Element;
3968513a70bcd92384395513322f1b801e7bf9c729Steve Blockclass Frame;
4006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsenclass HTMLToken;
4106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
42ab9e7a118cf1ea2e3a93dce683b2ded3e7291ddbBen Murdochclass HTMLTokenizer {
43ab9e7a118cf1ea2e3a93dce683b2ded3e7291ddbBen Murdoch    WTF_MAKE_NONCOPYABLE(HTMLTokenizer); WTF_MAKE_FAST_ALLOCATED;
4406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsenpublic:
4506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    enum State {
4606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        DataState,
4706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        CharacterReferenceInDataState,
4806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        RCDATAState,
4906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        CharacterReferenceInRCDATAState,
5006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        RAWTEXTState,
5106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataState,
5206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        PLAINTEXTState,
5306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        TagOpenState,
5406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        EndTagOpenState,
5506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        TagNameState,
5606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        RCDATALessThanSignState,
5706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        RCDATAEndTagOpenState,
5806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        RCDATAEndTagNameState,
5906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        RAWTEXTLessThanSignState,
6006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        RAWTEXTEndTagOpenState,
6106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        RAWTEXTEndTagNameState,
6206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataLessThanSignState,
6306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataEndTagOpenState,
6406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataEndTagNameState,
6506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataEscapeStartState,
6606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataEscapeStartDashState,
6706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataEscapedState,
6806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataEscapedDashState,
6906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataEscapedDashDashState,
7006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataEscapedLessThanSignState,
7106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataEscapedEndTagOpenState,
7206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataEscapedEndTagNameState,
7306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataDoubleEscapeStartState,
7406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataDoubleEscapedState,
7506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataDoubleEscapedDashState,
7606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataDoubleEscapedDashDashState,
7706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataDoubleEscapedLessThanSignState,
7806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        ScriptDataDoubleEscapeEndState,
7906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        BeforeAttributeNameState,
8006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AttributeNameState,
8106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AfterAttributeNameState,
8206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        BeforeAttributeValueState,
8306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AttributeValueDoubleQuotedState,
8406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AttributeValueSingleQuotedState,
8506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AttributeValueUnquotedState,
8606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        CharacterReferenceInAttributeValueState,
8706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AfterAttributeValueQuotedState,
8806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        SelfClosingStartTagState,
8906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        BogusCommentState,
90ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block        // The ContinueBogusCommentState is not in the HTML5 spec, but we use
91ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block        // it internally to keep track of whether we've started the bogus
92ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block        // comment token yet.
93ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block        ContinueBogusCommentState,
9406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        MarkupDeclarationOpenState,
9506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        CommentStartState,
9606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        CommentStartDashState,
9706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        CommentState,
9806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        CommentEndDashState,
9906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        CommentEndState,
10006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        CommentEndBangState,
10106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        DOCTYPEState,
10206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        BeforeDOCTYPENameState,
10306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        DOCTYPENameState,
10406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AfterDOCTYPENameState,
10506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AfterDOCTYPEPublicKeywordState,
10606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        BeforeDOCTYPEPublicIdentifierState,
10706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        DOCTYPEPublicIdentifierDoubleQuotedState,
10806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        DOCTYPEPublicIdentifierSingleQuotedState,
10906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AfterDOCTYPEPublicIdentifierState,
11006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        BetweenDOCTYPEPublicAndSystemIdentifiersState,
11106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AfterDOCTYPESystemKeywordState,
11206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        BeforeDOCTYPESystemIdentifierState,
11306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        DOCTYPESystemIdentifierDoubleQuotedState,
11406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        DOCTYPESystemIdentifierSingleQuotedState,
11506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        AfterDOCTYPESystemIdentifierState,
11606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        BogusDOCTYPEState,
11706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        CDATASectionState,
1185ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen        // These CDATA states are not in the HTML5 spec, but we use them internally.
1195ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen        CDATASectionRightSquareBracketState,
1205ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen        CDATASectionDoubleRightSquareBracketState,
12106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    };
12206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
12368513a70bcd92384395513322f1b801e7bf9c729Steve Block    static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); }
12406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    ~HTMLTokenizer();
12506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
12606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    void reset();
12706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
12868513a70bcd92384395513322f1b801e7bf9c729Steve Block    // This function returns true if it emits a token. Otherwise, callers
12906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // must provide the same (in progress) token on the next call (unless
13006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // they call reset() first).
13106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    bool nextToken(SegmentedString&, HTMLToken&);
13206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
13306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    int lineNumber() const { return m_lineNumber; }
13406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior.
13506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
13606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    State state() const { return m_state; }
13706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    void setState(State state) { m_state = state; }
13806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
13968513a70bcd92384395513322f1b801e7bf9c729Steve Block    // Updates the tokenizer's state according to the given tag name. This is
14068513a70bcd92384395513322f1b801e7bf9c729Steve Block    // an approximation of how the tree builder would update the tokenizer's
14168513a70bcd92384395513322f1b801e7bf9c729Steve Block    // state. This method is useful for approximating HTML tokenization. To
14268513a70bcd92384395513322f1b801e7bf9c729Steve Block    // get exactly the correct tokenization, you need the real tree builder.
14368513a70bcd92384395513322f1b801e7bf9c729Steve Block    //
14468513a70bcd92384395513322f1b801e7bf9c729Steve Block    // The main failures in the approximation are as follows:
14568513a70bcd92384395513322f1b801e7bf9c729Steve Block    //
14668513a70bcd92384395513322f1b801e7bf9c729Steve Block    //  * The first set of character tokens emitted for a <pre> element might
14768513a70bcd92384395513322f1b801e7bf9c729Steve Block    //    contain an extra leading newline.
14868513a70bcd92384395513322f1b801e7bf9c729Steve Block    //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
14968513a70bcd92384395513322f1b801e7bf9c729Steve Block    //    tree builder's insertion mode.
15068513a70bcd92384395513322f1b801e7bf9c729Steve Block    //  * CDATA sections in foreign content will be tokenized as bogus comments
15168513a70bcd92384395513322f1b801e7bf9c729Steve Block    //    instead of as character tokens.
15268513a70bcd92384395513322f1b801e7bf9c729Steve Block    //
15368513a70bcd92384395513322f1b801e7bf9c729Steve Block    void updateStateFor(const AtomicString& tagName, Frame*);
15468513a70bcd92384395513322f1b801e7bf9c729Steve Block
15506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // Hack to skip leading newline in <pre>/<listing> for authoring ease.
15606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
157dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch    void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
158dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch
159dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch    bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
160dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch    void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
161dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch
1625ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen    bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
1635ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen    void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
1645ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen
165dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch    bool shouldSkipNullCharacters() const
166dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch    {
167dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch        return !m_forceNullCharacterReplacement
168dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch            && (m_state == DataState
169dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                || m_state == RCDATAState
170dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                || m_state == RAWTEXTState
171dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                || m_state == PLAINTEXTState);
172dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch    }
17306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
17406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsenprivate:
17506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
176ab9e7a118cf1ea2e3a93dce683b2ded3e7291ddbBen Murdoch    class InputStreamPreprocessor {
177ab9e7a118cf1ea2e3a93dce683b2ded3e7291ddbBen Murdoch        WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
17806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    public:
179dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch        InputStreamPreprocessor(HTMLTokenizer* tokenizer)
180dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch            : m_tokenizer(tokenizer)
181dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch            , m_nextInputCharacter('\0')
18206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            , m_skipNextNewLine(false)
18306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        {
18406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        }
18506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
18606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        UChar nextInputCharacter() const { return m_nextInputCharacter; }
18706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
18806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        // Returns whether we succeeded in peeking at the next character.
18906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        // The only way we can fail to peek is if there are no more
19006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        // characters in |source| (after collapsing \r\n, etc).
1910617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen        ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber)
19206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        {
193dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch        PeekAgain:
19406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            m_nextInputCharacter = *source;
1950617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen
1960617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen            // Every branch in this function is expensive, so we have a
1970617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen            // fast-reject branch for characters that don't require special
19868513a70bcd92384395513322f1b801e7bf9c729Steve Block            // handling. Please run the parser benchmark whenever you touch
19968513a70bcd92384395513322f1b801e7bf9c729Steve Block            // this function. It's very hot.
2000617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen            static const UChar specialCharacterMask = '\n' | '\r' | '\0';
2010617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen            if (m_nextInputCharacter & ~specialCharacterMask) {
2020617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen                m_skipNextNewLine = false;
2030617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen                return true;
2040617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen            }
2050617145a89917ae7735fe1c9538688ab9a577df5Kristian Monsen
20606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
20706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                m_skipNextNewLine = false;
20806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                source.advancePastNewline(lineNumber);
20906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                if (source.isEmpty())
21006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                    return false;
21106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                m_nextInputCharacter = *source;
21206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            }
21306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            if (m_nextInputCharacter == '\r') {
21406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                m_nextInputCharacter = '\n';
21506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                m_skipNextNewLine = true;
21606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            } else {
21706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                m_skipNextNewLine = false;
21806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                // FIXME: The spec indicates that the surrogate pair range as well as
21906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                // a number of specific character values are parse errors and should be replaced
22006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                // by the replacement character. We suspect this is a problem with the spec as doing
22106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                // that filtering breaks surrogate pair handling and causes us not to match Minefield.
222dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
223dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                    if (m_tokenizer->shouldSkipNullCharacters()) {
224dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                        source.advancePastNonNewline();
225dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                        if (source.isEmpty())
226dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                            return false;
227dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                        goto PeekAgain;
228dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                    }
22906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                    m_nextInputCharacter = 0xFFFD;
230dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch                }
23106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            }
23206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            return true;
23306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        }
23406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
23506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        // Returns whether there are more characters in |source| after advancing.
23606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        bool advance(SegmentedString& source, int& lineNumber)
23706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        {
23806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            source.advance(lineNumber);
23906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            if (source.isEmpty())
24006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen                return false;
24106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen            return peek(source, lineNumber);
24206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        }
24306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
244ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block        static const UChar endOfFileMarker;
245ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block
24606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    private:
247ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block        bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
248ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block        {
249ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block            return source.isClosed() && source.length() == 1;
250ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block        }
251ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block
252dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch        HTMLTokenizer* m_tokenizer;
253dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch
25406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
25506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        UChar m_nextInputCharacter;
25606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen        bool m_skipNextNewLine;
25706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    };
25806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
25968513a70bcd92384395513322f1b801e7bf9c729Steve Block    HTMLTokenizer(bool usePreHTML5ParserQuirks);
260e8b154fd68f9b33be40a3590e58347f353835f5cSteve Block
261ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    inline bool processEntity(SegmentedString&);
262ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block
263ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    inline void parseError();
264ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    inline void bufferCharacter(UChar);
265ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    inline void bufferCodePoint(unsigned);
26606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
267ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    inline bool emitAndResumeIn(SegmentedString&, State);
268ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    inline bool emitAndReconsumeIn(SegmentedString&, State);
269ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    inline bool emitEndOfFile(SegmentedString&);
270ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    inline bool flushEmitAndResumeIn(SegmentedString&, State);
27106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
272ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    // Return whether we need to emit a character token before dealing with
273ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    // the buffered end tag.
274ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    inline bool flushBufferedEndTag(SegmentedString&);
27506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    inline bool temporaryBufferIs(const String&);
27606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
27706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // Sometimes we speculatively consume input characters and we don't
27868513a70bcd92384395513322f1b801e7bf9c729Steve Block    // know whether they represent end tags or RCDATA, etc. These
27906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // functions help manage these state.
28006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    inline void addToPossibleEndTag(UChar cc);
281ca9cb53ed1119a3fd98fafa0972ffeb56dee1c24Steve Block    inline void saveEndTagNameIfNeeded();
28206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    inline bool isAppropriateEndTag();
28306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
284f486d19d62f1bc33246748b14b14a9dfa617b57fIain Merrick    inline bool haveBufferedCharacterToken();
28506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
28606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    State m_state;
28706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
28806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    Vector<UChar, 32> m_appropriateEndTagName;
28906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
29068513a70bcd92384395513322f1b801e7bf9c729Steve Block    // m_token is owned by the caller. If nextToken is not on the stack,
29106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // this member might be pointing to unallocated memory.
29206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    HTMLToken* m_token;
29306ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    int m_lineNumber;
29406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
29506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    bool m_skipLeadingNewLineForListing;
296dd8bb3de4f353a81954234999f1fea748aee2ea9Ben Murdoch    bool m_forceNullCharacterReplacement;
2975ddde30071f639962dd557c453f2ad01f8f0fd00Kristian Monsen    bool m_shouldAllowCDATA;
29806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
29906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
30006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    Vector<UChar, 32> m_temporaryBuffer;
30106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
30206ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // We occationally want to emit both a character token and an end tag
30368513a70bcd92384395513322f1b801e7bf9c729Steve Block    // token (e.g., when lexing script). We buffer the name of the end tag
30406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // token here so we remember it next time we re-enter the tokenizer.
30506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    Vector<UChar, 32> m_bufferedEndTagName;
30606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
30706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
30806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    UChar m_additionalAllowedCharacter;
30906ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
31006ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
31106ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen    InputStreamPreprocessor m_inputStreamPreprocessor;
31268513a70bcd92384395513322f1b801e7bf9c729Steve Block
31368513a70bcd92384395513322f1b801e7bf9c729Steve Block    bool m_usePreHTML5ParserQuirks;
31406ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen};
31506ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
31606ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen}
31706ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen
31806ea8e899e48f1f2f396b70e63fae369f2f23232Kristian Monsen#endif
319