1/*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#ifndef HTMLTokenizer_h
28#define HTMLTokenizer_h
29
30#include "SegmentedString.h"
31#include <wtf/Noncopyable.h>
32#include <wtf/PassOwnPtr.h>
33#include <wtf/Vector.h>
34#include <wtf/text/AtomicString.h>
35
36namespace WebCore {
37
38class Element;
39class Frame;
40class HTMLToken;
41
42class HTMLTokenizer {
43    WTF_MAKE_NONCOPYABLE(HTMLTokenizer); WTF_MAKE_FAST_ALLOCATED;
44public:
45    enum State {
46        DataState,
47        CharacterReferenceInDataState,
48        RCDATAState,
49        CharacterReferenceInRCDATAState,
50        RAWTEXTState,
51        ScriptDataState,
52        PLAINTEXTState,
53        TagOpenState,
54        EndTagOpenState,
55        TagNameState,
56        RCDATALessThanSignState,
57        RCDATAEndTagOpenState,
58        RCDATAEndTagNameState,
59        RAWTEXTLessThanSignState,
60        RAWTEXTEndTagOpenState,
61        RAWTEXTEndTagNameState,
62        ScriptDataLessThanSignState,
63        ScriptDataEndTagOpenState,
64        ScriptDataEndTagNameState,
65        ScriptDataEscapeStartState,
66        ScriptDataEscapeStartDashState,
67        ScriptDataEscapedState,
68        ScriptDataEscapedDashState,
69        ScriptDataEscapedDashDashState,
70        ScriptDataEscapedLessThanSignState,
71        ScriptDataEscapedEndTagOpenState,
72        ScriptDataEscapedEndTagNameState,
73        ScriptDataDoubleEscapeStartState,
74        ScriptDataDoubleEscapedState,
75        ScriptDataDoubleEscapedDashState,
76        ScriptDataDoubleEscapedDashDashState,
77        ScriptDataDoubleEscapedLessThanSignState,
78        ScriptDataDoubleEscapeEndState,
79        BeforeAttributeNameState,
80        AttributeNameState,
81        AfterAttributeNameState,
82        BeforeAttributeValueState,
83        AttributeValueDoubleQuotedState,
84        AttributeValueSingleQuotedState,
85        AttributeValueUnquotedState,
86        CharacterReferenceInAttributeValueState,
87        AfterAttributeValueQuotedState,
88        SelfClosingStartTagState,
89        BogusCommentState,
90        // The ContinueBogusCommentState is not in the HTML5 spec, but we use
91        // it internally to keep track of whether we've started the bogus
92        // comment token yet.
93        ContinueBogusCommentState,
94        MarkupDeclarationOpenState,
95        CommentStartState,
96        CommentStartDashState,
97        CommentState,
98        CommentEndDashState,
99        CommentEndState,
100        CommentEndBangState,
101        DOCTYPEState,
102        BeforeDOCTYPENameState,
103        DOCTYPENameState,
104        AfterDOCTYPENameState,
105        AfterDOCTYPEPublicKeywordState,
106        BeforeDOCTYPEPublicIdentifierState,
107        DOCTYPEPublicIdentifierDoubleQuotedState,
108        DOCTYPEPublicIdentifierSingleQuotedState,
109        AfterDOCTYPEPublicIdentifierState,
110        BetweenDOCTYPEPublicAndSystemIdentifiersState,
111        AfterDOCTYPESystemKeywordState,
112        BeforeDOCTYPESystemIdentifierState,
113        DOCTYPESystemIdentifierDoubleQuotedState,
114        DOCTYPESystemIdentifierSingleQuotedState,
115        AfterDOCTYPESystemIdentifierState,
116        BogusDOCTYPEState,
117        CDATASectionState,
118        // These CDATA states are not in the HTML5 spec, but we use them internally.
119        CDATASectionRightSquareBracketState,
120        CDATASectionDoubleRightSquareBracketState,
121    };
122
123    static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); }
124    ~HTMLTokenizer();
125
126    void reset();
127
128    // This function returns true if it emits a token. Otherwise, callers
129    // must provide the same (in progress) token on the next call (unless
130    // they call reset() first).
131    bool nextToken(SegmentedString&, HTMLToken&);
132
133    int lineNumber() const { return m_lineNumber; }
134    int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior.
135
136    State state() const { return m_state; }
137    void setState(State state) { m_state = state; }
138
139    // Updates the tokenizer's state according to the given tag name. This is
140    // an approximation of how the tree builder would update the tokenizer's
141    // state. This method is useful for approximating HTML tokenization. To
142    // get exactly the correct tokenization, you need the real tree builder.
143    //
144    // The main failures in the approximation are as follows:
145    //
146    //  * The first set of character tokens emitted for a <pre> element might
147    //    contain an extra leading newline.
148    //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
149    //    tree builder's insertion mode.
150    //  * CDATA sections in foreign content will be tokenized as bogus comments
151    //    instead of as character tokens.
152    //
153    void updateStateFor(const AtomicString& tagName, Frame*);
154
155    // Hack to skip leading newline in <pre>/<listing> for authoring ease.
156    // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
157    void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
158
159    bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
160    void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
161
162    bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
163    void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
164
165    bool shouldSkipNullCharacters() const
166    {
167        return !m_forceNullCharacterReplacement
168            && (m_state == DataState
169                || m_state == RCDATAState
170                || m_state == RAWTEXTState
171                || m_state == PLAINTEXTState);
172    }
173
174private:
175    // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
176    class InputStreamPreprocessor {
177        WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
178    public:
179        InputStreamPreprocessor(HTMLTokenizer* tokenizer)
180            : m_tokenizer(tokenizer)
181            , m_nextInputCharacter('\0')
182            , m_skipNextNewLine(false)
183        {
184        }
185
186        UChar nextInputCharacter() const { return m_nextInputCharacter; }
187
188        // Returns whether we succeeded in peeking at the next character.
189        // The only way we can fail to peek is if there are no more
190        // characters in |source| (after collapsing \r\n, etc).
191        ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber)
192        {
193        PeekAgain:
194            m_nextInputCharacter = *source;
195
196            // Every branch in this function is expensive, so we have a
197            // fast-reject branch for characters that don't require special
198            // handling. Please run the parser benchmark whenever you touch
199            // this function. It's very hot.
200            static const UChar specialCharacterMask = '\n' | '\r' | '\0';
201            if (m_nextInputCharacter & ~specialCharacterMask) {
202                m_skipNextNewLine = false;
203                return true;
204            }
205
206            if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
207                m_skipNextNewLine = false;
208                source.advancePastNewline(lineNumber);
209                if (source.isEmpty())
210                    return false;
211                m_nextInputCharacter = *source;
212            }
213            if (m_nextInputCharacter == '\r') {
214                m_nextInputCharacter = '\n';
215                m_skipNextNewLine = true;
216            } else {
217                m_skipNextNewLine = false;
218                // FIXME: The spec indicates that the surrogate pair range as well as
219                // a number of specific character values are parse errors and should be replaced
220                // by the replacement character. We suspect this is a problem with the spec as doing
221                // that filtering breaks surrogate pair handling and causes us not to match Minefield.
222                if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
223                    if (m_tokenizer->shouldSkipNullCharacters()) {
224                        source.advancePastNonNewline();
225                        if (source.isEmpty())
226                            return false;
227                        goto PeekAgain;
228                    }
229                    m_nextInputCharacter = 0xFFFD;
230                }
231            }
232            return true;
233        }
234
235        // Returns whether there are more characters in |source| after advancing.
236        bool advance(SegmentedString& source, int& lineNumber)
237        {
238            source.advance(lineNumber);
239            if (source.isEmpty())
240                return false;
241            return peek(source, lineNumber);
242        }
243
244        static const UChar endOfFileMarker;
245
246    private:
247        bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
248        {
249            return source.isClosed() && source.length() == 1;
250        }
251
252        HTMLTokenizer* m_tokenizer;
253
254        // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
255        UChar m_nextInputCharacter;
256        bool m_skipNextNewLine;
257    };
258
259    HTMLTokenizer(bool usePreHTML5ParserQuirks);
260
261    inline bool processEntity(SegmentedString&);
262
263    inline void parseError();
264    inline void bufferCharacter(UChar);
265    inline void bufferCodePoint(unsigned);
266
267    inline bool emitAndResumeIn(SegmentedString&, State);
268    inline bool emitAndReconsumeIn(SegmentedString&, State);
269    inline bool emitEndOfFile(SegmentedString&);
270    inline bool flushEmitAndResumeIn(SegmentedString&, State);
271
272    // Return whether we need to emit a character token before dealing with
273    // the buffered end tag.
274    inline bool flushBufferedEndTag(SegmentedString&);
275    inline bool temporaryBufferIs(const String&);
276
277    // Sometimes we speculatively consume input characters and we don't
278    // know whether they represent end tags or RCDATA, etc. These
279    // functions help manage these state.
280    inline void addToPossibleEndTag(UChar cc);
281    inline void saveEndTagNameIfNeeded();
282    inline bool isAppropriateEndTag();
283
284    inline bool haveBufferedCharacterToken();
285
286    State m_state;
287
288    Vector<UChar, 32> m_appropriateEndTagName;
289
290    // m_token is owned by the caller. If nextToken is not on the stack,
291    // this member might be pointing to unallocated memory.
292    HTMLToken* m_token;
293    int m_lineNumber;
294
295    bool m_skipLeadingNewLineForListing;
296    bool m_forceNullCharacterReplacement;
297    bool m_shouldAllowCDATA;
298
299    // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
300    Vector<UChar, 32> m_temporaryBuffer;
301
302    // We occationally want to emit both a character token and an end tag
303    // token (e.g., when lexing script). We buffer the name of the end tag
304    // token here so we remember it next time we re-enter the tokenizer.
305    Vector<UChar, 32> m_bufferedEndTagName;
306
307    // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
308    UChar m_additionalAllowedCharacter;
309
310    // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
311    InputStreamPreprocessor m_inputStreamPreprocessor;
312
313    bool m_usePreHTML5ParserQuirks;
314};
315
316}
317
318#endif
319