1/*
2    Copyright (C) 1997 Martin Jones (mjones@kde.org)
3              (C) 1997 Torben Weis (weis@kde.org)
4              (C) 1998 Waldo Bastian (bastian@kde.org)
5              (C) 2001 Dirk Mueller (mueller@kde.org)
6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
7
8    This library is free software; you can redistribute it and/or
9    modify it under the terms of the GNU Library General Public
10    License as published by the Free Software Foundation; either
11    version 2 of the License, or (at your option) any later version.
12
13    This library is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16    Library General Public License for more details.
17
18    You should have received a copy of the GNU Library General Public License
19    along with this library; see the file COPYING.LIB.  If not, write to
20    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21    Boston, MA 02110-1301, USA.
22*/
23
24#ifndef HTMLTokenizer_h
25#define HTMLTokenizer_h
26
27#include "CachedResourceClient.h"
28#include "CachedResourceHandle.h"
29#include "NamedMappedAttrMap.h"
30#include "MappedAttributeEntry.h"
31#include "SegmentedString.h"
32#include "Timer.h"
33#include "Tokenizer.h"
34#include <wtf/Deque.h>
35#include <wtf/OwnPtr.h>
36#include <wtf/Vector.h>
37
38namespace WebCore {
39
40class CachedScript;
41class DocumentFragment;
42class Document;
43class HTMLDocument;
44class HTMLScriptElement;
45class HTMLViewSourceDocument;
46class FrameView;
47class HTMLParser;
48class Node;
49class PreloadScanner;
50class ScriptSourceCode;
51
52/**
53 * @internal
54 * represents one HTML tag. Consists of a numerical id, and the list
55 * of attributes. Can also represent text. In this case the id = 0 and
56 * text contains the text.
57 */
58struct Token {
59    Token()
60        : beginTag(true)
61        , selfClosingTag(false)
62        , brokenXMLStyle(false)
63        , m_sourceInfo(0)
64    { }
65    ~Token() { }
66
67    void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode);
68
69    bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; }
70    bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; }
71
72    void reset()
73    {
74        attrs = 0;
75        text = 0;
76        tagName = nullAtom;
77        beginTag = true;
78        selfClosingTag = false;
79        brokenXMLStyle = false;
80        if (m_sourceInfo)
81            m_sourceInfo->clear();
82    }
83
84    void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); }
85
86    RefPtr<NamedMappedAttrMap> attrs;
87    RefPtr<StringImpl> text;
88    AtomicString tagName;
89    bool beginTag;
90    bool selfClosingTag;
91    bool brokenXMLStyle;
92    OwnPtr<Vector<UChar> > m_sourceInfo;
93};
94
95enum DoctypeState {
96    DoctypeBegin,
97    DoctypeBeforeName,
98    DoctypeName,
99    DoctypeAfterName,
100    DoctypeBeforePublicID,
101    DoctypePublicID,
102    DoctypeAfterPublicID,
103    DoctypeBeforeSystemID,
104    DoctypeSystemID,
105    DoctypeAfterSystemID,
106    DoctypeBogus
107};
108
109class DoctypeToken {
110public:
111    DoctypeToken() {}
112
113    void reset()
114    {
115        m_name.clear();
116        m_publicID.clear();
117        m_systemID.clear();
118        m_state = DoctypeBegin;
119        m_source.clear();
120    }
121
122    DoctypeState state() { return m_state; }
123    void setState(DoctypeState s) { m_state = s; }
124
125    Vector<UChar> m_name;
126    Vector<UChar> m_publicID;
127    Vector<UChar> m_systemID;
128    DoctypeState m_state;
129
130    Vector<UChar> m_source;
131};
132
133//-----------------------------------------------------------------------------
134
135class HTMLTokenizer : public Tokenizer, public CachedResourceClient {
136public:
137    HTMLTokenizer(HTMLDocument*, bool reportErrors);
138    HTMLTokenizer(HTMLViewSourceDocument*);
139    HTMLTokenizer(DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);
140    virtual ~HTMLTokenizer();
141
142    virtual void write(const SegmentedString&, bool appendData);
143    virtual void finish();
144    virtual void setForceSynchronous(bool force);
145    virtual bool isWaitingForScripts() const;
146    virtual void stopParsing();
147    virtual bool processingData() const;
148    virtual int executingScript() const { return m_executingScript; }
149
150    virtual int lineNumber() const { return m_lineNumber; }
151    virtual int columnNumber() const { return 1; }
152
153    bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); }
154
155    virtual void executeScriptsWaitingForStylesheets();
156
157    virtual bool isHTMLTokenizer() const { return true; }
158    HTMLParser* htmlParser() const { return m_parser.get(); }
159
160private:
161    class State;
162
163    // Where we are in parsing a tag
164    void begin();
165    void end();
166
167    void reset();
168
169    PassRefPtr<Node> processToken();
170    void processDoctypeToken();
171
172    State processListing(SegmentedString, State);
173    State parseComment(SegmentedString&, State);
174    State parseDoctype(SegmentedString&, State);
175    State parseServer(SegmentedString&, State);
176    State parseText(SegmentedString&, State);
177    State parseNonHTMLText(SegmentedString&, State);
178    State parseTag(SegmentedString&, State);
179    State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag);
180    State parseProcessingInstruction(SegmentedString&, State);
181    State scriptHandler(State);
182    State scriptExecution(const ScriptSourceCode&, State);
183    void setSrc(const SegmentedString&);
184
185    // check if we have enough space in the buffer.
186    // if not enlarge it
187    inline void checkBuffer(int len = 10)
188    {
189        if ((m_dest - m_buffer) > m_bufferSize - len)
190            enlargeBuffer(len);
191    }
192
193    inline void checkScriptBuffer(int len = 10)
194    {
195        if (m_scriptCodeSize + len >= m_scriptCodeCapacity)
196            enlargeScriptBuffer(len);
197    }
198
199    void enlargeBuffer(int len);
200    void enlargeScriptBuffer(int len);
201
202    bool continueProcessing(int& processedCount, double startTime, State&);
203    void timerFired(Timer<HTMLTokenizer>*);
204    void allDataProcessed();
205
206    // from CachedResourceClient
207    void notifyFinished(CachedResource*);
208
209    void executeExternalScriptsIfReady();
210    void executeExternalScriptsTimerFired(Timer<HTMLTokenizer>*);
211    bool continueExecutingExternalScripts(double startTime);
212
213    // Internal buffers
214    ///////////////////
215    UChar* m_buffer;
216    int m_bufferSize;
217    UChar* m_dest;
218
219    Token m_currentToken;
220
221    // This buffer holds the raw characters we've seen between the beginning of
222    // the attribute name and the first character of the attribute value.
223    Vector<UChar, 32> m_rawAttributeBeforeValue;
224
225    // Tokenizer flags
226    //////////////////
227    // are we in quotes within a html tag
228    enum { NoQuote, SingleQuote, DoubleQuote } tquote;
229
230    // Are we in a &... character entity description?
231    enum EntityState {
232        NoEntity = 0,
233        SearchEntity = 1,
234        NumericSearch = 2,
235        Hexadecimal = 3,
236        Decimal = 4,
237        EntityName = 5,
238        SearchSemicolon = 6
239    };
240    unsigned EntityUnicodeValue;
241
242    enum TagState {
243        NoTag = 0,
244        TagName = 1,
245        SearchAttribute = 2,
246        AttributeName = 3,
247        SearchEqual = 4,
248        SearchValue = 5,
249        QuotedValue = 6,
250        Value = 7,
251        SearchEnd = 8
252    };
253
254    class State {
255    public:
256        State() : m_bits(0) { }
257
258        TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); }
259        void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; }
260        EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); }
261        void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); }
262
263        bool inScript() const { return testBit(InScript); }
264        void setInScript(bool v) { setBit(InScript, v); }
265        bool inStyle() const { return testBit(InStyle); }
266        void setInStyle(bool v) { setBit(InStyle, v); }
267        bool inXmp() const { return testBit(InXmp); }
268        void setInXmp(bool v) { setBit(InXmp, v); }
269        bool inTitle() const { return testBit(InTitle); }
270        void setInTitle(bool v) { setBit(InTitle, v); }
271        bool inIFrame() const { return testBit(InIFrame); }
272        void setInIFrame(bool v) { setBit(InIFrame, v); }
273        bool inPlainText() const { return testBit(InPlainText); }
274        void setInPlainText(bool v) { setBit(InPlainText, v); }
275        bool inProcessingInstruction() const { return testBit(InProcessingInstruction); }
276        void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); }
277        bool inComment() const { return testBit(InComment); }
278        void setInComment(bool v) { setBit(InComment, v); }
279        bool inDoctype() const { return testBit(InDoctype); }
280        void setInDoctype(bool v) { setBit(InDoctype, v); }
281        bool inTextArea() const { return testBit(InTextArea); }
282        void setInTextArea(bool v) { setBit(InTextArea, v); }
283        bool escaped() const { return testBit(Escaped); }
284        void setEscaped(bool v) { setBit(Escaped, v); }
285        bool inServer() const { return testBit(InServer); }
286        void setInServer(bool v) { setBit(InServer, v); }
287        bool skipLF() const { return testBit(SkipLF); }
288        void setSkipLF(bool v) { setBit(SkipLF, v); }
289        bool startTag() const { return testBit(StartTag); }
290        void setStartTag(bool v) { setBit(StartTag, v); }
291        bool discardLF() const { return testBit(DiscardLF); }
292        void setDiscardLF(bool v) { setBit(DiscardLF, v); }
293        bool allowYield() const { return testBit(AllowYield); }
294        void setAllowYield(bool v) { setBit(AllowYield, v); }
295        bool loadingExtScript() const { return testBit(LoadingExtScript); }
296        void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); }
297        bool forceSynchronous() const { return testBit(ForceSynchronous); }
298        void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); }
299
300        bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); }
301        bool hasTagState() const { return m_bits & TagMask; }
302        bool hasEntityState() const { return m_bits & EntityMask; }
303
304        bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); }
305
306    private:
307        static const int EntityShift = 4;
308        enum StateBits {
309            TagMask = (1 << 4) - 1,
310            EntityMask = (1 << 7) - (1 << 4),
311            InScript = 1 << 7,
312            InStyle = 1 << 8,
313            // Bit 9 unused
314            InXmp = 1 << 10,
315            InTitle = 1 << 11,
316            InPlainText = 1 << 12,
317            InProcessingInstruction = 1 << 13,
318            InComment = 1 << 14,
319            InTextArea = 1 << 15,
320            Escaped = 1 << 16,
321            InServer = 1 << 17,
322            SkipLF = 1 << 18,
323            StartTag = 1 << 19,
324            DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard
325            AllowYield = 1 << 21,
326            LoadingExtScript = 1 << 22,
327            ForceSynchronous = 1 << 23,
328            InIFrame = 1 << 24,
329            InDoctype = 1 << 25
330        };
331
332        void setBit(StateBits bit, bool value)
333        {
334            if (value)
335                m_bits |= bit;
336            else
337                m_bits &= ~bit;
338        }
339        bool testBit(StateBits bit) const { return m_bits & bit; }
340
341        unsigned m_bits;
342    };
343
344    State m_state;
345
346    DoctypeToken m_doctypeToken;
347    int m_doctypeSearchCount;
348    int m_doctypeSecondarySearchCount;
349
350    bool m_brokenServer;
351
352    // Name of an attribute that we just scanned.
353    AtomicString m_attrName;
354
355    // Used to store the code of a scripting sequence
356    UChar* m_scriptCode;
357    // Size of the script sequenze stored in @ref #scriptCode
358    int m_scriptCodeSize;
359    // Maximal size that can be stored in @ref #scriptCode
360    int m_scriptCodeCapacity;
361    // resync point of script code size
362    int m_scriptCodeResync;
363
364    // Stores characters if we are scanning for a string like "</script>"
365    UChar searchBuffer[10];
366
367    // Counts where we are in the string we are scanning for
368    int searchCount;
369    // the stopper string
370    const char* m_searchStopper;
371    int m_searchStopperLength;
372
373    // if no more data is coming, just parse what we have (including ext scripts that
374    // may be still downloading) and finish
375    bool m_noMoreData;
376    // URL to get source code of script from
377    String m_scriptTagSrcAttrValue;
378    String m_scriptTagCharsetAttrValue;
379    // the HTML code we will parse after the external script we are waiting for has loaded
380    SegmentedString m_pendingSrc;
381
382    // the HTML code we will parse after this particular script has
383    // loaded, but before all pending HTML
384    SegmentedString* m_currentPrependingSrc;
385
386    // true if we are executing a script while parsing a document. This causes the parsing of
387    // the output of the script to be postponed until after the script has finished executing
388    int m_executingScript;
389    Deque<CachedResourceHandle<CachedScript> > m_pendingScripts;
390    RefPtr<HTMLScriptElement> m_scriptNode;
391
392    bool m_requestingScript;
393    bool m_hasScriptsWaitingForStylesheets;
394
395    // if we found one broken comment, there are most likely others as well
396    // store a flag to get rid of the O(n^2) behaviour in such a case.
397    bool m_brokenComments;
398    // current line number
399    int m_lineNumber;
400    int m_currentScriptTagStartLineNumber;
401    int m_currentTagStartLineNumber;
402
403    double m_tokenizerTimeDelay;
404    int m_tokenizerChunkSize;
405
406    // The timer for continued processing.
407    Timer<HTMLTokenizer> m_timer;
408
409    // The timer for continued executing external scripts.
410    Timer<HTMLTokenizer> m_externalScriptsTimer;
411
412// This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
413// So any fixed number might be too small, but rather than rewriting all usage of this buffer
414// we'll just make it large enough to handle all imaginable cases.
415#define CBUFLEN 1024
416    UChar m_cBuffer[CBUFLEN + 2];
417    unsigned int m_cBufferPos;
418
419    SegmentedString m_src;
420    Document* m_doc;
421    OwnPtr<HTMLParser> m_parser;
422    bool m_inWrite;
423    bool m_fragment;
424    FragmentScriptingPermission m_scriptingPermission;
425
426    OwnPtr<PreloadScanner> m_preloadScanner;
427};
428
429void parseHTMLDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);
430
431UChar decodeNamedEntity(const char*);
432
433} // namespace WebCore
434
435#endif // HTMLTokenizer_h
436