HTMLToken.h revision cad810f21b803229eb11403f9209855525a25d57
1/*
2 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#ifndef HTMLToken_h
27#define HTMLToken_h
28
29#include "NamedNodeMap.h"
30#include <wtf/Noncopyable.h>
31#include <wtf/PassOwnPtr.h>
32#include <wtf/Vector.h>
33
34namespace WebCore {
35
36class HTMLToken : public Noncopyable {
37public:
38    enum Type {
39        Uninitialized,
40        DOCTYPE,
41        StartTag,
42        EndTag,
43        Comment,
44        Character,
45        EndOfFile,
46    };
47
48    class Range {
49    public:
50        int m_start;
51        int m_end;
52    };
53
54    class Attribute {
55    public:
56        Range m_nameRange;
57        Range m_valueRange;
58        WTF::Vector<UChar, 32> m_name;
59        WTF::Vector<UChar, 32> m_value;
60    };
61
62    typedef WTF::Vector<Attribute, 10> AttributeList;
63    typedef WTF::Vector<UChar, 1024> DataVector;
64
65    HTMLToken() { clear(); }
66
67    void clear(int startIndex = 0)
68    {
69        m_type = Uninitialized;
70        m_range.m_start = startIndex;
71        m_range.m_end = startIndex;
72        m_data.clear();
73    }
74
75    int startIndex() const { return m_range.m_start; }
76    int endIndex() const { return m_range.m_end; }
77
78    void end(int endIndex)
79    {
80        m_range.m_end = endIndex;
81    }
82
83    void makeEndOfFile()
84    {
85        ASSERT(m_type == Uninitialized);
86        m_type = EndOfFile;
87    }
88
89    void beginStartTag(UChar character)
90    {
91        ASSERT(character);
92        ASSERT(m_type == Uninitialized);
93        m_type = StartTag;
94        m_selfClosing = false;
95        m_currentAttribute = 0;
96        m_attributes.clear();
97
98        m_data.append(character);
99    }
100
101    template<typename T>
102    void beginEndTag(T characters)
103    {
104        ASSERT(m_type == Uninitialized);
105        m_type = EndTag;
106        m_selfClosing = false;
107        m_currentAttribute = 0;
108        m_attributes.clear();
109
110        m_data.append(characters);
111    }
112
113    // Starting a character token works slightly differently than starting
114    // other types of tokens because we want to save a per-character branch.
115    void ensureIsCharacterToken()
116    {
117        ASSERT(m_type == Uninitialized || m_type == Character);
118        m_type = Character;
119    }
120
121    void beginComment()
122    {
123        ASSERT(m_type == Uninitialized);
124        m_type = Comment;
125    }
126
127    void beginDOCTYPE()
128    {
129        ASSERT(m_type == Uninitialized);
130        m_type = DOCTYPE;
131        m_doctypeData = adoptPtr(new DoctypeData());
132    }
133
134    void beginDOCTYPE(UChar character)
135    {
136        ASSERT(character);
137        beginDOCTYPE();
138        m_data.append(character);
139    }
140
141    void appendToName(UChar character)
142    {
143        ASSERT(character);
144        ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
145        m_data.append(character);
146    }
147
148    template<typename T>
149    void appendToCharacter(T characters)
150    {
151        ASSERT(m_type == Character);
152        m_data.append(characters);
153    }
154
155    void appendToComment(UChar character)
156    {
157        ASSERT(character);
158        ASSERT(m_type == Comment);
159        m_data.append(character);
160    }
161
162    void addNewAttribute()
163    {
164        ASSERT(m_type == StartTag || m_type == EndTag);
165        m_attributes.grow(m_attributes.size() + 1);
166        m_currentAttribute = &m_attributes.last();
167#ifndef NDEBUG
168        m_currentAttribute->m_nameRange.m_start = 0;
169        m_currentAttribute->m_nameRange.m_end = 0;
170        m_currentAttribute->m_valueRange.m_start = 0;
171        m_currentAttribute->m_valueRange.m_end = 0;
172#endif
173    }
174
175    void beginAttributeName(int index)
176    {
177        m_currentAttribute->m_nameRange.m_start = index;
178    }
179
180    void endAttributeName(int index)
181    {
182        m_currentAttribute->m_nameRange.m_end = index;
183        m_currentAttribute->m_valueRange.m_start = index;
184        m_currentAttribute->m_valueRange.m_end = index;
185    }
186
187    void beginAttributeValue(int index)
188    {
189        m_currentAttribute->m_valueRange.m_start = index;
190#ifndef NDEBUG
191        m_currentAttribute->m_valueRange.m_end = 0;
192#endif
193    }
194
195    void endAttributeValue(int index)
196    {
197        m_currentAttribute->m_valueRange.m_end = index;
198    }
199
200    void appendToAttributeName(UChar character)
201    {
202        ASSERT(character);
203        ASSERT(m_type == StartTag || m_type == EndTag);
204        ASSERT(m_currentAttribute->m_nameRange.m_start);
205        m_currentAttribute->m_name.append(character);
206    }
207
208    void appendToAttributeValue(UChar character)
209    {
210        ASSERT(character);
211        ASSERT(m_type == StartTag || m_type == EndTag);
212        ASSERT(m_currentAttribute->m_valueRange.m_start);
213        m_currentAttribute->m_value.append(character);
214    }
215
216    Type type() const { return m_type; }
217
218    bool selfClosing() const
219    {
220        ASSERT(m_type == StartTag || m_type == EndTag);
221        return m_selfClosing;
222    }
223
224    void setSelfClosing()
225    {
226        ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
227        m_selfClosing = true;
228    }
229
230    const AttributeList& attributes() const
231    {
232        ASSERT(m_type == StartTag || m_type == EndTag);
233        return m_attributes;
234    }
235
236    const DataVector& name() const
237    {
238        ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
239        return m_data;
240    }
241
242    const DataVector& characters() const
243    {
244        ASSERT(m_type == Character);
245        return m_data;
246    }
247
248    const DataVector& comment() const
249    {
250        ASSERT(m_type == Comment);
251        return m_data;
252    }
253
254    // FIXME: Distinguish between a missing public identifer and an empty one.
255    const WTF::Vector<UChar>& publicIdentifier() const
256    {
257        ASSERT(m_type == DOCTYPE);
258        return m_doctypeData->m_publicIdentifier;
259    }
260
261    // FIXME: Distinguish between a missing system identifer and an empty one.
262    const WTF::Vector<UChar>& systemIdentifier() const
263    {
264        ASSERT(m_type == DOCTYPE);
265        return m_doctypeData->m_systemIdentifier;
266    }
267
268    void setPublicIdentifierToEmptyString()
269    {
270        ASSERT(m_type == DOCTYPE);
271        m_doctypeData->m_hasPublicIdentifier = true;
272        m_doctypeData->m_publicIdentifier.clear();
273    }
274
275    void setSystemIdentifierToEmptyString()
276    {
277        ASSERT(m_type == DOCTYPE);
278        m_doctypeData->m_hasSystemIdentifier = true;
279        m_doctypeData->m_systemIdentifier.clear();
280    }
281
282    bool forceQuirks() const
283    {
284        ASSERT(m_type == DOCTYPE);
285        return m_doctypeData->m_forceQuirks;
286    }
287
288    void setForceQuirks()
289    {
290        ASSERT(m_type == DOCTYPE);
291        m_doctypeData->m_forceQuirks = true;
292    }
293
294    void appendToPublicIdentifier(UChar character)
295    {
296        ASSERT(character);
297        ASSERT(m_type == DOCTYPE);
298        ASSERT(m_doctypeData->m_hasPublicIdentifier);
299        m_doctypeData->m_publicIdentifier.append(character);
300    }
301
302    void appendToSystemIdentifier(UChar character)
303    {
304        ASSERT(character);
305        ASSERT(m_type == DOCTYPE);
306        ASSERT(m_doctypeData->m_hasSystemIdentifier);
307        m_doctypeData->m_systemIdentifier.append(character);
308    }
309
310private:
311    // FIXME: I'm not sure what the final relationship between HTMLToken and
312    // AtomicHTMLToken will be.  I'm marking this a friend for now, but we'll
313    // want to end up with a cleaner interface between the two classes.
314    friend class AtomicHTMLToken;
315
316    class DoctypeData : public Noncopyable {
317    public:
318        DoctypeData()
319            : m_hasPublicIdentifier(false)
320            , m_hasSystemIdentifier(false)
321            , m_forceQuirks(false)
322        {
323        }
324
325        bool m_hasPublicIdentifier;
326        bool m_hasSystemIdentifier;
327        bool m_forceQuirks;
328        WTF::Vector<UChar> m_publicIdentifier;
329        WTF::Vector<UChar> m_systemIdentifier;
330    };
331
332    Type m_type;
333
334    // Which characters from the input stream are represented by this token.
335    Range m_range;
336
337    // "name" for DOCTYPE, StartTag, and EndTag
338    // "characters" for Character
339    // "data" for Comment
340    DataVector m_data;
341
342    // For DOCTYPE
343    OwnPtr<DoctypeData> m_doctypeData;
344
345    // For StartTag and EndTag
346    bool m_selfClosing;
347    AttributeList m_attributes;
348
349    // A pointer into m_attributes used during lexing.
350    Attribute* m_currentAttribute;
351};
352
353// FIXME: This class should eventually be named HTMLToken once we move the
354// exiting HTMLToken to be internal to the HTMLTokenizer.
355class AtomicHTMLToken : public Noncopyable {
356public:
357    AtomicHTMLToken(HTMLToken& token)
358        : m_type(token.type())
359    {
360        switch (m_type) {
361        case HTMLToken::Uninitialized:
362            ASSERT_NOT_REACHED();
363            break;
364        case HTMLToken::DOCTYPE:
365            m_name = AtomicString(token.name().data(), token.name().size());
366            m_doctypeData = token.m_doctypeData.release();
367            break;
368        case HTMLToken::EndOfFile:
369            break;
370        case HTMLToken::StartTag:
371        case HTMLToken::EndTag: {
372            m_selfClosing = token.selfClosing();
373            m_name = AtomicString(token.name().data(), token.name().size());
374            const HTMLToken::AttributeList& attributes = token.attributes();
375            for (HTMLToken::AttributeList::const_iterator iter = attributes.begin();
376                 iter != attributes.end(); ++iter) {
377                if (!iter->m_name.isEmpty()) {
378                    String name(iter->m_name.data(), iter->m_name.size());
379                    String value(iter->m_value.data(), iter->m_value.size());
380                    ASSERT(iter->m_nameRange.m_start);
381                    ASSERT(iter->m_nameRange.m_end);
382                    ASSERT(iter->m_valueRange.m_start);
383                    ASSERT(iter->m_valueRange.m_end);
384                    RefPtr<Attribute> mappedAttribute = Attribute::createMapped(name, value);
385                    if (!m_attributes) {
386                        m_attributes = NamedNodeMap::create();
387                        // Reserving capacity here improves the parser
388                        // benchmark.  It might be worth experimenting with
389                        // the constant to see where the optimal point is.
390                        m_attributes->reserveInitialCapacity(10);
391                    }
392                    m_attributes->insertAttribute(mappedAttribute.release(), false);
393                }
394            }
395            break;
396        }
397        case HTMLToken::Comment:
398            m_data = String(token.comment().data(), token.comment().size());
399            break;
400        case HTMLToken::Character:
401            m_externalCharacters = &token.characters();
402            break;
403        }
404    }
405
406    AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0)
407        : m_type(type)
408        , m_name(name)
409        , m_attributes(attributes)
410    {
411        ASSERT(usesName());
412    }
413
414    HTMLToken::Type type() const { return m_type; }
415
416    const AtomicString& name() const
417    {
418        ASSERT(usesName());
419        return m_name;
420    }
421
422    void setName(const AtomicString& name)
423    {
424        ASSERT(usesName());
425        m_name = name;
426    }
427
428    bool selfClosing() const
429    {
430        ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
431        return m_selfClosing;
432    }
433
434    Attribute* getAttributeItem(const QualifiedName& attributeName)
435    {
436        ASSERT(usesAttributes());
437        if (!m_attributes)
438            return 0;
439        return m_attributes->getAttributeItem(attributeName);
440    }
441
442    NamedNodeMap* attributes() const
443    {
444        ASSERT(usesAttributes());
445        return m_attributes.get();
446    }
447
448    PassRefPtr<NamedNodeMap> takeAtributes()
449    {
450        ASSERT(usesAttributes());
451        return m_attributes.release();
452    }
453
454    const HTMLToken::DataVector& characters() const
455    {
456        ASSERT(m_type == HTMLToken::Character);
457        return *m_externalCharacters;
458    }
459
460    const String& comment() const
461    {
462        ASSERT(m_type == HTMLToken::Comment);
463        return m_data;
464    }
465
466    // FIXME: Distinguish between a missing public identifer and an empty one.
467    WTF::Vector<UChar>& publicIdentifier() const
468    {
469        ASSERT(m_type == HTMLToken::DOCTYPE);
470        return m_doctypeData->m_publicIdentifier;
471    }
472
473    // FIXME: Distinguish between a missing system identifer and an empty one.
474    WTF::Vector<UChar>& systemIdentifier() const
475    {
476        ASSERT(m_type == HTMLToken::DOCTYPE);
477        return m_doctypeData->m_systemIdentifier;
478    }
479
480    bool forceQuirks() const
481    {
482        ASSERT(m_type == HTMLToken::DOCTYPE);
483        return m_doctypeData->m_forceQuirks;
484    }
485
486private:
487    HTMLToken::Type m_type;
488
489    bool usesName() const
490    {
491        return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
492    }
493
494    bool usesAttributes() const
495    {
496        return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
497    }
498
499    // "name" for DOCTYPE, StartTag, and EndTag
500    AtomicString m_name;
501
502    // "data" for Comment
503    String m_data;
504
505    // "characters" for Character
506    //
507    // We don't want to copy the the characters out of the HTMLToken, so we
508    // keep a pointer to its buffer instead.  This buffer is owned by the
509    // HTMLToken and causes a lifetime dependence between these objects.
510    //
511    // FIXME: Add a mechanism for "internalizing" the characters when the
512    //        HTMLToken is destructed.
513    const HTMLToken::DataVector* m_externalCharacters;
514
515    // For DOCTYPE
516    OwnPtr<HTMLToken::DoctypeData> m_doctypeData;
517
518    // For StartTag and EndTag
519    bool m_selfClosing;
520
521    RefPtr<NamedNodeMap> m_attributes;
522};
523
524}
525
526#endif
527