HTMLToken.h revision ab9e7a118cf1ea2e3a93dce683b2ded3e7291ddb
1/*
2 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#ifndef HTMLToken_h
27#define HTMLToken_h
28
29#include "NamedNodeMap.h"
30#include <wtf/PassOwnPtr.h>
31#include <wtf/Vector.h>
32
33namespace WebCore {
34
35class HTMLToken {
36    WTF_MAKE_NONCOPYABLE(HTMLToken); WTF_MAKE_FAST_ALLOCATED;
37public:
38    enum Type {
39        Uninitialized,
40        DOCTYPE,
41        StartTag,
42        EndTag,
43        Comment,
44        Character,
45        EndOfFile,
46    };
47
48    class Range {
49    public:
50        int m_start;
51        int m_end;
52    };
53
54    class Attribute {
55    public:
56        Range m_nameRange;
57        Range m_valueRange;
58        WTF::Vector<UChar, 32> m_name;
59        WTF::Vector<UChar, 32> m_value;
60    };
61
62    typedef WTF::Vector<Attribute, 10> AttributeList;
63    typedef WTF::Vector<UChar, 1024> DataVector;
64
65    HTMLToken() { clear(); }
66
67    void clear(int startIndex = 0)
68    {
69        m_type = Uninitialized;
70        m_range.m_start = startIndex;
71        m_range.m_end = startIndex;
72        m_data.clear();
73    }
74
75    int startIndex() const { return m_range.m_start; }
76    int endIndex() const { return m_range.m_end; }
77
78    void end(int endIndex)
79    {
80        m_range.m_end = endIndex;
81    }
82
83    void makeEndOfFile()
84    {
85        ASSERT(m_type == Uninitialized);
86        m_type = EndOfFile;
87    }
88
89    void beginStartTag(UChar character)
90    {
91        ASSERT(character);
92        ASSERT(m_type == Uninitialized);
93        m_type = StartTag;
94        m_selfClosing = false;
95        m_currentAttribute = 0;
96        m_attributes.clear();
97
98        m_data.append(character);
99    }
100
101    template<typename T>
102    void beginEndTag(T characters)
103    {
104        ASSERT(m_type == Uninitialized);
105        m_type = EndTag;
106        m_selfClosing = false;
107        m_currentAttribute = 0;
108        m_attributes.clear();
109
110        m_data.append(characters);
111    }
112
113    // Starting a character token works slightly differently than starting
114    // other types of tokens because we want to save a per-character branch.
115    void ensureIsCharacterToken()
116    {
117        ASSERT(m_type == Uninitialized || m_type == Character);
118        m_type = Character;
119    }
120
121    void beginComment()
122    {
123        ASSERT(m_type == Uninitialized);
124        m_type = Comment;
125    }
126
127    void beginDOCTYPE()
128    {
129        ASSERT(m_type == Uninitialized);
130        m_type = DOCTYPE;
131        m_doctypeData = adoptPtr(new DoctypeData());
132    }
133
134    void beginDOCTYPE(UChar character)
135    {
136        ASSERT(character);
137        beginDOCTYPE();
138        m_data.append(character);
139    }
140
141    void appendToName(UChar character)
142    {
143        ASSERT(character);
144        ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
145        m_data.append(character);
146    }
147
148    template<typename T>
149    void appendToCharacter(T characters)
150    {
151        ASSERT(m_type == Character);
152        m_data.append(characters);
153    }
154
155    void appendToComment(UChar character)
156    {
157        ASSERT(character);
158        ASSERT(m_type == Comment);
159        m_data.append(character);
160    }
161
162    void addNewAttribute()
163    {
164        ASSERT(m_type == StartTag || m_type == EndTag);
165        m_attributes.grow(m_attributes.size() + 1);
166        m_currentAttribute = &m_attributes.last();
167#ifndef NDEBUG
168        m_currentAttribute->m_nameRange.m_start = 0;
169        m_currentAttribute->m_nameRange.m_end = 0;
170        m_currentAttribute->m_valueRange.m_start = 0;
171        m_currentAttribute->m_valueRange.m_end = 0;
172#endif
173    }
174
175    void beginAttributeName(int index)
176    {
177        m_currentAttribute->m_nameRange.m_start = index;
178    }
179
180    void endAttributeName(int index)
181    {
182        m_currentAttribute->m_nameRange.m_end = index;
183        m_currentAttribute->m_valueRange.m_start = index;
184        m_currentAttribute->m_valueRange.m_end = index;
185    }
186
187    void beginAttributeValue(int index)
188    {
189        m_currentAttribute->m_valueRange.m_start = index;
190#ifndef NDEBUG
191        m_currentAttribute->m_valueRange.m_end = 0;
192#endif
193    }
194
195    void endAttributeValue(int index)
196    {
197        m_currentAttribute->m_valueRange.m_end = index;
198    }
199
200    void appendToAttributeName(UChar character)
201    {
202        ASSERT(character);
203        ASSERT(m_type == StartTag || m_type == EndTag);
204        ASSERT(m_currentAttribute->m_nameRange.m_start);
205        m_currentAttribute->m_name.append(character);
206    }
207
208    void appendToAttributeValue(UChar character)
209    {
210        ASSERT(character);
211        ASSERT(m_type == StartTag || m_type == EndTag);
212        ASSERT(m_currentAttribute->m_valueRange.m_start);
213        m_currentAttribute->m_value.append(character);
214    }
215
216    Type type() const { return m_type; }
217
218    bool selfClosing() const
219    {
220        ASSERT(m_type == StartTag || m_type == EndTag);
221        return m_selfClosing;
222    }
223
224    void setSelfClosing()
225    {
226        ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
227        m_selfClosing = true;
228    }
229
230    const AttributeList& attributes() const
231    {
232        ASSERT(m_type == StartTag || m_type == EndTag);
233        return m_attributes;
234    }
235
236    const DataVector& name() const
237    {
238        ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
239        return m_data;
240    }
241
242    const DataVector& characters() const
243    {
244        ASSERT(m_type == Character);
245        return m_data;
246    }
247
248    const DataVector& comment() const
249    {
250        ASSERT(m_type == Comment);
251        return m_data;
252    }
253
254    // FIXME: Distinguish between a missing public identifer and an empty one.
255    const WTF::Vector<UChar>& publicIdentifier() const
256    {
257        ASSERT(m_type == DOCTYPE);
258        return m_doctypeData->m_publicIdentifier;
259    }
260
261    // FIXME: Distinguish between a missing system identifer and an empty one.
262    const WTF::Vector<UChar>& systemIdentifier() const
263    {
264        ASSERT(m_type == DOCTYPE);
265        return m_doctypeData->m_systemIdentifier;
266    }
267
268    void setPublicIdentifierToEmptyString()
269    {
270        ASSERT(m_type == DOCTYPE);
271        m_doctypeData->m_hasPublicIdentifier = true;
272        m_doctypeData->m_publicIdentifier.clear();
273    }
274
275    void setSystemIdentifierToEmptyString()
276    {
277        ASSERT(m_type == DOCTYPE);
278        m_doctypeData->m_hasSystemIdentifier = true;
279        m_doctypeData->m_systemIdentifier.clear();
280    }
281
282    bool forceQuirks() const
283    {
284        ASSERT(m_type == DOCTYPE);
285        return m_doctypeData->m_forceQuirks;
286    }
287
288    void setForceQuirks()
289    {
290        ASSERT(m_type == DOCTYPE);
291        m_doctypeData->m_forceQuirks = true;
292    }
293
294    void appendToPublicIdentifier(UChar character)
295    {
296        ASSERT(character);
297        ASSERT(m_type == DOCTYPE);
298        ASSERT(m_doctypeData->m_hasPublicIdentifier);
299        m_doctypeData->m_publicIdentifier.append(character);
300    }
301
302    void appendToSystemIdentifier(UChar character)
303    {
304        ASSERT(character);
305        ASSERT(m_type == DOCTYPE);
306        ASSERT(m_doctypeData->m_hasSystemIdentifier);
307        m_doctypeData->m_systemIdentifier.append(character);
308    }
309
310private:
311    // FIXME: I'm not sure what the final relationship between HTMLToken and
312    // AtomicHTMLToken will be.  I'm marking this a friend for now, but we'll
313    // want to end up with a cleaner interface between the two classes.
314    friend class AtomicHTMLToken;
315
316    class DoctypeData {
317        WTF_MAKE_NONCOPYABLE(DoctypeData);
318    public:
319        DoctypeData()
320            : m_hasPublicIdentifier(false)
321            , m_hasSystemIdentifier(false)
322            , m_forceQuirks(false)
323        {
324        }
325
326        bool m_hasPublicIdentifier;
327        bool m_hasSystemIdentifier;
328        bool m_forceQuirks;
329        WTF::Vector<UChar> m_publicIdentifier;
330        WTF::Vector<UChar> m_systemIdentifier;
331    };
332
333    Type m_type;
334
335    // Which characters from the input stream are represented by this token.
336    Range m_range;
337
338    // "name" for DOCTYPE, StartTag, and EndTag
339    // "characters" for Character
340    // "data" for Comment
341    DataVector m_data;
342
343    // For DOCTYPE
344    OwnPtr<DoctypeData> m_doctypeData;
345
346    // For StartTag and EndTag
347    bool m_selfClosing;
348    AttributeList m_attributes;
349
350    // A pointer into m_attributes used during lexing.
351    Attribute* m_currentAttribute;
352};
353
354// FIXME: This class should eventually be named HTMLToken once we move the
355// exiting HTMLToken to be internal to the HTMLTokenizer.
356class AtomicHTMLToken {
357    WTF_MAKE_NONCOPYABLE(AtomicHTMLToken);
358public:
359    AtomicHTMLToken(HTMLToken& token)
360        : m_type(token.type())
361    {
362        switch (m_type) {
363        case HTMLToken::Uninitialized:
364            ASSERT_NOT_REACHED();
365            break;
366        case HTMLToken::DOCTYPE:
367            m_name = AtomicString(token.name().data(), token.name().size());
368            m_doctypeData = token.m_doctypeData.release();
369            break;
370        case HTMLToken::EndOfFile:
371            break;
372        case HTMLToken::StartTag:
373        case HTMLToken::EndTag: {
374            m_selfClosing = token.selfClosing();
375            m_name = AtomicString(token.name().data(), token.name().size());
376            const HTMLToken::AttributeList& attributes = token.attributes();
377            for (HTMLToken::AttributeList::const_iterator iter = attributes.begin();
378                 iter != attributes.end(); ++iter) {
379                if (!iter->m_name.isEmpty()) {
380                    String name(iter->m_name.data(), iter->m_name.size());
381                    String value(iter->m_value.data(), iter->m_value.size());
382                    ASSERT(iter->m_nameRange.m_start);
383                    ASSERT(iter->m_nameRange.m_end);
384                    ASSERT(iter->m_valueRange.m_start);
385                    ASSERT(iter->m_valueRange.m_end);
386                    RefPtr<Attribute> mappedAttribute = Attribute::createMapped(name, value);
387                    if (!m_attributes) {
388                        m_attributes = NamedNodeMap::create();
389                        // Reserving capacity here improves the parser
390                        // benchmark.  It might be worth experimenting with
391                        // the constant to see where the optimal point is.
392                        m_attributes->reserveInitialCapacity(10);
393                    }
394                    m_attributes->insertAttribute(mappedAttribute.release(), false);
395                }
396            }
397            break;
398        }
399        case HTMLToken::Comment:
400            m_data = String(token.comment().data(), token.comment().size());
401            break;
402        case HTMLToken::Character:
403            m_externalCharacters = &token.characters();
404            break;
405        }
406    }
407
408    AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0)
409        : m_type(type)
410        , m_name(name)
411        , m_attributes(attributes)
412    {
413        ASSERT(usesName());
414    }
415
416    HTMLToken::Type type() const { return m_type; }
417
418    const AtomicString& name() const
419    {
420        ASSERT(usesName());
421        return m_name;
422    }
423
424    void setName(const AtomicString& name)
425    {
426        ASSERT(usesName());
427        m_name = name;
428    }
429
430    bool selfClosing() const
431    {
432        ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
433        return m_selfClosing;
434    }
435
436    Attribute* getAttributeItem(const QualifiedName& attributeName)
437    {
438        ASSERT(usesAttributes());
439        if (!m_attributes)
440            return 0;
441        return m_attributes->getAttributeItem(attributeName);
442    }
443
444    NamedNodeMap* attributes() const
445    {
446        ASSERT(usesAttributes());
447        return m_attributes.get();
448    }
449
450    PassRefPtr<NamedNodeMap> takeAtributes()
451    {
452        ASSERT(usesAttributes());
453        return m_attributes.release();
454    }
455
456    const HTMLToken::DataVector& characters() const
457    {
458        ASSERT(m_type == HTMLToken::Character);
459        return *m_externalCharacters;
460    }
461
462    const String& comment() const
463    {
464        ASSERT(m_type == HTMLToken::Comment);
465        return m_data;
466    }
467
468    // FIXME: Distinguish between a missing public identifer and an empty one.
469    WTF::Vector<UChar>& publicIdentifier() const
470    {
471        ASSERT(m_type == HTMLToken::DOCTYPE);
472        return m_doctypeData->m_publicIdentifier;
473    }
474
475    // FIXME: Distinguish between a missing system identifer and an empty one.
476    WTF::Vector<UChar>& systemIdentifier() const
477    {
478        ASSERT(m_type == HTMLToken::DOCTYPE);
479        return m_doctypeData->m_systemIdentifier;
480    }
481
482    bool forceQuirks() const
483    {
484        ASSERT(m_type == HTMLToken::DOCTYPE);
485        return m_doctypeData->m_forceQuirks;
486    }
487
488private:
489    HTMLToken::Type m_type;
490
491    bool usesName() const
492    {
493        return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
494    }
495
496    bool usesAttributes() const
497    {
498        return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
499    }
500
501    // "name" for DOCTYPE, StartTag, and EndTag
502    AtomicString m_name;
503
504    // "data" for Comment
505    String m_data;
506
507    // "characters" for Character
508    //
509    // We don't want to copy the the characters out of the HTMLToken, so we
510    // keep a pointer to its buffer instead.  This buffer is owned by the
511    // HTMLToken and causes a lifetime dependence between these objects.
512    //
513    // FIXME: Add a mechanism for "internalizing" the characters when the
514    //        HTMLToken is destructed.
515    const HTMLToken::DataVector* m_externalCharacters;
516
517    // For DOCTYPE
518    OwnPtr<HTMLToken::DoctypeData> m_doctypeData;
519
520    // For StartTag and EndTag
521    bool m_selfClosing;
522
523    RefPtr<NamedNodeMap> m_attributes;
524};
525
526}
527
528#endif
529