HTMLToken.h revision 2fc2651226baac27029e38c9d6ef883fa32084db
1/*
2 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#ifndef HTMLToken_h
27#define HTMLToken_h
28
29#include "NamedNodeMap.h"
30#include <wtf/PassOwnPtr.h>
31#include <wtf/Vector.h>
32
33namespace WebCore {
34
35class HTMLToken {
36    WTF_MAKE_NONCOPYABLE(HTMLToken); WTF_MAKE_FAST_ALLOCATED;
37public:
38    enum Type {
39        Uninitialized,
40        DOCTYPE,
41        StartTag,
42        EndTag,
43        Comment,
44        Character,
45        EndOfFile,
46    };
47
48    class Range {
49    public:
50        int m_start;
51        int m_end;
52    };
53
54    class Attribute {
55    public:
56        Range m_nameRange;
57        Range m_valueRange;
58        WTF::Vector<UChar, 32> m_name;
59        WTF::Vector<UChar, 32> m_value;
60    };
61
62    typedef WTF::Vector<Attribute, 10> AttributeList;
63    typedef WTF::Vector<UChar, 1024> DataVector;
64
65    HTMLToken() { clear(); }
66
67    void clear()
68    {
69        m_type = Uninitialized;
70        m_range.m_start = 0;
71        m_range.m_end = 0;
72        m_baseOffset = 0;
73        m_data.clear();
74    }
75
76    int startIndex() const { return m_range.m_start; }
77    int endIndex() const { return m_range.m_end; }
78
79    void setBaseOffset(int offset)
80    {
81        m_baseOffset = offset;
82    }
83
84    void end(int endOffset)
85    {
86        m_range.m_end = endOffset - m_baseOffset;
87    }
88
89    void makeEndOfFile()
90    {
91        ASSERT(m_type == Uninitialized);
92        m_type = EndOfFile;
93    }
94
95    void beginStartTag(UChar character)
96    {
97        ASSERT(character);
98        ASSERT(m_type == Uninitialized);
99        m_type = StartTag;
100        m_selfClosing = false;
101        m_currentAttribute = 0;
102        m_attributes.clear();
103
104        m_data.append(character);
105    }
106
107    template<typename T>
108    void beginEndTag(T characters)
109    {
110        ASSERT(m_type == Uninitialized);
111        m_type = EndTag;
112        m_selfClosing = false;
113        m_currentAttribute = 0;
114        m_attributes.clear();
115
116        m_data.append(characters);
117    }
118
119    // Starting a character token works slightly differently than starting
120    // other types of tokens because we want to save a per-character branch.
121    void ensureIsCharacterToken()
122    {
123        ASSERT(m_type == Uninitialized || m_type == Character);
124        m_type = Character;
125    }
126
127    void beginComment()
128    {
129        ASSERT(m_type == Uninitialized);
130        m_type = Comment;
131    }
132
133    void beginDOCTYPE()
134    {
135        ASSERT(m_type == Uninitialized);
136        m_type = DOCTYPE;
137        m_doctypeData = adoptPtr(new DoctypeData());
138    }
139
140    void beginDOCTYPE(UChar character)
141    {
142        ASSERT(character);
143        beginDOCTYPE();
144        m_data.append(character);
145    }
146
147    void appendToName(UChar character)
148    {
149        ASSERT(character);
150        ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
151        m_data.append(character);
152    }
153
154    template<typename T>
155    void appendToCharacter(T characters)
156    {
157        ASSERT(m_type == Character);
158        m_data.append(characters);
159    }
160
161    void appendToComment(UChar character)
162    {
163        ASSERT(character);
164        ASSERT(m_type == Comment);
165        m_data.append(character);
166    }
167
168    void addNewAttribute()
169    {
170        ASSERT(m_type == StartTag || m_type == EndTag);
171        m_attributes.grow(m_attributes.size() + 1);
172        m_currentAttribute = &m_attributes.last();
173#ifndef NDEBUG
174        m_currentAttribute->m_nameRange.m_start = 0;
175        m_currentAttribute->m_nameRange.m_end = 0;
176        m_currentAttribute->m_valueRange.m_start = 0;
177        m_currentAttribute->m_valueRange.m_end = 0;
178#endif
179    }
180
181    void beginAttributeName(int offset)
182    {
183        m_currentAttribute->m_nameRange.m_start = offset - m_baseOffset;
184    }
185
186    void endAttributeName(int offset)
187    {
188        int index = offset - m_baseOffset;
189        m_currentAttribute->m_nameRange.m_end = index;
190        m_currentAttribute->m_valueRange.m_start = index;
191        m_currentAttribute->m_valueRange.m_end = index;
192    }
193
194    void beginAttributeValue(int offset)
195    {
196        m_currentAttribute->m_valueRange.m_start = offset - m_baseOffset;
197#ifndef NDEBUG
198        m_currentAttribute->m_valueRange.m_end = 0;
199#endif
200    }
201
202    void endAttributeValue(int offset)
203    {
204        m_currentAttribute->m_valueRange.m_end = offset - m_baseOffset;
205    }
206
207    void appendToAttributeName(UChar character)
208    {
209        ASSERT(character);
210        ASSERT(m_type == StartTag || m_type == EndTag);
211        ASSERT(m_currentAttribute->m_nameRange.m_start);
212        m_currentAttribute->m_name.append(character);
213    }
214
215    void appendToAttributeValue(UChar character)
216    {
217        ASSERT(character);
218        ASSERT(m_type == StartTag || m_type == EndTag);
219        ASSERT(m_currentAttribute->m_valueRange.m_start);
220        m_currentAttribute->m_value.append(character);
221    }
222
223    void appendToAttributeValue(size_t i, const String& value)
224    {
225        ASSERT(!value.isEmpty());
226        ASSERT(m_type == StartTag || m_type == EndTag);
227        m_attributes[i].m_value.append(value.characters(), value.length());
228    }
229
230    Type type() const { return m_type; }
231
232    bool selfClosing() const
233    {
234        ASSERT(m_type == StartTag || m_type == EndTag);
235        return m_selfClosing;
236    }
237
238    void setSelfClosing()
239    {
240        ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
241        m_selfClosing = true;
242    }
243
244    const AttributeList& attributes() const
245    {
246        ASSERT(m_type == StartTag || m_type == EndTag);
247        return m_attributes;
248    }
249
250    const DataVector& name() const
251    {
252        ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
253        return m_data;
254    }
255
256    void eraseCharacters()
257    {
258        ASSERT(m_type == Character);
259        m_data.clear();
260    }
261
262    void eraseValueOfAttribute(size_t i)
263    {
264        ASSERT(m_type == StartTag || m_type == EndTag);
265        m_attributes[i].m_value.clear();
266    }
267
268    const DataVector& characters() const
269    {
270        ASSERT(m_type == Character);
271        return m_data;
272    }
273
274    const DataVector& comment() const
275    {
276        ASSERT(m_type == Comment);
277        return m_data;
278    }
279
280    // FIXME: Distinguish between a missing public identifer and an empty one.
281    const WTF::Vector<UChar>& publicIdentifier() const
282    {
283        ASSERT(m_type == DOCTYPE);
284        return m_doctypeData->m_publicIdentifier;
285    }
286
287    // FIXME: Distinguish between a missing system identifer and an empty one.
288    const WTF::Vector<UChar>& systemIdentifier() const
289    {
290        ASSERT(m_type == DOCTYPE);
291        return m_doctypeData->m_systemIdentifier;
292    }
293
294    void setPublicIdentifierToEmptyString()
295    {
296        ASSERT(m_type == DOCTYPE);
297        m_doctypeData->m_hasPublicIdentifier = true;
298        m_doctypeData->m_publicIdentifier.clear();
299    }
300
301    void setSystemIdentifierToEmptyString()
302    {
303        ASSERT(m_type == DOCTYPE);
304        m_doctypeData->m_hasSystemIdentifier = true;
305        m_doctypeData->m_systemIdentifier.clear();
306    }
307
308    bool forceQuirks() const
309    {
310        ASSERT(m_type == DOCTYPE);
311        return m_doctypeData->m_forceQuirks;
312    }
313
314    void setForceQuirks()
315    {
316        ASSERT(m_type == DOCTYPE);
317        m_doctypeData->m_forceQuirks = true;
318    }
319
320    void appendToPublicIdentifier(UChar character)
321    {
322        ASSERT(character);
323        ASSERT(m_type == DOCTYPE);
324        ASSERT(m_doctypeData->m_hasPublicIdentifier);
325        m_doctypeData->m_publicIdentifier.append(character);
326    }
327
328    void appendToSystemIdentifier(UChar character)
329    {
330        ASSERT(character);
331        ASSERT(m_type == DOCTYPE);
332        ASSERT(m_doctypeData->m_hasSystemIdentifier);
333        m_doctypeData->m_systemIdentifier.append(character);
334    }
335
336private:
337    // FIXME: I'm not sure what the final relationship between HTMLToken and
338    // AtomicHTMLToken will be.  I'm marking this a friend for now, but we'll
339    // want to end up with a cleaner interface between the two classes.
340    friend class AtomicHTMLToken;
341
342    class DoctypeData {
343        WTF_MAKE_NONCOPYABLE(DoctypeData);
344    public:
345        DoctypeData()
346            : m_hasPublicIdentifier(false)
347            , m_hasSystemIdentifier(false)
348            , m_forceQuirks(false)
349        {
350        }
351
352        bool m_hasPublicIdentifier;
353        bool m_hasSystemIdentifier;
354        bool m_forceQuirks;
355        WTF::Vector<UChar> m_publicIdentifier;
356        WTF::Vector<UChar> m_systemIdentifier;
357    };
358
359    Type m_type;
360    Range m_range; // Always starts at zero.
361    int m_baseOffset;
362
363    // "name" for DOCTYPE, StartTag, and EndTag
364    // "characters" for Character
365    // "data" for Comment
366    DataVector m_data;
367
368    // For DOCTYPE
369    OwnPtr<DoctypeData> m_doctypeData;
370
371    // For StartTag and EndTag
372    bool m_selfClosing;
373    AttributeList m_attributes;
374
375    // A pointer into m_attributes used during lexing.
376    Attribute* m_currentAttribute;
377};
378
379// FIXME: This class should eventually be named HTMLToken once we move the
380// exiting HTMLToken to be internal to the HTMLTokenizer.
381class AtomicHTMLToken {
382    WTF_MAKE_NONCOPYABLE(AtomicHTMLToken);
383public:
384    AtomicHTMLToken(HTMLToken& token)
385        : m_type(token.type())
386    {
387        switch (m_type) {
388        case HTMLToken::Uninitialized:
389            ASSERT_NOT_REACHED();
390            break;
391        case HTMLToken::DOCTYPE:
392            m_name = AtomicString(token.name().data(), token.name().size());
393            m_doctypeData = token.m_doctypeData.release();
394            break;
395        case HTMLToken::EndOfFile:
396            break;
397        case HTMLToken::StartTag:
398        case HTMLToken::EndTag: {
399            m_selfClosing = token.selfClosing();
400            m_name = AtomicString(token.name().data(), token.name().size());
401            const HTMLToken::AttributeList& attributes = token.attributes();
402            for (HTMLToken::AttributeList::const_iterator iter = attributes.begin();
403                 iter != attributes.end(); ++iter) {
404                if (!iter->m_name.isEmpty()) {
405                    String name(iter->m_name.data(), iter->m_name.size());
406                    String value(iter->m_value.data(), iter->m_value.size());
407                    ASSERT(iter->m_nameRange.m_start);
408                    ASSERT(iter->m_nameRange.m_end);
409                    ASSERT(iter->m_valueRange.m_start);
410                    ASSERT(iter->m_valueRange.m_end);
411                    RefPtr<Attribute> mappedAttribute = Attribute::createMapped(name, value);
412                    if (!m_attributes) {
413                        m_attributes = NamedNodeMap::create();
414                        // Reserving capacity here improves the parser
415                        // benchmark.  It might be worth experimenting with
416                        // the constant to see where the optimal point is.
417                        m_attributes->reserveInitialCapacity(10);
418                    }
419                    m_attributes->insertAttribute(mappedAttribute.release(), false);
420                }
421            }
422            break;
423        }
424        case HTMLToken::Comment:
425            m_data = String(token.comment().data(), token.comment().size());
426            break;
427        case HTMLToken::Character:
428            m_externalCharacters = &token.characters();
429            break;
430        }
431    }
432
433    AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0)
434        : m_type(type)
435        , m_name(name)
436        , m_attributes(attributes)
437    {
438        ASSERT(usesName());
439    }
440
441    HTMLToken::Type type() const { return m_type; }
442
443    const AtomicString& name() const
444    {
445        ASSERT(usesName());
446        return m_name;
447    }
448
449    void setName(const AtomicString& name)
450    {
451        ASSERT(usesName());
452        m_name = name;
453    }
454
455    bool selfClosing() const
456    {
457        ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
458        return m_selfClosing;
459    }
460
461    Attribute* getAttributeItem(const QualifiedName& attributeName)
462    {
463        ASSERT(usesAttributes());
464        if (!m_attributes)
465            return 0;
466        return m_attributes->getAttributeItem(attributeName);
467    }
468
469    NamedNodeMap* attributes() const
470    {
471        ASSERT(usesAttributes());
472        return m_attributes.get();
473    }
474
475    PassRefPtr<NamedNodeMap> takeAtributes()
476    {
477        ASSERT(usesAttributes());
478        return m_attributes.release();
479    }
480
481    const HTMLToken::DataVector& characters() const
482    {
483        ASSERT(m_type == HTMLToken::Character);
484        return *m_externalCharacters;
485    }
486
487    const String& comment() const
488    {
489        ASSERT(m_type == HTMLToken::Comment);
490        return m_data;
491    }
492
493    // FIXME: Distinguish between a missing public identifer and an empty one.
494    WTF::Vector<UChar>& publicIdentifier() const
495    {
496        ASSERT(m_type == HTMLToken::DOCTYPE);
497        return m_doctypeData->m_publicIdentifier;
498    }
499
500    // FIXME: Distinguish between a missing system identifer and an empty one.
501    WTF::Vector<UChar>& systemIdentifier() const
502    {
503        ASSERT(m_type == HTMLToken::DOCTYPE);
504        return m_doctypeData->m_systemIdentifier;
505    }
506
507    bool forceQuirks() const
508    {
509        ASSERT(m_type == HTMLToken::DOCTYPE);
510        return m_doctypeData->m_forceQuirks;
511    }
512
513private:
514    HTMLToken::Type m_type;
515
516    bool usesName() const
517    {
518        return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
519    }
520
521    bool usesAttributes() const
522    {
523        return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
524    }
525
526    // "name" for DOCTYPE, StartTag, and EndTag
527    AtomicString m_name;
528
529    // "data" for Comment
530    String m_data;
531
532    // "characters" for Character
533    //
534    // We don't want to copy the the characters out of the HTMLToken, so we
535    // keep a pointer to its buffer instead.  This buffer is owned by the
536    // HTMLToken and causes a lifetime dependence between these objects.
537    //
538    // FIXME: Add a mechanism for "internalizing" the characters when the
539    //        HTMLToken is destructed.
540    const HTMLToken::DataVector* m_externalCharacters;
541
542    // For DOCTYPE
543    OwnPtr<HTMLToken::DoctypeData> m_doctypeData;
544
545    // For StartTag and EndTag
546    bool m_selfClosing;
547
548    RefPtr<NamedNodeMap> m_attributes;
549};
550
551}
552
553#endif
554