1/*
2 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#ifndef HTMLToken_h
27#define HTMLToken_h
28
29#include "core/dom/Attribute.h"
30#include "wtf/PassOwnPtr.h"
31#include "wtf/RefCounted.h"
32#include "wtf/RefPtr.h"
33
34namespace blink {
35
36class DoctypeData {
37    WTF_MAKE_NONCOPYABLE(DoctypeData);
38public:
39    DoctypeData()
40        : m_hasPublicIdentifier(false)
41        , m_hasSystemIdentifier(false)
42        , m_forceQuirks(false)
43    {
44    }
45
46    bool m_hasPublicIdentifier;
47    bool m_hasSystemIdentifier;
48    WTF::Vector<UChar> m_publicIdentifier;
49    WTF::Vector<UChar> m_systemIdentifier;
50    bool m_forceQuirks;
51};
52
53static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name)
54{
55    for (unsigned i = 0; i < attributes.size(); ++i) {
56        if (attributes.at(i).name().matches(name))
57            return &attributes.at(i);
58    }
59    return 0;
60}
61
62class HTMLToken {
63    WTF_MAKE_NONCOPYABLE(HTMLToken);
64    WTF_MAKE_FAST_ALLOCATED;
65public:
66    enum Type {
67        Uninitialized,
68        DOCTYPE,
69        StartTag,
70        EndTag,
71        Comment,
72        Character,
73        EndOfFile,
74    };
75
76    class Attribute {
77    public:
78        class Range {
79        public:
80            int start;
81            int end;
82        };
83
84        Range nameRange;
85        Range valueRange;
86        Vector<UChar, 32> name;
87        Vector<UChar, 32> value;
88    };
89
90    typedef Vector<Attribute, 10> AttributeList;
91
92    // By using an inline capacity of 256, we avoid spilling over into an malloced buffer
93    // approximately 99% of the time based on a non-scientific browse around a number of
94    // popular web sites on 23 May 2013.
95    typedef Vector<UChar, 256> DataVector;
96
97    HTMLToken() { clear(); }
98
99    void clear()
100    {
101        m_type = Uninitialized;
102        m_range.start = 0;
103        m_range.end = 0;
104        m_baseOffset = 0;
105        // Don't call Vector::clear() as that would destroy the
106        // alloced VectorBuffer. If the innerHTML'd content has
107        // two 257 character text nodes in a row, we'll needlessly
108        // thrash malloc. When we finally finish the parse the
109        // HTMLToken will be destroyed and the VectorBuffer released.
110        m_data.shrink(0);
111        m_orAllData = 0;
112    }
113
114    bool isUninitialized() { return m_type == Uninitialized; }
115    Type type() const { return m_type; }
116
117    void makeEndOfFile()
118    {
119        ASSERT(m_type == Uninitialized);
120        m_type = EndOfFile;
121    }
122
123    /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */
124    int startIndex() const { return m_range.start; }
125    int endIndex() const { return m_range.end; }
126
127    void setBaseOffset(int offset)
128    {
129        m_baseOffset = offset;
130    }
131
132    void end(int endOffset)
133    {
134        m_range.end = endOffset - m_baseOffset;
135    }
136
137    const DataVector& data() const
138    {
139        ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
140        return m_data;
141    }
142
143    bool isAll8BitData() const
144    {
145        return (m_orAllData <= 0xff);
146    }
147
148    const DataVector& name() const
149    {
150        ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
151        return m_data;
152    }
153
154    void appendToName(UChar character)
155    {
156        ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
157        ASSERT(character);
158        m_data.append(character);
159        m_orAllData |= character;
160    }
161
162    /* DOCTYPE Tokens */
163
164    bool forceQuirks() const
165    {
166        ASSERT(m_type == DOCTYPE);
167        return m_doctypeData->m_forceQuirks;
168    }
169
170    void setForceQuirks()
171    {
172        ASSERT(m_type == DOCTYPE);
173        m_doctypeData->m_forceQuirks = true;
174    }
175
176    void beginDOCTYPE()
177    {
178        ASSERT(m_type == Uninitialized);
179        m_type = DOCTYPE;
180        m_doctypeData = adoptPtr(new DoctypeData);
181    }
182
183    void beginDOCTYPE(UChar character)
184    {
185        ASSERT(character);
186        beginDOCTYPE();
187        m_data.append(character);
188        m_orAllData |= character;
189    }
190
191    // FIXME: Distinguish between a missing public identifer and an empty one.
192    const WTF::Vector<UChar>& publicIdentifier() const
193    {
194        ASSERT(m_type == DOCTYPE);
195        return m_doctypeData->m_publicIdentifier;
196    }
197
198    // FIXME: Distinguish between a missing system identifer and an empty one.
199    const WTF::Vector<UChar>& systemIdentifier() const
200    {
201        ASSERT(m_type == DOCTYPE);
202        return m_doctypeData->m_systemIdentifier;
203    }
204
205    void setPublicIdentifierToEmptyString()
206    {
207        ASSERT(m_type == DOCTYPE);
208        m_doctypeData->m_hasPublicIdentifier = true;
209        m_doctypeData->m_publicIdentifier.clear();
210    }
211
212    void setSystemIdentifierToEmptyString()
213    {
214        ASSERT(m_type == DOCTYPE);
215        m_doctypeData->m_hasSystemIdentifier = true;
216        m_doctypeData->m_systemIdentifier.clear();
217    }
218
219    void appendToPublicIdentifier(UChar character)
220    {
221        ASSERT(character);
222        ASSERT(m_type == DOCTYPE);
223        ASSERT(m_doctypeData->m_hasPublicIdentifier);
224        m_doctypeData->m_publicIdentifier.append(character);
225    }
226
227    void appendToSystemIdentifier(UChar character)
228    {
229        ASSERT(character);
230        ASSERT(m_type == DOCTYPE);
231        ASSERT(m_doctypeData->m_hasSystemIdentifier);
232        m_doctypeData->m_systemIdentifier.append(character);
233    }
234
235    PassOwnPtr<DoctypeData> releaseDoctypeData()
236    {
237        return m_doctypeData.release();
238    }
239
240    /* Start/End Tag Tokens */
241
242    bool selfClosing() const
243    {
244        ASSERT(m_type == StartTag || m_type == EndTag);
245        return m_selfClosing;
246    }
247
248    void setSelfClosing()
249    {
250        ASSERT(m_type == StartTag || m_type == EndTag);
251        m_selfClosing = true;
252    }
253
254    void beginStartTag(UChar character)
255    {
256        ASSERT(character);
257        ASSERT(m_type == Uninitialized);
258        m_type = StartTag;
259        m_selfClosing = false;
260        m_currentAttribute = 0;
261        m_attributes.clear();
262
263        m_data.append(character);
264        m_orAllData |= character;
265    }
266
267    void beginEndTag(LChar character)
268    {
269        ASSERT(m_type == Uninitialized);
270        m_type = EndTag;
271        m_selfClosing = false;
272        m_currentAttribute = 0;
273        m_attributes.clear();
274
275        m_data.append(character);
276    }
277
278    void beginEndTag(const Vector<LChar, 32>& characters)
279    {
280        ASSERT(m_type == Uninitialized);
281        m_type = EndTag;
282        m_selfClosing = false;
283        m_currentAttribute = 0;
284        m_attributes.clear();
285
286        m_data.appendVector(characters);
287    }
288
289    void addNewAttribute()
290    {
291        ASSERT(m_type == StartTag || m_type == EndTag);
292        m_attributes.grow(m_attributes.size() + 1);
293        m_currentAttribute = &m_attributes.last();
294#if ENABLE(ASSERT)
295        m_currentAttribute->nameRange.start = 0;
296        m_currentAttribute->nameRange.end = 0;
297        m_currentAttribute->valueRange.start = 0;
298        m_currentAttribute->valueRange.end = 0;
299#endif
300    }
301
302    void beginAttributeName(int offset)
303    {
304        m_currentAttribute->nameRange.start = offset - m_baseOffset;
305    }
306
307    void endAttributeName(int offset)
308    {
309        int index = offset - m_baseOffset;
310        m_currentAttribute->nameRange.end = index;
311        m_currentAttribute->valueRange.start = index;
312        m_currentAttribute->valueRange.end = index;
313    }
314
315    void beginAttributeValue(int offset)
316    {
317        m_currentAttribute->valueRange.start = offset - m_baseOffset;
318#if ENABLE(ASSERT)
319        m_currentAttribute->valueRange.end = 0;
320#endif
321    }
322
323    void endAttributeValue(int offset)
324    {
325        m_currentAttribute->valueRange.end = offset - m_baseOffset;
326    }
327
328    void appendToAttributeName(UChar character)
329    {
330        ASSERT(character);
331        ASSERT(m_type == StartTag || m_type == EndTag);
332        ASSERT(m_currentAttribute->nameRange.start);
333        m_currentAttribute->name.append(character);
334    }
335
336    void appendToAttributeValue(UChar character)
337    {
338        ASSERT(character);
339        ASSERT(m_type == StartTag || m_type == EndTag);
340        ASSERT(m_currentAttribute->valueRange.start);
341        m_currentAttribute->value.append(character);
342    }
343
344    void appendToAttributeValue(size_t i, const String& value)
345    {
346        ASSERT(!value.isEmpty());
347        ASSERT(m_type == StartTag || m_type == EndTag);
348        append(m_attributes[i].value, value);
349    }
350
351    const AttributeList& attributes() const
352    {
353        ASSERT(m_type == StartTag || m_type == EndTag);
354        return m_attributes;
355    }
356
357    const Attribute* getAttributeItem(const QualifiedName& name) const
358    {
359        for (unsigned i = 0; i < m_attributes.size(); ++i) {
360            if (AtomicString(m_attributes.at(i).name) == name.localName())
361                return &m_attributes.at(i);
362        }
363        return 0;
364    }
365
366    // Used by the XSSAuditor to nuke XSS-laden attributes.
367    void eraseValueOfAttribute(size_t i)
368    {
369        ASSERT(m_type == StartTag || m_type == EndTag);
370        m_attributes[i].value.clear();
371    }
372
373    /* Character Tokens */
374
375    // Starting a character token works slightly differently than starting
376    // other types of tokens because we want to save a per-character branch.
377    void ensureIsCharacterToken()
378    {
379        ASSERT(m_type == Uninitialized || m_type == Character);
380        m_type = Character;
381    }
382
383    const DataVector& characters() const
384    {
385        ASSERT(m_type == Character);
386        return m_data;
387    }
388
389    void appendToCharacter(char character)
390    {
391        ASSERT(m_type == Character);
392        m_data.append(character);
393    }
394
395    void appendToCharacter(UChar character)
396    {
397        ASSERT(m_type == Character);
398        m_data.append(character);
399        m_orAllData |= character;
400    }
401
402    void appendToCharacter(const Vector<LChar, 32>& characters)
403    {
404        ASSERT(m_type == Character);
405        m_data.appendVector(characters);
406    }
407
408    /* Comment Tokens */
409
410    const DataVector& comment() const
411    {
412        ASSERT(m_type == Comment);
413        return m_data;
414    }
415
416    void beginComment()
417    {
418        ASSERT(m_type == Uninitialized);
419        m_type = Comment;
420    }
421
422    void appendToComment(UChar character)
423    {
424        ASSERT(character);
425        ASSERT(m_type == Comment);
426        m_data.append(character);
427        m_orAllData |= character;
428    }
429
430    // Only for XSSAuditor
431    void eraseCharacters()
432    {
433        ASSERT(m_type == Character);
434        m_data.clear();
435        m_orAllData = 0;
436    }
437
438private:
439    Type m_type;
440    Attribute::Range m_range; // Always starts at zero.
441    int m_baseOffset;
442    DataVector m_data;
443    UChar m_orAllData;
444
445    // For StartTag and EndTag
446    bool m_selfClosing;
447    AttributeList m_attributes;
448
449    // A pointer into m_attributes used during lexing.
450    Attribute* m_currentAttribute;
451
452    // For DOCTYPE
453    OwnPtr<DoctypeData> m_doctypeData;
454};
455
456}
457
458#endif
459