1/*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "HTMLTokenizer.h"
30
31#include "HTMLEntityParser.h"
32#include "HTMLToken.h"
33#include "HTMLTreeBuilder.h"
34#include "HTMLNames.h"
35#include "NotImplemented.h"
36#include <wtf/ASCIICType.h>
37#include <wtf/CurrentTime.h>
38#include <wtf/UnusedParam.h>
39#include <wtf/text/AtomicString.h>
40#include <wtf/text/CString.h>
41#include <wtf/unicode/Unicode.h>
42
43using namespace WTF;
44
45namespace WebCore {
46
47using namespace HTMLNames;
48
49const UChar HTMLTokenizer::InputStreamPreprocessor::endOfFileMarker = 0;
50
51namespace {
52
53inline UChar toLowerCase(UChar cc)
54{
55    ASSERT(isASCIIUpper(cc));
56    const int lowerCaseOffset = 0x20;
57    return cc + lowerCaseOffset;
58}
59
60inline bool isTokenizerWhitespace(UChar cc)
61{
62    return cc == ' ' || cc == '\x0A' || cc == '\x09' || cc == '\x0C';
63}
64
65inline void advanceStringAndASSERTIgnoringCase(SegmentedString& source, const char* expectedCharacters)
66{
67    while (*expectedCharacters)
68        source.advanceAndASSERTIgnoringCase(*expectedCharacters++);
69}
70
71inline void advanceStringAndASSERT(SegmentedString& source, const char* expectedCharacters)
72{
73    while (*expectedCharacters)
74        source.advanceAndASSERT(*expectedCharacters++);
75}
76
77inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string)
78{
79    if (vector.size() != string.length())
80        return false;
81    const UChar* stringData = string.characters();
82    const UChar* vectorData = vector.data();
83    // FIXME: Is there a higher-level function we should be calling here?
84    return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar));
85}
86
87inline bool isEndTagBufferingState(HTMLTokenizer::State state)
88{
89    switch (state) {
90    case HTMLTokenizer::RCDATAEndTagOpenState:
91    case HTMLTokenizer::RCDATAEndTagNameState:
92    case HTMLTokenizer::RAWTEXTEndTagOpenState:
93    case HTMLTokenizer::RAWTEXTEndTagNameState:
94    case HTMLTokenizer::ScriptDataEndTagOpenState:
95    case HTMLTokenizer::ScriptDataEndTagNameState:
96    case HTMLTokenizer::ScriptDataEscapedEndTagOpenState:
97    case HTMLTokenizer::ScriptDataEscapedEndTagNameState:
98        return true;
99    default:
100        return false;
101    }
102}
103
104}
105
106HTMLTokenizer::HTMLTokenizer(bool usePreHTML5ParserQuirks)
107    : m_inputStreamPreprocessor(this)
108    , m_usePreHTML5ParserQuirks(usePreHTML5ParserQuirks)
109{
110    reset();
111}
112
113HTMLTokenizer::~HTMLTokenizer()
114{
115}
116
117void HTMLTokenizer::reset()
118{
119    m_state = DataState;
120    m_token = 0;
121    m_lineNumber = 0;
122    m_skipLeadingNewLineForListing = false;
123    m_forceNullCharacterReplacement = false;
124    m_shouldAllowCDATA = false;
125    m_additionalAllowedCharacter = '\0';
126}
127
128inline bool HTMLTokenizer::processEntity(SegmentedString& source)
129{
130    bool notEnoughCharacters = false;
131    Vector<UChar, 16> decodedEntity;
132    bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
133    if (notEnoughCharacters)
134        return false;
135    if (!success) {
136        ASSERT(decodedEntity.isEmpty());
137        bufferCharacter('&');
138    } else {
139        Vector<UChar>::const_iterator iter = decodedEntity.begin();
140        for (; iter != decodedEntity.end(); ++iter)
141            bufferCharacter(*iter);
142    }
143    return true;
144}
145
146#if COMPILER(MSVC)
147// We need to disable the "unreachable code" warning because we want to assert
148// that some code points aren't reached in the state machine.
149#pragma warning(disable: 4702)
150#endif
151
152#define BEGIN_STATE(stateName) case stateName: stateName:
153#define END_STATE() ASSERT_NOT_REACHED(); break;
154
155// We use this macro when the HTML5 spec says "reconsume the current input
156// character in the <mumble> state."
157#define RECONSUME_IN(stateName)                                            \
158    do {                                                                   \
159        m_state = stateName;                                               \
160        goto stateName;                                                    \
161    } while (false)
162
163// We use this macro when the HTML5 spec says "consume the next input
164// character ... and switch to the <mumble> state."
165#define ADVANCE_TO(stateName)                                              \
166    do {                                                                   \
167        m_state = stateName;                                               \
168        if (!m_inputStreamPreprocessor.advance(source, m_lineNumber))      \
169            return haveBufferedCharacterToken();                           \
170        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
171        goto stateName;                                                    \
172    } while (false)
173
174// Sometimes there's more complicated logic in the spec that separates when
175// we consume the next input character and when we switch to a particular
176// state. We handle those cases by advancing the source directly and using
177// this macro to switch to the indicated state.
178#define SWITCH_TO(stateName)                                               \
179    do {                                                                   \
180        m_state = stateName;                                               \
181        if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) \
182            return haveBufferedCharacterToken();                           \
183        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
184        goto stateName;                                                    \
185    } while (false)
186
187
188inline void HTMLTokenizer::saveEndTagNameIfNeeded()
189{
190    ASSERT(m_token->type() != HTMLToken::Uninitialized);
191    if (m_token->type() == HTMLToken::StartTag)
192        m_appropriateEndTagName = m_token->name();
193}
194
195// We use this function when the HTML5 spec says "Emit the current <mumble>
196// token. Switch to the <mumble> state."  We use the word "resume" instead of
197// switch to indicate that this macro actually returns and that we'll end up
198// in the state when we "resume" (i.e., are called again).
199bool HTMLTokenizer::emitAndResumeIn(SegmentedString& source, State state)
200{
201    m_state = state;
202    source.advance(m_lineNumber);
203    saveEndTagNameIfNeeded();
204    return true;
205}
206
207// Identical to emitAndResumeIn, except does not advance.
208bool HTMLTokenizer::emitAndReconsumeIn(SegmentedString&, State state)
209{
210    m_state = state;
211    saveEndTagNameIfNeeded();
212    return true;
213}
214
215// Used to emit the EndOfFile token.
216// Check if we have buffered characters to emit first before emitting the EOF.
217bool HTMLTokenizer::emitEndOfFile(SegmentedString& source)
218{
219    if (haveBufferedCharacterToken())
220        return true;
221    m_state = DataState;
222    source.advance(m_lineNumber);
223    m_token->clear();
224    m_token->makeEndOfFile();
225    return true;
226}
227
228bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
229{
230    ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized);
231    source.advance(m_lineNumber);
232    if (m_token->type() == HTMLToken::Character)
233        return true;
234    m_token->beginEndTag(m_bufferedEndTagName);
235    m_bufferedEndTagName.clear();
236    return false;
237}
238
239#define FLUSH_AND_ADVANCE_TO(stateName)                                    \
240    do {                                                                   \
241        m_state = stateName;                                               \
242        if (flushBufferedEndTag(source))                                   \
243            return true;                                                   \
244        if (source.isEmpty()                                               \
245            || !m_inputStreamPreprocessor.peek(source, m_lineNumber))      \
246            return haveBufferedCharacterToken();                           \
247        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
248        goto stateName;                                                    \
249    } while (false)
250
251bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, State state)
252{
253    m_state = state;
254    flushBufferedEndTag(source);
255    return true;
256}
257
258bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
259{
260    // If we have a token in progress, then we're supposed to be called back
261    // with the same token so we can finish it.
262    ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
263    m_token = &token;
264
265    if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
266        // FIXME: This should call flushBufferedEndTag().
267        // We started an end tag during our last iteration.
268        m_token->beginEndTag(m_bufferedEndTagName);
269        m_bufferedEndTagName.clear();
270        if (m_state == DataState) {
271            // We're back in the data state, so we must be done with the tag.
272            return true;
273        }
274    }
275
276    if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber))
277        return haveBufferedCharacterToken();
278    UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
279
280    // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
281    // Note that this logic is different than the generic \r\n collapsing
282    // handled in the input stream preprocessor. This logic is here as an
283    // "authoring convenience" so folks can write:
284    //
285    // <pre>
286    // lorem ipsum
287    // lorem ipsum
288    // </pre>
289    //
290    // without getting an extra newline at the start of their <pre> element.
291    if (m_skipLeadingNewLineForListing) {
292        m_skipLeadingNewLineForListing = false;
293        if (cc == '\n') {
294            if (m_state == DataState)
295                ADVANCE_TO(DataState);
296            if (m_state == RCDATAState)
297                ADVANCE_TO(RCDATAState);
298            // When parsing text/plain documents, we run the tokenizer in the
299            // PLAINTEXTState and ignore m_skipLeadingNewLineForListing.
300            ASSERT(m_state == PLAINTEXTState);
301        }
302    }
303
304    // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
305    switch (m_state) {
306    BEGIN_STATE(DataState) {
307        if (cc == '&')
308            ADVANCE_TO(CharacterReferenceInDataState);
309        else if (cc == '<') {
310            if (m_token->type() == HTMLToken::Character) {
311                // We have a bunch of character tokens queued up that we
312                // are emitting lazily here.
313                return true;
314            }
315            ADVANCE_TO(TagOpenState);
316        } else if (cc == InputStreamPreprocessor::endOfFileMarker)
317            return emitEndOfFile(source);
318        else {
319            bufferCharacter(cc);
320            ADVANCE_TO(DataState);
321        }
322    }
323    END_STATE()
324
325    BEGIN_STATE(CharacterReferenceInDataState) {
326        if (!processEntity(source))
327            return haveBufferedCharacterToken();
328        SWITCH_TO(DataState);
329    }
330    END_STATE()
331
332    BEGIN_STATE(RCDATAState) {
333        if (cc == '&')
334            ADVANCE_TO(CharacterReferenceInRCDATAState);
335        else if (cc == '<')
336            ADVANCE_TO(RCDATALessThanSignState);
337        else if (cc == InputStreamPreprocessor::endOfFileMarker)
338            return emitEndOfFile(source);
339        else {
340            bufferCharacter(cc);
341            ADVANCE_TO(RCDATAState);
342        }
343    }
344    END_STATE()
345
346    BEGIN_STATE(CharacterReferenceInRCDATAState) {
347        if (!processEntity(source))
348            return haveBufferedCharacterToken();
349        SWITCH_TO(RCDATAState);
350    }
351    END_STATE()
352
353    BEGIN_STATE(RAWTEXTState) {
354        if (cc == '<')
355            ADVANCE_TO(RAWTEXTLessThanSignState);
356        else if (cc == InputStreamPreprocessor::endOfFileMarker)
357            return emitEndOfFile(source);
358        else {
359            bufferCharacter(cc);
360            ADVANCE_TO(RAWTEXTState);
361        }
362    }
363    END_STATE()
364
365    BEGIN_STATE(ScriptDataState) {
366        if (cc == '<')
367            ADVANCE_TO(ScriptDataLessThanSignState);
368        else if (cc == InputStreamPreprocessor::endOfFileMarker)
369            return emitEndOfFile(source);
370        else {
371            bufferCharacter(cc);
372            ADVANCE_TO(ScriptDataState);
373        }
374    }
375    END_STATE()
376
377    BEGIN_STATE(PLAINTEXTState) {
378        if (cc == InputStreamPreprocessor::endOfFileMarker)
379            return emitEndOfFile(source);
380        else
381            bufferCharacter(cc);
382        ADVANCE_TO(PLAINTEXTState);
383    }
384    END_STATE()
385
386    BEGIN_STATE(TagOpenState) {
387        if (cc == '!')
388            ADVANCE_TO(MarkupDeclarationOpenState);
389        else if (cc == '/')
390            ADVANCE_TO(EndTagOpenState);
391        else if (isASCIIUpper(cc)) {
392            m_token->beginStartTag(toLowerCase(cc));
393            ADVANCE_TO(TagNameState);
394        } else if (isASCIILower(cc)) {
395            m_token->beginStartTag(cc);
396            ADVANCE_TO(TagNameState);
397        } else if (cc == '?') {
398            parseError();
399            // The spec consumes the current character before switching
400            // to the bogus comment state, but it's easier to implement
401            // if we reconsume the current character.
402            RECONSUME_IN(BogusCommentState);
403        } else {
404            parseError();
405            bufferCharacter('<');
406            RECONSUME_IN(DataState);
407        }
408    }
409    END_STATE()
410
411    BEGIN_STATE(EndTagOpenState) {
412        if (isASCIIUpper(cc)) {
413            m_token->beginEndTag(toLowerCase(cc));
414            ADVANCE_TO(TagNameState);
415        } else if (isASCIILower(cc)) {
416            m_token->beginEndTag(cc);
417            ADVANCE_TO(TagNameState);
418        } else if (cc == '>') {
419            parseError();
420            ADVANCE_TO(DataState);
421        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
422            parseError();
423            bufferCharacter('<');
424            bufferCharacter('/');
425            RECONSUME_IN(DataState);
426        } else {
427            parseError();
428            RECONSUME_IN(BogusCommentState);
429        }
430    }
431    END_STATE()
432
433    BEGIN_STATE(TagNameState) {
434        if (isTokenizerWhitespace(cc))
435            ADVANCE_TO(BeforeAttributeNameState);
436        else if (cc == '/')
437            ADVANCE_TO(SelfClosingStartTagState);
438        else if (cc == '>')
439            return emitAndResumeIn(source, DataState);
440        else if (m_usePreHTML5ParserQuirks && cc == '<')
441            return emitAndReconsumeIn(source, DataState);
442        else if (isASCIIUpper(cc)) {
443            m_token->appendToName(toLowerCase(cc));
444            ADVANCE_TO(TagNameState);
445        } if (cc == InputStreamPreprocessor::endOfFileMarker) {
446            parseError();
447            RECONSUME_IN(DataState);
448        } else {
449            m_token->appendToName(cc);
450            ADVANCE_TO(TagNameState);
451        }
452    }
453    END_STATE()
454
455    BEGIN_STATE(RCDATALessThanSignState) {
456        if (cc == '/') {
457            m_temporaryBuffer.clear();
458            ASSERT(m_bufferedEndTagName.isEmpty());
459            ADVANCE_TO(RCDATAEndTagOpenState);
460        } else {
461            bufferCharacter('<');
462            RECONSUME_IN(RCDATAState);
463        }
464    }
465    END_STATE()
466
467    BEGIN_STATE(RCDATAEndTagOpenState) {
468        if (isASCIIUpper(cc)) {
469            m_temporaryBuffer.append(cc);
470            addToPossibleEndTag(toLowerCase(cc));
471            ADVANCE_TO(RCDATAEndTagNameState);
472        } else if (isASCIILower(cc)) {
473            m_temporaryBuffer.append(cc);
474            addToPossibleEndTag(cc);
475            ADVANCE_TO(RCDATAEndTagNameState);
476        } else {
477            bufferCharacter('<');
478            bufferCharacter('/');
479            RECONSUME_IN(RCDATAState);
480        }
481    }
482    END_STATE()
483
484    BEGIN_STATE(RCDATAEndTagNameState) {
485        if (isASCIIUpper(cc)) {
486            m_temporaryBuffer.append(cc);
487            addToPossibleEndTag(toLowerCase(cc));
488            ADVANCE_TO(RCDATAEndTagNameState);
489        } else if (isASCIILower(cc)) {
490            m_temporaryBuffer.append(cc);
491            addToPossibleEndTag(cc);
492            ADVANCE_TO(RCDATAEndTagNameState);
493        } else {
494            if (isTokenizerWhitespace(cc)) {
495                if (isAppropriateEndTag())
496                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
497            } else if (cc == '/') {
498                if (isAppropriateEndTag())
499                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
500            } else if (cc == '>') {
501                if (isAppropriateEndTag())
502                    return flushEmitAndResumeIn(source, DataState);
503            }
504            bufferCharacter('<');
505            bufferCharacter('/');
506            m_token->appendToCharacter(m_temporaryBuffer);
507            m_bufferedEndTagName.clear();
508            RECONSUME_IN(RCDATAState);
509        }
510    }
511    END_STATE()
512
513    BEGIN_STATE(RAWTEXTLessThanSignState) {
514        if (cc == '/') {
515            m_temporaryBuffer.clear();
516            ASSERT(m_bufferedEndTagName.isEmpty());
517            ADVANCE_TO(RAWTEXTEndTagOpenState);
518        } else {
519            bufferCharacter('<');
520            RECONSUME_IN(RAWTEXTState);
521        }
522    }
523    END_STATE()
524
525    BEGIN_STATE(RAWTEXTEndTagOpenState) {
526        if (isASCIIUpper(cc)) {
527            m_temporaryBuffer.append(cc);
528            addToPossibleEndTag(toLowerCase(cc));
529            ADVANCE_TO(RAWTEXTEndTagNameState);
530        } else if (isASCIILower(cc)) {
531            m_temporaryBuffer.append(cc);
532            addToPossibleEndTag(cc);
533            ADVANCE_TO(RAWTEXTEndTagNameState);
534        } else {
535            bufferCharacter('<');
536            bufferCharacter('/');
537            RECONSUME_IN(RAWTEXTState);
538        }
539    }
540    END_STATE()
541
542    BEGIN_STATE(RAWTEXTEndTagNameState) {
543        if (isASCIIUpper(cc)) {
544            m_temporaryBuffer.append(cc);
545            addToPossibleEndTag(toLowerCase(cc));
546            ADVANCE_TO(RAWTEXTEndTagNameState);
547        } else if (isASCIILower(cc)) {
548            m_temporaryBuffer.append(cc);
549            addToPossibleEndTag(cc);
550            ADVANCE_TO(RAWTEXTEndTagNameState);
551        } else {
552            if (isTokenizerWhitespace(cc)) {
553                if (isAppropriateEndTag())
554                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
555            } else if (cc == '/') {
556                if (isAppropriateEndTag())
557                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
558            } else if (cc == '>') {
559                if (isAppropriateEndTag())
560                    return flushEmitAndResumeIn(source, DataState);
561            }
562            bufferCharacter('<');
563            bufferCharacter('/');
564            m_token->appendToCharacter(m_temporaryBuffer);
565            m_bufferedEndTagName.clear();
566            RECONSUME_IN(RAWTEXTState);
567        }
568    }
569    END_STATE()
570
571    BEGIN_STATE(ScriptDataLessThanSignState) {
572        if (cc == '/') {
573            m_temporaryBuffer.clear();
574            ASSERT(m_bufferedEndTagName.isEmpty());
575            ADVANCE_TO(ScriptDataEndTagOpenState);
576        } else if (cc == '!') {
577            bufferCharacter('<');
578            bufferCharacter('!');
579            ADVANCE_TO(ScriptDataEscapeStartState);
580        } else {
581            bufferCharacter('<');
582            RECONSUME_IN(ScriptDataState);
583        }
584    }
585    END_STATE()
586
587    BEGIN_STATE(ScriptDataEndTagOpenState) {
588        if (isASCIIUpper(cc)) {
589            m_temporaryBuffer.append(cc);
590            addToPossibleEndTag(toLowerCase(cc));
591            ADVANCE_TO(ScriptDataEndTagNameState);
592        } else if (isASCIILower(cc)) {
593            m_temporaryBuffer.append(cc);
594            addToPossibleEndTag(cc);
595            ADVANCE_TO(ScriptDataEndTagNameState);
596        } else {
597            bufferCharacter('<');
598            bufferCharacter('/');
599            RECONSUME_IN(ScriptDataState);
600        }
601    }
602    END_STATE()
603
604    BEGIN_STATE(ScriptDataEndTagNameState) {
605        if (isASCIIUpper(cc)) {
606            m_temporaryBuffer.append(cc);
607            addToPossibleEndTag(toLowerCase(cc));
608            ADVANCE_TO(ScriptDataEndTagNameState);
609        } else if (isASCIILower(cc)) {
610            m_temporaryBuffer.append(cc);
611            addToPossibleEndTag(cc);
612            ADVANCE_TO(ScriptDataEndTagNameState);
613        } else {
614            if (isTokenizerWhitespace(cc)) {
615                if (isAppropriateEndTag())
616                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
617            } else if (cc == '/') {
618                if (isAppropriateEndTag())
619                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
620            } else if (cc == '>') {
621                if (isAppropriateEndTag())
622                    return flushEmitAndResumeIn(source, DataState);
623            }
624            bufferCharacter('<');
625            bufferCharacter('/');
626            m_token->appendToCharacter(m_temporaryBuffer);
627            m_bufferedEndTagName.clear();
628            RECONSUME_IN(ScriptDataState);
629        }
630    }
631    END_STATE()
632
633    BEGIN_STATE(ScriptDataEscapeStartState) {
634        if (cc == '-') {
635            bufferCharacter(cc);
636            ADVANCE_TO(ScriptDataEscapeStartDashState);
637        } else
638            RECONSUME_IN(ScriptDataState);
639    }
640    END_STATE()
641
642    BEGIN_STATE(ScriptDataEscapeStartDashState) {
643        if (cc == '-') {
644            bufferCharacter(cc);
645            ADVANCE_TO(ScriptDataEscapedDashDashState);
646        } else
647            RECONSUME_IN(ScriptDataState);
648    }
649    END_STATE()
650
651    BEGIN_STATE(ScriptDataEscapedState) {
652        if (cc == '-') {
653            bufferCharacter(cc);
654            ADVANCE_TO(ScriptDataEscapedDashState);
655        } else if (cc == '<')
656            ADVANCE_TO(ScriptDataEscapedLessThanSignState);
657        else if (cc == InputStreamPreprocessor::endOfFileMarker) {
658            parseError();
659            RECONSUME_IN(DataState);
660        } else {
661            bufferCharacter(cc);
662            ADVANCE_TO(ScriptDataEscapedState);
663        }
664    }
665    END_STATE()
666
667    BEGIN_STATE(ScriptDataEscapedDashState) {
668        if (cc == '-') {
669            bufferCharacter(cc);
670            ADVANCE_TO(ScriptDataEscapedDashDashState);
671        } else if (cc == '<')
672            ADVANCE_TO(ScriptDataEscapedLessThanSignState);
673        else if (cc == InputStreamPreprocessor::endOfFileMarker) {
674            parseError();
675            RECONSUME_IN(DataState);
676        } else {
677            bufferCharacter(cc);
678            ADVANCE_TO(ScriptDataEscapedState);
679        }
680    }
681    END_STATE()
682
683    BEGIN_STATE(ScriptDataEscapedDashDashState) {
684        if (cc == '-') {
685            bufferCharacter(cc);
686            ADVANCE_TO(ScriptDataEscapedDashDashState);
687        } else if (cc == '<')
688            ADVANCE_TO(ScriptDataEscapedLessThanSignState);
689        else if (cc == '>') {
690            bufferCharacter(cc);
691            ADVANCE_TO(ScriptDataState);
692        } if (cc == InputStreamPreprocessor::endOfFileMarker) {
693            parseError();
694            RECONSUME_IN(DataState);
695        } else {
696            bufferCharacter(cc);
697            ADVANCE_TO(ScriptDataEscapedState);
698        }
699    }
700    END_STATE()
701
702    BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
703        if (cc == '/') {
704            m_temporaryBuffer.clear();
705            ASSERT(m_bufferedEndTagName.isEmpty());
706            ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
707        } else if (isASCIIUpper(cc)) {
708            bufferCharacter('<');
709            bufferCharacter(cc);
710            m_temporaryBuffer.clear();
711            m_temporaryBuffer.append(toLowerCase(cc));
712            ADVANCE_TO(ScriptDataDoubleEscapeStartState);
713        } else if (isASCIILower(cc)) {
714            bufferCharacter('<');
715            bufferCharacter(cc);
716            m_temporaryBuffer.clear();
717            m_temporaryBuffer.append(cc);
718            ADVANCE_TO(ScriptDataDoubleEscapeStartState);
719        } else {
720            bufferCharacter('<');
721            RECONSUME_IN(ScriptDataEscapedState);
722        }
723    }
724    END_STATE()
725
726    BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
727        if (isASCIIUpper(cc)) {
728            m_temporaryBuffer.append(cc);
729            addToPossibleEndTag(toLowerCase(cc));
730            ADVANCE_TO(ScriptDataEscapedEndTagNameState);
731        } else if (isASCIILower(cc)) {
732            m_temporaryBuffer.append(cc);
733            addToPossibleEndTag(cc);
734            ADVANCE_TO(ScriptDataEscapedEndTagNameState);
735        } else {
736            bufferCharacter('<');
737            bufferCharacter('/');
738            RECONSUME_IN(ScriptDataEscapedState);
739        }
740    }
741    END_STATE()
742
743    BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
744        if (isASCIIUpper(cc)) {
745            m_temporaryBuffer.append(cc);
746            addToPossibleEndTag(toLowerCase(cc));
747            ADVANCE_TO(ScriptDataEscapedEndTagNameState);
748        } else if (isASCIILower(cc)) {
749            m_temporaryBuffer.append(cc);
750            addToPossibleEndTag(cc);
751            ADVANCE_TO(ScriptDataEscapedEndTagNameState);
752        } else {
753            if (isTokenizerWhitespace(cc)) {
754                if (isAppropriateEndTag())
755                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
756            } else if (cc == '/') {
757                if (isAppropriateEndTag())
758                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
759            } else if (cc == '>') {
760                if (isAppropriateEndTag())
761                    return flushEmitAndResumeIn(source, DataState);
762            }
763            bufferCharacter('<');
764            bufferCharacter('/');
765            m_token->appendToCharacter(m_temporaryBuffer);
766            m_bufferedEndTagName.clear();
767            RECONSUME_IN(ScriptDataEscapedState);
768        }
769    }
770    END_STATE()
771
772    BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
773        if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
774            bufferCharacter(cc);
775            if (temporaryBufferIs(scriptTag.localName()))
776                ADVANCE_TO(ScriptDataDoubleEscapedState);
777            else
778                ADVANCE_TO(ScriptDataEscapedState);
779        } else if (isASCIIUpper(cc)) {
780            bufferCharacter(cc);
781            m_temporaryBuffer.append(toLowerCase(cc));
782            ADVANCE_TO(ScriptDataDoubleEscapeStartState);
783        } else if (isASCIILower(cc)) {
784            bufferCharacter(cc);
785            m_temporaryBuffer.append(cc);
786            ADVANCE_TO(ScriptDataDoubleEscapeStartState);
787        } else
788            RECONSUME_IN(ScriptDataEscapedState);
789    }
790    END_STATE()
791
792    BEGIN_STATE(ScriptDataDoubleEscapedState) {
793        if (cc == '-') {
794            bufferCharacter(cc);
795            ADVANCE_TO(ScriptDataDoubleEscapedDashState);
796        } else if (cc == '<') {
797            bufferCharacter(cc);
798            ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
799        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
800            parseError();
801            RECONSUME_IN(DataState);
802        } else {
803            bufferCharacter(cc);
804            ADVANCE_TO(ScriptDataDoubleEscapedState);
805        }
806    }
807    END_STATE()
808
809    BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
810        if (cc == '-') {
811            bufferCharacter(cc);
812            ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
813        } else if (cc == '<') {
814            bufferCharacter(cc);
815            ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
816        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
817            parseError();
818            RECONSUME_IN(DataState);
819        } else {
820            bufferCharacter(cc);
821            ADVANCE_TO(ScriptDataDoubleEscapedState);
822        }
823    }
824    END_STATE()
825
826    BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
827        if (cc == '-') {
828            bufferCharacter(cc);
829            ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
830        } else if (cc == '<') {
831            bufferCharacter(cc);
832            ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
833        } else if (cc == '>') {
834            bufferCharacter(cc);
835            ADVANCE_TO(ScriptDataState);
836        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
837            parseError();
838            RECONSUME_IN(DataState);
839        } else {
840            bufferCharacter(cc);
841            ADVANCE_TO(ScriptDataDoubleEscapedState);
842        }
843    }
844    END_STATE()
845
846    BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
847        if (cc == '/') {
848            bufferCharacter(cc);
849            m_temporaryBuffer.clear();
850            ADVANCE_TO(ScriptDataDoubleEscapeEndState);
851        } else
852            RECONSUME_IN(ScriptDataDoubleEscapedState);
853    }
854    END_STATE()
855
856    BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
857        if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
858            bufferCharacter(cc);
859            if (temporaryBufferIs(scriptTag.localName()))
860                ADVANCE_TO(ScriptDataEscapedState);
861            else
862                ADVANCE_TO(ScriptDataDoubleEscapedState);
863        } else if (isASCIIUpper(cc)) {
864            bufferCharacter(cc);
865            m_temporaryBuffer.append(toLowerCase(cc));
866            ADVANCE_TO(ScriptDataDoubleEscapeEndState);
867        } else if (isASCIILower(cc)) {
868            bufferCharacter(cc);
869            m_temporaryBuffer.append(cc);
870            ADVANCE_TO(ScriptDataDoubleEscapeEndState);
871        } else
872            RECONSUME_IN(ScriptDataDoubleEscapedState);
873    }
874    END_STATE()
875
876    BEGIN_STATE(BeforeAttributeNameState) {
877        if (isTokenizerWhitespace(cc))
878            ADVANCE_TO(BeforeAttributeNameState);
879        else if (cc == '/')
880            ADVANCE_TO(SelfClosingStartTagState);
881        else if (cc == '>')
882            return emitAndResumeIn(source, DataState);
883        else if (m_usePreHTML5ParserQuirks && cc == '<')
884            return emitAndReconsumeIn(source, DataState);
885        else if (isASCIIUpper(cc)) {
886            m_token->addNewAttribute();
887            m_token->beginAttributeName(source.numberOfCharactersConsumed());
888            m_token->appendToAttributeName(toLowerCase(cc));
889            ADVANCE_TO(AttributeNameState);
890        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
891            parseError();
892            RECONSUME_IN(DataState);
893        } else {
894            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
895                parseError();
896            m_token->addNewAttribute();
897            m_token->beginAttributeName(source.numberOfCharactersConsumed());
898            m_token->appendToAttributeName(cc);
899            ADVANCE_TO(AttributeNameState);
900        }
901    }
902    END_STATE()
903
904    BEGIN_STATE(AttributeNameState) {
905        if (isTokenizerWhitespace(cc)) {
906            m_token->endAttributeName(source.numberOfCharactersConsumed());
907            ADVANCE_TO(AfterAttributeNameState);
908        } else if (cc == '/') {
909            m_token->endAttributeName(source.numberOfCharactersConsumed());
910            ADVANCE_TO(SelfClosingStartTagState);
911        } else if (cc == '=') {
912            m_token->endAttributeName(source.numberOfCharactersConsumed());
913            ADVANCE_TO(BeforeAttributeValueState);
914        } else if (cc == '>') {
915            m_token->endAttributeName(source.numberOfCharactersConsumed());
916            return emitAndResumeIn(source, DataState);
917        } else if (m_usePreHTML5ParserQuirks && cc == '<') {
918            m_token->endAttributeName(source.numberOfCharactersConsumed());
919            return emitAndReconsumeIn(source, DataState);
920        } else if (isASCIIUpper(cc)) {
921            m_token->appendToAttributeName(toLowerCase(cc));
922            ADVANCE_TO(AttributeNameState);
923        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
924            parseError();
925            m_token->endAttributeName(source.numberOfCharactersConsumed());
926            RECONSUME_IN(DataState);
927        } else {
928            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
929                parseError();
930            m_token->appendToAttributeName(cc);
931            ADVANCE_TO(AttributeNameState);
932        }
933    }
934    END_STATE()
935
936    BEGIN_STATE(AfterAttributeNameState) {
937        if (isTokenizerWhitespace(cc))
938            ADVANCE_TO(AfterAttributeNameState);
939        else if (cc == '/')
940            ADVANCE_TO(SelfClosingStartTagState);
941        else if (cc == '=')
942            ADVANCE_TO(BeforeAttributeValueState);
943        else if (cc == '>')
944            return emitAndResumeIn(source, DataState);
945        else if (m_usePreHTML5ParserQuirks && cc == '<')
946            return emitAndReconsumeIn(source, DataState);
947        else if (isASCIIUpper(cc)) {
948            m_token->addNewAttribute();
949            m_token->beginAttributeName(source.numberOfCharactersConsumed());
950            m_token->appendToAttributeName(toLowerCase(cc));
951            ADVANCE_TO(AttributeNameState);
952        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
953            parseError();
954            RECONSUME_IN(DataState);
955        } else {
956            if (cc == '"' || cc == '\'' || cc == '<')
957                parseError();
958            m_token->addNewAttribute();
959            m_token->beginAttributeName(source.numberOfCharactersConsumed());
960            m_token->appendToAttributeName(cc);
961            ADVANCE_TO(AttributeNameState);
962        }
963    }
964    END_STATE()
965
966    BEGIN_STATE(BeforeAttributeValueState) {
967        if (isTokenizerWhitespace(cc))
968            ADVANCE_TO(BeforeAttributeValueState);
969        else if (cc == '"') {
970            m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
971            ADVANCE_TO(AttributeValueDoubleQuotedState);
972        } else if (cc == '&') {
973            m_token->beginAttributeValue(source.numberOfCharactersConsumed());
974            RECONSUME_IN(AttributeValueUnquotedState);
975        } else if (cc == '\'') {
976            m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
977            ADVANCE_TO(AttributeValueSingleQuotedState);
978        } else if (cc == '>') {
979            parseError();
980            return emitAndResumeIn(source, DataState);
981        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
982            parseError();
983            RECONSUME_IN(DataState);
984        } else {
985            if (cc == '<' || cc == '=' || cc == '`')
986                parseError();
987            m_token->beginAttributeValue(source.numberOfCharactersConsumed());
988            m_token->appendToAttributeValue(cc);
989            ADVANCE_TO(AttributeValueUnquotedState);
990        }
991    }
992    END_STATE()
993
994    BEGIN_STATE(AttributeValueDoubleQuotedState) {
995        if (cc == '"') {
996            m_token->endAttributeValue(source.numberOfCharactersConsumed());
997            ADVANCE_TO(AfterAttributeValueQuotedState);
998        } else if (cc == '&') {
999            m_additionalAllowedCharacter = '"';
1000            ADVANCE_TO(CharacterReferenceInAttributeValueState);
1001        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1002            parseError();
1003            m_token->endAttributeValue(source.numberOfCharactersConsumed());
1004            RECONSUME_IN(DataState);
1005        } else {
1006            m_token->appendToAttributeValue(cc);
1007            ADVANCE_TO(AttributeValueDoubleQuotedState);
1008        }
1009    }
1010    END_STATE()
1011
1012    BEGIN_STATE(AttributeValueSingleQuotedState) {
1013        if (cc == '\'') {
1014            m_token->endAttributeValue(source.numberOfCharactersConsumed());
1015            ADVANCE_TO(AfterAttributeValueQuotedState);
1016        } else if (cc == '&') {
1017            m_additionalAllowedCharacter = '\'';
1018            ADVANCE_TO(CharacterReferenceInAttributeValueState);
1019        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1020            parseError();
1021            m_token->endAttributeValue(source.numberOfCharactersConsumed());
1022            RECONSUME_IN(DataState);
1023        } else {
1024            m_token->appendToAttributeValue(cc);
1025            ADVANCE_TO(AttributeValueSingleQuotedState);
1026        }
1027    }
1028    END_STATE()
1029
1030    BEGIN_STATE(AttributeValueUnquotedState) {
1031        if (isTokenizerWhitespace(cc)) {
1032            m_token->endAttributeValue(source.numberOfCharactersConsumed());
1033            ADVANCE_TO(BeforeAttributeNameState);
1034        } else if (cc == '&') {
1035            m_additionalAllowedCharacter = '>';
1036            ADVANCE_TO(CharacterReferenceInAttributeValueState);
1037        } else if (cc == '>') {
1038            m_token->endAttributeValue(source.numberOfCharactersConsumed());
1039            return emitAndResumeIn(source, DataState);
1040        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1041            parseError();
1042            m_token->endAttributeValue(source.numberOfCharactersConsumed());
1043            RECONSUME_IN(DataState);
1044        } else {
1045            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
1046                parseError();
1047            m_token->appendToAttributeValue(cc);
1048            ADVANCE_TO(AttributeValueUnquotedState);
1049        }
1050    }
1051    END_STATE()
1052
1053    BEGIN_STATE(CharacterReferenceInAttributeValueState) {
1054        bool notEnoughCharacters = false;
1055        Vector<UChar, 16> decodedEntity;
1056        bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
1057        if (notEnoughCharacters)
1058            return haveBufferedCharacterToken();
1059        if (!success) {
1060            ASSERT(decodedEntity.isEmpty());
1061            m_token->appendToAttributeValue('&');
1062        } else {
1063            Vector<UChar>::const_iterator iter = decodedEntity.begin();
1064            for (; iter != decodedEntity.end(); ++iter)
1065                m_token->appendToAttributeValue(*iter);
1066        }
1067        // We're supposed to switch back to the attribute value state that
1068        // we were in when we were switched into this state. Rather than
1069        // keeping track of this explictly, we observe that the previous
1070        // state can be determined by m_additionalAllowedCharacter.
1071        if (m_additionalAllowedCharacter == '"')
1072            SWITCH_TO(AttributeValueDoubleQuotedState);
1073        else if (m_additionalAllowedCharacter == '\'')
1074            SWITCH_TO(AttributeValueSingleQuotedState);
1075        else if (m_additionalAllowedCharacter == '>')
1076            SWITCH_TO(AttributeValueUnquotedState);
1077        else
1078            ASSERT_NOT_REACHED();
1079    }
1080    END_STATE()
1081
1082    BEGIN_STATE(AfterAttributeValueQuotedState) {
1083        if (isTokenizerWhitespace(cc))
1084            ADVANCE_TO(BeforeAttributeNameState);
1085        else if (cc == '/')
1086            ADVANCE_TO(SelfClosingStartTagState);
1087        else if (cc == '>')
1088            return emitAndResumeIn(source, DataState);
1089        else if (m_usePreHTML5ParserQuirks && cc == '<')
1090            return emitAndReconsumeIn(source, DataState);
1091        else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1092            parseError();
1093            RECONSUME_IN(DataState);
1094        } else {
1095            parseError();
1096            RECONSUME_IN(BeforeAttributeNameState);
1097        }
1098    }
1099    END_STATE()
1100
1101    BEGIN_STATE(SelfClosingStartTagState) {
1102        if (cc == '>') {
1103            m_token->setSelfClosing();
1104            return emitAndResumeIn(source, DataState);
1105        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1106            parseError();
1107            RECONSUME_IN(DataState);
1108        } else {
1109            parseError();
1110            RECONSUME_IN(BeforeAttributeNameState);
1111        }
1112    }
1113    END_STATE()
1114
1115    BEGIN_STATE(BogusCommentState) {
1116        m_token->beginComment();
1117        RECONSUME_IN(ContinueBogusCommentState);
1118    }
1119    END_STATE()
1120
1121    BEGIN_STATE(ContinueBogusCommentState) {
1122        if (cc == '>')
1123            return emitAndResumeIn(source, DataState);
1124        else if (cc == InputStreamPreprocessor::endOfFileMarker)
1125            return emitAndReconsumeIn(source, DataState);
1126        else {
1127            m_token->appendToComment(cc);
1128            ADVANCE_TO(ContinueBogusCommentState);
1129        }
1130    }
1131    END_STATE()
1132
1133    BEGIN_STATE(MarkupDeclarationOpenState) {
1134        DEFINE_STATIC_LOCAL(String, dashDashString, ("--"));
1135        DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype"));
1136        DEFINE_STATIC_LOCAL(String, cdataString, ("[CDATA["));
1137        if (cc == '-') {
1138            SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
1139            if (result == SegmentedString::DidMatch) {
1140                source.advanceAndASSERT('-');
1141                source.advanceAndASSERT('-');
1142                m_token->beginComment();
1143                SWITCH_TO(CommentStartState);
1144            } else if (result == SegmentedString::NotEnoughCharacters)
1145                return haveBufferedCharacterToken();
1146        } else if (cc == 'D' || cc == 'd') {
1147            SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
1148            if (result == SegmentedString::DidMatch) {
1149                advanceStringAndASSERTIgnoringCase(source, "doctype");
1150                SWITCH_TO(DOCTYPEState);
1151            } else if (result == SegmentedString::NotEnoughCharacters)
1152                return haveBufferedCharacterToken();
1153        } else if (cc == '[' && shouldAllowCDATA()) {
1154            SegmentedString::LookAheadResult result = source.lookAhead(cdataString);
1155            if (result == SegmentedString::DidMatch) {
1156                advanceStringAndASSERT(source, "[CDATA[");
1157                SWITCH_TO(CDATASectionState);
1158            } else if (result == SegmentedString::NotEnoughCharacters)
1159                return haveBufferedCharacterToken();
1160        }
1161        parseError();
1162        RECONSUME_IN(BogusCommentState);
1163    }
1164    END_STATE()
1165
1166    BEGIN_STATE(CommentStartState) {
1167        if (cc == '-')
1168            ADVANCE_TO(CommentStartDashState);
1169        else if (cc == '>') {
1170            parseError();
1171            return emitAndResumeIn(source, DataState);
1172        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1173            parseError();
1174            return emitAndReconsumeIn(source, DataState);
1175        } else {
1176            m_token->appendToComment(cc);
1177            ADVANCE_TO(CommentState);
1178        }
1179    }
1180    END_STATE()
1181
1182    BEGIN_STATE(CommentStartDashState) {
1183        if (cc == '-')
1184            ADVANCE_TO(CommentEndState);
1185        else if (cc == '>') {
1186            parseError();
1187            return emitAndResumeIn(source, DataState);
1188        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1189            parseError();
1190            return emitAndReconsumeIn(source, DataState);
1191        } else {
1192            m_token->appendToComment('-');
1193            m_token->appendToComment(cc);
1194            ADVANCE_TO(CommentState);
1195        }
1196    }
1197    END_STATE()
1198
1199    BEGIN_STATE(CommentState) {
1200        if (cc == '-')
1201            ADVANCE_TO(CommentEndDashState);
1202        else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1203            parseError();
1204            return emitAndReconsumeIn(source, DataState);
1205        } else {
1206            m_token->appendToComment(cc);
1207            ADVANCE_TO(CommentState);
1208        }
1209    }
1210    END_STATE()
1211
1212    BEGIN_STATE(CommentEndDashState) {
1213        if (cc == '-')
1214            ADVANCE_TO(CommentEndState);
1215        else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1216            parseError();
1217            return emitAndReconsumeIn(source, DataState);
1218        } else {
1219            m_token->appendToComment('-');
1220            m_token->appendToComment(cc);
1221            ADVANCE_TO(CommentState);
1222        }
1223    }
1224    END_STATE()
1225
1226    BEGIN_STATE(CommentEndState) {
1227        if (cc == '>')
1228            return emitAndResumeIn(source, DataState);
1229        else if (cc == '!') {
1230            parseError();
1231            ADVANCE_TO(CommentEndBangState);
1232        } else if (cc == '-') {
1233            parseError();
1234            m_token->appendToComment('-');
1235            ADVANCE_TO(CommentEndState);
1236        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1237            parseError();
1238            return emitAndReconsumeIn(source, DataState);
1239        } else {
1240            parseError();
1241            m_token->appendToComment('-');
1242            m_token->appendToComment('-');
1243            m_token->appendToComment(cc);
1244            ADVANCE_TO(CommentState);
1245        }
1246    }
1247    END_STATE()
1248
1249    BEGIN_STATE(CommentEndBangState) {
1250        if (cc == '-') {
1251            m_token->appendToComment('-');
1252            m_token->appendToComment('-');
1253            m_token->appendToComment('!');
1254            ADVANCE_TO(CommentEndDashState);
1255        } else if (cc == '>')
1256            return emitAndResumeIn(source, DataState);
1257        else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1258            parseError();
1259            return emitAndReconsumeIn(source, DataState);
1260        } else {
1261            m_token->appendToComment('-');
1262            m_token->appendToComment('-');
1263            m_token->appendToComment('!');
1264            m_token->appendToComment(cc);
1265            ADVANCE_TO(CommentState);
1266        }
1267    }
1268    END_STATE()
1269
1270    BEGIN_STATE(DOCTYPEState) {
1271        if (isTokenizerWhitespace(cc))
1272            ADVANCE_TO(BeforeDOCTYPENameState);
1273        else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1274            parseError();
1275            m_token->beginDOCTYPE();
1276            m_token->setForceQuirks();
1277            return emitAndReconsumeIn(source, DataState);
1278        } else {
1279            parseError();
1280            RECONSUME_IN(BeforeDOCTYPENameState);
1281        }
1282    }
1283    END_STATE()
1284
1285    BEGIN_STATE(BeforeDOCTYPENameState) {
1286        if (isTokenizerWhitespace(cc))
1287            ADVANCE_TO(BeforeDOCTYPENameState);
1288        else if (isASCIIUpper(cc)) {
1289            m_token->beginDOCTYPE(toLowerCase(cc));
1290            ADVANCE_TO(DOCTYPENameState);
1291        } else if (cc == '>') {
1292            parseError();
1293            m_token->beginDOCTYPE();
1294            m_token->setForceQuirks();
1295            return emitAndResumeIn(source, DataState);
1296        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1297            parseError();
1298            m_token->beginDOCTYPE();
1299            m_token->setForceQuirks();
1300            return emitAndReconsumeIn(source, DataState);
1301        } else {
1302            m_token->beginDOCTYPE(cc);
1303            ADVANCE_TO(DOCTYPENameState);
1304        }
1305    }
1306    END_STATE()
1307
1308    BEGIN_STATE(DOCTYPENameState) {
1309        if (isTokenizerWhitespace(cc))
1310            ADVANCE_TO(AfterDOCTYPENameState);
1311        else if (cc == '>')
1312            return emitAndResumeIn(source, DataState);
1313        else if (isASCIIUpper(cc)) {
1314            m_token->appendToName(toLowerCase(cc));
1315            ADVANCE_TO(DOCTYPENameState);
1316        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1317            parseError();
1318            m_token->setForceQuirks();
1319            return emitAndReconsumeIn(source, DataState);
1320        } else {
1321            m_token->appendToName(cc);
1322            ADVANCE_TO(DOCTYPENameState);
1323        }
1324    }
1325    END_STATE()
1326
1327    BEGIN_STATE(AfterDOCTYPENameState) {
1328        if (isTokenizerWhitespace(cc))
1329            ADVANCE_TO(AfterDOCTYPENameState);
1330        if (cc == '>')
1331            return emitAndResumeIn(source, DataState);
1332        else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1333            parseError();
1334            m_token->setForceQuirks();
1335            return emitAndReconsumeIn(source, DataState);
1336        } else {
1337            DEFINE_STATIC_LOCAL(String, publicString, ("public"));
1338            DEFINE_STATIC_LOCAL(String, systemString, ("system"));
1339            if (cc == 'P' || cc == 'p') {
1340                SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
1341                if (result == SegmentedString::DidMatch) {
1342                    advanceStringAndASSERTIgnoringCase(source, "public");
1343                    SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1344                } else if (result == SegmentedString::NotEnoughCharacters)
1345                    return haveBufferedCharacterToken();
1346            } else if (cc == 'S' || cc == 's') {
1347                SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
1348                if (result == SegmentedString::DidMatch) {
1349                    advanceStringAndASSERTIgnoringCase(source, "system");
1350                    SWITCH_TO(AfterDOCTYPESystemKeywordState);
1351                } else if (result == SegmentedString::NotEnoughCharacters)
1352                    return haveBufferedCharacterToken();
1353            }
1354            parseError();
1355            m_token->setForceQuirks();
1356            ADVANCE_TO(BogusDOCTYPEState);
1357        }
1358    }
1359    END_STATE()
1360
1361    BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1362        if (isTokenizerWhitespace(cc))
1363            ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1364        else if (cc == '"') {
1365            parseError();
1366            m_token->setPublicIdentifierToEmptyString();
1367            ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1368        } else if (cc == '\'') {
1369            parseError();
1370            m_token->setPublicIdentifierToEmptyString();
1371            ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1372        } else if (cc == '>') {
1373            parseError();
1374            m_token->setForceQuirks();
1375            return emitAndResumeIn(source, DataState);
1376        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1377            parseError();
1378            m_token->setForceQuirks();
1379            return emitAndReconsumeIn(source, DataState);
1380        } else {
1381            parseError();
1382            m_token->setForceQuirks();
1383            ADVANCE_TO(BogusDOCTYPEState);
1384        }
1385    }
1386    END_STATE()
1387
1388    BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1389        if (isTokenizerWhitespace(cc))
1390            ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1391        else if (cc == '"') {
1392            m_token->setPublicIdentifierToEmptyString();
1393            ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1394        } else if (cc == '\'') {
1395            m_token->setPublicIdentifierToEmptyString();
1396            ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1397        } else if (cc == '>') {
1398            parseError();
1399            m_token->setForceQuirks();
1400            return emitAndResumeIn(source, DataState);
1401        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1402            parseError();
1403            m_token->setForceQuirks();
1404            return emitAndReconsumeIn(source, DataState);
1405        } else {
1406            parseError();
1407            m_token->setForceQuirks();
1408            ADVANCE_TO(BogusDOCTYPEState);
1409        }
1410    }
1411    END_STATE()
1412
1413    BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1414        if (cc == '"')
1415            ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1416        else if (cc == '>') {
1417            parseError();
1418            m_token->setForceQuirks();
1419            return emitAndResumeIn(source, DataState);
1420        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1421            parseError();
1422            m_token->setForceQuirks();
1423            return emitAndReconsumeIn(source, DataState);
1424        } else {
1425            m_token->appendToPublicIdentifier(cc);
1426            ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1427        }
1428    }
1429    END_STATE()
1430
1431    BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1432        if (cc == '\'')
1433            ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1434        else if (cc == '>') {
1435            parseError();
1436            m_token->setForceQuirks();
1437            return emitAndResumeIn(source, DataState);
1438        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1439            parseError();
1440            m_token->setForceQuirks();
1441            return emitAndReconsumeIn(source, DataState);
1442        } else {
1443            m_token->appendToPublicIdentifier(cc);
1444            ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1445        }
1446    }
1447    END_STATE()
1448
1449    BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1450        if (isTokenizerWhitespace(cc))
1451            ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1452        else if (cc == '>')
1453            return emitAndResumeIn(source, DataState);
1454        else if (cc == '"') {
1455            parseError();
1456            m_token->setSystemIdentifierToEmptyString();
1457            ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1458        } else if (cc == '\'') {
1459            parseError();
1460            m_token->setSystemIdentifierToEmptyString();
1461            ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1462        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1463            parseError();
1464            m_token->setForceQuirks();
1465            return emitAndReconsumeIn(source, DataState);
1466        } else {
1467            parseError();
1468            m_token->setForceQuirks();
1469            ADVANCE_TO(BogusDOCTYPEState);
1470        }
1471    }
1472    END_STATE()
1473
1474    BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1475        if (isTokenizerWhitespace(cc))
1476            ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1477        else if (cc == '>')
1478            return emitAndResumeIn(source, DataState);
1479        else if (cc == '"') {
1480            m_token->setSystemIdentifierToEmptyString();
1481            ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1482        } else if (cc == '\'') {
1483            m_token->setSystemIdentifierToEmptyString();
1484            ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1485        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1486            parseError();
1487            m_token->setForceQuirks();
1488            return emitAndReconsumeIn(source, DataState);
1489        } else {
1490            parseError();
1491            m_token->setForceQuirks();
1492            ADVANCE_TO(BogusDOCTYPEState);
1493        }
1494    }
1495    END_STATE()
1496
1497    BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1498        if (isTokenizerWhitespace(cc))
1499            ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1500        else if (cc == '"') {
1501            parseError();
1502            m_token->setSystemIdentifierToEmptyString();
1503            ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1504        } else if (cc == '\'') {
1505            parseError();
1506            m_token->setSystemIdentifierToEmptyString();
1507            ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1508        } else if (cc == '>') {
1509            parseError();
1510            m_token->setForceQuirks();
1511            return emitAndResumeIn(source, DataState);
1512        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1513            parseError();
1514            m_token->setForceQuirks();
1515            return emitAndReconsumeIn(source, DataState);
1516        } else {
1517            parseError();
1518            m_token->setForceQuirks();
1519            ADVANCE_TO(BogusDOCTYPEState);
1520        }
1521    }
1522    END_STATE()
1523
1524    BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1525        if (isTokenizerWhitespace(cc))
1526            ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1527        if (cc == '"') {
1528            m_token->setSystemIdentifierToEmptyString();
1529            ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1530        } else if (cc == '\'') {
1531            m_token->setSystemIdentifierToEmptyString();
1532            ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1533        } else if (cc == '>') {
1534            parseError();
1535            m_token->setForceQuirks();
1536            return emitAndResumeIn(source, DataState);
1537        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1538            parseError();
1539            m_token->setForceQuirks();
1540            return emitAndReconsumeIn(source, DataState);
1541        } else {
1542            parseError();
1543            m_token->setForceQuirks();
1544            ADVANCE_TO(BogusDOCTYPEState);
1545        }
1546    }
1547    END_STATE()
1548
1549    BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1550        if (cc == '"')
1551            ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1552        else if (cc == '>') {
1553            parseError();
1554            m_token->setForceQuirks();
1555            return emitAndResumeIn(source, DataState);
1556        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1557            parseError();
1558            m_token->setForceQuirks();
1559            return emitAndReconsumeIn(source, DataState);
1560        } else {
1561            m_token->appendToSystemIdentifier(cc);
1562            ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1563        }
1564    }
1565    END_STATE()
1566
1567    BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1568        if (cc == '\'')
1569            ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1570        else if (cc == '>') {
1571            parseError();
1572            m_token->setForceQuirks();
1573            return emitAndResumeIn(source, DataState);
1574        } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1575            parseError();
1576            m_token->setForceQuirks();
1577            return emitAndReconsumeIn(source, DataState);
1578        } else {
1579            m_token->appendToSystemIdentifier(cc);
1580            ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1581        }
1582    }
1583    END_STATE()
1584
1585    BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1586        if (isTokenizerWhitespace(cc))
1587            ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1588        else if (cc == '>')
1589            return emitAndResumeIn(source, DataState);
1590        else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1591            parseError();
1592            m_token->setForceQuirks();
1593            return emitAndReconsumeIn(source, DataState);
1594        } else {
1595            parseError();
1596            ADVANCE_TO(BogusDOCTYPEState);
1597        }
1598    }
1599    END_STATE()
1600
1601    BEGIN_STATE(BogusDOCTYPEState) {
1602        if (cc == '>')
1603            return emitAndResumeIn(source, DataState);
1604        else if (cc == InputStreamPreprocessor::endOfFileMarker)
1605            return emitAndReconsumeIn(source, DataState);
1606        ADVANCE_TO(BogusDOCTYPEState);
1607    }
1608    END_STATE()
1609
1610    BEGIN_STATE(CDATASectionState) {
1611        if (cc == ']')
1612            ADVANCE_TO(CDATASectionRightSquareBracketState);
1613        else if (cc == InputStreamPreprocessor::endOfFileMarker)
1614            RECONSUME_IN(DataState);
1615        else {
1616            bufferCharacter(cc);
1617            ADVANCE_TO(CDATASectionState);
1618        }
1619    }
1620    END_STATE()
1621
1622    BEGIN_STATE(CDATASectionRightSquareBracketState) {
1623        if (cc == ']')
1624            ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1625        else {
1626            bufferCharacter(']');
1627            RECONSUME_IN(CDATASectionState);
1628        }
1629    }
1630
1631    BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1632        if (cc == '>')
1633            ADVANCE_TO(DataState);
1634        else {
1635            bufferCharacter(']');
1636            bufferCharacter(']');
1637            RECONSUME_IN(CDATASectionState);
1638        }
1639    }
1640    END_STATE()
1641
1642    }
1643
1644    ASSERT_NOT_REACHED();
1645    return false;
1646}
1647
1648void HTMLTokenizer::updateStateFor(const AtomicString& tagName, Frame* frame)
1649{
1650    if (tagName == textareaTag || tagName == titleTag)
1651        setState(RCDATAState);
1652    else if (tagName == plaintextTag)
1653        setState(PLAINTEXTState);
1654    else if (tagName == scriptTag)
1655        setState(ScriptDataState);
1656    else if (tagName == styleTag
1657        || tagName == iframeTag
1658        || tagName == xmpTag
1659        || (tagName == noembedTag && HTMLTreeBuilder::pluginsEnabled(frame))
1660        || tagName == noframesTag
1661        || (tagName == noscriptTag && HTMLTreeBuilder::scriptEnabled(frame)))
1662        setState(RAWTEXTState);
1663}
1664
1665inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1666{
1667    return vectorEqualsString(m_temporaryBuffer, expectedString);
1668}
1669
1670inline void HTMLTokenizer::addToPossibleEndTag(UChar cc)
1671{
1672    ASSERT(isEndTagBufferingState(m_state));
1673    m_bufferedEndTagName.append(cc);
1674}
1675
1676inline bool HTMLTokenizer::isAppropriateEndTag()
1677{
1678    return m_bufferedEndTagName == m_appropriateEndTagName;
1679}
1680
1681inline void HTMLTokenizer::bufferCharacter(UChar character)
1682{
1683    ASSERT(character != InputStreamPreprocessor::endOfFileMarker);
1684    m_token->ensureIsCharacterToken();
1685    m_token->appendToCharacter(character);
1686}
1687
1688inline void HTMLTokenizer::parseError()
1689{
1690    notImplemented();
1691}
1692
1693inline bool HTMLTokenizer::haveBufferedCharacterToken()
1694{
1695    return m_token->type() == HTMLToken::Character;
1696}
1697
1698}
1699