HTMLDocumentParser.cpp revision 2bde8e466a4451c7319e3a072d118917957d6554
1/*
2 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "HTMLDocumentParser.h"
28
29#include "ContentSecurityPolicy.h"
30#include "DocumentFragment.h"
31#include "Element.h"
32#include "Frame.h"
33#include "HTMLNames.h"
34#include "HTMLParserScheduler.h"
35#include "HTMLTokenizer.h"
36#include "HTMLPreloadScanner.h"
37#include "HTMLScriptRunner.h"
38#include "HTMLTreeBuilder.h"
39#include "HTMLDocument.h"
40#include "InspectorInstrumentation.h"
41#include "NestingLevelIncrementer.h"
42#include "Settings.h"
43#include <wtf/CurrentTime.h>
44
45#ifdef ANDROID_INSTRUMENT
46#include "TimeCounter.h"
47#endif
48
49namespace WebCore {
50
51using namespace HTMLNames;
52
53namespace {
54
55// This is a direct transcription of step 4 from:
56// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#fragment-case
57HTMLTokenizer::State tokenizerStateForContextElement(Element* contextElement, bool reportErrors)
58{
59    if (!contextElement)
60        return HTMLTokenizer::DataState;
61
62    const QualifiedName& contextTag = contextElement->tagQName();
63
64    if (contextTag.matches(titleTag) || contextTag.matches(textareaTag))
65        return HTMLTokenizer::RCDATAState;
66    if (contextTag.matches(styleTag)
67        || contextTag.matches(xmpTag)
68        || contextTag.matches(iframeTag)
69        || (contextTag.matches(noembedTag) && HTMLTreeBuilder::pluginsEnabled(contextElement->document()->frame()))
70        || (contextTag.matches(noscriptTag) && HTMLTreeBuilder::scriptEnabled(contextElement->document()->frame()))
71        || contextTag.matches(noframesTag))
72        return reportErrors ? HTMLTokenizer::RAWTEXTState : HTMLTokenizer::PLAINTEXTState;
73    if (contextTag.matches(scriptTag))
74        return reportErrors ? HTMLTokenizer::ScriptDataState : HTMLTokenizer::PLAINTEXTState;
75    if (contextTag.matches(plaintextTag))
76        return HTMLTokenizer::PLAINTEXTState;
77    return HTMLTokenizer::DataState;
78}
79
80} // namespace
81
82HTMLDocumentParser::HTMLDocumentParser(HTMLDocument* document, bool reportErrors)
83    : ScriptableDocumentParser(document)
84    , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(document)))
85    , m_scriptRunner(HTMLScriptRunner::create(document, this))
86    , m_treeBuilder(HTMLTreeBuilder::create(this, document, reportErrors, usePreHTML5ParserQuirks(document)))
87    , m_parserScheduler(HTMLParserScheduler::create(this))
88    , m_xssFilter(this)
89    , m_endWasDelayed(false)
90    , m_pumpSessionNestingLevel(0)
91{
92}
93
94// FIXME: Member variables should be grouped into self-initializing structs to
95// minimize code duplication between these constructors.
96HTMLDocumentParser::HTMLDocumentParser(DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission)
97    : ScriptableDocumentParser(fragment->document())
98    , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(fragment->document())))
99    , m_treeBuilder(HTMLTreeBuilder::create(this, fragment, contextElement, scriptingPermission, usePreHTML5ParserQuirks(fragment->document())))
100    , m_xssFilter(this)
101    , m_endWasDelayed(false)
102    , m_pumpSessionNestingLevel(0)
103{
104    bool reportErrors = false; // For now document fragment parsing never reports errors.
105    m_tokenizer->setState(tokenizerStateForContextElement(contextElement, reportErrors));
106}
107
108HTMLDocumentParser::~HTMLDocumentParser()
109{
110    ASSERT(!m_parserScheduler);
111    ASSERT(!m_pumpSessionNestingLevel);
112    ASSERT(!m_preloadScanner);
113}
114
115void HTMLDocumentParser::detach()
116{
117    DocumentParser::detach();
118    if (m_scriptRunner)
119        m_scriptRunner->detach();
120    m_treeBuilder->detach();
121    // FIXME: It seems wrong that we would have a preload scanner here.
122    // Yet during fast/dom/HTMLScriptElement/script-load-events.html we do.
123    m_preloadScanner.clear();
124    m_parserScheduler.clear(); // Deleting the scheduler will clear any timers.
125}
126
127void HTMLDocumentParser::stopParsing()
128{
129    DocumentParser::stopParsing();
130    m_parserScheduler.clear(); // Deleting the scheduler will clear any timers.
131}
132
133// This kicks off "Once the user agent stops parsing" as described by:
134// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#the-end
135void HTMLDocumentParser::prepareToStopParsing()
136{
137    ASSERT(!hasInsertionPoint());
138
139    // pumpTokenizer can cause this parser to be detached from the Document,
140    // but we need to ensure it isn't deleted yet.
141    RefPtr<HTMLDocumentParser> protect(this);
142
143    // NOTE: This pump should only ever emit buffered character tokens,
144    // so ForceSynchronous vs. AllowYield should be meaningless.
145    pumpTokenizerIfPossible(ForceSynchronous);
146
147    if (isStopped())
148        return;
149
150    DocumentParser::prepareToStopParsing();
151
152    // We will not have a scriptRunner when parsing a DocumentFragment.
153    if (m_scriptRunner)
154        document()->setReadyState(Document::Interactive);
155
156    attemptToRunDeferredScriptsAndEnd();
157}
158
159bool HTMLDocumentParser::isParsingFragment() const
160{
161    return m_treeBuilder->isParsingFragment();
162}
163
164bool HTMLDocumentParser::processingData() const
165{
166    return isScheduledForResume() || inPumpSession();
167}
168
169void HTMLDocumentParser::pumpTokenizerIfPossible(SynchronousMode mode)
170{
171    if (isStopped() || m_treeBuilder->isPaused())
172        return;
173
174    // Once a resume is scheduled, HTMLParserScheduler controls when we next pump.
175    if (isScheduledForResume()) {
176        ASSERT(mode == AllowYield);
177        return;
178    }
179
180    pumpTokenizer(mode);
181}
182
183bool HTMLDocumentParser::isScheduledForResume() const
184{
185    return m_parserScheduler && m_parserScheduler->isScheduledForResume();
186}
187
188// Used by HTMLParserScheduler
189void HTMLDocumentParser::resumeParsingAfterYield()
190{
191    // pumpTokenizer can cause this parser to be detached from the Document,
192    // but we need to ensure it isn't deleted yet.
193    RefPtr<HTMLDocumentParser> protect(this);
194
195    // We should never be here unless we can pump immediately.  Call pumpTokenizer()
196    // directly so that ASSERTS will fire if we're wrong.
197    pumpTokenizer(AllowYield);
198    endIfDelayed();
199}
200
201bool HTMLDocumentParser::runScriptsForPausedTreeBuilder()
202{
203    ASSERT(m_treeBuilder->isPaused());
204
205    TextPosition1 scriptStartPosition = TextPosition1::belowRangePosition();
206    RefPtr<Element> scriptElement = m_treeBuilder->takeScriptToProcess(scriptStartPosition);
207    // We will not have a scriptRunner when parsing a DocumentFragment.
208    if (!m_scriptRunner)
209        return true;
210    return m_scriptRunner->execute(scriptElement.release(), scriptStartPosition);
211}
212
213bool HTMLDocumentParser::canTakeNextToken(SynchronousMode mode, PumpSession& session)
214{
215    if (isStopped())
216        return false;
217
218    // The parser will pause itself when waiting on a script to load or run.
219    if (m_treeBuilder->isPaused()) {
220        if (mode == AllowYield)
221            m_parserScheduler->checkForYieldBeforeScript(session);
222
223        // If we don't run the script, we cannot allow the next token to be taken.
224        if (session.needsYield)
225            return false;
226
227        // If we're paused waiting for a script, we try to execute scripts before continuing.
228        bool shouldContinueParsing = runScriptsForPausedTreeBuilder();
229        m_treeBuilder->setPaused(!shouldContinueParsing);
230        if (!shouldContinueParsing || isStopped())
231            return false;
232    }
233
234    // FIXME: It's wrong for the HTMLDocumentParser to reach back to the
235    //        Frame, but this approach is how the old parser handled
236    //        stopping when the page assigns window.location.  What really
237    //        should happen is that assigning window.location causes the
238    //        parser to stop parsing cleanly.  The problem is we're not
239    //        perpared to do that at every point where we run JavaScript.
240    if (!isParsingFragment()
241        && document()->frame() && document()->frame()->navigationScheduler()->locationChangePending())
242        return false;
243
244    if (mode == AllowYield)
245        m_parserScheduler->checkForYieldBeforeToken(session);
246
247    return true;
248}
249
250void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode)
251{
252    ASSERT(!isStopped());
253    ASSERT(!isScheduledForResume());
254    // ASSERT that this object is both attached to the Document and protected.
255    ASSERT(refCount() >= 2);
256
257    PumpSession session(m_pumpSessionNestingLevel);
258
259    // We tell the InspectorInstrumentation about every pump, even if we
260    // end up pumping nothing.  It can filter out empty pumps itself.
261    // FIXME: m_input.current().length() is only accurate if we
262    // end up parsing the whole buffer in this pump.  We should pass how
263    // much we parsed as part of didWriteHTML instead of willWriteHTML.
264    InspectorInstrumentationCookie cookie = InspectorInstrumentation::willWriteHTML(document(), m_input.current().length(), m_tokenizer->lineNumber());
265
266    while (canTakeNextToken(mode, session) && !session.needsYield) {
267        if (!isParsingFragment())
268            m_sourceTracker.start(m_input, m_token);
269
270        if (!m_tokenizer->nextToken(m_input.current(), m_token))
271            break;
272
273        if (!isParsingFragment()) {
274            m_sourceTracker.end(m_input, m_token);
275
276            // We do not XSS filter innerHTML, which means we (intentionally) fail
277            // http/tests/security/xssAuditor/dom-write-innerHTML.html
278            m_xssFilter.filterToken(m_token);
279        }
280
281        m_treeBuilder->constructTreeFromToken(m_token);
282        m_token.clear();
283    }
284
285    // Ensure we haven't been totally deref'ed after pumping. Any caller of this
286    // function should be holding a RefPtr to this to ensure we weren't deleted.
287    ASSERT(refCount() >= 1);
288
289    if (isStopped())
290        return;
291
292    if (session.needsYield)
293        m_parserScheduler->scheduleForResume();
294
295    if (isWaitingForScripts()) {
296        ASSERT(m_tokenizer->state() == HTMLTokenizer::DataState);
297        if (!m_preloadScanner) {
298            m_preloadScanner.set(new HTMLPreloadScanner(document()));
299            m_preloadScanner->appendToEnd(m_input.current());
300        }
301        m_preloadScanner->scan();
302    }
303
304    InspectorInstrumentation::didWriteHTML(cookie, m_tokenizer->lineNumber());
305}
306
307bool HTMLDocumentParser::hasInsertionPoint()
308{
309    // FIXME: The wasCreatedByScript() branch here might not be fully correct.
310    //        Our model of the EOF character differs slightly from the one in
311    //        the spec because our treatment is uniform between network-sourced
312    //        and script-sourced input streams whereas the spec treats them
313    //        differently.
314    return m_input.hasInsertionPoint() || (wasCreatedByScript() && !m_input.haveSeenEndOfFile());
315}
316
317void HTMLDocumentParser::insert(const SegmentedString& source)
318{
319    if (isStopped())
320        return;
321
322#ifdef ANDROID_INSTRUMENT
323    android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
324#endif
325
326    // pumpTokenizer can cause this parser to be detached from the Document,
327    // but we need to ensure it isn't deleted yet.
328    RefPtr<HTMLDocumentParser> protect(this);
329
330    SegmentedString excludedLineNumberSource(source);
331    excludedLineNumberSource.setExcludeLineNumbers();
332    m_input.insertAtCurrentInsertionPoint(excludedLineNumberSource);
333    pumpTokenizerIfPossible(ForceSynchronous);
334
335    endIfDelayed();
336}
337
338void HTMLDocumentParser::append(const SegmentedString& source)
339{
340    if (isStopped())
341        return;
342
343    // pumpTokenizer can cause this parser to be detached from the Document,
344    // but we need to ensure it isn't deleted yet.
345    RefPtr<HTMLDocumentParser> protect(this);
346
347    m_input.appendToEnd(source);
348    if (m_preloadScanner)
349        m_preloadScanner->appendToEnd(source);
350
351    if (inPumpSession()) {
352        // We've gotten data off the network in a nested write.
353        // We don't want to consume any more of the input stream now.  Do
354        // not worry.  We'll consume this data in a less-nested write().
355#ifdef ANDROID_INSTRUMENT
356        android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
357#endif
358        return;
359    }
360
361    pumpTokenizerIfPossible(AllowYield);
362
363    endIfDelayed();
364#ifdef ANDROID_INSTRUMENT
365    android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
366#endif
367}
368
369void HTMLDocumentParser::end()
370{
371    ASSERT(!isDetached());
372    ASSERT(!isScheduledForResume());
373
374    // Informs the the rest of WebCore that parsing is really finished (and deletes this).
375    m_treeBuilder->finished();
376}
377
378void HTMLDocumentParser::attemptToRunDeferredScriptsAndEnd()
379{
380    ASSERT(isStopping());
381    ASSERT(!hasInsertionPoint());
382    if (m_scriptRunner && !m_scriptRunner->executeScriptsWaitingForParsing())
383        return;
384    end();
385}
386
387void HTMLDocumentParser::attemptToEnd()
388{
389    // finish() indicates we will not receive any more data. If we are waiting on
390    // an external script to load, we can't finish parsing quite yet.
391
392    if (shouldDelayEnd()) {
393        m_endWasDelayed = true;
394        return;
395    }
396    prepareToStopParsing();
397}
398
399void HTMLDocumentParser::endIfDelayed()
400{
401    // If we've already been detached, don't bother ending.
402    if (isDetached())
403        return;
404
405    if (!m_endWasDelayed || shouldDelayEnd())
406        return;
407
408    m_endWasDelayed = false;
409    prepareToStopParsing();
410}
411
412void HTMLDocumentParser::finish()
413{
414    // FIXME: We should ASSERT(!m_parserStopped) here, since it does not
415    // makes sense to call any methods on DocumentParser once it's been stopped.
416    // However, FrameLoader::stop calls Document::finishParsing unconditionally
417    // which in turn calls m_parser->finish().
418
419    // We're not going to get any more data off the network, so we tell the
420    // input stream we've reached the end of file.  finish() can be called more
421    // than once, if the first time does not call end().
422    if (!m_input.haveSeenEndOfFile())
423        m_input.markEndOfFile();
424    attemptToEnd();
425}
426
427bool HTMLDocumentParser::finishWasCalled()
428{
429    return m_input.haveSeenEndOfFile();
430}
431
432// This function is virtual and just for the DocumentParser interface.
433bool HTMLDocumentParser::isExecutingScript() const
434{
435    return inScriptExecution();
436}
437
438// This function is non-virtual and used throughout the implementation.
439bool HTMLDocumentParser::inScriptExecution() const
440{
441    if (!m_scriptRunner)
442        return false;
443    return m_scriptRunner->isExecutingScript();
444}
445
446String HTMLDocumentParser::sourceForToken(const HTMLToken& token)
447{
448    return m_sourceTracker.sourceForToken(token);
449}
450
451int HTMLDocumentParser::lineNumber() const
452{
453    return m_tokenizer->lineNumber();
454}
455
456TextPosition0 HTMLDocumentParser::textPosition() const
457{
458    const SegmentedString& currentString = m_input.current();
459    WTF::ZeroBasedNumber line = currentString.currentLine();
460    WTF::ZeroBasedNumber column = currentString.currentColumn();
461    ASSERT(m_tokenizer->lineNumber() == line.zeroBasedInt());
462
463    return TextPosition0(line, column);
464}
465
466bool HTMLDocumentParser::isWaitingForScripts() const
467{
468    return m_treeBuilder->isPaused();
469}
470
471void HTMLDocumentParser::resumeParsingAfterScriptExecution()
472{
473    ASSERT(!inScriptExecution());
474    ASSERT(!m_treeBuilder->isPaused());
475
476    m_preloadScanner.clear();
477    pumpTokenizerIfPossible(AllowYield);
478    endIfDelayed();
479}
480
481void HTMLDocumentParser::watchForLoad(CachedResource* cachedScript)
482{
483    ASSERT(!cachedScript->isLoaded());
484    // addClient would call notifyFinished if the load were complete.
485    // Callers do not expect to be re-entered from this call, so they should
486    // not an already-loaded CachedResource.
487    cachedScript->addClient(this);
488}
489
490void HTMLDocumentParser::stopWatchingForLoad(CachedResource* cachedScript)
491{
492    cachedScript->removeClient(this);
493}
494
495void HTMLDocumentParser::notifyFinished(CachedResource* cachedResource)
496{
497    // pumpTokenizer can cause this parser to be detached from the Document,
498    // but we need to ensure it isn't deleted yet.
499    RefPtr<HTMLDocumentParser> protect(this);
500
501    ASSERT(m_scriptRunner);
502    ASSERT(!inScriptExecution());
503    if (isStopping()) {
504        attemptToRunDeferredScriptsAndEnd();
505        return;
506    }
507
508    ASSERT(m_treeBuilder->isPaused());
509    // Note: We only ever wait on one script at a time, so we always know this
510    // is the one we were waiting on and can un-pause the tree builder.
511    m_treeBuilder->setPaused(false);
512    bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForLoad(cachedResource);
513    m_treeBuilder->setPaused(!shouldContinueParsing);
514    if (shouldContinueParsing)
515        resumeParsingAfterScriptExecution();
516}
517
518void HTMLDocumentParser::executeScriptsWaitingForStylesheets()
519{
520    // Document only calls this when the Document owns the DocumentParser
521    // so this will not be called in the DocumentFragment case.
522    ASSERT(m_scriptRunner);
523    // Ignore calls unless we have a script blocking the parser waiting on a
524    // stylesheet load.  Otherwise we are currently parsing and this
525    // is a re-entrant call from encountering a </ style> tag.
526    if (!m_scriptRunner->hasScriptsWaitingForStylesheets())
527        return;
528
529    // pumpTokenizer can cause this parser to be detached from the Document,
530    // but we need to ensure it isn't deleted yet.
531    RefPtr<HTMLDocumentParser> protect(this);
532
533    ASSERT(!m_scriptRunner->isExecutingScript());
534    ASSERT(m_treeBuilder->isPaused());
535    // Note: We only ever wait on one script at a time, so we always know this
536    // is the one we were waiting on and can un-pause the tree builder.
537    m_treeBuilder->setPaused(false);
538    bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForStylesheets();
539    m_treeBuilder->setPaused(!shouldContinueParsing);
540    if (shouldContinueParsing)
541        resumeParsingAfterScriptExecution();
542}
543
544ScriptController* HTMLDocumentParser::script() const
545{
546    return document()->frame() ? document()->frame()->script() : 0;
547}
548
549void HTMLDocumentParser::parseDocumentFragment(const String& source, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission)
550{
551    RefPtr<HTMLDocumentParser> parser = HTMLDocumentParser::create(fragment, contextElement, scriptingPermission);
552    parser->insert(source); // Use insert() so that the parser will not yield.
553    parser->finish();
554    ASSERT(!parser->processingData()); // Make sure we're done. <rdar://problem/3963151>
555    parser->detach(); // Allows ~DocumentParser to assert it was detached before destruction.
556}
557
558bool HTMLDocumentParser::usePreHTML5ParserQuirks(Document* document)
559{
560    ASSERT(document);
561    return document->settings() && document->settings()->usePreHTML5ParserQuirks();
562}
563
564void HTMLDocumentParser::suspendScheduledTasks()
565{
566    if (m_parserScheduler)
567        m_parserScheduler->suspend();
568}
569
570void HTMLDocumentParser::resumeScheduledTasks()
571{
572    if (m_parserScheduler)
573        m_parserScheduler->resume();
574}
575
576}
577