HTMLDocumentParser.cpp revision 2bde8e466a4451c7319e3a072d118917957d6554
1/* 2 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#include "config.h" 27#include "HTMLDocumentParser.h" 28 29#include "ContentSecurityPolicy.h" 30#include "DocumentFragment.h" 31#include "Element.h" 32#include "Frame.h" 33#include "HTMLNames.h" 34#include "HTMLParserScheduler.h" 35#include "HTMLTokenizer.h" 36#include "HTMLPreloadScanner.h" 37#include "HTMLScriptRunner.h" 38#include "HTMLTreeBuilder.h" 39#include "HTMLDocument.h" 40#include "InspectorInstrumentation.h" 41#include "NestingLevelIncrementer.h" 42#include "Settings.h" 43#include <wtf/CurrentTime.h> 44 45#ifdef ANDROID_INSTRUMENT 46#include "TimeCounter.h" 47#endif 48 49namespace WebCore { 50 51using namespace HTMLNames; 52 53namespace { 54 55// This is a direct transcription of step 4 from: 56// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#fragment-case 57HTMLTokenizer::State tokenizerStateForContextElement(Element* contextElement, bool reportErrors) 58{ 59 if (!contextElement) 60 return HTMLTokenizer::DataState; 61 62 const QualifiedName& contextTag = contextElement->tagQName(); 63 64 if (contextTag.matches(titleTag) || contextTag.matches(textareaTag)) 65 return HTMLTokenizer::RCDATAState; 66 if (contextTag.matches(styleTag) 67 || contextTag.matches(xmpTag) 68 || contextTag.matches(iframeTag) 69 || (contextTag.matches(noembedTag) && HTMLTreeBuilder::pluginsEnabled(contextElement->document()->frame())) 70 || (contextTag.matches(noscriptTag) && HTMLTreeBuilder::scriptEnabled(contextElement->document()->frame())) 71 || contextTag.matches(noframesTag)) 72 return reportErrors ? HTMLTokenizer::RAWTEXTState : HTMLTokenizer::PLAINTEXTState; 73 if (contextTag.matches(scriptTag)) 74 return reportErrors ? HTMLTokenizer::ScriptDataState : HTMLTokenizer::PLAINTEXTState; 75 if (contextTag.matches(plaintextTag)) 76 return HTMLTokenizer::PLAINTEXTState; 77 return HTMLTokenizer::DataState; 78} 79 80} // namespace 81 82HTMLDocumentParser::HTMLDocumentParser(HTMLDocument* document, bool reportErrors) 83 : ScriptableDocumentParser(document) 84 , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(document))) 85 , m_scriptRunner(HTMLScriptRunner::create(document, this)) 86 , m_treeBuilder(HTMLTreeBuilder::create(this, document, reportErrors, usePreHTML5ParserQuirks(document))) 87 , m_parserScheduler(HTMLParserScheduler::create(this)) 88 , m_xssFilter(this) 89 , m_endWasDelayed(false) 90 , m_pumpSessionNestingLevel(0) 91{ 92} 93 94// FIXME: Member variables should be grouped into self-initializing structs to 95// minimize code duplication between these constructors. 96HTMLDocumentParser::HTMLDocumentParser(DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) 97 : ScriptableDocumentParser(fragment->document()) 98 , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(fragment->document()))) 99 , m_treeBuilder(HTMLTreeBuilder::create(this, fragment, contextElement, scriptingPermission, usePreHTML5ParserQuirks(fragment->document()))) 100 , m_xssFilter(this) 101 , m_endWasDelayed(false) 102 , m_pumpSessionNestingLevel(0) 103{ 104 bool reportErrors = false; // For now document fragment parsing never reports errors. 105 m_tokenizer->setState(tokenizerStateForContextElement(contextElement, reportErrors)); 106} 107 108HTMLDocumentParser::~HTMLDocumentParser() 109{ 110 ASSERT(!m_parserScheduler); 111 ASSERT(!m_pumpSessionNestingLevel); 112 ASSERT(!m_preloadScanner); 113} 114 115void HTMLDocumentParser::detach() 116{ 117 DocumentParser::detach(); 118 if (m_scriptRunner) 119 m_scriptRunner->detach(); 120 m_treeBuilder->detach(); 121 // FIXME: It seems wrong that we would have a preload scanner here. 122 // Yet during fast/dom/HTMLScriptElement/script-load-events.html we do. 123 m_preloadScanner.clear(); 124 m_parserScheduler.clear(); // Deleting the scheduler will clear any timers. 125} 126 127void HTMLDocumentParser::stopParsing() 128{ 129 DocumentParser::stopParsing(); 130 m_parserScheduler.clear(); // Deleting the scheduler will clear any timers. 131} 132 133// This kicks off "Once the user agent stops parsing" as described by: 134// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#the-end 135void HTMLDocumentParser::prepareToStopParsing() 136{ 137 ASSERT(!hasInsertionPoint()); 138 139 // pumpTokenizer can cause this parser to be detached from the Document, 140 // but we need to ensure it isn't deleted yet. 141 RefPtr<HTMLDocumentParser> protect(this); 142 143 // NOTE: This pump should only ever emit buffered character tokens, 144 // so ForceSynchronous vs. AllowYield should be meaningless. 145 pumpTokenizerIfPossible(ForceSynchronous); 146 147 if (isStopped()) 148 return; 149 150 DocumentParser::prepareToStopParsing(); 151 152 // We will not have a scriptRunner when parsing a DocumentFragment. 153 if (m_scriptRunner) 154 document()->setReadyState(Document::Interactive); 155 156 attemptToRunDeferredScriptsAndEnd(); 157} 158 159bool HTMLDocumentParser::isParsingFragment() const 160{ 161 return m_treeBuilder->isParsingFragment(); 162} 163 164bool HTMLDocumentParser::processingData() const 165{ 166 return isScheduledForResume() || inPumpSession(); 167} 168 169void HTMLDocumentParser::pumpTokenizerIfPossible(SynchronousMode mode) 170{ 171 if (isStopped() || m_treeBuilder->isPaused()) 172 return; 173 174 // Once a resume is scheduled, HTMLParserScheduler controls when we next pump. 175 if (isScheduledForResume()) { 176 ASSERT(mode == AllowYield); 177 return; 178 } 179 180 pumpTokenizer(mode); 181} 182 183bool HTMLDocumentParser::isScheduledForResume() const 184{ 185 return m_parserScheduler && m_parserScheduler->isScheduledForResume(); 186} 187 188// Used by HTMLParserScheduler 189void HTMLDocumentParser::resumeParsingAfterYield() 190{ 191 // pumpTokenizer can cause this parser to be detached from the Document, 192 // but we need to ensure it isn't deleted yet. 193 RefPtr<HTMLDocumentParser> protect(this); 194 195 // We should never be here unless we can pump immediately. Call pumpTokenizer() 196 // directly so that ASSERTS will fire if we're wrong. 197 pumpTokenizer(AllowYield); 198 endIfDelayed(); 199} 200 201bool HTMLDocumentParser::runScriptsForPausedTreeBuilder() 202{ 203 ASSERT(m_treeBuilder->isPaused()); 204 205 TextPosition1 scriptStartPosition = TextPosition1::belowRangePosition(); 206 RefPtr<Element> scriptElement = m_treeBuilder->takeScriptToProcess(scriptStartPosition); 207 // We will not have a scriptRunner when parsing a DocumentFragment. 208 if (!m_scriptRunner) 209 return true; 210 return m_scriptRunner->execute(scriptElement.release(), scriptStartPosition); 211} 212 213bool HTMLDocumentParser::canTakeNextToken(SynchronousMode mode, PumpSession& session) 214{ 215 if (isStopped()) 216 return false; 217 218 // The parser will pause itself when waiting on a script to load or run. 219 if (m_treeBuilder->isPaused()) { 220 if (mode == AllowYield) 221 m_parserScheduler->checkForYieldBeforeScript(session); 222 223 // If we don't run the script, we cannot allow the next token to be taken. 224 if (session.needsYield) 225 return false; 226 227 // If we're paused waiting for a script, we try to execute scripts before continuing. 228 bool shouldContinueParsing = runScriptsForPausedTreeBuilder(); 229 m_treeBuilder->setPaused(!shouldContinueParsing); 230 if (!shouldContinueParsing || isStopped()) 231 return false; 232 } 233 234 // FIXME: It's wrong for the HTMLDocumentParser to reach back to the 235 // Frame, but this approach is how the old parser handled 236 // stopping when the page assigns window.location. What really 237 // should happen is that assigning window.location causes the 238 // parser to stop parsing cleanly. The problem is we're not 239 // perpared to do that at every point where we run JavaScript. 240 if (!isParsingFragment() 241 && document()->frame() && document()->frame()->navigationScheduler()->locationChangePending()) 242 return false; 243 244 if (mode == AllowYield) 245 m_parserScheduler->checkForYieldBeforeToken(session); 246 247 return true; 248} 249 250void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode) 251{ 252 ASSERT(!isStopped()); 253 ASSERT(!isScheduledForResume()); 254 // ASSERT that this object is both attached to the Document and protected. 255 ASSERT(refCount() >= 2); 256 257 PumpSession session(m_pumpSessionNestingLevel); 258 259 // We tell the InspectorInstrumentation about every pump, even if we 260 // end up pumping nothing. It can filter out empty pumps itself. 261 // FIXME: m_input.current().length() is only accurate if we 262 // end up parsing the whole buffer in this pump. We should pass how 263 // much we parsed as part of didWriteHTML instead of willWriteHTML. 264 InspectorInstrumentationCookie cookie = InspectorInstrumentation::willWriteHTML(document(), m_input.current().length(), m_tokenizer->lineNumber()); 265 266 while (canTakeNextToken(mode, session) && !session.needsYield) { 267 if (!isParsingFragment()) 268 m_sourceTracker.start(m_input, m_token); 269 270 if (!m_tokenizer->nextToken(m_input.current(), m_token)) 271 break; 272 273 if (!isParsingFragment()) { 274 m_sourceTracker.end(m_input, m_token); 275 276 // We do not XSS filter innerHTML, which means we (intentionally) fail 277 // http/tests/security/xssAuditor/dom-write-innerHTML.html 278 m_xssFilter.filterToken(m_token); 279 } 280 281 m_treeBuilder->constructTreeFromToken(m_token); 282 m_token.clear(); 283 } 284 285 // Ensure we haven't been totally deref'ed after pumping. Any caller of this 286 // function should be holding a RefPtr to this to ensure we weren't deleted. 287 ASSERT(refCount() >= 1); 288 289 if (isStopped()) 290 return; 291 292 if (session.needsYield) 293 m_parserScheduler->scheduleForResume(); 294 295 if (isWaitingForScripts()) { 296 ASSERT(m_tokenizer->state() == HTMLTokenizer::DataState); 297 if (!m_preloadScanner) { 298 m_preloadScanner.set(new HTMLPreloadScanner(document())); 299 m_preloadScanner->appendToEnd(m_input.current()); 300 } 301 m_preloadScanner->scan(); 302 } 303 304 InspectorInstrumentation::didWriteHTML(cookie, m_tokenizer->lineNumber()); 305} 306 307bool HTMLDocumentParser::hasInsertionPoint() 308{ 309 // FIXME: The wasCreatedByScript() branch here might not be fully correct. 310 // Our model of the EOF character differs slightly from the one in 311 // the spec because our treatment is uniform between network-sourced 312 // and script-sourced input streams whereas the spec treats them 313 // differently. 314 return m_input.hasInsertionPoint() || (wasCreatedByScript() && !m_input.haveSeenEndOfFile()); 315} 316 317void HTMLDocumentParser::insert(const SegmentedString& source) 318{ 319 if (isStopped()) 320 return; 321 322#ifdef ANDROID_INSTRUMENT 323 android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter); 324#endif 325 326 // pumpTokenizer can cause this parser to be detached from the Document, 327 // but we need to ensure it isn't deleted yet. 328 RefPtr<HTMLDocumentParser> protect(this); 329 330 SegmentedString excludedLineNumberSource(source); 331 excludedLineNumberSource.setExcludeLineNumbers(); 332 m_input.insertAtCurrentInsertionPoint(excludedLineNumberSource); 333 pumpTokenizerIfPossible(ForceSynchronous); 334 335 endIfDelayed(); 336} 337 338void HTMLDocumentParser::append(const SegmentedString& source) 339{ 340 if (isStopped()) 341 return; 342 343 // pumpTokenizer can cause this parser to be detached from the Document, 344 // but we need to ensure it isn't deleted yet. 345 RefPtr<HTMLDocumentParser> protect(this); 346 347 m_input.appendToEnd(source); 348 if (m_preloadScanner) 349 m_preloadScanner->appendToEnd(source); 350 351 if (inPumpSession()) { 352 // We've gotten data off the network in a nested write. 353 // We don't want to consume any more of the input stream now. Do 354 // not worry. We'll consume this data in a less-nested write(). 355#ifdef ANDROID_INSTRUMENT 356 android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); 357#endif 358 return; 359 } 360 361 pumpTokenizerIfPossible(AllowYield); 362 363 endIfDelayed(); 364#ifdef ANDROID_INSTRUMENT 365 android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); 366#endif 367} 368 369void HTMLDocumentParser::end() 370{ 371 ASSERT(!isDetached()); 372 ASSERT(!isScheduledForResume()); 373 374 // Informs the the rest of WebCore that parsing is really finished (and deletes this). 375 m_treeBuilder->finished(); 376} 377 378void HTMLDocumentParser::attemptToRunDeferredScriptsAndEnd() 379{ 380 ASSERT(isStopping()); 381 ASSERT(!hasInsertionPoint()); 382 if (m_scriptRunner && !m_scriptRunner->executeScriptsWaitingForParsing()) 383 return; 384 end(); 385} 386 387void HTMLDocumentParser::attemptToEnd() 388{ 389 // finish() indicates we will not receive any more data. If we are waiting on 390 // an external script to load, we can't finish parsing quite yet. 391 392 if (shouldDelayEnd()) { 393 m_endWasDelayed = true; 394 return; 395 } 396 prepareToStopParsing(); 397} 398 399void HTMLDocumentParser::endIfDelayed() 400{ 401 // If we've already been detached, don't bother ending. 402 if (isDetached()) 403 return; 404 405 if (!m_endWasDelayed || shouldDelayEnd()) 406 return; 407 408 m_endWasDelayed = false; 409 prepareToStopParsing(); 410} 411 412void HTMLDocumentParser::finish() 413{ 414 // FIXME: We should ASSERT(!m_parserStopped) here, since it does not 415 // makes sense to call any methods on DocumentParser once it's been stopped. 416 // However, FrameLoader::stop calls Document::finishParsing unconditionally 417 // which in turn calls m_parser->finish(). 418 419 // We're not going to get any more data off the network, so we tell the 420 // input stream we've reached the end of file. finish() can be called more 421 // than once, if the first time does not call end(). 422 if (!m_input.haveSeenEndOfFile()) 423 m_input.markEndOfFile(); 424 attemptToEnd(); 425} 426 427bool HTMLDocumentParser::finishWasCalled() 428{ 429 return m_input.haveSeenEndOfFile(); 430} 431 432// This function is virtual and just for the DocumentParser interface. 433bool HTMLDocumentParser::isExecutingScript() const 434{ 435 return inScriptExecution(); 436} 437 438// This function is non-virtual and used throughout the implementation. 439bool HTMLDocumentParser::inScriptExecution() const 440{ 441 if (!m_scriptRunner) 442 return false; 443 return m_scriptRunner->isExecutingScript(); 444} 445 446String HTMLDocumentParser::sourceForToken(const HTMLToken& token) 447{ 448 return m_sourceTracker.sourceForToken(token); 449} 450 451int HTMLDocumentParser::lineNumber() const 452{ 453 return m_tokenizer->lineNumber(); 454} 455 456TextPosition0 HTMLDocumentParser::textPosition() const 457{ 458 const SegmentedString& currentString = m_input.current(); 459 WTF::ZeroBasedNumber line = currentString.currentLine(); 460 WTF::ZeroBasedNumber column = currentString.currentColumn(); 461 ASSERT(m_tokenizer->lineNumber() == line.zeroBasedInt()); 462 463 return TextPosition0(line, column); 464} 465 466bool HTMLDocumentParser::isWaitingForScripts() const 467{ 468 return m_treeBuilder->isPaused(); 469} 470 471void HTMLDocumentParser::resumeParsingAfterScriptExecution() 472{ 473 ASSERT(!inScriptExecution()); 474 ASSERT(!m_treeBuilder->isPaused()); 475 476 m_preloadScanner.clear(); 477 pumpTokenizerIfPossible(AllowYield); 478 endIfDelayed(); 479} 480 481void HTMLDocumentParser::watchForLoad(CachedResource* cachedScript) 482{ 483 ASSERT(!cachedScript->isLoaded()); 484 // addClient would call notifyFinished if the load were complete. 485 // Callers do not expect to be re-entered from this call, so they should 486 // not an already-loaded CachedResource. 487 cachedScript->addClient(this); 488} 489 490void HTMLDocumentParser::stopWatchingForLoad(CachedResource* cachedScript) 491{ 492 cachedScript->removeClient(this); 493} 494 495void HTMLDocumentParser::notifyFinished(CachedResource* cachedResource) 496{ 497 // pumpTokenizer can cause this parser to be detached from the Document, 498 // but we need to ensure it isn't deleted yet. 499 RefPtr<HTMLDocumentParser> protect(this); 500 501 ASSERT(m_scriptRunner); 502 ASSERT(!inScriptExecution()); 503 if (isStopping()) { 504 attemptToRunDeferredScriptsAndEnd(); 505 return; 506 } 507 508 ASSERT(m_treeBuilder->isPaused()); 509 // Note: We only ever wait on one script at a time, so we always know this 510 // is the one we were waiting on and can un-pause the tree builder. 511 m_treeBuilder->setPaused(false); 512 bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForLoad(cachedResource); 513 m_treeBuilder->setPaused(!shouldContinueParsing); 514 if (shouldContinueParsing) 515 resumeParsingAfterScriptExecution(); 516} 517 518void HTMLDocumentParser::executeScriptsWaitingForStylesheets() 519{ 520 // Document only calls this when the Document owns the DocumentParser 521 // so this will not be called in the DocumentFragment case. 522 ASSERT(m_scriptRunner); 523 // Ignore calls unless we have a script blocking the parser waiting on a 524 // stylesheet load. Otherwise we are currently parsing and this 525 // is a re-entrant call from encountering a </ style> tag. 526 if (!m_scriptRunner->hasScriptsWaitingForStylesheets()) 527 return; 528 529 // pumpTokenizer can cause this parser to be detached from the Document, 530 // but we need to ensure it isn't deleted yet. 531 RefPtr<HTMLDocumentParser> protect(this); 532 533 ASSERT(!m_scriptRunner->isExecutingScript()); 534 ASSERT(m_treeBuilder->isPaused()); 535 // Note: We only ever wait on one script at a time, so we always know this 536 // is the one we were waiting on and can un-pause the tree builder. 537 m_treeBuilder->setPaused(false); 538 bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForStylesheets(); 539 m_treeBuilder->setPaused(!shouldContinueParsing); 540 if (shouldContinueParsing) 541 resumeParsingAfterScriptExecution(); 542} 543 544ScriptController* HTMLDocumentParser::script() const 545{ 546 return document()->frame() ? document()->frame()->script() : 0; 547} 548 549void HTMLDocumentParser::parseDocumentFragment(const String& source, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) 550{ 551 RefPtr<HTMLDocumentParser> parser = HTMLDocumentParser::create(fragment, contextElement, scriptingPermission); 552 parser->insert(source); // Use insert() so that the parser will not yield. 553 parser->finish(); 554 ASSERT(!parser->processingData()); // Make sure we're done. <rdar://problem/3963151> 555 parser->detach(); // Allows ~DocumentParser to assert it was detached before destruction. 556} 557 558bool HTMLDocumentParser::usePreHTML5ParserQuirks(Document* document) 559{ 560 ASSERT(document); 561 return document->settings() && document->settings()->usePreHTML5ParserQuirks(); 562} 563 564void HTMLDocumentParser::suspendScheduledTasks() 565{ 566 if (m_parserScheduler) 567 m_parserScheduler->suspend(); 568} 569 570void HTMLDocumentParser::resumeScheduledTasks() 571{ 572 if (m_parserScheduler) 573 m_parserScheduler->resume(); 574} 575 576} 577