1/* 2 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#include "config.h" 27#include "HTMLDocumentParser.h" 28 29#include "ContentSecurityPolicy.h" 30#include "DocumentFragment.h" 31#include "Element.h" 32#include "Frame.h" 33#include "HTMLNames.h" 34#include "HTMLParserScheduler.h" 35#include "HTMLTokenizer.h" 36#include "HTMLPreloadScanner.h" 37#include "HTMLScriptRunner.h" 38#include "HTMLTreeBuilder.h" 39#include "HTMLDocument.h" 40#include "InspectorInstrumentation.h" 41#include "NestingLevelIncrementer.h" 42#include "Settings.h" 43 44#ifdef ANDROID_INSTRUMENT 45#include "TimeCounter.h" 46#endif 47 48namespace WebCore { 49 50using namespace HTMLNames; 51 52namespace { 53 54// This is a direct transcription of step 4 from: 55// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#fragment-case 56HTMLTokenizer::State tokenizerStateForContextElement(Element* contextElement, bool reportErrors) 57{ 58 if (!contextElement) 59 return HTMLTokenizer::DataState; 60 61 const QualifiedName& contextTag = contextElement->tagQName(); 62 63 if (contextTag.matches(titleTag) || contextTag.matches(textareaTag)) 64 return HTMLTokenizer::RCDATAState; 65 if (contextTag.matches(styleTag) 66 || contextTag.matches(xmpTag) 67 || contextTag.matches(iframeTag) 68 || (contextTag.matches(noembedTag) && HTMLTreeBuilder::pluginsEnabled(contextElement->document()->frame())) 69 || (contextTag.matches(noscriptTag) && HTMLTreeBuilder::scriptEnabled(contextElement->document()->frame())) 70 || contextTag.matches(noframesTag)) 71 return reportErrors ? HTMLTokenizer::RAWTEXTState : HTMLTokenizer::PLAINTEXTState; 72 if (contextTag.matches(scriptTag)) 73 return reportErrors ? HTMLTokenizer::ScriptDataState : HTMLTokenizer::PLAINTEXTState; 74 if (contextTag.matches(plaintextTag)) 75 return HTMLTokenizer::PLAINTEXTState; 76 return HTMLTokenizer::DataState; 77} 78 79} // namespace 80 81HTMLDocumentParser::HTMLDocumentParser(HTMLDocument* document, bool reportErrors) 82 : ScriptableDocumentParser(document) 83 , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(document))) 84 , m_scriptRunner(HTMLScriptRunner::create(document, this)) 85 , m_treeBuilder(HTMLTreeBuilder::create(this, document, reportErrors, usePreHTML5ParserQuirks(document))) 86 , m_parserScheduler(HTMLParserScheduler::create(this)) 87 , m_xssFilter(this) 88 , m_endWasDelayed(false) 89 , m_pumpSessionNestingLevel(0) 90{ 91} 92 93// FIXME: Member variables should be grouped into self-initializing structs to 94// minimize code duplication between these constructors. 95HTMLDocumentParser::HTMLDocumentParser(DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) 96 : ScriptableDocumentParser(fragment->document()) 97 , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(fragment->document()))) 98 , m_treeBuilder(HTMLTreeBuilder::create(this, fragment, contextElement, scriptingPermission, usePreHTML5ParserQuirks(fragment->document()))) 99 , m_xssFilter(this) 100 , m_endWasDelayed(false) 101 , m_pumpSessionNestingLevel(0) 102{ 103 bool reportErrors = false; // For now document fragment parsing never reports errors. 104 m_tokenizer->setState(tokenizerStateForContextElement(contextElement, reportErrors)); 105} 106 107HTMLDocumentParser::~HTMLDocumentParser() 108{ 109 ASSERT(!m_parserScheduler); 110 ASSERT(!m_pumpSessionNestingLevel); 111 ASSERT(!m_preloadScanner); 112} 113 114void HTMLDocumentParser::detach() 115{ 116 DocumentParser::detach(); 117 if (m_scriptRunner) 118 m_scriptRunner->detach(); 119 m_treeBuilder->detach(); 120 // FIXME: It seems wrong that we would have a preload scanner here. 121 // Yet during fast/dom/HTMLScriptElement/script-load-events.html we do. 122 m_preloadScanner.clear(); 123 m_parserScheduler.clear(); // Deleting the scheduler will clear any timers. 124} 125 126void HTMLDocumentParser::stopParsing() 127{ 128 DocumentParser::stopParsing(); 129 m_parserScheduler.clear(); // Deleting the scheduler will clear any timers. 130} 131 132// This kicks off "Once the user agent stops parsing" as described by: 133// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#the-end 134void HTMLDocumentParser::prepareToStopParsing() 135{ 136 ASSERT(!hasInsertionPoint()); 137 138 // pumpTokenizer can cause this parser to be detached from the Document, 139 // but we need to ensure it isn't deleted yet. 140 RefPtr<HTMLDocumentParser> protect(this); 141 142 // NOTE: This pump should only ever emit buffered character tokens, 143 // so ForceSynchronous vs. AllowYield should be meaningless. 144 pumpTokenizerIfPossible(ForceSynchronous); 145 146 if (isStopped()) 147 return; 148 149 DocumentParser::prepareToStopParsing(); 150 151 // We will not have a scriptRunner when parsing a DocumentFragment. 152 if (m_scriptRunner) 153 document()->setReadyState(Document::Interactive); 154 155 attemptToRunDeferredScriptsAndEnd(); 156} 157 158bool HTMLDocumentParser::isParsingFragment() const 159{ 160 return m_treeBuilder->isParsingFragment(); 161} 162 163bool HTMLDocumentParser::processingData() const 164{ 165 return isScheduledForResume() || inPumpSession(); 166} 167 168void HTMLDocumentParser::pumpTokenizerIfPossible(SynchronousMode mode) 169{ 170 if (isStopped() || m_treeBuilder->isPaused()) 171 return; 172 173 // Once a resume is scheduled, HTMLParserScheduler controls when we next pump. 174 if (isScheduledForResume()) { 175 ASSERT(mode == AllowYield); 176 return; 177 } 178 179 pumpTokenizer(mode); 180} 181 182bool HTMLDocumentParser::isScheduledForResume() const 183{ 184 return m_parserScheduler && m_parserScheduler->isScheduledForResume(); 185} 186 187// Used by HTMLParserScheduler 188void HTMLDocumentParser::resumeParsingAfterYield() 189{ 190 // pumpTokenizer can cause this parser to be detached from the Document, 191 // but we need to ensure it isn't deleted yet. 192 RefPtr<HTMLDocumentParser> protect(this); 193 194 // We should never be here unless we can pump immediately. Call pumpTokenizer() 195 // directly so that ASSERTS will fire if we're wrong. 196 pumpTokenizer(AllowYield); 197 endIfDelayed(); 198} 199 200bool HTMLDocumentParser::runScriptsForPausedTreeBuilder() 201{ 202 ASSERT(m_treeBuilder->isPaused()); 203 204 TextPosition1 scriptStartPosition = TextPosition1::belowRangePosition(); 205 RefPtr<Element> scriptElement = m_treeBuilder->takeScriptToProcess(scriptStartPosition); 206 // We will not have a scriptRunner when parsing a DocumentFragment. 207 if (!m_scriptRunner) 208 return true; 209 return m_scriptRunner->execute(scriptElement.release(), scriptStartPosition); 210} 211 212bool HTMLDocumentParser::canTakeNextToken(SynchronousMode mode, PumpSession& session) 213{ 214 if (isStopped()) 215 return false; 216 217 // The parser will pause itself when waiting on a script to load or run. 218 if (m_treeBuilder->isPaused()) { 219 if (mode == AllowYield) 220 m_parserScheduler->checkForYieldBeforeScript(session); 221 222 // If we don't run the script, we cannot allow the next token to be taken. 223 if (session.needsYield) 224 return false; 225 226 // If we're paused waiting for a script, we try to execute scripts before continuing. 227 bool shouldContinueParsing = runScriptsForPausedTreeBuilder(); 228 m_treeBuilder->setPaused(!shouldContinueParsing); 229 if (!shouldContinueParsing || isStopped()) 230 return false; 231 } 232 233 // FIXME: It's wrong for the HTMLDocumentParser to reach back to the 234 // Frame, but this approach is how the old parser handled 235 // stopping when the page assigns window.location. What really 236 // should happen is that assigning window.location causes the 237 // parser to stop parsing cleanly. The problem is we're not 238 // perpared to do that at every point where we run JavaScript. 239 if (!isParsingFragment() 240 && document()->frame() && document()->frame()->navigationScheduler()->locationChangePending()) 241 return false; 242 243 if (mode == AllowYield) 244 m_parserScheduler->checkForYieldBeforeToken(session); 245 246 return true; 247} 248 249void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode) 250{ 251 ASSERT(!isStopped()); 252 ASSERT(!isScheduledForResume()); 253 // ASSERT that this object is both attached to the Document and protected. 254 ASSERT(refCount() >= 2); 255 256 PumpSession session(m_pumpSessionNestingLevel); 257 258 // We tell the InspectorInstrumentation about every pump, even if we 259 // end up pumping nothing. It can filter out empty pumps itself. 260 // FIXME: m_input.current().length() is only accurate if we 261 // end up parsing the whole buffer in this pump. We should pass how 262 // much we parsed as part of didWriteHTML instead of willWriteHTML. 263 InspectorInstrumentationCookie cookie = InspectorInstrumentation::willWriteHTML(document(), m_input.current().length(), m_tokenizer->lineNumber()); 264 265 while (canTakeNextToken(mode, session) && !session.needsYield) { 266 if (!isParsingFragment()) 267 m_sourceTracker.start(m_input, m_token); 268 269 if (!m_tokenizer->nextToken(m_input.current(), m_token)) 270 break; 271 272 if (!isParsingFragment()) { 273 m_sourceTracker.end(m_input, m_token); 274 275 // We do not XSS filter innerHTML, which means we (intentionally) fail 276 // http/tests/security/xssAuditor/dom-write-innerHTML.html 277 m_xssFilter.filterToken(m_token); 278 } 279 280 m_treeBuilder->constructTreeFromToken(m_token); 281 ASSERT(m_token.isUninitialized()); 282 } 283 284 // Ensure we haven't been totally deref'ed after pumping. Any caller of this 285 // function should be holding a RefPtr to this to ensure we weren't deleted. 286 ASSERT(refCount() >= 1); 287 288 if (isStopped()) 289 return; 290 291 if (session.needsYield) 292 m_parserScheduler->scheduleForResume(); 293 294 if (isWaitingForScripts()) { 295 ASSERT(m_tokenizer->state() == HTMLTokenizer::DataState); 296 if (!m_preloadScanner) { 297 m_preloadScanner.set(new HTMLPreloadScanner(document())); 298 m_preloadScanner->appendToEnd(m_input.current()); 299 } 300 m_preloadScanner->scan(); 301 } 302 303 InspectorInstrumentation::didWriteHTML(cookie, m_tokenizer->lineNumber()); 304} 305 306bool HTMLDocumentParser::hasInsertionPoint() 307{ 308 // FIXME: The wasCreatedByScript() branch here might not be fully correct. 309 // Our model of the EOF character differs slightly from the one in 310 // the spec because our treatment is uniform between network-sourced 311 // and script-sourced input streams whereas the spec treats them 312 // differently. 313 return m_input.hasInsertionPoint() || (wasCreatedByScript() && !m_input.haveSeenEndOfFile()); 314} 315 316void HTMLDocumentParser::insert(const SegmentedString& source) 317{ 318 if (isStopped()) 319 return; 320 321#ifdef ANDROID_INSTRUMENT 322 android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter); 323#endif 324 325 // pumpTokenizer can cause this parser to be detached from the Document, 326 // but we need to ensure it isn't deleted yet. 327 RefPtr<HTMLDocumentParser> protect(this); 328 329 SegmentedString excludedLineNumberSource(source); 330 excludedLineNumberSource.setExcludeLineNumbers(); 331 m_input.insertAtCurrentInsertionPoint(excludedLineNumberSource); 332 pumpTokenizerIfPossible(ForceSynchronous); 333 334 if (isWaitingForScripts()) { 335 // Check the document.write() output with a separate preload scanner as 336 // the main scanner can't deal with insertions. 337 HTMLPreloadScanner preloadScanner(document()); 338 preloadScanner.appendToEnd(source); 339 preloadScanner.scan(); 340 } 341 342 endIfDelayed(); 343} 344 345void HTMLDocumentParser::append(const SegmentedString& source) 346{ 347 if (isStopped()) 348 return; 349 350 // pumpTokenizer can cause this parser to be detached from the Document, 351 // but we need to ensure it isn't deleted yet. 352 RefPtr<HTMLDocumentParser> protect(this); 353 354 if (m_preloadScanner) { 355 if (m_input.current().isEmpty() && !isWaitingForScripts()) { 356 // We have parsed until the end of the current input and so are now moving ahead of the preload scanner. 357 // Clear the scanner so we know to scan starting from the current input point if we block again. 358 m_preloadScanner.clear(); 359 } else { 360 m_preloadScanner->appendToEnd(source); 361 if (isWaitingForScripts()) 362 m_preloadScanner->scan(); 363 } 364 } 365 366 m_input.appendToEnd(source); 367 368 if (inPumpSession()) { 369 // We've gotten data off the network in a nested write. 370 // We don't want to consume any more of the input stream now. Do 371 // not worry. We'll consume this data in a less-nested write(). 372#ifdef ANDROID_INSTRUMENT 373 android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); 374#endif 375 return; 376 } 377 378 pumpTokenizerIfPossible(AllowYield); 379 380 endIfDelayed(); 381#ifdef ANDROID_INSTRUMENT 382 android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); 383#endif 384} 385 386void HTMLDocumentParser::end() 387{ 388 ASSERT(!isDetached()); 389 ASSERT(!isScheduledForResume()); 390 391 // Informs the the rest of WebCore that parsing is really finished (and deletes this). 392 m_treeBuilder->finished(); 393} 394 395void HTMLDocumentParser::attemptToRunDeferredScriptsAndEnd() 396{ 397 ASSERT(isStopping()); 398 ASSERT(!hasInsertionPoint()); 399 if (m_scriptRunner && !m_scriptRunner->executeScriptsWaitingForParsing()) 400 return; 401 end(); 402} 403 404void HTMLDocumentParser::attemptToEnd() 405{ 406 // finish() indicates we will not receive any more data. If we are waiting on 407 // an external script to load, we can't finish parsing quite yet. 408 409 if (shouldDelayEnd()) { 410 m_endWasDelayed = true; 411 return; 412 } 413 prepareToStopParsing(); 414} 415 416void HTMLDocumentParser::endIfDelayed() 417{ 418 // If we've already been detached, don't bother ending. 419 if (isDetached()) 420 return; 421 422 if (!m_endWasDelayed || shouldDelayEnd()) 423 return; 424 425 m_endWasDelayed = false; 426 prepareToStopParsing(); 427} 428 429void HTMLDocumentParser::finish() 430{ 431 // FIXME: We should ASSERT(!m_parserStopped) here, since it does not 432 // makes sense to call any methods on DocumentParser once it's been stopped. 433 // However, FrameLoader::stop calls Document::finishParsing unconditionally 434 // which in turn calls m_parser->finish(). 435 436 // We're not going to get any more data off the network, so we tell the 437 // input stream we've reached the end of file. finish() can be called more 438 // than once, if the first time does not call end(). 439 if (!m_input.haveSeenEndOfFile()) 440 m_input.markEndOfFile(); 441 attemptToEnd(); 442} 443 444bool HTMLDocumentParser::finishWasCalled() 445{ 446 return m_input.haveSeenEndOfFile(); 447} 448 449// This function is virtual and just for the DocumentParser interface. 450bool HTMLDocumentParser::isExecutingScript() const 451{ 452 return inScriptExecution(); 453} 454 455// This function is non-virtual and used throughout the implementation. 456bool HTMLDocumentParser::inScriptExecution() const 457{ 458 if (!m_scriptRunner) 459 return false; 460 return m_scriptRunner->isExecutingScript(); 461} 462 463String HTMLDocumentParser::sourceForToken(const HTMLToken& token) 464{ 465 return m_sourceTracker.sourceForToken(token); 466} 467 468int HTMLDocumentParser::lineNumber() const 469{ 470 return m_tokenizer->lineNumber(); 471} 472 473TextPosition0 HTMLDocumentParser::textPosition() const 474{ 475 const SegmentedString& currentString = m_input.current(); 476 WTF::ZeroBasedNumber line = currentString.currentLine(); 477 WTF::ZeroBasedNumber column = currentString.currentColumn(); 478 ASSERT(m_tokenizer->lineNumber() == line.zeroBasedInt()); 479 480 return TextPosition0(line, column); 481} 482 483bool HTMLDocumentParser::isWaitingForScripts() const 484{ 485 return m_treeBuilder->isPaused(); 486} 487 488void HTMLDocumentParser::resumeParsingAfterScriptExecution() 489{ 490 ASSERT(!inScriptExecution()); 491 ASSERT(!m_treeBuilder->isPaused()); 492 493 pumpTokenizerIfPossible(AllowYield); 494 endIfDelayed(); 495} 496 497void HTMLDocumentParser::watchForLoad(CachedResource* cachedScript) 498{ 499 ASSERT(!cachedScript->isLoaded()); 500 // addClient would call notifyFinished if the load were complete. 501 // Callers do not expect to be re-entered from this call, so they should 502 // not an already-loaded CachedResource. 503 cachedScript->addClient(this); 504} 505 506void HTMLDocumentParser::stopWatchingForLoad(CachedResource* cachedScript) 507{ 508 cachedScript->removeClient(this); 509} 510 511void HTMLDocumentParser::appendCurrentInputStreamToPreloadScannerAndScan() 512{ 513 ASSERT(m_preloadScanner); 514 m_preloadScanner->appendToEnd(m_input.current()); 515 m_preloadScanner->scan(); 516} 517 518void HTMLDocumentParser::notifyFinished(CachedResource* cachedResource) 519{ 520 // pumpTokenizer can cause this parser to be detached from the Document, 521 // but we need to ensure it isn't deleted yet. 522 RefPtr<HTMLDocumentParser> protect(this); 523 524 ASSERT(m_scriptRunner); 525 ASSERT(!inScriptExecution()); 526 if (isStopping()) { 527 attemptToRunDeferredScriptsAndEnd(); 528 return; 529 } 530 531 ASSERT(m_treeBuilder->isPaused()); 532 // Note: We only ever wait on one script at a time, so we always know this 533 // is the one we were waiting on and can un-pause the tree builder. 534 m_treeBuilder->setPaused(false); 535 bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForLoad(cachedResource); 536 m_treeBuilder->setPaused(!shouldContinueParsing); 537 if (shouldContinueParsing) 538 resumeParsingAfterScriptExecution(); 539} 540 541void HTMLDocumentParser::executeScriptsWaitingForStylesheets() 542{ 543 // Document only calls this when the Document owns the DocumentParser 544 // so this will not be called in the DocumentFragment case. 545 ASSERT(m_scriptRunner); 546 // Ignore calls unless we have a script blocking the parser waiting on a 547 // stylesheet load. Otherwise we are currently parsing and this 548 // is a re-entrant call from encountering a </ style> tag. 549 if (!m_scriptRunner->hasScriptsWaitingForStylesheets()) 550 return; 551 552 // pumpTokenizer can cause this parser to be detached from the Document, 553 // but we need to ensure it isn't deleted yet. 554 RefPtr<HTMLDocumentParser> protect(this); 555 556 ASSERT(!m_scriptRunner->isExecutingScript()); 557 ASSERT(m_treeBuilder->isPaused()); 558 // Note: We only ever wait on one script at a time, so we always know this 559 // is the one we were waiting on and can un-pause the tree builder. 560 m_treeBuilder->setPaused(false); 561 bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForStylesheets(); 562 m_treeBuilder->setPaused(!shouldContinueParsing); 563 if (shouldContinueParsing) 564 resumeParsingAfterScriptExecution(); 565} 566 567ScriptController* HTMLDocumentParser::script() const 568{ 569 return document()->frame() ? document()->frame()->script() : 0; 570} 571 572void HTMLDocumentParser::parseDocumentFragment(const String& source, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) 573{ 574 RefPtr<HTMLDocumentParser> parser = HTMLDocumentParser::create(fragment, contextElement, scriptingPermission); 575 parser->insert(source); // Use insert() so that the parser will not yield. 576 parser->finish(); 577 ASSERT(!parser->processingData()); // Make sure we're done. <rdar://problem/3963151> 578 parser->detach(); // Allows ~DocumentParser to assert it was detached before destruction. 579} 580 581bool HTMLDocumentParser::usePreHTML5ParserQuirks(Document* document) 582{ 583 ASSERT(document); 584 return document->settings() && document->settings()->usePreHTML5ParserQuirks(); 585} 586 587void HTMLDocumentParser::suspendScheduledTasks() 588{ 589 if (m_parserScheduler) 590 m_parserScheduler->suspend(); 591} 592 593void HTMLDocumentParser::resumeScheduledTasks() 594{ 595 if (m_parserScheduler) 596 m_parserScheduler->resume(); 597} 598 599} 600