1/*
2 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GOOGLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "core/html/parser/BackgroundHTMLParser.h"
28
29#include "core/html/parser/HTMLDocumentParser.h"
30#include "core/html/parser/HTMLParserThread.h"
31#include "core/html/parser/HTMLTokenizer.h"
32#include "core/html/parser/XSSAuditor.h"
33#include "wtf/MainThread.h"
34#include "wtf/text/TextPosition.h"
35
36namespace WebCore {
37
38// On a network with high latency and high bandwidth, using a device
39// with a fast CPU, we could end up speculatively tokenizing
40// the whole document, well ahead of when the main-thread actually needs it.
41// This is a waste of memory (and potentially time if the speculation fails).
42// So we limit our outstanding speculations arbitrarily to 10.
43// Our maximal memory spent speculating will be approximately:
44// outstandingCheckpointLimit * pendingTokenLimit * sizeof(CompactToken)
45// We use a separate low and high water mark to avoid constantly topping
46// off the main thread's token buffer.
47// At time of writing, this is 10 * 1000 * 28 bytes = appox 280kb of memory.
48// These numbers have not been tuned.
49static const size_t outstandingCheckpointLimit = 10;
50
51// We limit our chucks to 1000 tokens, to make sure the main
52// thread is never waiting on the parser thread for tokens.
53// This was tuned in https://bugs.webkit.org/show_bug.cgi?id=110408.
54static const size_t pendingTokenLimit = 1000;
55
56using namespace HTMLNames;
57
58#ifndef NDEBUG
59
60static void checkThatTokensAreSafeToSendToAnotherThread(const CompactHTMLTokenStream* tokens)
61{
62    for (size_t i = 0; i < tokens->size(); ++i)
63        ASSERT(tokens->at(i).isSafeToSendToAnotherThread());
64}
65
66static void checkThatPreloadsAreSafeToSendToAnotherThread(const PreloadRequestStream& preloads)
67{
68    for (size_t i = 0; i < preloads.size(); ++i)
69        ASSERT(preloads[i]->isSafeToSendToAnotherThread());
70}
71
72#endif
73
74BackgroundHTMLParser::BackgroundHTMLParser(PassRefPtr<WeakReference<BackgroundHTMLParser> > reference, PassOwnPtr<Configuration> config)
75    : m_weakFactory(reference, this)
76    , m_token(adoptPtr(new HTMLToken))
77    , m_tokenizer(HTMLTokenizer::create(config->options))
78    , m_treeBuilderSimulator(config->options)
79    , m_options(config->options)
80    , m_parser(config->parser)
81    , m_pendingTokens(adoptPtr(new CompactHTMLTokenStream))
82    , m_xssAuditor(config->xssAuditor.release())
83    , m_preloadScanner(config->preloadScanner.release())
84{
85}
86
87void BackgroundHTMLParser::append(const String& input)
88{
89    ASSERT(!m_input.current().isClosed());
90    m_input.append(input);
91    pumpTokenizer();
92}
93
94void BackgroundHTMLParser::resumeFrom(PassOwnPtr<Checkpoint> checkpoint)
95{
96    m_parser = checkpoint->parser;
97    m_token = checkpoint->token.release();
98    m_tokenizer = checkpoint->tokenizer.release();
99    m_treeBuilderSimulator.setState(checkpoint->treeBuilderState);
100    m_input.rewindTo(checkpoint->inputCheckpoint, checkpoint->unparsedInput);
101    m_preloadScanner->rewindTo(checkpoint->preloadScannerCheckpoint);
102    pumpTokenizer();
103}
104
105void BackgroundHTMLParser::startedChunkWithCheckpoint(HTMLInputCheckpoint inputCheckpoint)
106{
107    // Note, we should not have to worry about the index being invalid
108    // as messages from the main thread will be processed in FIFO order.
109    m_input.invalidateCheckpointsBefore(inputCheckpoint);
110    pumpTokenizer();
111}
112
113void BackgroundHTMLParser::finish()
114{
115    markEndOfFile();
116    pumpTokenizer();
117}
118
119void BackgroundHTMLParser::stop()
120{
121    delete this;
122}
123
124void BackgroundHTMLParser::forcePlaintextForTextDocument()
125{
126    // This is only used by the TextDocumentParser (a subclass of HTMLDocumentParser)
127    // to force us into the PLAINTEXT state w/o using a <plaintext> tag.
128    // The TextDocumentParser uses a <pre> tag for historical/compatibility reasons.
129    m_tokenizer->setState(HTMLTokenizer::PLAINTEXTState);
130}
131
132void BackgroundHTMLParser::markEndOfFile()
133{
134    ASSERT(!m_input.current().isClosed());
135    m_input.append(String(&kEndOfFileMarker, 1));
136    m_input.close();
137}
138
139void BackgroundHTMLParser::pumpTokenizer()
140{
141    // No need to start speculating until the main thread has almost caught up.
142    if (m_input.outstandingCheckpointCount() > outstandingCheckpointLimit)
143        return;
144
145    while (true) {
146        m_sourceTracker.start(m_input.current(), m_tokenizer.get(), *m_token);
147        if (!m_tokenizer->nextToken(m_input.current(), *m_token)) {
148            // We've reached the end of our current input.
149            sendTokensToMainThread();
150            break;
151        }
152        m_sourceTracker.end(m_input.current(), m_tokenizer.get(), *m_token);
153
154        {
155            TextPosition position = TextPosition(m_input.current().currentLine(), m_input.current().currentColumn());
156
157            if (OwnPtr<XSSInfo> xssInfo = m_xssAuditor->filterToken(FilterTokenRequest(*m_token, m_sourceTracker, m_tokenizer->shouldAllowCDATA()))) {
158                xssInfo->m_textPosition = position;
159                m_pendingXSSInfos.append(xssInfo.release());
160            }
161
162            CompactHTMLToken token(m_token.get(), TextPosition(m_input.current().currentLine(), m_input.current().currentColumn()));
163
164            m_preloadScanner->scan(token, m_input.current(), m_pendingPreloads);
165
166            m_pendingTokens->append(token);
167        }
168
169        m_token->clear();
170
171        if (!m_treeBuilderSimulator.simulate(m_pendingTokens->last(), m_tokenizer.get()) || m_pendingTokens->size() >= pendingTokenLimit) {
172            sendTokensToMainThread();
173            // If we're far ahead of the main thread, yield for a bit to avoid consuming too much memory.
174            if (m_input.outstandingCheckpointCount() > outstandingCheckpointLimit)
175                break;
176        }
177    }
178}
179
180void BackgroundHTMLParser::sendTokensToMainThread()
181{
182    if (m_pendingTokens->isEmpty())
183        return;
184
185#ifndef NDEBUG
186    checkThatTokensAreSafeToSendToAnotherThread(m_pendingTokens.get());
187    checkThatPreloadsAreSafeToSendToAnotherThread(m_pendingPreloads);
188#endif
189
190    OwnPtr<HTMLDocumentParser::ParsedChunk> chunk = adoptPtr(new HTMLDocumentParser::ParsedChunk);
191    chunk->tokens = m_pendingTokens.release();
192    chunk->preloads.swap(m_pendingPreloads);
193    chunk->xssInfos.swap(m_pendingXSSInfos);
194    chunk->tokenizerState = m_tokenizer->state();
195    chunk->treeBuilderState = m_treeBuilderSimulator.state();
196    chunk->inputCheckpoint = m_input.createCheckpoint();
197    chunk->preloadScannerCheckpoint = m_preloadScanner->createCheckpoint();
198    callOnMainThread(bind(&HTMLDocumentParser::didReceiveParsedChunkFromBackgroundParser, m_parser, chunk.release()));
199
200    m_pendingTokens = adoptPtr(new CompactHTMLTokenStream);
201}
202
203}
204