1/*
2 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GOOGLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "core/html/parser/BackgroundHTMLParser.h"
28
29#include "core/html/parser/HTMLDocumentParser.h"
30#include "core/html/parser/TextResourceDecoder.h"
31#include "core/html/parser/XSSAuditor.h"
32#include "wtf/MainThread.h"
33#include "wtf/text/TextPosition.h"
34
35namespace blink {
36
37// On a network with high latency and high bandwidth, using a device
38// with a fast CPU, we could end up speculatively tokenizing
39// the whole document, well ahead of when the main-thread actually needs it.
40// This is a waste of memory (and potentially time if the speculation fails).
41// So we limit our outstanding tokens arbitrarily to 10,000.
42// Our maximal memory spent speculating will be approximately:
43// (outstandingTokenLimit + pendingTokenLimit) * sizeof(CompactToken)
44// We use a separate low and high water mark to avoid constantly topping
45// off the main thread's token buffer.
46// At time of writing, this is (10000 + 1000) * 28 bytes = ~308kb of memory.
47// These numbers have not been tuned.
48static const size_t outstandingTokenLimit = 10000;
49
50// We limit our chucks to 1000 tokens, to make sure the main
51// thread is never waiting on the parser thread for tokens.
52// This was tuned in https://bugs.webkit.org/show_bug.cgi?id=110408.
53static const size_t pendingTokenLimit = 1000;
54
55using namespace HTMLNames;
56
57#if ENABLE(ASSERT)
58
59static void checkThatTokensAreSafeToSendToAnotherThread(const CompactHTMLTokenStream* tokens)
60{
61    for (size_t i = 0; i < tokens->size(); ++i)
62        ASSERT(tokens->at(i).isSafeToSendToAnotherThread());
63}
64
65static void checkThatPreloadsAreSafeToSendToAnotherThread(const PreloadRequestStream& preloads)
66{
67    for (size_t i = 0; i < preloads.size(); ++i)
68        ASSERT(preloads[i]->isSafeToSendToAnotherThread());
69}
70
71static void checkThatXSSInfosAreSafeToSendToAnotherThread(const XSSInfoStream& infos)
72{
73    for (size_t i = 0; i < infos.size(); ++i)
74        ASSERT(infos[i]->isSafeToSendToAnotherThread());
75}
76
77#endif
78
79void BackgroundHTMLParser::start(PassRefPtr<WeakReference<BackgroundHTMLParser> > reference, PassOwnPtr<Configuration> config)
80{
81    new BackgroundHTMLParser(reference, config);
82    // Caller must free by calling stop().
83}
84
85BackgroundHTMLParser::BackgroundHTMLParser(PassRefPtr<WeakReference<BackgroundHTMLParser> > reference, PassOwnPtr<Configuration> config)
86    : m_weakFactory(reference, this)
87    , m_token(adoptPtr(new HTMLToken))
88    , m_tokenizer(HTMLTokenizer::create(config->options))
89    , m_treeBuilderSimulator(config->options)
90    , m_options(config->options)
91    , m_parser(config->parser)
92    , m_pendingTokens(adoptPtr(new CompactHTMLTokenStream))
93    , m_xssAuditor(config->xssAuditor.release())
94    , m_preloadScanner(config->preloadScanner.release())
95    , m_decoder(config->decoder.release())
96{
97}
98
99BackgroundHTMLParser::~BackgroundHTMLParser()
100{
101}
102
103void BackgroundHTMLParser::appendRawBytesFromParserThread(const char* data, int dataLength)
104{
105    ASSERT(m_decoder);
106    updateDocument(m_decoder->decode(data, dataLength));
107}
108
109void BackgroundHTMLParser::appendRawBytesFromMainThread(PassOwnPtr<Vector<char> > buffer)
110{
111    ASSERT(m_decoder);
112    updateDocument(m_decoder->decode(buffer->data(), buffer->size()));
113}
114
115void BackgroundHTMLParser::appendDecodedBytes(const String& input)
116{
117    ASSERT(!m_input.current().isClosed());
118    m_input.append(input);
119    pumpTokenizer();
120}
121
122void BackgroundHTMLParser::setDecoder(PassOwnPtr<TextResourceDecoder> decoder)
123{
124    ASSERT(decoder);
125    m_decoder = decoder;
126}
127
128void BackgroundHTMLParser::flush()
129{
130    ASSERT(m_decoder);
131    updateDocument(m_decoder->flush());
132}
133
134void BackgroundHTMLParser::updateDocument(const String& decodedData)
135{
136    DocumentEncodingData encodingData(*m_decoder.get());
137
138    if (encodingData != m_lastSeenEncodingData) {
139        m_lastSeenEncodingData = encodingData;
140
141        m_xssAuditor->setEncoding(encodingData.encoding());
142        callOnMainThread(bind(&HTMLDocumentParser::didReceiveEncodingDataFromBackgroundParser, m_parser, encodingData));
143    }
144
145    if (decodedData.isEmpty())
146        return;
147
148    appendDecodedBytes(decodedData);
149}
150
151void BackgroundHTMLParser::resumeFrom(PassOwnPtr<Checkpoint> checkpoint)
152{
153    m_parser = checkpoint->parser;
154    m_token = checkpoint->token.release();
155    m_tokenizer = checkpoint->tokenizer.release();
156    m_treeBuilderSimulator.setState(checkpoint->treeBuilderState);
157    m_input.rewindTo(checkpoint->inputCheckpoint, checkpoint->unparsedInput);
158    m_preloadScanner->rewindTo(checkpoint->preloadScannerCheckpoint);
159    pumpTokenizer();
160}
161
162void BackgroundHTMLParser::startedChunkWithCheckpoint(HTMLInputCheckpoint inputCheckpoint)
163{
164    // Note, we should not have to worry about the index being invalid
165    // as messages from the main thread will be processed in FIFO order.
166    m_input.invalidateCheckpointsBefore(inputCheckpoint);
167    pumpTokenizer();
168}
169
170void BackgroundHTMLParser::finish()
171{
172    markEndOfFile();
173    pumpTokenizer();
174}
175
176void BackgroundHTMLParser::stop()
177{
178    delete this;
179}
180
181void BackgroundHTMLParser::forcePlaintextForTextDocument()
182{
183    // This is only used by the TextDocumentParser (a subclass of HTMLDocumentParser)
184    // to force us into the PLAINTEXT state w/o using a <plaintext> tag.
185    // The TextDocumentParser uses a <pre> tag for historical/compatibility reasons.
186    m_tokenizer->setState(HTMLTokenizer::PLAINTEXTState);
187}
188
189void BackgroundHTMLParser::markEndOfFile()
190{
191    ASSERT(!m_input.current().isClosed());
192    m_input.append(String(&kEndOfFileMarker, 1));
193    m_input.close();
194}
195
196void BackgroundHTMLParser::pumpTokenizer()
197{
198    // No need to start speculating until the main thread has almost caught up.
199    if (m_input.totalCheckpointTokenCount() > outstandingTokenLimit)
200        return;
201
202    while (true) {
203        m_sourceTracker.start(m_input.current(), m_tokenizer.get(), *m_token);
204        if (!m_tokenizer->nextToken(m_input.current(), *m_token)) {
205            // We've reached the end of our current input.
206            sendTokensToMainThread();
207            break;
208        }
209        m_sourceTracker.end(m_input.current(), m_tokenizer.get(), *m_token);
210
211        {
212            TextPosition position = TextPosition(m_input.current().currentLine(), m_input.current().currentColumn());
213
214            if (OwnPtr<XSSInfo> xssInfo = m_xssAuditor->filterToken(FilterTokenRequest(*m_token, m_sourceTracker, m_tokenizer->shouldAllowCDATA()))) {
215                xssInfo->m_textPosition = position;
216                m_pendingXSSInfos.append(xssInfo.release());
217            }
218
219            CompactHTMLToken token(m_token.get(), TextPosition(m_input.current().currentLine(), m_input.current().currentColumn()));
220
221            m_preloadScanner->scan(token, m_input.current(), m_pendingPreloads);
222
223            m_pendingTokens->append(token);
224        }
225
226        m_token->clear();
227
228        if (!m_treeBuilderSimulator.simulate(m_pendingTokens->last(), m_tokenizer.get()) || m_pendingTokens->size() >= pendingTokenLimit) {
229            sendTokensToMainThread();
230            // If we're far ahead of the main thread, yield for a bit to avoid consuming too much memory.
231            if (m_input.totalCheckpointTokenCount() > outstandingTokenLimit)
232                break;
233        }
234    }
235}
236
237void BackgroundHTMLParser::sendTokensToMainThread()
238{
239    if (m_pendingTokens->isEmpty())
240        return;
241
242#if ENABLE(ASSERT)
243    checkThatTokensAreSafeToSendToAnotherThread(m_pendingTokens.get());
244    checkThatPreloadsAreSafeToSendToAnotherThread(m_pendingPreloads);
245    checkThatXSSInfosAreSafeToSendToAnotherThread(m_pendingXSSInfos);
246#endif
247
248    OwnPtr<HTMLDocumentParser::ParsedChunk> chunk = adoptPtr(new HTMLDocumentParser::ParsedChunk);
249    chunk->preloads.swap(m_pendingPreloads);
250    chunk->xssInfos.swap(m_pendingXSSInfos);
251    chunk->tokenizerState = m_tokenizer->state();
252    chunk->treeBuilderState = m_treeBuilderSimulator.state();
253    chunk->inputCheckpoint = m_input.createCheckpoint(m_pendingTokens->size());
254    chunk->preloadScannerCheckpoint = m_preloadScanner->createCheckpoint();
255    chunk->tokens = m_pendingTokens.release();
256    callOnMainThread(bind(&HTMLDocumentParser::didReceiveParsedChunkFromBackgroundParser, m_parser, chunk.release()));
257
258    m_pendingTokens = adoptPtr(new CompactHTMLTokenStream);
259}
260
261}
262