1/*
2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31// How we handle the base tag better.
32// Current status:
33// At now the normal way we use to handling base tag is
34// a) For those links which have corresponding local saved files, such as
35// savable CSS, JavaScript files, they will be written to relative URLs which
36// point to local saved file. Why those links can not be resolved as absolute
37// file URLs, because if they are resolved as absolute URLs, after moving the
38// file location from one directory to another directory, the file URLs will
39// be dead links.
40// b) For those links which have not corresponding local saved files, such as
41// links in A, AREA tags, they will be resolved as absolute URLs.
42// c) We comment all base tags when serialzing DOM for the page.
43// FireFox also uses above way to handle base tag.
44//
45// Problem:
46// This way can not handle the following situation:
47// the base tag is written by JavaScript.
48// For example. The page "www.yahoo.com" use
49// "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
50// of page when loading page. So when saving page as completed-HTML, we assume
51// that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
52// completed-HTML page, then the JavaScript will insert a base tag
53// <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
54// local saved resource files will be resolved as
55// "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
56// files can not be loaded correctly. Also the page will be rendered ugly since
57// all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
58// files can not be fetched.
59// Now FireFox, IE and WebKit based Browser all have this problem.
60//
61// Solution:
62// My solution is that we comment old base tag and write new base tag:
63// <base href="." ...> after the previous commented base tag. In WebKit, it
64// always uses the latest "href" attribute of base tag to set document's base
65// URL. Based on this behavior, when we encounter a base tag, we comment it and
66// write a new base tag <base href="."> after the previous commented base tag.
67// The new added base tag can help engine to locate correct base URL for
68// correctly loading local saved resource files. Also I think we need to inherit
69// the base target value from document object when appending new base tag.
70// If there are multiple base tags in original document, we will comment all old
71// base tags and append new base tag after each old base tag because we do not
72// know those old base tags are original content or added by JavaScript. If
73// they are added by JavaScript, it means when loading saved page, the script(s)
74// will still insert base tag(s) to DOM, so the new added base tag(s) can
75// override the incorrect base URL and make sure we alway load correct local
76// saved resource files.
77
78#include "config.h"
79#include "web/WebPageSerializerImpl.h"
80
81#include "core/HTMLNames.h"
82#include "core/dom/Document.h"
83#include "core/dom/DocumentType.h"
84#include "core/dom/Element.h"
85#include "core/editing/markup.h"
86#include "core/html/HTMLAllCollection.h"
87#include "core/html/HTMLElement.h"
88#include "core/html/HTMLFormElement.h"
89#include "core/html/HTMLHtmlElement.h"
90#include "core/html/HTMLMetaElement.h"
91#include "core/loader/DocumentLoader.h"
92#include "core/loader/FrameLoader.h"
93#include "public/platform/WebVector.h"
94#include "web/WebLocalFrameImpl.h"
95#include "wtf/text/TextEncoding.h"
96
97namespace blink {
98
99// Maximum length of data buffer which is used to temporary save generated
100// html content data. This is a soft limit which might be passed if a very large
101// contegious string is found in the page.
102static const unsigned dataBufferCapacity = 65536;
103
104WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
105                                                            const WTF::TextEncoding& textEncoding,
106                                                            Document* document,
107                                                            const String& directoryName)
108    : url(url)
109    , textEncoding(textEncoding)
110    , document(document)
111    , directoryName(directoryName)
112    , isHTMLDocument(document->isHTMLDocument())
113    , haveSeenDocType(false)
114    , haveAddedCharsetDeclaration(false)
115    , skipMetaElement(0)
116    , isInScriptOrStyleTag(false)
117    , haveAddedXMLProcessingDirective(false)
118    , haveAddedContentsBeforeEnd(false)
119{
120}
121
122String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
123    const Element* element, SerializeDomParam* param, bool* needSkip)
124{
125    StringBuilder result;
126
127    *needSkip = false;
128    if (param->isHTMLDocument) {
129        // Skip the open tag of original META tag which declare charset since we
130        // have overrided the META which have correct charset declaration after
131        // serializing open tag of HEAD element.
132        ASSERT(element);
133        if (isHTMLMetaElement(*element)) {
134            const HTMLMetaElement& meta = toHTMLMetaElement(*element);
135            // Check whether the META tag has declared charset or not.
136            String equiv = meta.httpEquiv();
137            if (equalIgnoringCase(equiv, "content-type")) {
138                String content = meta.content();
139                if (content.length() && content.contains("charset", false)) {
140                    // Find META tag declared charset, we need to skip it when
141                    // serializing DOM.
142                    param->skipMetaElement = element;
143                    *needSkip = true;
144                }
145            }
146        } else if (isHTMLHtmlElement(*element)) {
147            // Check something before processing the open tag of HEAD element.
148            // First we add doc type declaration if original document has it.
149            if (!param->haveSeenDocType) {
150                param->haveSeenDocType = true;
151                result.append(createMarkup(param->document->doctype()));
152            }
153
154            // Add MOTW declaration before html tag.
155            // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
156            result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
157        } else if (isHTMLBaseElement(*element)) {
158            // Comment the BASE tag when serializing dom.
159            result.appendLiteral("<!--");
160        }
161    } else {
162        // Write XML declaration.
163        if (!param->haveAddedXMLProcessingDirective) {
164            param->haveAddedXMLProcessingDirective = true;
165            // Get encoding info.
166            String xmlEncoding = param->document->xmlEncoding();
167            if (xmlEncoding.isEmpty())
168                xmlEncoding = param->document->encodingName();
169            if (xmlEncoding.isEmpty())
170                xmlEncoding = UTF8Encoding().name();
171            result.appendLiteral("<?xml version=\"");
172            result.append(param->document->xmlVersion());
173            result.appendLiteral("\" encoding=\"");
174            result.append(xmlEncoding);
175            if (param->document->xmlStandalone())
176                result.appendLiteral("\" standalone=\"yes");
177            result.appendLiteral("\"?>\n");
178        }
179        // Add doc type declaration if original document has it.
180        if (!param->haveSeenDocType) {
181            param->haveSeenDocType = true;
182            result.append(createMarkup(param->document->doctype()));
183        }
184    }
185    return result.toString();
186}
187
188String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
189    const Element* element, SerializeDomParam* param)
190{
191    StringBuilder result;
192
193    param->haveAddedContentsBeforeEnd = false;
194    if (!param->isHTMLDocument)
195        return result.toString();
196    // Check after processing the open tag of HEAD element
197    if (!param->haveAddedCharsetDeclaration
198        && isHTMLHeadElement(*element)) {
199        param->haveAddedCharsetDeclaration = true;
200        // Check meta element. WebKit only pre-parse the first 512 bytes
201        // of the document. If the whole <HEAD> is larger and meta is the
202        // end of head part, then this kind of pages aren't decoded correctly
203        // because of this issue. So when we serialize the DOM, we need to
204        // make sure the meta will in first child of head tag.
205        // See http://bugs.webkit.org/show_bug.cgi?id=16621.
206        // First we generate new content for writing correct META element.
207        result.append(WebPageSerializer::generateMetaCharsetDeclaration(
208            String(param->textEncoding.name())));
209
210        param->haveAddedContentsBeforeEnd = true;
211        // Will search each META which has charset declaration, and skip them all
212        // in PreActionBeforeSerializeOpenTag.
213    } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
214        param->isInScriptOrStyleTag = true;
215    }
216
217    return result.toString();
218}
219
220String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
221    const Element* element, SerializeDomParam* param, bool* needSkip)
222{
223    String result;
224
225    *needSkip = false;
226    if (!param->isHTMLDocument)
227        return result;
228    // Skip the end tag of original META tag which declare charset.
229    // Need not to check whether it's META tag since we guarantee
230    // skipMetaElement is definitely META tag if it's not 0.
231    if (param->skipMetaElement == element) {
232        *needSkip = true;
233    } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
234        ASSERT(param->isInScriptOrStyleTag);
235        param->isInScriptOrStyleTag = false;
236    }
237
238    return result;
239}
240
241// After we finish serializing end tag of a element, we give the target
242// element a chance to do some post work to add some additional data.
243String WebPageSerializerImpl::postActionAfterSerializeEndTag(
244    const Element* element, SerializeDomParam* param)
245{
246    StringBuilder result;
247
248    if (!param->isHTMLDocument)
249        return result.toString();
250    // Comment the BASE tag when serializing DOM.
251    if (isHTMLBaseElement(*element)) {
252        result.appendLiteral("-->");
253        // Append a new base tag declaration.
254        result.append(WebPageSerializer::generateBaseTagDeclaration(
255            param->document->baseTarget()));
256    }
257
258    return result.toString();
259}
260
261void WebPageSerializerImpl::saveHTMLContentToBuffer(
262    const String& result, SerializeDomParam* param)
263{
264    m_dataBuffer.append(result);
265    encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
266                         param,
267                         DoNotForceFlush);
268}
269
270void WebPageSerializerImpl::encodeAndFlushBuffer(
271    WebPageSerializerClient::PageSerializationStatus status,
272    SerializeDomParam* param,
273    FlushOption flushOption)
274{
275    // Data buffer is not full nor do we want to force flush.
276    if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
277        return;
278
279    String content = m_dataBuffer.toString();
280    m_dataBuffer.clear();
281
282    CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables);
283
284    // Send result to the client.
285    m_client->didSerializeDataForFrame(param->url,
286                                       WebCString(encodedContent.data(), encodedContent.length()),
287                                       status);
288}
289
290void WebPageSerializerImpl::openTagToString(Element* element,
291                                            SerializeDomParam* param)
292{
293    bool needSkip;
294    StringBuilder result;
295    // Do pre action for open tag.
296    result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
297    if (needSkip)
298        return;
299    // Add open tag
300    result.append('<');
301    result.append(element->nodeName().lower());
302    // Go through all attributes and serialize them.
303    AttributeCollection attributes = element->attributes();
304    AttributeCollection::iterator end = attributes.end();
305    for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) {
306        result.append(' ');
307        // Add attribute pair
308        result.append(it->name().toString());
309        result.appendLiteral("=\"");
310        if (!it->value().isEmpty()) {
311            const String& attrValue = it->value();
312
313            // Check whether we need to replace some resource links
314            // with local resource paths.
315            const QualifiedName& attrName = it->name();
316            if (element->hasLegalLinkAttribute(attrName)) {
317                // For links start with "javascript:", we do not change it.
318                if (attrValue.startsWith("javascript:", false)) {
319                    result.append(attrValue);
320                } else {
321                    // Get the absolute link
322                    WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element);
323                    String completeURL = subFrame ? subFrame->frame()->document()->url() :
324                                                    param->document->completeURL(attrValue);
325                    // Check whether we have local files for those link.
326                    if (m_localLinks.contains(completeURL)) {
327                        if (!param->directoryName.isEmpty()) {
328                            result.appendLiteral("./");
329                            result.append(param->directoryName);
330                            result.append('/');
331                        }
332                        result.append(m_localLinks.get(completeURL));
333                    } else {
334                        result.append(completeURL);
335                    }
336                }
337            } else {
338                if (param->isHTMLDocument)
339                    result.append(m_htmlEntities.convertEntitiesInString(attrValue));
340                else
341                    result.append(m_xmlEntities.convertEntitiesInString(attrValue));
342            }
343        }
344        result.append('\"');
345    }
346
347    // Do post action for open tag.
348    String addedContents = postActionAfterSerializeOpenTag(element, param);
349    // Complete the open tag for element when it has child/children.
350    if (element->hasChildren() || param->haveAddedContentsBeforeEnd)
351        result.append('>');
352    // Append the added contents generate in  post action of open tag.
353    result.append(addedContents);
354    // Save the result to data buffer.
355    saveHTMLContentToBuffer(result.toString(), param);
356}
357
358// Serialize end tag of an specified element.
359void WebPageSerializerImpl::endTagToString(Element* element,
360                                           SerializeDomParam* param)
361{
362    bool needSkip;
363    StringBuilder result;
364    // Do pre action for end tag.
365    result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
366    if (needSkip)
367        return;
368    // Write end tag when element has child/children.
369    if (element->hasChildren() || param->haveAddedContentsBeforeEnd) {
370        result.appendLiteral("</");
371        result.append(element->nodeName().lower());
372        result.append('>');
373    } else {
374        // Check whether we have to write end tag for empty element.
375        if (param->isHTMLDocument) {
376            result.append('>');
377            // FIXME: This code is horribly wrong.  WebPageSerializerImpl must die.
378            if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) {
379                // We need to write end tag when it is required.
380                result.appendLiteral("</");
381                result.append(element->nodeName().lower());
382                result.append('>');
383            }
384        } else {
385            // For xml base document.
386            result.appendLiteral(" />");
387        }
388    }
389    // Do post action for end tag.
390    result.append(postActionAfterSerializeEndTag(element, param));
391    // Save the result to data buffer.
392    saveHTMLContentToBuffer(result.toString(), param);
393}
394
395void WebPageSerializerImpl::buildContentForNode(Node* node,
396                                                SerializeDomParam* param)
397{
398    switch (node->nodeType()) {
399    case Node::ELEMENT_NODE:
400        // Process open tag of element.
401        openTagToString(toElement(node), param);
402        // Walk through the children nodes and process it.
403        for (Node *child = node->firstChild(); child; child = child->nextSibling())
404            buildContentForNode(child, param);
405        // Process end tag of element.
406        endTagToString(toElement(node), param);
407        break;
408    case Node::TEXT_NODE:
409        saveHTMLContentToBuffer(createMarkup(node), param);
410        break;
411    case Node::ATTRIBUTE_NODE:
412    case Node::DOCUMENT_NODE:
413    case Node::DOCUMENT_FRAGMENT_NODE:
414        // Should not exist.
415        ASSERT_NOT_REACHED();
416        break;
417    // Document type node can be in DOM?
418    case Node::DOCUMENT_TYPE_NODE:
419        param->haveSeenDocType = true;
420    default:
421        // For other type node, call default action.
422        saveHTMLContentToBuffer(createMarkup(node), param);
423        break;
424    }
425}
426
427WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
428                                             bool recursiveSerialization,
429                                             WebPageSerializerClient* client,
430                                             const WebVector<WebURL>& links,
431                                             const WebVector<WebString>& localPaths,
432                                             const WebString& localDirectoryName)
433    : m_client(client)
434    , m_recursiveSerialization(recursiveSerialization)
435    , m_framesCollected(false)
436    , m_localDirectoryName(localDirectoryName)
437    , m_htmlEntities(false)
438    , m_xmlEntities(true)
439{
440    // Must specify available webframe.
441    ASSERT(frame);
442    m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame);
443    // Make sure we have non 0 client.
444    ASSERT(client);
445    // Build local resources map.
446    ASSERT(links.size() == localPaths.size());
447    for (size_t i = 0; i < links.size(); i++) {
448        KURL url = links[i];
449        ASSERT(!m_localLinks.contains(url.string()));
450        m_localLinks.set(url.string(), localPaths[i]);
451    }
452
453    ASSERT(m_dataBuffer.isEmpty());
454}
455
456void WebPageSerializerImpl::collectTargetFrames()
457{
458    ASSERT(!m_framesCollected);
459    m_framesCollected = true;
460
461    // First, process main frame.
462    m_frames.append(m_specifiedWebLocalFrameImpl);
463    // Return now if user only needs to serialize specified frame, not including
464    // all sub-frames.
465    if (!m_recursiveSerialization)
466        return;
467    // Collect all frames inside the specified frame.
468    for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
469        WebLocalFrameImpl* currentFrame = m_frames[i];
470        // Get current using document.
471        Document* currentDoc = currentFrame->frame()->document();
472        // Go through sub-frames.
473        RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all();
474
475        for (unsigned i = 0; Element* element = all->item(i); ++i) {
476            if (!element->isHTMLElement())
477                continue;
478            WebLocalFrameImpl* webFrame =
479                WebLocalFrameImpl::fromFrameOwnerElement(element);
480            if (webFrame)
481                m_frames.append(webFrame);
482        }
483    }
484}
485
486bool WebPageSerializerImpl::serialize()
487{
488    if (!m_framesCollected)
489        collectTargetFrames();
490
491    bool didSerialization = false;
492    KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url();
493
494    for (unsigned i = 0; i < m_frames.size(); ++i) {
495        WebLocalFrameImpl* webFrame = m_frames[i];
496        Document* document = webFrame->frame()->document();
497        const KURL& url = document->url();
498
499        if (!url.isValid() || !m_localLinks.contains(url.string()))
500            continue;
501
502        didSerialization = true;
503
504        const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();
505        String directoryName = url == mainURL ? m_localDirectoryName : "";
506
507        SerializeDomParam param(url, textEncoding, document, directoryName);
508
509        Element* documentElement = document->documentElement();
510        if (documentElement)
511            buildContentForNode(documentElement, &param);
512
513        encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &param, ForceFlush);
514    }
515
516    ASSERT(m_dataBuffer.isEmpty());
517    m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
518    return didSerialization;
519}
520
521}  // namespace blink
522