1/*
2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31// How we handle the base tag better.
32// Current status:
33// At now the normal way we use to handling base tag is
34// a) For those links which have corresponding local saved files, such as
35// savable CSS, JavaScript files, they will be written to relative URLs which
36// point to local saved file. Why those links can not be resolved as absolute
37// file URLs, because if they are resolved as absolute URLs, after moving the
38// file location from one directory to another directory, the file URLs will
39// be dead links.
40// b) For those links which have not corresponding local saved files, such as
41// links in A, AREA tags, they will be resolved as absolute URLs.
42// c) We comment all base tags when serialzing DOM for the page.
43// FireFox also uses above way to handle base tag.
44//
45// Problem:
46// This way can not handle the following situation:
47// the base tag is written by JavaScript.
48// For example. The page "www.yahoo.com" use
49// "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
50// of page when loading page. So when saving page as completed-HTML, we assume
51// that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
52// completed-HTML page, then the JavaScript will insert a base tag
53// <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
54// local saved resource files will be resolved as
55// "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
56// files can not be loaded correctly. Also the page will be rendered ugly since
57// all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
58// files can not be fetched.
59// Now FireFox, IE and WebKit based Browser all have this problem.
60//
61// Solution:
62// My solution is that we comment old base tag and write new base tag:
63// <base href="." ...> after the previous commented base tag. In WebKit, it
64// always uses the latest "href" attribute of base tag to set document's base
65// URL. Based on this behavior, when we encounter a base tag, we comment it and
66// write a new base tag <base href="."> after the previous commented base tag.
67// The new added base tag can help engine to locate correct base URL for
68// correctly loading local saved resource files. Also I think we need to inherit
69// the base target value from document object when appending new base tag.
70// If there are multiple base tags in original document, we will comment all old
71// base tags and append new base tag after each old base tag because we do not
72// know those old base tags are original content or added by JavaScript. If
73// they are added by JavaScript, it means when loading saved page, the script(s)
74// will still insert base tag(s) to DOM, so the new added base tag(s) can
75// override the incorrect base URL and make sure we alway load correct local
76// saved resource files.
77
78#include "config.h"
79#include "WebPageSerializerImpl.h"
80
81#include "Document.h"
82#include "DocumentLoader.h"
83#include "DocumentType.h"
84#include "Element.h"
85#include "FrameLoader.h"
86#include "HTMLAllCollection.h"
87#include "HTMLElement.h"
88#include "HTMLFormElement.h"
89#include "HTMLMetaElement.h"
90#include "HTMLNames.h"
91#include "KURL.h"
92#include "TextEncoding.h"
93#include "markup.h"
94
95#include "DOMUtilitiesPrivate.h"
96#include "WebFrameImpl.h"
97#include "WebURL.h"
98#include "WebVector.h"
99
100using namespace WebCore;
101
102namespace WebKit {
103
104// Maximum length of data buffer which is used to temporary save generated
105// html content data. This is a soft limit which might be passed if a very large
106// contegious string is found in the page.
107static const unsigned dataBufferCapacity = 65536;
108
109WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
110                                                            const TextEncoding& textEncoding,
111                                                            Document* document,
112                                                            const String& directoryName)
113    : url(url)
114    , textEncoding(textEncoding)
115    , document(document)
116    , directoryName(directoryName)
117    , isHTMLDocument(document->isHTMLDocument())
118    , haveSeenDocType(false)
119    , haveAddedCharsetDeclaration(false)
120    , skipMetaElement(0)
121    , isInScriptOrStyleTag(false)
122    , haveAddedXMLProcessingDirective(false)
123    , haveAddedContentsBeforeEnd(false)
124{
125}
126
127String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
128    const Element* element, SerializeDomParam* param, bool* needSkip)
129{
130    StringBuilder result;
131
132    *needSkip = false;
133    if (param->isHTMLDocument) {
134        // Skip the open tag of original META tag which declare charset since we
135        // have overrided the META which have correct charset declaration after
136        // serializing open tag of HEAD element.
137        if (element->hasTagName(HTMLNames::metaTag)) {
138            const HTMLMetaElement* meta = static_cast<const HTMLMetaElement*>(element);
139            // Check whether the META tag has declared charset or not.
140            String equiv = meta->httpEquiv();
141            if (equalIgnoringCase(equiv, "content-type")) {
142                String content = meta->content();
143                if (content.length() && content.contains("charset", false)) {
144                    // Find META tag declared charset, we need to skip it when
145                    // serializing DOM.
146                    param->skipMetaElement = element;
147                    *needSkip = true;
148                }
149            }
150        } else if (element->hasTagName(HTMLNames::htmlTag)) {
151            // Check something before processing the open tag of HEAD element.
152            // First we add doc type declaration if original document has it.
153            if (!param->haveSeenDocType) {
154                param->haveSeenDocType = true;
155                result.append(createMarkup(param->document->doctype()));
156            }
157
158            // Add MOTW declaration before html tag.
159            // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
160            result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
161        } else if (element->hasTagName(HTMLNames::baseTag)) {
162            // Comment the BASE tag when serializing dom.
163            result.append("<!--");
164        }
165    } else {
166        // Write XML declaration.
167        if (!param->haveAddedXMLProcessingDirective) {
168            param->haveAddedXMLProcessingDirective = true;
169            // Get encoding info.
170            String xmlEncoding = param->document->xmlEncoding();
171            if (xmlEncoding.isEmpty())
172                xmlEncoding = param->document->loader()->writer()->encoding();
173            if (xmlEncoding.isEmpty())
174                xmlEncoding = UTF8Encoding().name();
175            result.append("<?xml version=\"");
176            result.append(param->document->xmlVersion());
177            result.append("\" encoding=\"");
178            result.append(xmlEncoding);
179            if (param->document->xmlStandalone())
180                result.append("\" standalone=\"yes");
181            result.append("\"?>\n");
182        }
183        // Add doc type declaration if original document has it.
184        if (!param->haveSeenDocType) {
185            param->haveSeenDocType = true;
186            result.append(createMarkup(param->document->doctype()));
187        }
188    }
189    return result.toString();
190}
191
192String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
193    const Element* element, SerializeDomParam* param)
194{
195    StringBuilder result;
196
197    param->haveAddedContentsBeforeEnd = false;
198    if (!param->isHTMLDocument)
199        return result.toString();
200    // Check after processing the open tag of HEAD element
201    if (!param->haveAddedCharsetDeclaration
202        && element->hasTagName(HTMLNames::headTag)) {
203        param->haveAddedCharsetDeclaration = true;
204        // Check meta element. WebKit only pre-parse the first 512 bytes
205        // of the document. If the whole <HEAD> is larger and meta is the
206        // end of head part, then this kind of pages aren't decoded correctly
207        // because of this issue. So when we serialize the DOM, we need to
208        // make sure the meta will in first child of head tag.
209        // See http://bugs.webkit.org/show_bug.cgi?id=16621.
210        // First we generate new content for writing correct META element.
211        result.append(WebPageSerializer::generateMetaCharsetDeclaration(
212            String(param->textEncoding.name())));
213
214        param->haveAddedContentsBeforeEnd = true;
215        // Will search each META which has charset declaration, and skip them all
216        // in PreActionBeforeSerializeOpenTag.
217    } else if (element->hasTagName(HTMLNames::scriptTag)
218               || element->hasTagName(HTMLNames::styleTag)) {
219        param->isInScriptOrStyleTag = true;
220    }
221
222    return result.toString();
223}
224
225String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
226    const Element* element, SerializeDomParam* param, bool* needSkip)
227{
228    String result;
229
230    *needSkip = false;
231    if (!param->isHTMLDocument)
232        return result;
233    // Skip the end tag of original META tag which declare charset.
234    // Need not to check whether it's META tag since we guarantee
235    // skipMetaElement is definitely META tag if it's not 0.
236    if (param->skipMetaElement == element)
237        *needSkip = true;
238    else if (element->hasTagName(HTMLNames::scriptTag)
239             || element->hasTagName(HTMLNames::styleTag)) {
240        ASSERT(param->isInScriptOrStyleTag);
241        param->isInScriptOrStyleTag = false;
242    }
243
244    return result;
245}
246
247// After we finish serializing end tag of a element, we give the target
248// element a chance to do some post work to add some additional data.
249String WebPageSerializerImpl::postActionAfterSerializeEndTag(
250    const Element* element, SerializeDomParam* param)
251{
252    StringBuilder result;
253
254    if (!param->isHTMLDocument)
255        return result.toString();
256    // Comment the BASE tag when serializing DOM.
257    if (element->hasTagName(HTMLNames::baseTag)) {
258        result.append("-->");
259        // Append a new base tag declaration.
260        result.append(WebPageSerializer::generateBaseTagDeclaration(
261            param->document->baseTarget()));
262    }
263
264    return result.toString();
265}
266
267void WebPageSerializerImpl::saveHTMLContentToBuffer(
268    const String& result, SerializeDomParam* param)
269{
270    m_dataBuffer.append(result);
271    encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
272                         param,
273                         DoNotForceFlush);
274}
275
276void WebPageSerializerImpl::encodeAndFlushBuffer(
277    WebPageSerializerClient::PageSerializationStatus status,
278    SerializeDomParam* param,
279    FlushOption flushOption)
280{
281    // Data buffer is not full nor do we want to force flush.
282    if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
283        return;
284
285    String content = m_dataBuffer.toString();
286    m_dataBuffer = StringBuilder();
287
288    // Convert the unicode content to target encoding
289    CString encodedContent = param->textEncoding.encode(
290        content.characters(), content.length(), EntitiesForUnencodables);
291
292    // Send result to the client.
293    m_client->didSerializeDataForFrame(param->url,
294                                       WebCString(encodedContent.data(), encodedContent.length()),
295                                       status);
296}
297
298void WebPageSerializerImpl::openTagToString(Element* element,
299                                            SerializeDomParam* param)
300{
301    // FIXME: use StringBuilder instead of String.
302    bool needSkip;
303    // Do pre action for open tag.
304    String result = preActionBeforeSerializeOpenTag(element, param, &needSkip);
305    if (needSkip)
306        return;
307    // Add open tag
308    result += "<" + element->nodeName().lower();
309    // Go through all attributes and serialize them.
310    const NamedNodeMap *attrMap = element->attributes(true);
311    if (attrMap) {
312        unsigned numAttrs = attrMap->length();
313        for (unsigned i = 0; i < numAttrs; i++) {
314            result += " ";
315            // Add attribute pair
316            const Attribute *attribute = attrMap->attributeItem(i);
317            result += attribute->name().toString();
318            result += "=\"";
319            if (!attribute->value().isEmpty()) {
320                const String& attrValue = attribute->value();
321
322                // Check whether we need to replace some resource links
323                // with local resource paths.
324                const QualifiedName& attrName = attribute->name();
325                if (elementHasLegalLinkAttribute(element, attrName)) {
326                    // For links start with "javascript:", we do not change it.
327                    if (attrValue.startsWith("javascript:", false))
328                        result += attrValue;
329                    else {
330                        // Get the absolute link
331                        WebFrameImpl* subFrame = WebFrameImpl::fromFrameOwnerElement(element);
332                        String completeURL = subFrame ? subFrame->frame()->document()->url() :
333                                                        param->document->completeURL(attrValue);
334                        // Check whether we have local files for those link.
335                        if (m_localLinks.contains(completeURL)) {
336                            if (!param->directoryName.isEmpty())
337                                result += "./" + param->directoryName + "/";
338                            result += m_localLinks.get(completeURL);
339                        } else
340                            result += completeURL;
341                    }
342                } else {
343                    if (param->isHTMLDocument)
344                        result += m_htmlEntities.convertEntitiesInString(attrValue);
345                    else
346                        result += m_xmlEntities.convertEntitiesInString(attrValue);
347                }
348            }
349            result += "\"";
350        }
351    }
352
353    // Do post action for open tag.
354    String addedContents = postActionAfterSerializeOpenTag(element, param);
355    // Complete the open tag for element when it has child/children.
356    if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd)
357        result += ">";
358    // Append the added contents generate in  post action of open tag.
359    result += addedContents;
360    // Save the result to data buffer.
361    saveHTMLContentToBuffer(result, param);
362}
363
364// Serialize end tag of an specified element.
365void WebPageSerializerImpl::endTagToString(Element* element,
366                                           SerializeDomParam* param)
367{
368    bool needSkip;
369    // Do pre action for end tag.
370    String result = preActionBeforeSerializeEndTag(element,
371                                                   param,
372                                                   &needSkip);
373    if (needSkip)
374        return;
375    // Write end tag when element has child/children.
376    if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd) {
377        result += "</";
378        result += element->nodeName().lower();
379        result += ">";
380    } else {
381        // Check whether we have to write end tag for empty element.
382        if (param->isHTMLDocument) {
383            result += ">";
384            // FIXME: This code is horribly wrong.  WebPageSerializerImpl must die.
385            if (!static_cast<const HTMLElement*>(element)->ieForbidsInsertHTML()) {
386                // We need to write end tag when it is required.
387                result += "</";
388                result += element->nodeName().lower();
389                result += ">";
390            }
391        } else {
392            // For xml base document.
393            result += " />";
394        }
395    }
396    // Do post action for end tag.
397    result += postActionAfterSerializeEndTag(element, param);
398    // Save the result to data buffer.
399    saveHTMLContentToBuffer(result, param);
400}
401
402void WebPageSerializerImpl::buildContentForNode(Node* node,
403                                                SerializeDomParam* param)
404{
405    switch (node->nodeType()) {
406    case Node::ELEMENT_NODE:
407        // Process open tag of element.
408        openTagToString(static_cast<Element*>(node), param);
409        // Walk through the children nodes and process it.
410        for (Node *child = node->firstChild(); child; child = child->nextSibling())
411            buildContentForNode(child, param);
412        // Process end tag of element.
413        endTagToString(static_cast<Element*>(node), param);
414        break;
415    case Node::TEXT_NODE:
416        saveHTMLContentToBuffer(createMarkup(node), param);
417        break;
418    case Node::ATTRIBUTE_NODE:
419    case Node::DOCUMENT_NODE:
420    case Node::DOCUMENT_FRAGMENT_NODE:
421        // Should not exist.
422        ASSERT_NOT_REACHED();
423        break;
424    // Document type node can be in DOM?
425    case Node::DOCUMENT_TYPE_NODE:
426        param->haveSeenDocType = true;
427    default:
428        // For other type node, call default action.
429        saveHTMLContentToBuffer(createMarkup(node), param);
430        break;
431    }
432}
433
434WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
435                                             bool recursiveSerialization,
436                                             WebPageSerializerClient* client,
437                                             const WebVector<WebURL>& links,
438                                             const WebVector<WebString>& localPaths,
439                                             const WebString& localDirectoryName)
440    : m_client(client)
441    , m_recursiveSerialization(recursiveSerialization)
442    , m_framesCollected(false)
443    , m_localDirectoryName(localDirectoryName)
444    , m_htmlEntities(false)
445    , m_xmlEntities(true)
446{
447    // Must specify available webframe.
448    ASSERT(frame);
449    m_specifiedWebFrameImpl = static_cast<WebFrameImpl*>(frame);
450    // Make sure we have non 0 client.
451    ASSERT(client);
452    // Build local resources map.
453    ASSERT(links.size() == localPaths.size());
454    for (size_t i = 0; i < links.size(); i++) {
455        KURL url = links[i];
456        ASSERT(!m_localLinks.contains(url.string()));
457        m_localLinks.set(url.string(), localPaths[i]);
458    }
459
460    ASSERT(m_dataBuffer.isEmpty());
461}
462
463void WebPageSerializerImpl::collectTargetFrames()
464{
465    ASSERT(!m_framesCollected);
466    m_framesCollected = true;
467
468    // First, process main frame.
469    m_frames.append(m_specifiedWebFrameImpl);
470    // Return now if user only needs to serialize specified frame, not including
471    // all sub-frames.
472    if (!m_recursiveSerialization)
473        return;
474    // Collect all frames inside the specified frame.
475    for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
476        WebFrameImpl* currentFrame = m_frames[i];
477        // Get current using document.
478        Document* currentDoc = currentFrame->frame()->document();
479        // Go through sub-frames.
480        RefPtr<HTMLAllCollection> all = currentDoc->all();
481        for (Node* node = all->firstItem(); node; node = all->nextItem()) {
482            if (!node->isHTMLElement())
483                continue;
484            Element* element = static_cast<Element*>(node);
485            WebFrameImpl* webFrame =
486                WebFrameImpl::fromFrameOwnerElement(element);
487            if (webFrame)
488                m_frames.append(webFrame);
489        }
490    }
491}
492
493bool WebPageSerializerImpl::serialize()
494{
495    if (!m_framesCollected)
496        collectTargetFrames();
497
498    bool didSerialization = false;
499    KURL mainURL = m_specifiedWebFrameImpl->frame()->document()->url();
500
501    for (unsigned i = 0; i < m_frames.size(); ++i) {
502        WebFrameImpl* webFrame = m_frames[i];
503        Document* document = webFrame->frame()->document();
504        const KURL& url = document->url();
505
506        if (!url.isValid() || !m_localLinks.contains(url.string()))
507            continue;
508
509        didSerialization = true;
510
511        String encoding = document->loader()->writer()->encoding();
512        const TextEncoding& textEncoding = encoding.isEmpty() ? UTF8Encoding() : TextEncoding(encoding);
513        String directoryName = url == mainURL ? m_localDirectoryName : "";
514
515        SerializeDomParam param(url, textEncoding, document, directoryName);
516
517        Element* documentElement = document->documentElement();
518        if (documentElement)
519            buildContentForNode(documentElement, &param);
520
521        encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &param, ForceFlush);
522    }
523
524    ASSERT(m_dataBuffer.isEmpty());
525    m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
526    return didSerialization;
527}
528
529}  // namespace WebKit
530