1/*
2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "WebPageSerializer.h"
33
34#include "DocumentLoader.h"
35#include "Element.h"
36#include "Frame.h"
37#include "HTMLAllCollection.h"
38#include "HTMLFrameOwnerElement.h"
39#include "HTMLInputElement.h"
40#include "HTMLNames.h"
41#include "KURL.h"
42#include "Vector.h"
43
44#include "WebCString.h"
45#include "WebFrame.h"
46#include "WebFrameImpl.h"
47#include "WebPageSerializerClient.h"
48#include "WebPageSerializerImpl.h"
49#include "WebString.h"
50#include "WebURL.h"
51#include "WebVector.h"
52#include "WebView.h"
53
54#include <wtf/text/StringConcatenate.h>
55
56using namespace WebCore;
57
58namespace {
59
60KURL getSubResourceURLFromElement(Element* element)
61{
62    ASSERT(element);
63    const QualifiedName* attributeName = 0;
64    if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag))
65        attributeName = &HTMLNames::srcAttr;
66    else if (element->hasTagName(HTMLNames::inputTag)) {
67        HTMLInputElement* input = static_cast<HTMLInputElement*>(element);
68        if (input->isImageButton())
69            attributeName = &HTMLNames::srcAttr;
70    } else if (element->hasTagName(HTMLNames::bodyTag)
71               || element->hasTagName(HTMLNames::tableTag)
72               || element->hasTagName(HTMLNames::trTag)
73               || element->hasTagName(HTMLNames::tdTag))
74        attributeName = &HTMLNames::backgroundAttr;
75    else if (element->hasTagName(HTMLNames::blockquoteTag)
76             || element->hasTagName(HTMLNames::qTag)
77             || element->hasTagName(HTMLNames::delTag)
78             || element->hasTagName(HTMLNames::insTag))
79        attributeName = &HTMLNames::citeAttr;
80    else if (element->hasTagName(HTMLNames::linkTag)) {
81        // If the link element is not css, ignore it.
82        if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) {
83            // FIXME: Add support for extracting links of sub-resources which
84            // are inside style-sheet such as @import, @font-face, url(), etc.
85            attributeName = &HTMLNames::hrefAttr;
86        }
87    } else if (element->hasTagName(HTMLNames::objectTag))
88        attributeName = &HTMLNames::dataAttr;
89    else if (element->hasTagName(HTMLNames::embedTag))
90        attributeName = &HTMLNames::srcAttr;
91
92    if (!attributeName)
93        return KURL();
94
95    String value = element->getAttribute(*attributeName);
96    // Ignore javascript content.
97    if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
98        return KURL();
99
100    return element->document()->completeURL(value);
101}
102
103void retrieveResourcesForElement(Element* element,
104                                 Vector<Frame*>* visitedFrames,
105                                 Vector<Frame*>* framesToVisit,
106                                 Vector<KURL>* frameURLs,
107                                 Vector<KURL>* resourceURLs)
108{
109    // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
110    if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag)
111        || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag))
112            && element->isFrameOwnerElement()) {
113        Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame();
114        if (frame) {
115            if (!visitedFrames->contains(frame))
116                framesToVisit->append(frame);
117            return;
118        }
119    }
120
121    KURL url = getSubResourceURLFromElement(element);
122    if (url.isEmpty() || !url.isValid())
123        return; // No subresource for this node.
124
125    // Ignore URLs that have a non-standard protocols. Since the FTP protocol
126    // does no have a cache mechanism, we skip it as well.
127    if (!url.protocolInHTTPFamily() && !url.isLocalFile())
128        return;
129
130    if (!resourceURLs->contains(url))
131        resourceURLs->append(url);
132}
133
134void retrieveResourcesForFrame(Frame* frame,
135                               const WebKit::WebVector<WebKit::WebCString>& supportedSchemes,
136                               Vector<Frame*>* visitedFrames,
137                               Vector<Frame*>* framesToVisit,
138                               Vector<KURL>* frameURLs,
139                               Vector<KURL>* resourceURLs)
140{
141    KURL frameURL = frame->loader()->documentLoader()->request().url();
142
143    // If the frame's URL is invalid, ignore it, it is not retrievable.
144    if (!frameURL.isValid())
145        return;
146
147    // Ignore frames from unsupported schemes.
148    bool isValidScheme = false;
149    for (size_t i = 0; i < supportedSchemes.size(); ++i) {
150        if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
151            isValidScheme = true;
152            break;
153        }
154    }
155    if (!isValidScheme)
156        return;
157
158    // If we have already seen that frame, ignore it.
159    if (visitedFrames->contains(frame))
160        return;
161    visitedFrames->append(frame);
162    if (!frameURLs->contains(frameURL))
163        frameURLs->append(frameURL);
164
165    // Now get the resources associated with each node of the document.
166    RefPtr<HTMLAllCollection> allNodes = frame->document()->all();
167    for (unsigned i = 0; i < allNodes->length(); ++i) {
168        Node* node = allNodes->item(i);
169        // We are only interested in HTML resources.
170        if (!node->isElementNode())
171            continue;
172        retrieveResourcesForElement(static_cast<Element*>(node),
173                                    visitedFrames, framesToVisit,
174                                    frameURLs, resourceURLs);
175    }
176}
177
178} // namespace
179
180namespace WebKit {
181
182bool WebPageSerializer::serialize(WebFrame* frame,
183                                  bool recursive,
184                                  WebPageSerializerClient* client,
185                                  const WebVector<WebURL>& links,
186                                  const WebVector<WebString>& localPaths,
187                                  const WebString& localDirectoryName)
188{
189    WebPageSerializerImpl serializerImpl(
190        frame, recursive, client, links, localPaths, localDirectoryName);
191    return serializerImpl.serialize();
192}
193
194bool WebPageSerializer::retrieveAllResources(WebView* view,
195                                             const WebVector<WebCString>& supportedSchemes,
196                                             WebVector<WebURL>* resourceURLs,
197                                             WebVector<WebURL>* frameURLs) {
198    WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame());
199    if (!mainFrame)
200        return false;
201
202    Vector<Frame*> framesToVisit;
203    Vector<Frame*> visitedFrames;
204    Vector<KURL> frameKURLs;
205    Vector<KURL> resourceKURLs;
206
207    // Let's retrieve the resources from every frame in this page.
208    framesToVisit.append(mainFrame->frame());
209    while (!framesToVisit.isEmpty()) {
210        Frame* frame = framesToVisit[0];
211        framesToVisit.remove(0);
212        retrieveResourcesForFrame(frame, supportedSchemes,
213                                  &visitedFrames, &framesToVisit,
214                                  &frameKURLs, &resourceKURLs);
215    }
216
217    // Converts the results to WebURLs.
218    WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
219    for (size_t i = 0; i < resourceKURLs.size(); ++i) {
220        resultResourceURLs[i] = resourceKURLs[i];
221        // A frame's src can point to the same URL as another resource, keep the
222        // resource URL only in such cases.
223        size_t index = frameKURLs.find(resourceKURLs[i]);
224        if (index != notFound)
225            frameKURLs.remove(index);
226    }
227    *resourceURLs = resultResourceURLs;
228    WebVector<WebURL> resultFrameURLs(frameKURLs.size());
229    for (size_t i = 0; i < frameKURLs.size(); ++i)
230        resultFrameURLs[i] = frameKURLs[i];
231    *frameURLs = resultFrameURLs;
232
233    return true;
234}
235
236WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
237{
238    return makeString("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=", static_cast<const String&>(charset), "\">");
239}
240
241WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
242{
243    return String::format("\n<!-- saved from url=(%04d)%s -->\n",
244                          static_cast<int>(url.spec().length()),
245                          url.spec().data());
246}
247
248WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
249{
250    if (baseTarget.isEmpty())
251        return makeString("<base href=\".\">");
252    return makeString("<base href=\".\" target=\"", static_cast<const String&>(baseTarget), "\">");
253}
254
255} // namespace WebKit
256