1/*
2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "public/web/WebPageSerializer.h"
33
34#include "core/HTMLNames.h"
35#include "core/dom/Document.h"
36#include "core/dom/Element.h"
37#include "core/frame/LocalFrame.h"
38#include "core/html/HTMLAllCollection.h"
39#include "core/html/HTMLFrameElementBase.h"
40#include "core/html/HTMLFrameOwnerElement.h"
41#include "core/html/HTMLInputElement.h"
42#include "core/html/HTMLTableElement.h"
43#include "core/loader/DocumentLoader.h"
44#include "core/page/Page.h"
45#include "core/page/PageSerializer.h"
46#include "platform/SerializedResource.h"
47#include "platform/mhtml/MHTMLArchive.h"
48#include "platform/weborigin/KURL.h"
49#include "public/platform/WebCString.h"
50#include "public/platform/WebString.h"
51#include "public/platform/WebURL.h"
52#include "public/platform/WebVector.h"
53#include "public/web/WebFrame.h"
54#include "public/web/WebPageSerializerClient.h"
55#include "public/web/WebView.h"
56#include "web/WebLocalFrameImpl.h"
57#include "web/WebPageSerializerImpl.h"
58#include "web/WebViewImpl.h"
59#include "wtf/Vector.h"
60#include "wtf/text/StringConcatenate.h"
61
62namespace blink {
63
64namespace {
65
66KURL getSubResourceURLFromElement(Element* element)
67{
68    ASSERT(element);
69    const QualifiedName& attributeName = element->subResourceAttributeName();
70    if (attributeName == QualifiedName::null())
71        return KURL();
72
73    String value = element->getAttribute(attributeName);
74    // Ignore javascript content.
75    if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
76        return KURL();
77
78    return element->document().completeURL(value);
79}
80
81void retrieveResourcesForElement(Element* element,
82                                 Vector<LocalFrame*>* visitedFrames,
83                                 Vector<LocalFrame*>* framesToVisit,
84                                 Vector<KURL>* frameURLs,
85                                 Vector<KURL>* resourceURLs)
86{
87    ASSERT(element);
88    // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
89    if (isHTMLFrameElementBase(*element) || isHTMLObjectElement(*element) || isHTMLEmbedElement(*element)) {
90        Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame();
91        if (frame && frame->isLocalFrame()) {
92            if (!visitedFrames->contains(toLocalFrame(frame)))
93                framesToVisit->append(toLocalFrame(frame));
94            return;
95        }
96    }
97
98    KURL url = getSubResourceURLFromElement(element);
99    if (url.isEmpty() || !url.isValid())
100        return; // No subresource for this node.
101
102    // Ignore URLs that have a non-standard protocols. Since the FTP protocol
103    // does no have a cache mechanism, we skip it as well.
104    if (!url.protocolIsInHTTPFamily() && !url.isLocalFile())
105        return;
106
107    if (!resourceURLs->contains(url))
108        resourceURLs->append(url);
109}
110
111void retrieveResourcesForFrame(LocalFrame* frame,
112    const WebVector<WebCString>& supportedSchemes,
113    Vector<LocalFrame*>* visitedFrames,
114    Vector<LocalFrame*>* framesToVisit,
115    Vector<KURL>* frameURLs,
116    Vector<KURL>* resourceURLs)
117{
118    KURL frameURL = frame->loader().documentLoader()->request().url();
119
120    // If the frame's URL is invalid, ignore it, it is not retrievable.
121    if (!frameURL.isValid())
122        return;
123
124    // Ignore frames from unsupported schemes.
125    bool isValidScheme = false;
126    for (size_t i = 0; i < supportedSchemes.size(); ++i) {
127        if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
128            isValidScheme = true;
129            break;
130        }
131    }
132    if (!isValidScheme)
133        return;
134
135    // If we have already seen that frame, ignore it.
136    if (visitedFrames->contains(frame))
137        return;
138    visitedFrames->append(frame);
139    if (!frameURLs->contains(frameURL))
140        frameURLs->append(frameURL);
141
142    // Now get the resources associated with each node of the document.
143    RefPtrWillBeRawPtr<HTMLAllCollection> allElements = frame->document()->all();
144    for (unsigned i = 0; i < allElements->length(); ++i) {
145        Element* element = allElements->item(i);
146        retrieveResourcesForElement(element,
147                                    visitedFrames, framesToVisit,
148                                    frameURLs, resourceURLs);
149    }
150}
151
152} // namespace
153
154void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam)
155{
156    Vector<SerializedResource> resources;
157    PageSerializer serializer(&resources);
158    serializer.serialize(toWebViewImpl(view)->page());
159
160    Vector<Resource> result;
161    for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) {
162        Resource resource;
163        resource.url = iter->url;
164        resource.mimeType = iter->mimeType.ascii();
165        // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData().
166        resource.data = WebCString(iter->data->data(), iter->data->size());
167        result.append(resource);
168    }
169
170    *resourcesParam = result;
171}
172
173static PassRefPtr<SharedBuffer> serializePageToMHTML(Page* page, MHTMLArchive::EncodingPolicy encodingPolicy)
174{
175    Vector<SerializedResource> resources;
176    PageSerializer serializer(&resources);
177    serializer.serialize(page);
178    Document* document = page->deprecatedLocalMainFrame()->document();
179    return MHTMLArchive::generateMHTMLData(resources, encodingPolicy, document->title(), document->suggestedMIMEType());
180}
181
182WebCString WebPageSerializer::serializeToMHTML(WebView* view)
183{
184    RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseDefaultEncoding);
185    // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
186    return WebCString(mhtml->data(), mhtml->size());
187}
188
189WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view)
190{
191    RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseBinaryEncoding);
192    // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
193    return WebCString(mhtml->data(), mhtml->size());
194}
195
196bool WebPageSerializer::serialize(WebLocalFrame* frame,
197                                  bool recursive,
198                                  WebPageSerializerClient* client,
199                                  const WebVector<WebURL>& links,
200                                  const WebVector<WebString>& localPaths,
201                                  const WebString& localDirectoryName)
202{
203    WebPageSerializerImpl serializerImpl(
204        frame, recursive, client, links, localPaths, localDirectoryName);
205    return serializerImpl.serialize();
206}
207
208bool WebPageSerializer::retrieveAllResources(WebView* view,
209                                             const WebVector<WebCString>& supportedSchemes,
210                                             WebVector<WebURL>* resourceURLs,
211                                             WebVector<WebURL>* frameURLs) {
212    WebLocalFrameImpl* mainFrame = toWebLocalFrameImpl(view->mainFrame());
213    if (!mainFrame)
214        return false;
215
216    Vector<LocalFrame*> framesToVisit;
217    Vector<LocalFrame*> visitedFrames;
218    Vector<KURL> frameKURLs;
219    Vector<KURL> resourceKURLs;
220
221    // Let's retrieve the resources from every frame in this page.
222    framesToVisit.append(mainFrame->frame());
223    while (!framesToVisit.isEmpty()) {
224        LocalFrame* frame = framesToVisit[0];
225        framesToVisit.remove(0);
226        retrieveResourcesForFrame(frame, supportedSchemes,
227                                  &visitedFrames, &framesToVisit,
228                                  &frameKURLs, &resourceKURLs);
229    }
230
231    // Converts the results to WebURLs.
232    WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
233    for (size_t i = 0; i < resourceKURLs.size(); ++i) {
234        resultResourceURLs[i] = resourceKURLs[i];
235        // A frame's src can point to the same URL as another resource, keep the
236        // resource URL only in such cases.
237        size_t index = frameKURLs.find(resourceKURLs[i]);
238        if (index != kNotFound)
239            frameKURLs.remove(index);
240    }
241    *resourceURLs = resultResourceURLs;
242    WebVector<WebURL> resultFrameURLs(frameKURLs.size());
243    for (size_t i = 0; i < frameKURLs.size(); ++i)
244        resultFrameURLs[i] = frameKURLs[i];
245    *frameURLs = resultFrameURLs;
246
247    return true;
248}
249
250WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
251{
252    String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">";
253    return charsetString;
254}
255
256WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
257{
258    return String::format("\n<!-- saved from url=(%04d)%s -->\n",
259                          static_cast<int>(url.spec().length()),
260                          url.spec().data());
261}
262
263WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
264{
265    if (baseTarget.isEmpty())
266        return String("<base href=\".\">");
267    String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">";
268    return baseString;
269}
270
271} // namespace blink
272