1/* 2 * Copyright (C) 2009 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include "config.h" 32#include "WebPageSerializer.h" 33 34#include "DocumentLoader.h" 35#include "Element.h" 36#include "Frame.h" 37#include "HTMLAllCollection.h" 38#include "HTMLFrameOwnerElement.h" 39#include "HTMLInputElement.h" 40#include "HTMLNames.h" 41#include "KURL.h" 42#include "Vector.h" 43 44#include "WebCString.h" 45#include "WebFrame.h" 46#include "WebFrameImpl.h" 47#include "WebPageSerializerClient.h" 48#include "WebPageSerializerImpl.h" 49#include "WebString.h" 50#include "WebURL.h" 51#include "WebVector.h" 52#include "WebView.h" 53 54#include <wtf/text/StringConcatenate.h> 55 56using namespace WebCore; 57 58namespace { 59 60KURL getSubResourceURLFromElement(Element* element) 61{ 62 ASSERT(element); 63 const QualifiedName* attributeName = 0; 64 if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag)) 65 attributeName = &HTMLNames::srcAttr; 66 else if (element->hasTagName(HTMLNames::inputTag)) { 67 HTMLInputElement* input = static_cast<HTMLInputElement*>(element); 68 if (input->isImageButton()) 69 attributeName = &HTMLNames::srcAttr; 70 } else if (element->hasTagName(HTMLNames::bodyTag) 71 || element->hasTagName(HTMLNames::tableTag) 72 || element->hasTagName(HTMLNames::trTag) 73 || element->hasTagName(HTMLNames::tdTag)) 74 attributeName = &HTMLNames::backgroundAttr; 75 else if (element->hasTagName(HTMLNames::blockquoteTag) 76 || element->hasTagName(HTMLNames::qTag) 77 || element->hasTagName(HTMLNames::delTag) 78 || element->hasTagName(HTMLNames::insTag)) 79 attributeName = &HTMLNames::citeAttr; 80 else if (element->hasTagName(HTMLNames::linkTag)) { 81 // If the link element is not css, ignore it. 82 if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) { 83 // FIXME: Add support for extracting links of sub-resources which 84 // are inside style-sheet such as @import, @font-face, url(), etc. 85 attributeName = &HTMLNames::hrefAttr; 86 } 87 } else if (element->hasTagName(HTMLNames::objectTag)) 88 attributeName = &HTMLNames::dataAttr; 89 else if (element->hasTagName(HTMLNames::embedTag)) 90 attributeName = &HTMLNames::srcAttr; 91 92 if (!attributeName) 93 return KURL(); 94 95 String value = element->getAttribute(*attributeName); 96 // Ignore javascript content. 97 if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false)) 98 return KURL(); 99 100 return element->document()->completeURL(value); 101} 102 103void retrieveResourcesForElement(Element* element, 104 Vector<Frame*>* visitedFrames, 105 Vector<Frame*>* framesToVisit, 106 Vector<KURL>* frameURLs, 107 Vector<KURL>* resourceURLs) 108{ 109 // If the node is a frame, we'll process it later in retrieveResourcesForFrame. 110 if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag) 111 || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag)) 112 && element->isFrameOwnerElement()) { 113 Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame(); 114 if (frame) { 115 if (!visitedFrames->contains(frame)) 116 framesToVisit->append(frame); 117 return; 118 } 119 } 120 121 KURL url = getSubResourceURLFromElement(element); 122 if (url.isEmpty() || !url.isValid()) 123 return; // No subresource for this node. 124 125 // Ignore URLs that have a non-standard protocols. Since the FTP protocol 126 // does no have a cache mechanism, we skip it as well. 127 if (!url.protocolInHTTPFamily() && !url.isLocalFile()) 128 return; 129 130 if (!resourceURLs->contains(url)) 131 resourceURLs->append(url); 132} 133 134void retrieveResourcesForFrame(Frame* frame, 135 const WebKit::WebVector<WebKit::WebCString>& supportedSchemes, 136 Vector<Frame*>* visitedFrames, 137 Vector<Frame*>* framesToVisit, 138 Vector<KURL>* frameURLs, 139 Vector<KURL>* resourceURLs) 140{ 141 KURL frameURL = frame->loader()->documentLoader()->request().url(); 142 143 // If the frame's URL is invalid, ignore it, it is not retrievable. 144 if (!frameURL.isValid()) 145 return; 146 147 // Ignore frames from unsupported schemes. 148 bool isValidScheme = false; 149 for (size_t i = 0; i < supportedSchemes.size(); ++i) { 150 if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) { 151 isValidScheme = true; 152 break; 153 } 154 } 155 if (!isValidScheme) 156 return; 157 158 // If we have already seen that frame, ignore it. 159 if (visitedFrames->contains(frame)) 160 return; 161 visitedFrames->append(frame); 162 if (!frameURLs->contains(frameURL)) 163 frameURLs->append(frameURL); 164 165 // Now get the resources associated with each node of the document. 166 RefPtr<HTMLAllCollection> allNodes = frame->document()->all(); 167 for (unsigned i = 0; i < allNodes->length(); ++i) { 168 Node* node = allNodes->item(i); 169 // We are only interested in HTML resources. 170 if (!node->isElementNode()) 171 continue; 172 retrieveResourcesForElement(static_cast<Element*>(node), 173 visitedFrames, framesToVisit, 174 frameURLs, resourceURLs); 175 } 176} 177 178} // namespace 179 180namespace WebKit { 181 182bool WebPageSerializer::serialize(WebFrame* frame, 183 bool recursive, 184 WebPageSerializerClient* client, 185 const WebVector<WebURL>& links, 186 const WebVector<WebString>& localPaths, 187 const WebString& localDirectoryName) 188{ 189 WebPageSerializerImpl serializerImpl( 190 frame, recursive, client, links, localPaths, localDirectoryName); 191 return serializerImpl.serialize(); 192} 193 194bool WebPageSerializer::retrieveAllResources(WebView* view, 195 const WebVector<WebCString>& supportedSchemes, 196 WebVector<WebURL>* resourceURLs, 197 WebVector<WebURL>* frameURLs) { 198 WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame()); 199 if (!mainFrame) 200 return false; 201 202 Vector<Frame*> framesToVisit; 203 Vector<Frame*> visitedFrames; 204 Vector<KURL> frameKURLs; 205 Vector<KURL> resourceKURLs; 206 207 // Let's retrieve the resources from every frame in this page. 208 framesToVisit.append(mainFrame->frame()); 209 while (!framesToVisit.isEmpty()) { 210 Frame* frame = framesToVisit[0]; 211 framesToVisit.remove(0); 212 retrieveResourcesForFrame(frame, supportedSchemes, 213 &visitedFrames, &framesToVisit, 214 &frameKURLs, &resourceKURLs); 215 } 216 217 // Converts the results to WebURLs. 218 WebVector<WebURL> resultResourceURLs(resourceKURLs.size()); 219 for (size_t i = 0; i < resourceKURLs.size(); ++i) { 220 resultResourceURLs[i] = resourceKURLs[i]; 221 // A frame's src can point to the same URL as another resource, keep the 222 // resource URL only in such cases. 223 size_t index = frameKURLs.find(resourceKURLs[i]); 224 if (index != notFound) 225 frameKURLs.remove(index); 226 } 227 *resourceURLs = resultResourceURLs; 228 WebVector<WebURL> resultFrameURLs(frameKURLs.size()); 229 for (size_t i = 0; i < frameKURLs.size(); ++i) 230 resultFrameURLs[i] = frameKURLs[i]; 231 *frameURLs = resultFrameURLs; 232 233 return true; 234} 235 236WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset) 237{ 238 return makeString("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=", static_cast<const String&>(charset), "\">"); 239} 240 241WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url) 242{ 243 return String::format("\n<!-- saved from url=(%04d)%s -->\n", 244 static_cast<int>(url.spec().length()), 245 url.spec().data()); 246} 247 248WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget) 249{ 250 if (baseTarget.isEmpty()) 251 return makeString("<base href=\".\">"); 252 return makeString("<base href=\".\" target=\"", static_cast<const String&>(baseTarget), "\">"); 253} 254 255} // namespace WebKit 256