1// Copyright (c) 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "content/renderer/savable_resources.h" 6 7#include <set> 8 9#include "base/compiler_specific.h" 10#include "base/logging.h" 11#include "base/strings/string_util.h" 12#include "third_party/WebKit/public/platform/WebString.h" 13#include "third_party/WebKit/public/platform/WebVector.h" 14#include "third_party/WebKit/public/web/WebDocument.h" 15#include "third_party/WebKit/public/web/WebElement.h" 16#include "third_party/WebKit/public/web/WebFrame.h" 17#include "third_party/WebKit/public/web/WebInputElement.h" 18#include "third_party/WebKit/public/web/WebNode.h" 19#include "third_party/WebKit/public/web/WebNodeCollection.h" 20#include "third_party/WebKit/public/web/WebNodeList.h" 21#include "third_party/WebKit/public/web/WebView.h" 22 23using blink::WebDocument; 24using blink::WebElement; 25using blink::WebFrame; 26using blink::WebInputElement; 27using blink::WebNode; 28using blink::WebNodeCollection; 29using blink::WebNodeList; 30using blink::WebString; 31using blink::WebVector; 32using blink::WebView; 33 34namespace content { 35namespace { 36 37// Structure for storage the unique set of all savable resource links for 38// making sure that no duplicated resource link in final result. The consumer 39// of the SavableResourcesUniqueCheck is responsible for keeping these pointers 40// valid for the lifetime of the SavableResourcesUniqueCheck instance. 41struct SavableResourcesUniqueCheck { 42 // Unique set of all sub resource links. 43 std::set<GURL>* resources_set; 44 // Unique set of all frame links. 45 std::set<GURL>* frames_set; 46 // Collection of all frames we go through when getting all savable resource 47 // links. 48 std::vector<WebFrame*>* frames; 49 50 SavableResourcesUniqueCheck() 51 : resources_set(NULL), 52 frames_set(NULL), 53 frames(NULL) {} 54 55 SavableResourcesUniqueCheck(std::set<GURL>* resources_set, 56 std::set<GURL>* frames_set, std::vector<WebFrame*>* frames) 57 : resources_set(resources_set), 58 frames_set(frames_set), 59 frames(frames) {} 60}; 61 62// Get all savable resource links from current element. One element might 63// have more than one resource link. It is possible to have some links 64// in one CSS stylesheet. 65void GetSavableResourceLinkForElement( 66 const WebElement& element, 67 const WebDocument& current_doc, 68 SavableResourcesUniqueCheck* unique_check, 69 SavableResourcesResult* result) { 70 71 // Handle frame and iframe tag. 72 if (element.hasTagName("iframe") || 73 element.hasTagName("frame")) { 74 WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element); 75 if (sub_frame) 76 unique_check->frames->push_back(sub_frame); 77 return; 78 } 79 80 // Check whether the node has sub resource URL or not. 81 WebString value = GetSubResourceLinkFromElement(element); 82 if (value.isNull()) 83 return; 84 // Get absolute URL. 85 GURL u = current_doc.completeURL(value); 86 // ignore invalid URL 87 if (!u.is_valid()) 88 return; 89 // Ignore those URLs which are not standard protocols. Because FTP 90 // protocol does no have cache mechanism, we will skip all 91 // sub-resources if they use FTP protocol. 92 if (!u.SchemeIsHTTPOrHTTPS() && !u.SchemeIs("file")) 93 return; 94 // Ignore duplicated resource link. 95 if (!unique_check->resources_set->insert(u).second) 96 return; 97 result->resources_list->push_back(u); 98 // Insert referrer for above new resource link. 99 result->referrer_urls_list->push_back(GURL()); 100 result->referrer_policies_list->push_back(blink::WebReferrerPolicyDefault); 101} 102 103// Get all savable resource links from current WebFrameImpl object pointer. 104void GetAllSavableResourceLinksForFrame(WebFrame* current_frame, 105 SavableResourcesUniqueCheck* unique_check, 106 SavableResourcesResult* result, 107 const char** savable_schemes) { 108 // Get current frame's URL. 109 GURL current_frame_url = current_frame->document().url(); 110 111 // If url of current frame is invalid, ignore it. 112 if (!current_frame_url.is_valid()) 113 return; 114 115 // If url of current frame is not a savable protocol, ignore it. 116 bool is_valid_protocol = false; 117 for (int i = 0; savable_schemes[i] != NULL; ++i) { 118 if (current_frame_url.SchemeIs(savable_schemes[i])) { 119 is_valid_protocol = true; 120 break; 121 } 122 } 123 if (!is_valid_protocol) 124 return; 125 126 // If find same frame we have recorded, ignore it. 127 if (!unique_check->frames_set->insert(current_frame_url).second) 128 return; 129 130 // Get current using document. 131 WebDocument current_doc = current_frame->document(); 132 // Go through all descent nodes. 133 WebNodeCollection all = current_doc.all(); 134 // Go through all node in this frame. 135 for (WebNode node = all.firstItem(); !node.isNull(); 136 node = all.nextItem()) { 137 // We only save HTML resources. 138 if (!node.isElementNode()) 139 continue; 140 WebElement element = node.to<WebElement>(); 141 GetSavableResourceLinkForElement(element, 142 current_doc, 143 unique_check, 144 result); 145 } 146} 147 148} // namespace 149 150WebString GetSubResourceLinkFromElement(const WebElement& element) { 151 const char* attribute_name = NULL; 152 if (element.hasHTMLTagName("img") || 153 element.hasHTMLTagName("script")) { 154 attribute_name = "src"; 155 } else if (element.hasHTMLTagName("input")) { 156 const WebInputElement input = element.toConst<WebInputElement>(); 157 if (input.isImageButton()) { 158 attribute_name = "src"; 159 } 160 } else if (element.hasHTMLTagName("body") || 161 element.hasHTMLTagName("table") || 162 element.hasHTMLTagName("tr") || 163 element.hasHTMLTagName("td")) { 164 attribute_name = "background"; 165 } else if (element.hasHTMLTagName("blockquote") || 166 element.hasHTMLTagName("q") || 167 element.hasHTMLTagName("del") || 168 element.hasHTMLTagName("ins")) { 169 attribute_name = "cite"; 170 } else if (element.hasHTMLTagName("link")) { 171 // If the link element is not linked to css, ignore it. 172 if (LowerCaseEqualsASCII(element.getAttribute("type"), "text/css")) { 173 // TODO(jnd): Add support for extracting links of sub-resources which 174 // are inside style-sheet such as @import, url(), etc. 175 // See bug: http://b/issue?id=1111667. 176 attribute_name = "href"; 177 } 178 } 179 if (!attribute_name) 180 return WebString(); 181 WebString value = element.getAttribute(WebString::fromUTF8(attribute_name)); 182 // If value has content and not start with "javascript:" then return it, 183 // otherwise return NULL. 184 if (!value.isNull() && !value.isEmpty() && 185 !StartsWithASCII(value.utf8(), "javascript:", false)) 186 return value; 187 188 return WebString(); 189} 190 191// Get all savable resource links from current webview, include main 192// frame and sub-frame 193bool GetAllSavableResourceLinksForCurrentPage(WebView* view, 194 const GURL& page_url, SavableResourcesResult* result, 195 const char** savable_schemes) { 196 WebFrame* main_frame = view->mainFrame(); 197 if (!main_frame) 198 return false; 199 200 std::set<GURL> resources_set; 201 std::set<GURL> frames_set; 202 std::vector<WebFrame*> frames; 203 SavableResourcesUniqueCheck unique_check(&resources_set, 204 &frames_set, 205 &frames); 206 207 GURL main_page_gurl(main_frame->document().url()); 208 209 // Make sure we are saving same page between embedder and webkit. 210 // If page has being navigated, embedder will get three empty vector, 211 // which will make the saving page job ended. 212 if (page_url != main_page_gurl) 213 return true; 214 215 // First, process main frame. 216 frames.push_back(main_frame); 217 218 // Check all resource in this page, include sub-frame. 219 for (int i = 0; i < static_cast<int>(frames.size()); ++i) { 220 // Get current frame's all savable resource links. 221 GetAllSavableResourceLinksForFrame(frames[i], &unique_check, result, 222 savable_schemes); 223 } 224 225 // Since frame's src can also point to sub-resources link, so it is possible 226 // that some URLs in frames_list are also in resources_list. For those 227 // URLs, we will remove it from frame_list, only keep them in resources_list. 228 for (std::set<GURL>::iterator it = frames_set.begin(); 229 it != frames_set.end(); ++it) { 230 // Append unique frame source to savable frame list. 231 if (resources_set.find(*it) == resources_set.end()) 232 result->frames_list->push_back(*it); 233 } 234 235 return true; 236} 237 238} // namespace content 239