1// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "content/renderer/savable_resources.h"
6
7#include <set>
8
9#include "base/compiler_specific.h"
10#include "base/logging.h"
11#include "base/strings/string_util.h"
12#include "third_party/WebKit/public/platform/WebString.h"
13#include "third_party/WebKit/public/platform/WebVector.h"
14#include "third_party/WebKit/public/web/WebDocument.h"
15#include "third_party/WebKit/public/web/WebElement.h"
16#include "third_party/WebKit/public/web/WebFrame.h"
17#include "third_party/WebKit/public/web/WebInputElement.h"
18#include "third_party/WebKit/public/web/WebNode.h"
19#include "third_party/WebKit/public/web/WebNodeCollection.h"
20#include "third_party/WebKit/public/web/WebNodeList.h"
21#include "third_party/WebKit/public/web/WebView.h"
22
23using blink::WebDocument;
24using blink::WebElement;
25using blink::WebFrame;
26using blink::WebInputElement;
27using blink::WebNode;
28using blink::WebNodeCollection;
29using blink::WebNodeList;
30using blink::WebString;
31using blink::WebVector;
32using blink::WebView;
33
34namespace content {
35namespace {
36
37// Structure for storage the unique set of all savable resource links for
38// making sure that no duplicated resource link in final result. The consumer
39// of the SavableResourcesUniqueCheck is responsible for keeping these pointers
40// valid for the lifetime of the SavableResourcesUniqueCheck instance.
41struct SavableResourcesUniqueCheck {
42  // Unique set of all sub resource links.
43  std::set<GURL>* resources_set;
44  // Unique set of all frame links.
45  std::set<GURL>* frames_set;
46  // Collection of all frames we go through when getting all savable resource
47  // links.
48  std::vector<WebFrame*>* frames;
49
50  SavableResourcesUniqueCheck()
51      : resources_set(NULL),
52        frames_set(NULL),
53        frames(NULL) {}
54
55  SavableResourcesUniqueCheck(std::set<GURL>* resources_set,
56      std::set<GURL>* frames_set, std::vector<WebFrame*>* frames)
57      : resources_set(resources_set),
58        frames_set(frames_set),
59        frames(frames) {}
60};
61
62// Get all savable resource links from current element. One element might
63// have more than one resource link. It is possible to have some links
64// in one CSS stylesheet.
65void GetSavableResourceLinkForElement(
66    const WebElement& element,
67    const WebDocument& current_doc,
68    SavableResourcesUniqueCheck* unique_check,
69    SavableResourcesResult* result) {
70
71  // Handle frame and iframe tag.
72  if (element.hasTagName("iframe") ||
73      element.hasTagName("frame")) {
74    WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element);
75    if (sub_frame)
76      unique_check->frames->push_back(sub_frame);
77    return;
78  }
79
80  // Check whether the node has sub resource URL or not.
81  WebString value = GetSubResourceLinkFromElement(element);
82  if (value.isNull())
83    return;
84  // Get absolute URL.
85  GURL u = current_doc.completeURL(value);
86  // ignore invalid URL
87  if (!u.is_valid())
88    return;
89  // Ignore those URLs which are not standard protocols. Because FTP
90  // protocol does no have cache mechanism, we will skip all
91  // sub-resources if they use FTP protocol.
92  if (!u.SchemeIsHTTPOrHTTPS() && !u.SchemeIs("file"))
93    return;
94  // Ignore duplicated resource link.
95  if (!unique_check->resources_set->insert(u).second)
96    return;
97  result->resources_list->push_back(u);
98  // Insert referrer for above new resource link.
99  result->referrer_urls_list->push_back(GURL());
100  result->referrer_policies_list->push_back(blink::WebReferrerPolicyDefault);
101}
102
103// Get all savable resource links from current WebFrameImpl object pointer.
104void GetAllSavableResourceLinksForFrame(WebFrame* current_frame,
105    SavableResourcesUniqueCheck* unique_check,
106    SavableResourcesResult* result,
107    const char** savable_schemes) {
108  // Get current frame's URL.
109  GURL current_frame_url = current_frame->document().url();
110
111  // If url of current frame is invalid, ignore it.
112  if (!current_frame_url.is_valid())
113    return;
114
115  // If url of current frame is not a savable protocol, ignore it.
116  bool is_valid_protocol = false;
117  for (int i = 0; savable_schemes[i] != NULL; ++i) {
118    if (current_frame_url.SchemeIs(savable_schemes[i])) {
119      is_valid_protocol = true;
120      break;
121    }
122  }
123  if (!is_valid_protocol)
124    return;
125
126  // If find same frame we have recorded, ignore it.
127  if (!unique_check->frames_set->insert(current_frame_url).second)
128    return;
129
130  // Get current using document.
131  WebDocument current_doc = current_frame->document();
132  // Go through all descent nodes.
133  WebNodeCollection all = current_doc.all();
134  // Go through all node in this frame.
135  for (WebNode node = all.firstItem(); !node.isNull();
136       node = all.nextItem()) {
137    // We only save HTML resources.
138    if (!node.isElementNode())
139      continue;
140    WebElement element = node.to<WebElement>();
141    GetSavableResourceLinkForElement(element,
142                                     current_doc,
143                                     unique_check,
144                                     result);
145  }
146}
147
148}  // namespace
149
150WebString GetSubResourceLinkFromElement(const WebElement& element) {
151  const char* attribute_name = NULL;
152  if (element.hasHTMLTagName("img") ||
153      element.hasHTMLTagName("script")) {
154    attribute_name = "src";
155  } else if (element.hasHTMLTagName("input")) {
156    const WebInputElement input = element.toConst<WebInputElement>();
157    if (input.isImageButton()) {
158      attribute_name = "src";
159    }
160  } else if (element.hasHTMLTagName("body") ||
161             element.hasHTMLTagName("table") ||
162             element.hasHTMLTagName("tr") ||
163             element.hasHTMLTagName("td")) {
164    attribute_name = "background";
165  } else if (element.hasHTMLTagName("blockquote") ||
166             element.hasHTMLTagName("q") ||
167             element.hasHTMLTagName("del") ||
168             element.hasHTMLTagName("ins")) {
169    attribute_name = "cite";
170  } else if (element.hasHTMLTagName("link")) {
171    // If the link element is not linked to css, ignore it.
172    if (LowerCaseEqualsASCII(element.getAttribute("type"), "text/css")) {
173      // TODO(jnd): Add support for extracting links of sub-resources which
174      // are inside style-sheet such as @import, url(), etc.
175      // See bug: http://b/issue?id=1111667.
176      attribute_name = "href";
177    }
178  }
179  if (!attribute_name)
180    return WebString();
181  WebString value = element.getAttribute(WebString::fromUTF8(attribute_name));
182  // If value has content and not start with "javascript:" then return it,
183  // otherwise return NULL.
184  if (!value.isNull() && !value.isEmpty() &&
185      !StartsWithASCII(value.utf8(), "javascript:", false))
186    return value;
187
188  return WebString();
189}
190
191// Get all savable resource links from current webview, include main
192// frame and sub-frame
193bool GetAllSavableResourceLinksForCurrentPage(WebView* view,
194    const GURL& page_url, SavableResourcesResult* result,
195    const char** savable_schemes) {
196  WebFrame* main_frame = view->mainFrame();
197  if (!main_frame)
198    return false;
199
200  std::set<GURL> resources_set;
201  std::set<GURL> frames_set;
202  std::vector<WebFrame*> frames;
203  SavableResourcesUniqueCheck unique_check(&resources_set,
204                                           &frames_set,
205                                           &frames);
206
207  GURL main_page_gurl(main_frame->document().url());
208
209  // Make sure we are saving same page between embedder and webkit.
210  // If page has being navigated, embedder will get three empty vector,
211  // which will make the saving page job ended.
212  if (page_url != main_page_gurl)
213    return true;
214
215  // First, process main frame.
216  frames.push_back(main_frame);
217
218  // Check all resource in this page, include sub-frame.
219  for (int i = 0; i < static_cast<int>(frames.size()); ++i) {
220    // Get current frame's all savable resource links.
221    GetAllSavableResourceLinksForFrame(frames[i], &unique_check, result,
222                                       savable_schemes);
223  }
224
225  // Since frame's src can also point to sub-resources link, so it is possible
226  // that some URLs in frames_list are also in resources_list. For those
227  // URLs, we will remove it from frame_list, only keep them in resources_list.
228  for (std::set<GURL>::iterator it = frames_set.begin();
229       it != frames_set.end(); ++it) {
230    // Append unique frame source to savable frame list.
231    if (resources_set.find(*it) == resources_set.end())
232      result->frames_list->push_back(*it);
233  }
234
235  return true;
236}
237
238}  // namespace content
239