1// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "content/renderer/savable_resources.h"
6
7#include <set>
8
9#include "base/compiler_specific.h"
10#include "base/logging.h"
11#include "base/strings/string_util.h"
12#include "third_party/WebKit/public/platform/WebString.h"
13#include "third_party/WebKit/public/platform/WebVector.h"
14#include "third_party/WebKit/public/web/WebDocument.h"
15#include "third_party/WebKit/public/web/WebElement.h"
16#include "third_party/WebKit/public/web/WebElementCollection.h"
17#include "third_party/WebKit/public/web/WebInputElement.h"
18#include "third_party/WebKit/public/web/WebLocalFrame.h"
19#include "third_party/WebKit/public/web/WebNode.h"
20#include "third_party/WebKit/public/web/WebNodeList.h"
21#include "third_party/WebKit/public/web/WebView.h"
22
23using blink::WebDocument;
24using blink::WebElement;
25using blink::WebElementCollection;
26using blink::WebFrame;
27using blink::WebInputElement;
28using blink::WebLocalFrame;
29using blink::WebNode;
30using blink::WebNodeList;
31using blink::WebString;
32using blink::WebVector;
33using blink::WebView;
34
35namespace content {
36namespace {
37
38// Structure for storage the unique set of all savable resource links for
39// making sure that no duplicated resource link in final result. The consumer
40// of the SavableResourcesUniqueCheck is responsible for keeping these pointers
41// valid for the lifetime of the SavableResourcesUniqueCheck instance.
42struct SavableResourcesUniqueCheck {
43  // Unique set of all sub resource links.
44  std::set<GURL>* resources_set;
45  // Unique set of all frame links.
46  std::set<GURL>* frames_set;
47  // Collection of all frames we go through when getting all savable resource
48  // links.
49  std::vector<WebFrame*>* frames;
50
51  SavableResourcesUniqueCheck()
52      : resources_set(NULL),
53        frames_set(NULL),
54        frames(NULL) {}
55
56  SavableResourcesUniqueCheck(std::set<GURL>* resources_set,
57      std::set<GURL>* frames_set, std::vector<WebFrame*>* frames)
58      : resources_set(resources_set),
59        frames_set(frames_set),
60        frames(frames) {}
61};
62
63// Get all savable resource links from current element. One element might
64// have more than one resource link. It is possible to have some links
65// in one CSS stylesheet.
66void GetSavableResourceLinkForElement(
67    const WebElement& element,
68    const WebDocument& current_doc,
69    SavableResourcesUniqueCheck* unique_check,
70    SavableResourcesResult* result) {
71
72  // Handle frame and iframe tag.
73  if (element.hasHTMLTagName("iframe") ||
74      element.hasHTMLTagName("frame")) {
75    WebFrame* sub_frame = WebLocalFrame::fromFrameOwnerElement(element);
76    if (sub_frame)
77      unique_check->frames->push_back(sub_frame);
78    return;
79  }
80
81  // Check whether the node has sub resource URL or not.
82  WebString value = GetSubResourceLinkFromElement(element);
83  if (value.isNull())
84    return;
85  // Get absolute URL.
86  GURL u = current_doc.completeURL(value);
87  // ignore invalid URL
88  if (!u.is_valid())
89    return;
90  // Ignore those URLs which are not standard protocols. Because FTP
91  // protocol does no have cache mechanism, we will skip all
92  // sub-resources if they use FTP protocol.
93  if (!u.SchemeIsHTTPOrHTTPS() && !u.SchemeIs(url::kFileScheme))
94    return;
95  // Ignore duplicated resource link.
96  if (!unique_check->resources_set->insert(u).second)
97    return;
98  result->resources_list->push_back(u);
99  // Insert referrer for above new resource link.
100  result->referrer_urls_list->push_back(GURL());
101  result->referrer_policies_list->push_back(blink::WebReferrerPolicyDefault);
102}
103
104// Get all savable resource links from current WebFrameImpl object pointer.
105void GetAllSavableResourceLinksForFrame(WebFrame* current_frame,
106    SavableResourcesUniqueCheck* unique_check,
107    SavableResourcesResult* result,
108    const char** savable_schemes) {
109  // Get current frame's URL.
110  GURL current_frame_url = current_frame->document().url();
111
112  // If url of current frame is invalid, ignore it.
113  if (!current_frame_url.is_valid())
114    return;
115
116  // If url of current frame is not a savable protocol, ignore it.
117  bool is_valid_protocol = false;
118  for (int i = 0; savable_schemes[i] != NULL; ++i) {
119    if (current_frame_url.SchemeIs(savable_schemes[i])) {
120      is_valid_protocol = true;
121      break;
122    }
123  }
124  if (!is_valid_protocol)
125    return;
126
127  // If find same frame we have recorded, ignore it.
128  if (!unique_check->frames_set->insert(current_frame_url).second)
129    return;
130
131  // Get current using document.
132  WebDocument current_doc = current_frame->document();
133  // Go through all descent nodes.
134  WebElementCollection all = current_doc.all();
135  // Go through all elements in this frame.
136  for (WebElement element = all.firstItem(); !element.isNull();
137       element = all.nextItem()) {
138    GetSavableResourceLinkForElement(element,
139                                     current_doc,
140                                     unique_check,
141                                     result);
142  }
143}
144
145}  // namespace
146
147WebString GetSubResourceLinkFromElement(const WebElement& element) {
148  const char* attribute_name = NULL;
149  if (element.hasHTMLTagName("img") ||
150      element.hasHTMLTagName("script")) {
151    attribute_name = "src";
152  } else if (element.hasHTMLTagName("input")) {
153    const WebInputElement input = element.toConst<WebInputElement>();
154    if (input.isImageButton()) {
155      attribute_name = "src";
156    }
157  } else if (element.hasHTMLTagName("body") ||
158             element.hasHTMLTagName("table") ||
159             element.hasHTMLTagName("tr") ||
160             element.hasHTMLTagName("td")) {
161    attribute_name = "background";
162  } else if (element.hasHTMLTagName("blockquote") ||
163             element.hasHTMLTagName("q") ||
164             element.hasHTMLTagName("del") ||
165             element.hasHTMLTagName("ins")) {
166    attribute_name = "cite";
167  } else if (element.hasHTMLTagName("link")) {
168    // If the link element is not linked to css, ignore it.
169    if (LowerCaseEqualsASCII(element.getAttribute("type"), "text/css")) {
170      // TODO(jnd): Add support for extracting links of sub-resources which
171      // are inside style-sheet such as @import, url(), etc.
172      // See bug: http://b/issue?id=1111667.
173      attribute_name = "href";
174    }
175  }
176  if (!attribute_name)
177    return WebString();
178  WebString value = element.getAttribute(WebString::fromUTF8(attribute_name));
179  // If value has content and not start with "javascript:" then return it,
180  // otherwise return NULL.
181  if (!value.isNull() && !value.isEmpty() &&
182      !StartsWithASCII(value.utf8(), "javascript:", false))
183    return value;
184
185  return WebString();
186}
187
188// Get all savable resource links from current webview, include main
189// frame and sub-frame
190bool GetAllSavableResourceLinksForCurrentPage(WebView* view,
191    const GURL& page_url, SavableResourcesResult* result,
192    const char** savable_schemes) {
193  WebFrame* main_frame = view->mainFrame();
194  if (!main_frame)
195    return false;
196
197  std::set<GURL> resources_set;
198  std::set<GURL> frames_set;
199  std::vector<WebFrame*> frames;
200  SavableResourcesUniqueCheck unique_check(&resources_set,
201                                           &frames_set,
202                                           &frames);
203
204  GURL main_page_gurl(main_frame->document().url());
205
206  // Make sure we are saving same page between embedder and webkit.
207  // If page has being navigated, embedder will get three empty vector,
208  // which will make the saving page job ended.
209  if (page_url != main_page_gurl)
210    return true;
211
212  // First, process main frame.
213  frames.push_back(main_frame);
214
215  // Check all resource in this page, include sub-frame.
216  for (int i = 0; i < static_cast<int>(frames.size()); ++i) {
217    // Get current frame's all savable resource links.
218    GetAllSavableResourceLinksForFrame(frames[i], &unique_check, result,
219                                       savable_schemes);
220  }
221
222  // Since frame's src can also point to sub-resources link, so it is possible
223  // that some URLs in frames_list are also in resources_list. For those
224  // URLs, we will remove it from frame_list, only keep them in resources_list.
225  for (std::set<GURL>::iterator it = frames_set.begin();
226       it != frames_set.end(); ++it) {
227    // Append unique frame source to savable frame list.
228    if (resources_set.find(*it) == resources_set.end())
229      result->frames_list->push_back(*it);
230  }
231
232  return true;
233}
234
235}  // namespace content
236