1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// BrowserFeatureExtractor computes various browser features for client-side
6// phishing detection.  For now it does a bunch of lookups in the history
7// service to see whether a particular URL has been visited before by the
8// user.
9
10#ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
11#define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
12
13#include <map>
14#include <set>
15#include <string>
16#include <utility>
17#include <vector>
18
19#include "base/basictypes.h"
20#include "base/callback.h"
21#include "base/containers/hash_tables.h"
22#include "base/memory/scoped_ptr.h"
23#include "base/task/cancelable_task_tracker.h"
24#include "base/time/time.h"
25#include "chrome/browser/safe_browsing/safe_browsing_service.h"
26#include "chrome/browser/safe_browsing/ui_manager.h"
27#include "components/history/core/browser/history_types.h"
28#include "content/public/common/resource_type.h"
29#include "url/gurl.h"
30
31
32class HistoryService;
33
34namespace content {
35class WebContents;
36}
37
38namespace safe_browsing {
39class ClientMalwareRequest;
40class ClientPhishingRequest;
41class ClientSideDetectionHost;
42
43struct IPUrlInfo {
44  // The url on the bad IP address.
45  std::string url;
46  std::string method;
47  std::string referrer;
48  content::ResourceType resource_type;
49
50  IPUrlInfo(const std::string& url,
51            const std::string& method,
52            const std::string& referrer,
53            const content::ResourceType& resource_type);
54  ~IPUrlInfo();
55};
56
57typedef std::map<std::string, std::vector<IPUrlInfo> > IPUrlMap;
58
59struct BrowseInfo {
60  // The URL we're currently browsing.
61  GURL url;
62
63  // List of IPv4 and IPv6 addresses from which content was requested
64  // together with the hosts on it, while browsing to the |url|.
65  IPUrlMap ips;
66
67  // If a SafeBrowsing interstitial was shown for the current URL
68  // this will contain the UnsafeResource struct for that URL.
69  scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource;
70
71  // List of redirects that lead to the first page on the current host and
72  // the current url respectively. These may be the same if the current url
73  // is the first page on its host.
74  std::vector<GURL> host_redirects;
75  std::vector<GURL> url_redirects;
76
77  // URL of the referrer of this URL load.
78  GURL referrer;
79
80  // The HTTP status code from this navigation.
81  int http_status_code;
82
83  // The page ID of the navigation.  This comes from FrameNavigateParams.
84  int32 page_id;
85
86  BrowseInfo();
87  ~BrowseInfo();
88};
89
90// All methods of this class must be called on the UI thread (including
91// the constructor).
92class BrowserFeatureExtractor {
93 public:
94  // Called when feature extraction is done.  The first argument will be
95  // true iff feature extraction succeeded.  The second argument is the
96  // phishing request which was modified by the feature extractor.  The
97  // DoneCallback takes ownership of the request object.
98  typedef base::Callback<void(bool, scoped_ptr<ClientPhishingRequest>)>
99      DoneCallback;
100  typedef base::Callback<void(bool, scoped_ptr<ClientMalwareRequest>)>
101      MalwareDoneCallback;
102
103  // The caller keeps ownership of the tab and host objects and is
104  // responsible for ensuring that they stay valid for the entire
105  // lifetime of this object.
106  BrowserFeatureExtractor(content::WebContents* tab,
107                          ClientSideDetectionHost* host);
108
109  // The destructor will cancel any pending requests.
110  virtual ~BrowserFeatureExtractor();
111
112  // Begins extraction of the browser features.  We take ownership
113  // of the request object until |callback| is called (see DoneCallback above)
114  // and will write the extracted features to the feature map.  Once the
115  // feature extraction is complete, |callback| is run on the UI thread.  We
116  // take ownership of the |callback| object.  |info| may not be valid after
117  // ExtractFeatures returns.  This method must run on the UI thread.
118  virtual void ExtractFeatures(const BrowseInfo* info,
119                               ClientPhishingRequest* request,
120                               const DoneCallback& callback);
121
122  // Begins extraction of the malware related features.  We take ownership
123  // of the request object until |callback| is called.  Once feature extraction
124  // is complete, |callback| will run on the UI thread.  |info| is not expected
125  // to stay valid after ExtractMalwareFeatures returns.  All IPs stored in
126  // |info| will be cleared by calling this function.
127  virtual void ExtractMalwareFeatures(BrowseInfo* info,
128                                      ClientMalwareRequest* request,
129                                      const MalwareDoneCallback& callback);
130
131 private:
132  // Synchronous browser feature extraction.
133  void ExtractBrowseInfoFeatures(const BrowseInfo& info,
134                                 ClientPhishingRequest* request);
135
136  // Actually starts feature extraction (does the real work).
137  void StartExtractFeatures(scoped_ptr<ClientPhishingRequest> request,
138                            const DoneCallback& callback);
139
140  // HistoryService callback which is called when we're done querying URL visits
141  // in the history.
142  void QueryUrlHistoryDone(scoped_ptr<ClientPhishingRequest> request,
143                           const DoneCallback& callback,
144                           bool success,
145                           const history::URLRow& row,
146                           const history::VisitVector& visits);
147
148  // HistoryService callback which is called when we're done querying HTTP host
149  // visits in the history.
150  void QueryHttpHostVisitsDone(scoped_ptr<ClientPhishingRequest> request,
151                               const DoneCallback& callback,
152                               bool success,
153                               int num_visits,
154                               base::Time first_visit);
155
156  // HistoryService callback which is called when we're done querying HTTPS host
157  // visits in the history.
158  void QueryHttpsHostVisitsDone(scoped_ptr<ClientPhishingRequest> request,
159                                const DoneCallback& callback,
160                                bool success,
161                                int num_visits,
162                                base::Time first_visit);
163
164  // Helper function which sets the host history features given the
165  // number of host visits and the time of the fist host visit.  Set
166  // |is_http_query| to true if the URL scheme is HTTP and to false if
167  // the scheme is HTTPS.
168  void SetHostVisitsFeatures(int num_visits,
169                             base::Time first_visit,
170                             bool is_http_query,
171                             ClientPhishingRequest* request);
172
173  // Helper function which gets the history server if possible.  If the pointer
174  // is set it will return true and false otherwise.
175  bool GetHistoryService(HistoryService** history);
176
177  // Helper function which is called when we're done filtering out benign IPs
178  // on the IO thread.  This function is called on the UI thread.
179  void FinishExtractMalwareFeatures(scoped_ptr<IPUrlMap> bad_ips,
180                                    MalwareDoneCallback callback,
181                                    scoped_ptr<ClientMalwareRequest> request);
182
183  content::WebContents* tab_;
184  ClientSideDetectionHost* host_;
185  base::CancelableTaskTracker cancelable_task_tracker_;
186  base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_;
187
188  DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor);
189};
190
191}  // namespace safe_browsing
192#endif  // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
193