1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// BrowserFeatureExtractor computes various browser features for client-side
6// phishing detection.  For now it does a bunch of lookups in the history
7// service to see whether a particular URL has been visited before by the
8// user.
9
10#ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
11#define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
12
13#include <map>
14#include <set>
15#include <string>
16#include <utility>
17#include <vector>
18
19#include "base/basictypes.h"
20#include "base/callback.h"
21#include "base/containers/hash_tables.h"
22#include "base/memory/scoped_ptr.h"
23#include "base/sequenced_task_runner_helpers.h"
24#include "base/time/time.h"
25#include "chrome/browser/common/cancelable_request.h"
26#include "chrome/browser/history/history_types.h"
27#include "chrome/browser/safe_browsing/safe_browsing_service.h"
28#include "chrome/browser/safe_browsing/ui_manager.h"
29#include "url/gurl.h"
30#include "webkit/common/resource_type.h"
31
32
33class HistoryService;
34
35namespace content {
36class WebContents;
37}
38
39namespace safe_browsing {
40class ClientMalwareRequest;
41class ClientPhishingRequest;
42class ClientSideDetectionHost;
43
44struct IPUrlInfo {
45  // The url on the bad IP address.
46  std::string url;
47  std::string method;
48  std::string referrer;
49  ResourceType::Type resource_type;
50
51  IPUrlInfo(const std::string& url,
52            const std::string& method,
53            const std::string& referrer,
54            const ResourceType::Type& resource_type);
55  ~IPUrlInfo();
56};
57
58typedef std::map<std::string, std::vector<IPUrlInfo> > IPUrlMap;
59
60struct BrowseInfo {
61  // List of IPv4 and IPv6 addresses from which content was requested
62  // together with the hosts on it, while browsing to the |url|.
63  IPUrlMap ips;
64
65  // If a SafeBrowsing interstitial was shown for the current URL
66  // this will contain the UnsafeResource struct for that URL.
67  scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource;
68
69  // List of redirects that lead to the first page on the current host and
70  // the current url respectively. These may be the same if the current url
71  // is the first page on its host.
72  std::vector<GURL> host_redirects;
73  std::vector<GURL> url_redirects;
74
75  // URL of the referrer of this URL load.
76  GURL referrer;
77
78  // The HTTP status code from this navigation.
79  int http_status_code;
80
81  BrowseInfo();
82  ~BrowseInfo();
83};
84
85// All methods of this class must be called on the UI thread (including
86// the constructor).
87class BrowserFeatureExtractor {
88 public:
89  // Called when feature extraction is done.  The first argument will be
90  // true iff feature extraction succeeded.  The second argument is the
91  // phishing request which was modified by the feature extractor.  The
92  // DoneCallback takes ownership of the request object.
93  typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback;
94  typedef base::Callback<void(bool, scoped_ptr<ClientMalwareRequest>)>
95      MalwareDoneCallback;
96
97  // The caller keeps ownership of the tab and host objects and is
98  // responsible for ensuring that they stay valid for the entire
99  // lifetime of this object.
100  BrowserFeatureExtractor(content::WebContents* tab,
101                          ClientSideDetectionHost* host);
102
103  // The destructor will cancel any pending requests.
104  virtual ~BrowserFeatureExtractor();
105
106  // Begins extraction of the browser features.  We take ownership
107  // of the request object until |callback| is called (see DoneCallback above)
108  // and will write the extracted features to the feature map.  Once the
109  // feature extraction is complete, |callback| is run on the UI thread.  We
110  // take ownership of the |callback| object.  |info| may not be valid after
111  // ExtractFeatures returns.  This method must run on the UI thread.
112  virtual void ExtractFeatures(const BrowseInfo* info,
113                               ClientPhishingRequest* request,
114                               const DoneCallback& callback);
115
116  // Begins extraction of the malware related features.  We take ownership
117  // of the request object until |callback| is called.  Once feature extraction
118  // is complete, |callback| will run on the UI thread.  |info| is not expected
119  // to stay valid after ExtractMalwareFeatures returns.  All IPs stored in
120  // |info| will be cleared by calling this function.
121  virtual void ExtractMalwareFeatures(BrowseInfo* info,
122                                      ClientMalwareRequest* request,
123                                      const MalwareDoneCallback& callback);
124
125 private:
126  friend class base::DeleteHelper<BrowserFeatureExtractor>;
127  typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData;
128  typedef std::map<CancelableRequestProvider::Handle,
129                   ExtractionData> PendingQueriesMap;
130
131  // Synchronous browser feature extraction.
132  void ExtractBrowseInfoFeatures(const BrowseInfo& info,
133                                 ClientPhishingRequest* request);
134
135  // Actually starts feature extraction (does the real work).
136  void StartExtractFeatures(ClientPhishingRequest* request,
137                            const DoneCallback& callback);
138
139  // HistoryService callback which is called when we're done querying URL visits
140  // in the history.
141  void QueryUrlHistoryDone(CancelableRequestProvider::Handle handle,
142                           bool success,
143                           const history::URLRow* row,
144                           history::VisitVector* visits);
145
146  // HistoryService callback which is called when we're done querying HTTP host
147  // visits in the history.
148  void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle,
149                               bool success,
150                               int num_visits,
151                               base::Time first_visit);
152
153  // HistoryService callback which is called when we're done querying HTTPS host
154  // visits in the history.
155  void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle,
156                                bool success,
157                                int num_visits,
158                                base::Time first_visit);
159
160  // Helper function which sets the host history features given the
161  // number of host visits and the time of the fist host visit.  Set
162  // |is_http_query| to true if the URL scheme is HTTP and to false if
163  // the scheme is HTTPS.
164  void SetHostVisitsFeatures(int num_visits,
165                             base::Time first_visit,
166                             bool is_http_query,
167                             ClientPhishingRequest* request);
168
169  // Helper function which stores the request and callback while the history
170  // query is being processed.
171  void StorePendingQuery(CancelableRequestProvider::Handle handle,
172                         ClientPhishingRequest* request,
173                         const DoneCallback& callback);
174
175  // Helper function which is the counterpart of StorePendingQuery.  If there
176  // is a pending query for the given handle it will return false and set both
177  // the request and cb pointers.  Otherwise, it will return false.
178  bool GetPendingQuery(CancelableRequestProvider::Handle handle,
179                       ClientPhishingRequest** request,
180                       DoneCallback* callback);
181
182  // Helper function which gets the history server if possible.  If the pointer
183  // is set it will return true and false otherwise.
184  bool GetHistoryService(HistoryService** history);
185
186  // Helper function which is called when we're done filtering out benign IPs
187  // on the IO thread.  This function is called on the UI thread.
188  void FinishExtractMalwareFeatures(scoped_ptr<IPUrlMap> bad_ips,
189                                    MalwareDoneCallback callback,
190                                    scoped_ptr<ClientMalwareRequest> request);
191
192  content::WebContents* tab_;
193  ClientSideDetectionHost* host_;
194  CancelableRequestConsumer request_consumer_;
195  base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_;
196
197  // Set of pending extractions (i.e. extractions for which ExtractFeatures was
198  // called but not StartExtractFeatures).
199  std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_;
200
201  // Set of pending queries (i.e., where history->Query...() was called but
202  // the history callback hasn't been invoked yet).
203  PendingQueriesMap pending_queries_;
204
205  DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor);
206};
207
208}  // namespace safe_browsing
209#endif  // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
210