1// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// BrowserFeatureExtractor computes various browser features for client-side 6// phishing detection. For now it does a bunch of lookups in the history 7// service to see whether a particular URL has been visited before by the 8// user. 9 10#ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 11#define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 12 13#include <map> 14#include <set> 15#include <string> 16#include <utility> 17#include <vector> 18 19#include "base/basictypes.h" 20#include "base/callback.h" 21#include "base/containers/hash_tables.h" 22#include "base/memory/scoped_ptr.h" 23#include "base/task/cancelable_task_tracker.h" 24#include "base/time/time.h" 25#include "chrome/browser/safe_browsing/safe_browsing_service.h" 26#include "chrome/browser/safe_browsing/ui_manager.h" 27#include "components/history/core/browser/history_types.h" 28#include "content/public/common/resource_type.h" 29#include "url/gurl.h" 30 31 32class HistoryService; 33 34namespace content { 35class WebContents; 36} 37 38namespace safe_browsing { 39class ClientMalwareRequest; 40class ClientPhishingRequest; 41class ClientSideDetectionHost; 42 43struct IPUrlInfo { 44 // The url on the bad IP address. 45 std::string url; 46 std::string method; 47 std::string referrer; 48 content::ResourceType resource_type; 49 50 IPUrlInfo(const std::string& url, 51 const std::string& method, 52 const std::string& referrer, 53 const content::ResourceType& resource_type); 54 ~IPUrlInfo(); 55}; 56 57typedef std::map<std::string, std::vector<IPUrlInfo> > IPUrlMap; 58 59struct BrowseInfo { 60 // The URL we're currently browsing. 61 GURL url; 62 63 // List of IPv4 and IPv6 addresses from which content was requested 64 // together with the hosts on it, while browsing to the |url|. 65 IPUrlMap ips; 66 67 // If a SafeBrowsing interstitial was shown for the current URL 68 // this will contain the UnsafeResource struct for that URL. 69 scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource; 70 71 // List of redirects that lead to the first page on the current host and 72 // the current url respectively. These may be the same if the current url 73 // is the first page on its host. 74 std::vector<GURL> host_redirects; 75 std::vector<GURL> url_redirects; 76 77 // URL of the referrer of this URL load. 78 GURL referrer; 79 80 // The HTTP status code from this navigation. 81 int http_status_code; 82 83 // The page ID of the navigation. This comes from FrameNavigateParams. 84 int32 page_id; 85 86 BrowseInfo(); 87 ~BrowseInfo(); 88}; 89 90// All methods of this class must be called on the UI thread (including 91// the constructor). 92class BrowserFeatureExtractor { 93 public: 94 // Called when feature extraction is done. The first argument will be 95 // true iff feature extraction succeeded. The second argument is the 96 // phishing request which was modified by the feature extractor. The 97 // DoneCallback takes ownership of the request object. 98 typedef base::Callback<void(bool, scoped_ptr<ClientPhishingRequest>)> 99 DoneCallback; 100 typedef base::Callback<void(bool, scoped_ptr<ClientMalwareRequest>)> 101 MalwareDoneCallback; 102 103 // The caller keeps ownership of the tab and host objects and is 104 // responsible for ensuring that they stay valid for the entire 105 // lifetime of this object. 106 BrowserFeatureExtractor(content::WebContents* tab, 107 ClientSideDetectionHost* host); 108 109 // The destructor will cancel any pending requests. 110 virtual ~BrowserFeatureExtractor(); 111 112 // Begins extraction of the browser features. We take ownership 113 // of the request object until |callback| is called (see DoneCallback above) 114 // and will write the extracted features to the feature map. Once the 115 // feature extraction is complete, |callback| is run on the UI thread. We 116 // take ownership of the |callback| object. |info| may not be valid after 117 // ExtractFeatures returns. This method must run on the UI thread. 118 virtual void ExtractFeatures(const BrowseInfo* info, 119 ClientPhishingRequest* request, 120 const DoneCallback& callback); 121 122 // Begins extraction of the malware related features. We take ownership 123 // of the request object until |callback| is called. Once feature extraction 124 // is complete, |callback| will run on the UI thread. |info| is not expected 125 // to stay valid after ExtractMalwareFeatures returns. All IPs stored in 126 // |info| will be cleared by calling this function. 127 virtual void ExtractMalwareFeatures(BrowseInfo* info, 128 ClientMalwareRequest* request, 129 const MalwareDoneCallback& callback); 130 131 private: 132 // Synchronous browser feature extraction. 133 void ExtractBrowseInfoFeatures(const BrowseInfo& info, 134 ClientPhishingRequest* request); 135 136 // Actually starts feature extraction (does the real work). 137 void StartExtractFeatures(scoped_ptr<ClientPhishingRequest> request, 138 const DoneCallback& callback); 139 140 // HistoryService callback which is called when we're done querying URL visits 141 // in the history. 142 void QueryUrlHistoryDone(scoped_ptr<ClientPhishingRequest> request, 143 const DoneCallback& callback, 144 bool success, 145 const history::URLRow& row, 146 const history::VisitVector& visits); 147 148 // HistoryService callback which is called when we're done querying HTTP host 149 // visits in the history. 150 void QueryHttpHostVisitsDone(scoped_ptr<ClientPhishingRequest> request, 151 const DoneCallback& callback, 152 bool success, 153 int num_visits, 154 base::Time first_visit); 155 156 // HistoryService callback which is called when we're done querying HTTPS host 157 // visits in the history. 158 void QueryHttpsHostVisitsDone(scoped_ptr<ClientPhishingRequest> request, 159 const DoneCallback& callback, 160 bool success, 161 int num_visits, 162 base::Time first_visit); 163 164 // Helper function which sets the host history features given the 165 // number of host visits and the time of the fist host visit. Set 166 // |is_http_query| to true if the URL scheme is HTTP and to false if 167 // the scheme is HTTPS. 168 void SetHostVisitsFeatures(int num_visits, 169 base::Time first_visit, 170 bool is_http_query, 171 ClientPhishingRequest* request); 172 173 // Helper function which gets the history server if possible. If the pointer 174 // is set it will return true and false otherwise. 175 bool GetHistoryService(HistoryService** history); 176 177 // Helper function which is called when we're done filtering out benign IPs 178 // on the IO thread. This function is called on the UI thread. 179 void FinishExtractMalwareFeatures(scoped_ptr<IPUrlMap> bad_ips, 180 MalwareDoneCallback callback, 181 scoped_ptr<ClientMalwareRequest> request); 182 183 content::WebContents* tab_; 184 ClientSideDetectionHost* host_; 185 base::CancelableTaskTracker cancelable_task_tracker_; 186 base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_; 187 188 DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor); 189}; 190 191} // namespace safe_browsing 192#endif // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 193