browser_feature_extractor.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BrowserFeatureExtractor computes various browser features for client-side 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// phishing detection. For now it does a bunch of lookups in the history 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// service to see whether a particular URL has been visited before by the 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// user. 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map> 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set> 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <utility> 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector> 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/callback.h" 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h" 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/sequenced_task_runner_helpers.h" 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/time.h" 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/common/cancelable_request.h" 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/history/history_types.h" 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/safe_browsing/safe_browsing_service.h" 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "googleurl/src/gurl.h" 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class HistoryService; 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content { 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class WebContents; 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing { 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ClientPhishingRequest; 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ClientSideDetectionService; 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct BrowseInfo { 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // List of IPv4 and IPv6 addresses from which content was requested 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // while browsing to the |url|. 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::set<std::string> ips; 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If a SafeBrowsing interstitial was shown for the current URL 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // this will contain the UnsafeResource struct for that URL. 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<SafeBrowsingService::UnsafeResource> unsafe_resource; 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // List of redirects that lead to the first page on the current host and 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the current url respectively. These may be the same if the current url 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is the first page on its host. 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<GURL> host_redirects; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<GURL> url_redirects; 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The HTTP status code from this navigation. 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int http_status_code; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BrowseInfo(); 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~BrowseInfo(); 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All methods of this class must be called on the UI thread (including 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the constructor). 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class BrowserFeatureExtractor { 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Called when feature extraction is done. The first argument will be 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // true iff feature extraction succeeded. The second argument is the 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // phishing request which was modified by the feature extractor. The 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // DoneCallback takes ownership of the request object. 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The caller keeps ownership of the tab and service objects and is 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // responsible for ensuring that they stay valid for the entire 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // lifetime of this object. 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BrowserFeatureExtractor(content::WebContents* tab, 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientSideDetectionService* service); 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The destructor will cancel any pending requests. 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual ~BrowserFeatureExtractor(); 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Begins extraction of the browser features. We take ownership 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of the request object until |callback| is called (see DoneCallback above) 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and will write the extracted features to the feature map. Once the 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // feature extraction is complete, |callback| is run on the UI thread. We 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // take ownership of the |callback| object. |info| may not be valid after 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ExtractFeatures returns. This method must run on the UI thread. 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual void ExtractFeatures(const BrowseInfo* info, 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request, 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const DoneCallback& callback); 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) friend class base::DeleteHelper<BrowserFeatureExtractor>; 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData; 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef std::map<CancelableRequestProvider::Handle, 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ExtractionData> PendingQueriesMap; 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Synchronous browser feature extraction. 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void ExtractBrowseInfoFeatures(const BrowseInfo& info, 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request); 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Actually starts feature extraction (does the real work). 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void StartExtractFeatures(ClientPhishingRequest* request, 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const DoneCallback& callback); 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // HistoryService callback which is called when we're done querying URL visits 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in the history. 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void QueryUrlHistoryDone(CancelableRequestProvider::Handle handle, 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success, 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const history::URLRow* row, 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) history::VisitVector* visits); 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // HistoryService callback which is called when we're done querying HTTP host 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // visits in the history. 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle, 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success, 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_visits, 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Time first_visit); 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // HistoryService callback which is called when we're done querying HTTPS host 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // visits in the history. 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle, 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success, 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_visits, 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Time first_visit); 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper function which sets the host history features given the 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // number of host visits and the time of the fist host visit. Set 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // |is_http_query| to true if the URL scheme is HTTP and to false if 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the scheme is HTTPS. 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void SetHostVisitsFeatures(int num_visits, 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Time first_visit, 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_http_query, 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request); 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper function which stores the request and callback while the history 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // query is being processed. 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void StorePendingQuery(CancelableRequestProvider::Handle handle, 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request, 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const DoneCallback& callback); 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper function which is the counterpart of StorePendingQuery. If there 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is a pending query for the given handle it will return false and set both 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the request and cb pointers. Otherwise, it will return false. 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetPendingQuery(CancelableRequestProvider::Handle handle, 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest** request, 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DoneCallback* callback); 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper function which gets the history server if possible. If the pointer 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is set it will return true and false otherwise. 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetHistoryService(HistoryService** history); 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) content::WebContents* tab_; 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientSideDetectionService* service_; 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CancelableRequestConsumer request_consumer_; 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_; 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Set of pending extractions (i.e. extractions for which ExtractFeatures was 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // called but not StartExtractFeatures). 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_; 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Set of pending queries (i.e., where history->Query...() was called but 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the history callback hasn't been invoked yet). 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PendingQueriesMap pending_queries_; 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor); 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace safe_browsing 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 169