browser_feature_extractor.h revision eb525c5499e34cc9c4b825d6d9e75bb07cc06ace
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BrowserFeatureExtractor computes various browser features for client-side 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// phishing detection. For now it does a bunch of lookups in the history 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// service to see whether a particular URL has been visited before by the 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// user. 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map> 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set> 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <utility> 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector> 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/callback.h" 217d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h" 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h" 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/sequenced_task_runner_helpers.h" 24eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "base/time/time.h" 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/common/cancelable_request.h" 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/history/history_types.h" 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/safe_browsing/safe_browsing_service.h" 282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "chrome/browser/safe_browsing/ui_manager.h" 29eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "url/gurl.h" 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class HistoryService; 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content { 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class WebContents; 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing { 382a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)class ClientMalwareRequest; 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ClientPhishingRequest; 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ClientSideDetectionService; 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 422a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)typedef std::map<std::string, std::set<std::string> > IPHostMap; 432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct BrowseInfo { 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // List of IPv4 and IPv6 addresses from which content was requested 462a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // together with the hosts on it, while browsing to the |url|. 472a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) IPHostMap ips; 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If a SafeBrowsing interstitial was shown for the current URL 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // this will contain the UnsafeResource struct for that URL. 512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // List of redirects that lead to the first page on the current host and 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the current url respectively. These may be the same if the current url 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is the first page on its host. 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<GURL> host_redirects; 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<GURL> url_redirects; 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The HTTP status code from this navigation. 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int http_status_code; 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BrowseInfo(); 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~BrowseInfo(); 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All methods of this class must be called on the UI thread (including 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the constructor). 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class BrowserFeatureExtractor { 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Called when feature extraction is done. The first argument will be 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // true iff feature extraction succeeded. The second argument is the 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // phishing request which was modified by the feature extractor. The 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // DoneCallback takes ownership of the request object. 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback; 752a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) typedef base::Callback<void(bool, ClientMalwareRequest*)> MalwareDoneCallback; 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The caller keeps ownership of the tab and service objects and is 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // responsible for ensuring that they stay valid for the entire 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // lifetime of this object. 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BrowserFeatureExtractor(content::WebContents* tab, 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientSideDetectionService* service); 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The destructor will cancel any pending requests. 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual ~BrowserFeatureExtractor(); 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Begins extraction of the browser features. We take ownership 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of the request object until |callback| is called (see DoneCallback above) 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and will write the extracted features to the feature map. Once the 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // feature extraction is complete, |callback| is run on the UI thread. We 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // take ownership of the |callback| object. |info| may not be valid after 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ExtractFeatures returns. This method must run on the UI thread. 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual void ExtractFeatures(const BrowseInfo* info, 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request, 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const DoneCallback& callback); 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 962a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // Extract the malware related features. The request object is owned by the 972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // caller. 982a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) virtual void ExtractMalwareFeatures(const BrowseInfo* info, 992a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) ClientMalwareRequest* request); 1002a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) friend class base::DeleteHelper<BrowserFeatureExtractor>; 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData; 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef std::map<CancelableRequestProvider::Handle, 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ExtractionData> PendingQueriesMap; 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Synchronous browser feature extraction. 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void ExtractBrowseInfoFeatures(const BrowseInfo& info, 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request); 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Actually starts feature extraction (does the real work). 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void StartExtractFeatures(ClientPhishingRequest* request, 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const DoneCallback& callback); 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // HistoryService callback which is called when we're done querying URL visits 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in the history. 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void QueryUrlHistoryDone(CancelableRequestProvider::Handle handle, 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success, 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const history::URLRow* row, 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) history::VisitVector* visits); 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // HistoryService callback which is called when we're done querying HTTP host 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // visits in the history. 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle, 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success, 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_visits, 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Time first_visit); 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // HistoryService callback which is called when we're done querying HTTPS host 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // visits in the history. 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle, 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success, 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_visits, 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Time first_visit); 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper function which sets the host history features given the 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // number of host visits and the time of the fist host visit. Set 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // |is_http_query| to true if the URL scheme is HTTP and to false if 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the scheme is HTTPS. 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void SetHostVisitsFeatures(int num_visits, 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Time first_visit, 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_http_query, 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request); 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper function which stores the request and callback while the history 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // query is being processed. 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void StorePendingQuery(CancelableRequestProvider::Handle handle, 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request, 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const DoneCallback& callback); 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper function which is the counterpart of StorePendingQuery. If there 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is a pending query for the given handle it will return false and set both 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the request and cb pointers. Otherwise, it will return false. 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetPendingQuery(CancelableRequestProvider::Handle handle, 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest** request, 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DoneCallback* callback); 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper function which gets the history server if possible. If the pointer 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is set it will return true and false otherwise. 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetHistoryService(HistoryService** history); 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) content::WebContents* tab_; 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientSideDetectionService* service_; 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CancelableRequestConsumer request_consumer_; 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_; 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Set of pending extractions (i.e. extractions for which ExtractFeatures was 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // called but not StartExtractFeatures). 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_; 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Set of pending queries (i.e., where history->Query...() was called but 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the history callback hasn't been invoked yet). 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PendingQueriesMap pending_queries_; 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor); 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace safe_browsing 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 180