15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BrowserFeatureExtractor computes various browser features for client-side 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// phishing detection. For now it does a bunch of lookups in the history 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// service to see whether a particular URL has been visited before by the 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// user. 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map> 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set> 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <utility> 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector> 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/callback.h" 217d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h" 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h" 23f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "base/task/cancelable_task_tracker.h" 24eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "base/time/time.h" 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/safe_browsing/safe_browsing_service.h" 262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "chrome/browser/safe_browsing/ui_manager.h" 271320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#include "components/history/core/browser/history_types.h" 28116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch#include "content/public/common/resource_type.h" 29eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "url/gurl.h" 30a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class HistoryService; 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content { 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class WebContents; 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing { 392a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)class ClientMalwareRequest; 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ClientPhishingRequest; 41f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)class ClientSideDetectionHost; 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 43a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)struct IPUrlInfo { 44a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) // The url on the bad IP address. 45a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) std::string url; 46a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) std::string method; 47a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) std::string referrer; 485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) content::ResourceType resource_type; 49a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 50a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) IPUrlInfo(const std::string& url, 51a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) const std::string& method, 52a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) const std::string& referrer, 535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const content::ResourceType& resource_type); 54a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) ~IPUrlInfo(); 55a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)}; 56a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 57a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)typedef std::map<std::string, std::vector<IPUrlInfo> > IPUrlMap; 582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct BrowseInfo { 60effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch // The URL we're currently browsing. 61effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch GURL url; 62effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // List of IPv4 and IPv6 addresses from which content was requested 642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // together with the hosts on it, while browsing to the |url|. 65a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles) IPUrlMap ips; 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If a SafeBrowsing interstitial was shown for the current URL 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // this will contain the UnsafeResource struct for that URL. 692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // List of redirects that lead to the first page on the current host and 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the current url respectively. These may be the same if the current url 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is the first page on its host. 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<GURL> host_redirects; 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<GURL> url_redirects; 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 77f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // URL of the referrer of this URL load. 78f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) GURL referrer; 79f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The HTTP status code from this navigation. 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int http_status_code; 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 83effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch // The page ID of the navigation. This comes from FrameNavigateParams. 84effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch int32 page_id; 85effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BrowseInfo(); 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~BrowseInfo(); 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All methods of this class must be called on the UI thread (including 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the constructor). 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class BrowserFeatureExtractor { 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Called when feature extraction is done. The first argument will be 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // true iff feature extraction succeeded. The second argument is the 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // phishing request which was modified by the feature extractor. The 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // DoneCallback takes ownership of the request object. 98116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch typedef base::Callback<void(bool, scoped_ptr<ClientPhishingRequest>)> 99116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch DoneCallback; 100f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) typedef base::Callback<void(bool, scoped_ptr<ClientMalwareRequest>)> 101f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MalwareDoneCallback; 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 103f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // The caller keeps ownership of the tab and host objects and is 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // responsible for ensuring that they stay valid for the entire 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // lifetime of this object. 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BrowserFeatureExtractor(content::WebContents* tab, 107f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) ClientSideDetectionHost* host); 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The destructor will cancel any pending requests. 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual ~BrowserFeatureExtractor(); 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Begins extraction of the browser features. We take ownership 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of the request object until |callback| is called (see DoneCallback above) 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and will write the extracted features to the feature map. Once the 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // feature extraction is complete, |callback| is run on the UI thread. We 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // take ownership of the |callback| object. |info| may not be valid after 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ExtractFeatures returns. This method must run on the UI thread. 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual void ExtractFeatures(const BrowseInfo* info, 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request, 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const DoneCallback& callback); 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 122f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Begins extraction of the malware related features. We take ownership 123f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // of the request object until |callback| is called. Once feature extraction 124f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // is complete, |callback| will run on the UI thread. |info| is not expected 125f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // to stay valid after ExtractMalwareFeatures returns. All IPs stored in 126f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // |info| will be cleared by calling this function. 127f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) virtual void ExtractMalwareFeatures(BrowseInfo* info, 128f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) ClientMalwareRequest* request, 129f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const MalwareDoneCallback& callback); 1302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Synchronous browser feature extraction. 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void ExtractBrowseInfoFeatures(const BrowseInfo& info, 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request); 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Actually starts feature extraction (does the real work). 137116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch void StartExtractFeatures(scoped_ptr<ClientPhishingRequest> request, 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const DoneCallback& callback); 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // HistoryService callback which is called when we're done querying URL visits 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in the history. 142f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) void QueryUrlHistoryDone(scoped_ptr<ClientPhishingRequest> request, 143f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const DoneCallback& callback, 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success, 145f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const history::URLRow& row, 146f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const history::VisitVector& visits); 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // HistoryService callback which is called when we're done querying HTTP host 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // visits in the history. 150116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch void QueryHttpHostVisitsDone(scoped_ptr<ClientPhishingRequest> request, 151116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch const DoneCallback& callback, 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success, 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_visits, 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Time first_visit); 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // HistoryService callback which is called when we're done querying HTTPS host 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // visits in the history. 158116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch void QueryHttpsHostVisitsDone(scoped_ptr<ClientPhishingRequest> request, 159116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch const DoneCallback& callback, 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success, 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_visits, 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Time first_visit); 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper function which sets the host history features given the 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // number of host visits and the time of the fist host visit. Set 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // |is_http_query| to true if the URL scheme is HTTP and to false if 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the scheme is HTTPS. 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void SetHostVisitsFeatures(int num_visits, 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Time first_visit, 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_http_query, 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientPhishingRequest* request); 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper function which gets the history server if possible. If the pointer 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is set it will return true and false otherwise. 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetHistoryService(HistoryService** history); 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 177f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Helper function which is called when we're done filtering out benign IPs 178f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // on the IO thread. This function is called on the UI thread. 179f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) void FinishExtractMalwareFeatures(scoped_ptr<IPUrlMap> bad_ips, 180f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MalwareDoneCallback callback, 181f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) scoped_ptr<ClientMalwareRequest> request); 182f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) content::WebContents* tab_; 184f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) ClientSideDetectionHost* host_; 185f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) base::CancelableTaskTracker cancelable_task_tracker_; 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_; 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor); 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace safe_browsing 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 193