browser_feature_extractor.h revision 7d4cd473f85ac64c3747c96c277f9e506a0d2246
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BrowserFeatureExtractor computes various browser features for client-side
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// phishing detection.  For now it does a bunch of lookups in the history
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// service to see whether a particular URL has been visited before by the
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// user.
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map>
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set>
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <utility>
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/callback.h"
217d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h"
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/sequenced_task_runner_helpers.h"
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/time.h"
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/common/cancelable_request.h"
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/history/history_types.h"
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/safe_browsing/safe_browsing_service.h"
282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "chrome/browser/safe_browsing/ui_manager.h"
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "googleurl/src/gurl.h"
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class HistoryService;
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content {
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class WebContents;
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
382a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)class ClientMalwareRequest;
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ClientPhishingRequest;
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ClientSideDetectionService;
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
422a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)typedef std::map<std::string, std::set<std::string> > IPHostMap;
432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct BrowseInfo {
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // List of IPv4 and IPv6 addresses from which content was requested
462a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  // together with the hosts on it, while browsing to the |url|.
472a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  IPHostMap ips;
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If a SafeBrowsing interstitial was shown for the current URL
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this will contain the UnsafeResource struct for that URL.
512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource;
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // List of redirects that lead to the first page on the current host and
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the current url respectively. These may be the same if the current url
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is the first page on its host.
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<GURL> host_redirects;
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<GURL> url_redirects;
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The HTTP status code from this navigation.
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int http_status_code;
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  BrowseInfo();
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~BrowseInfo();
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All methods of this class must be called on the UI thread (including
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the constructor).
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class BrowserFeatureExtractor {
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Called when feature extraction is done.  The first argument will be
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // true iff feature extraction succeeded.  The second argument is the
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // phishing request which was modified by the feature extractor.  The
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // DoneCallback takes ownership of the request object.
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback;
752a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  typedef base::Callback<void(bool, ClientMalwareRequest*)> MalwareDoneCallback;
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The caller keeps ownership of the tab and service objects and is
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // responsible for ensuring that they stay valid for the entire
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // lifetime of this object.
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  BrowserFeatureExtractor(content::WebContents* tab,
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          ClientSideDetectionService* service);
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The destructor will cancel any pending requests.
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual ~BrowserFeatureExtractor();
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Begins extraction of the browser features.  We take ownership
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // of the request object until |callback| is called (see DoneCallback above)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and will write the extracted features to the feature map.  Once the
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // feature extraction is complete, |callback| is run on the UI thread.  We
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // take ownership of the |callback| object.  |info| may not be valid after
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ExtractFeatures returns.  This method must run on the UI thread.
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void ExtractFeatures(const BrowseInfo* info,
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               ClientPhishingRequest* request,
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               const DoneCallback& callback);
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
962a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  // Extract the malware related features. The request object is owned by the
972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  // caller.
982a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  virtual void ExtractMalwareFeatures(const BrowseInfo* info,
992a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                                      ClientMalwareRequest* request);
1002a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  friend class base::DeleteHelper<BrowserFeatureExtractor>;
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData;
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef std::map<CancelableRequestProvider::Handle,
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   ExtractionData> PendingQueriesMap;
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Synchronous browser feature extraction.
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void ExtractBrowseInfoFeatures(const BrowseInfo& info,
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                 ClientPhishingRequest* request);
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Actually starts feature extraction (does the real work).
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void StartExtractFeatures(ClientPhishingRequest* request,
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            const DoneCallback& callback);
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // HistoryService callback which is called when we're done querying URL visits
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in the history.
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void QueryUrlHistoryDone(CancelableRequestProvider::Handle handle,
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           bool success,
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           const history::URLRow* row,
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           history::VisitVector* visits);
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // HistoryService callback which is called when we're done querying HTTP host
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // visits in the history.
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle,
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               bool success,
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               int num_visits,
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               base::Time first_visit);
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // HistoryService callback which is called when we're done querying HTTPS host
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // visits in the history.
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle,
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                bool success,
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                int num_visits,
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                base::Time first_visit);
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper function which sets the host history features given the
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // number of host visits and the time of the fist host visit.  Set
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // |is_http_query| to true if the URL scheme is HTTP and to false if
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the scheme is HTTPS.
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void SetHostVisitsFeatures(int num_visits,
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             base::Time first_visit,
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             bool is_http_query,
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             ClientPhishingRequest* request);
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper function which stores the request and callback while the history
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // query is being processed.
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void StorePendingQuery(CancelableRequestProvider::Handle handle,
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         ClientPhishingRequest* request,
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         const DoneCallback& callback);
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper function which is the counterpart of StorePendingQuery.  If there
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is a pending query for the given handle it will return false and set both
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the request and cb pointers.  Otherwise, it will return false.
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetPendingQuery(CancelableRequestProvider::Handle handle,
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       ClientPhishingRequest** request,
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       DoneCallback* callback);
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper function which gets the history server if possible.  If the pointer
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is set it will return true and false otherwise.
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetHistoryService(HistoryService** history);
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  content::WebContents* tab_;
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ClientSideDetectionService* service_;
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CancelableRequestConsumer request_consumer_;
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_;
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Set of pending extractions (i.e. extractions for which ExtractFeatures was
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // called but not StartExtractFeatures).
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_;
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Set of pending queries (i.e., where history->Query...() was called but
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the history callback hasn't been invoked yet).
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PendingQueriesMap pending_queries_;
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor);
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace safe_browsing
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
180