browser_feature_extractor.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BrowserFeatureExtractor computes various browser features for client-side
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// phishing detection.  For now it does a bunch of lookups in the history
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// service to see whether a particular URL has been visited before by the
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// user.
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map>
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set>
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <utility>
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/callback.h"
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/sequenced_task_runner_helpers.h"
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/time.h"
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/common/cancelable_request.h"
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/history/history_types.h"
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/safe_browsing/safe_browsing_service.h"
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "googleurl/src/gurl.h"
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class HistoryService;
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content {
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class WebContents;
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ClientPhishingRequest;
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ClientSideDetectionService;
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct BrowseInfo {
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // List of IPv4 and IPv6 addresses from which content was requested
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // while browsing to the |url|.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::set<std::string> ips;
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If a SafeBrowsing interstitial was shown for the current URL
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this will contain the UnsafeResource struct for that URL.
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<SafeBrowsingService::UnsafeResource> unsafe_resource;
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // List of redirects that lead to the first page on the current host and
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the current url respectively. These may be the same if the current url
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is the first page on its host.
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<GURL> host_redirects;
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<GURL> url_redirects;
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The HTTP status code from this navigation.
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int http_status_code;
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  BrowseInfo();
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~BrowseInfo();
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All methods of this class must be called on the UI thread (including
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the constructor).
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class BrowserFeatureExtractor {
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Called when feature extraction is done.  The first argument will be
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // true iff feature extraction succeeded.  The second argument is the
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // phishing request which was modified by the feature extractor.  The
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // DoneCallback takes ownership of the request object.
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback;
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The caller keeps ownership of the tab and service objects and is
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // responsible for ensuring that they stay valid for the entire
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // lifetime of this object.
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  BrowserFeatureExtractor(content::WebContents* tab,
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          ClientSideDetectionService* service);
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The destructor will cancel any pending requests.
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual ~BrowserFeatureExtractor();
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Begins extraction of the browser features.  We take ownership
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // of the request object until |callback| is called (see DoneCallback above)
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and will write the extracted features to the feature map.  Once the
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // feature extraction is complete, |callback| is run on the UI thread.  We
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // take ownership of the |callback| object.  |info| may not be valid after
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ExtractFeatures returns.  This method must run on the UI thread.
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void ExtractFeatures(const BrowseInfo* info,
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               ClientPhishingRequest* request,
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               const DoneCallback& callback);
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  friend class base::DeleteHelper<BrowserFeatureExtractor>;
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData;
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef std::map<CancelableRequestProvider::Handle,
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   ExtractionData> PendingQueriesMap;
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Synchronous browser feature extraction.
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void ExtractBrowseInfoFeatures(const BrowseInfo& info,
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                 ClientPhishingRequest* request);
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Actually starts feature extraction (does the real work).
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void StartExtractFeatures(ClientPhishingRequest* request,
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            const DoneCallback& callback);
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // HistoryService callback which is called when we're done querying URL visits
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in the history.
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void QueryUrlHistoryDone(CancelableRequestProvider::Handle handle,
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           bool success,
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           const history::URLRow* row,
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           history::VisitVector* visits);
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // HistoryService callback which is called when we're done querying HTTP host
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // visits in the history.
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle,
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               bool success,
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               int num_visits,
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               base::Time first_visit);
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // HistoryService callback which is called when we're done querying HTTPS host
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // visits in the history.
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle,
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                bool success,
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                int num_visits,
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                base::Time first_visit);
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper function which sets the host history features given the
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // number of host visits and the time of the fist host visit.  Set
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // |is_http_query| to true if the URL scheme is HTTP and to false if
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the scheme is HTTPS.
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void SetHostVisitsFeatures(int num_visits,
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             base::Time first_visit,
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             bool is_http_query,
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             ClientPhishingRequest* request);
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper function which stores the request and callback while the history
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // query is being processed.
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void StorePendingQuery(CancelableRequestProvider::Handle handle,
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         ClientPhishingRequest* request,
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         const DoneCallback& callback);
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper function which is the counterpart of StorePendingQuery.  If there
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is a pending query for the given handle it will return false and set both
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the request and cb pointers.  Otherwise, it will return false.
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetPendingQuery(CancelableRequestProvider::Handle handle,
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       ClientPhishingRequest** request,
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       DoneCallback* callback);
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper function which gets the history server if possible.  If the pointer
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is set it will return true and false otherwise.
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetHistoryService(HistoryService** history);
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  content::WebContents* tab_;
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ClientSideDetectionService* service_;
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CancelableRequestConsumer request_consumer_;
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_;
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Set of pending extractions (i.e. extractions for which ExtractFeatures was
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // called but not StartExtractFeatures).
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_;
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Set of pending queries (i.e., where history->Query...() was called but
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the history callback hasn't been invoked yet).
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PendingQueriesMap pending_queries_;
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor);
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace safe_browsing
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
169