15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BrowserFeatureExtractor computes various browser features for client-side
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// phishing detection.  For now it does a bunch of lookups in the history
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// service to see whether a particular URL has been visited before by the
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// user.
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map>
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set>
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <utility>
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/callback.h"
217d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h"
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
23f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "base/task/cancelable_task_tracker.h"
24eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "base/time/time.h"
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/safe_browsing/safe_browsing_service.h"
262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "chrome/browser/safe_browsing/ui_manager.h"
271320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#include "components/history/core/browser/history_types.h"
28116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch#include "content/public/common/resource_type.h"
29eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "url/gurl.h"
30a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class HistoryService;
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content {
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class WebContents;
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
392a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)class ClientMalwareRequest;
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ClientPhishingRequest;
41f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)class ClientSideDetectionHost;
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
43a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)struct IPUrlInfo {
44a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  // The url on the bad IP address.
45a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  std::string url;
46a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  std::string method;
47a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  std::string referrer;
485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  content::ResourceType resource_type;
49a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
50a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  IPUrlInfo(const std::string& url,
51a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)            const std::string& method,
52a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)            const std::string& referrer,
535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            const content::ResourceType& resource_type);
54a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  ~IPUrlInfo();
55a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)};
56a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
57a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)typedef std::map<std::string, std::vector<IPUrlInfo> > IPUrlMap;
582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct BrowseInfo {
60effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch  // The URL we're currently browsing.
61effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch  GURL url;
62effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // List of IPv4 and IPv6 addresses from which content was requested
642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  // together with the hosts on it, while browsing to the |url|.
65a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)  IPUrlMap ips;
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If a SafeBrowsing interstitial was shown for the current URL
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this will contain the UnsafeResource struct for that URL.
692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource;
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // List of redirects that lead to the first page on the current host and
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the current url respectively. These may be the same if the current url
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is the first page on its host.
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<GURL> host_redirects;
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<GURL> url_redirects;
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
77f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // URL of the referrer of this URL load.
78f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  GURL referrer;
79f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The HTTP status code from this navigation.
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int http_status_code;
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
83effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch  // The page ID of the navigation.  This comes from FrameNavigateParams.
84effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch  int32 page_id;
85effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  BrowseInfo();
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~BrowseInfo();
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All methods of this class must be called on the UI thread (including
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the constructor).
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class BrowserFeatureExtractor {
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Called when feature extraction is done.  The first argument will be
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // true iff feature extraction succeeded.  The second argument is the
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // phishing request which was modified by the feature extractor.  The
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // DoneCallback takes ownership of the request object.
98116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  typedef base::Callback<void(bool, scoped_ptr<ClientPhishingRequest>)>
99116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      DoneCallback;
100f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  typedef base::Callback<void(bool, scoped_ptr<ClientMalwareRequest>)>
101f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      MalwareDoneCallback;
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
103f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // The caller keeps ownership of the tab and host objects and is
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // responsible for ensuring that they stay valid for the entire
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // lifetime of this object.
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  BrowserFeatureExtractor(content::WebContents* tab,
107f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                          ClientSideDetectionHost* host);
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The destructor will cancel any pending requests.
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual ~BrowserFeatureExtractor();
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Begins extraction of the browser features.  We take ownership
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // of the request object until |callback| is called (see DoneCallback above)
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and will write the extracted features to the feature map.  Once the
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // feature extraction is complete, |callback| is run on the UI thread.  We
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // take ownership of the |callback| object.  |info| may not be valid after
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ExtractFeatures returns.  This method must run on the UI thread.
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void ExtractFeatures(const BrowseInfo* info,
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               ClientPhishingRequest* request,
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               const DoneCallback& callback);
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
122f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Begins extraction of the malware related features.  We take ownership
123f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // of the request object until |callback| is called.  Once feature extraction
124f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // is complete, |callback| will run on the UI thread.  |info| is not expected
125f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // to stay valid after ExtractMalwareFeatures returns.  All IPs stored in
126f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // |info| will be cleared by calling this function.
127f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  virtual void ExtractMalwareFeatures(BrowseInfo* info,
128f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                      ClientMalwareRequest* request,
129f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                      const MalwareDoneCallback& callback);
1302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Synchronous browser feature extraction.
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void ExtractBrowseInfoFeatures(const BrowseInfo& info,
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                 ClientPhishingRequest* request);
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Actually starts feature extraction (does the real work).
137116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  void StartExtractFeatures(scoped_ptr<ClientPhishingRequest> request,
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            const DoneCallback& callback);
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // HistoryService callback which is called when we're done querying URL visits
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in the history.
142f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  void QueryUrlHistoryDone(scoped_ptr<ClientPhishingRequest> request,
143f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                           const DoneCallback& callback,
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           bool success,
145f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                           const history::URLRow& row,
146f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                           const history::VisitVector& visits);
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // HistoryService callback which is called when we're done querying HTTP host
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // visits in the history.
150116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  void QueryHttpHostVisitsDone(scoped_ptr<ClientPhishingRequest> request,
151116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch                               const DoneCallback& callback,
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               bool success,
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               int num_visits,
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               base::Time first_visit);
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // HistoryService callback which is called when we're done querying HTTPS host
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // visits in the history.
158116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  void QueryHttpsHostVisitsDone(scoped_ptr<ClientPhishingRequest> request,
159116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch                                const DoneCallback& callback,
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                bool success,
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                int num_visits,
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                base::Time first_visit);
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper function which sets the host history features given the
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // number of host visits and the time of the fist host visit.  Set
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // |is_http_query| to true if the URL scheme is HTTP and to false if
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the scheme is HTTPS.
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void SetHostVisitsFeatures(int num_visits,
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             base::Time first_visit,
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             bool is_http_query,
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             ClientPhishingRequest* request);
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper function which gets the history server if possible.  If the pointer
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is set it will return true and false otherwise.
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetHistoryService(HistoryService** history);
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
177f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Helper function which is called when we're done filtering out benign IPs
178f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // on the IO thread.  This function is called on the UI thread.
179f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  void FinishExtractMalwareFeatures(scoped_ptr<IPUrlMap> bad_ips,
180f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                    MalwareDoneCallback callback,
181f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                    scoped_ptr<ClientMalwareRequest> request);
182f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  content::WebContents* tab_;
184f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  ClientSideDetectionHost* host_;
185f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  base::CancelableTaskTracker cancelable_task_tracker_;
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_;
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor);
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace safe_browsing
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
193