1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// PhishingTermFeatureExtractor handles computing term features from the text
6// of a web page for the client-side phishing detection model.  To do this, it
7// takes a list of terms that appear in the model, and scans through the page
8// text looking for them.  Any terms that appear will cause a corresponding
9// features::kPageTerm feature to be added to the FeatureMap.
10//
11// To make it harder for a phisher to enumerate all of the relevant terms in
12// the model, the terms are provided as SHA-256 hashes, rather than plain text.
13//
14// There is one PhishingTermFeatureExtractor per RenderView.
15
16#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
17#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
18
19#include <set>
20#include <string>
21
22#include "base/basictypes.h"
23#include "base/callback.h"
24#include "base/containers/hash_tables.h"
25#include "base/memory/scoped_ptr.h"
26#include "base/memory/weak_ptr.h"
27#include "base/strings/string16.h"
28#include "base/strings/string_piece.h"
29
30namespace safe_browsing {
31class FeatureExtractorClock;
32class FeatureMap;
33
34class PhishingTermFeatureExtractor {
35 public:
36  // Callback to be run when feature extraction finishes.  The callback
37  // argument is true if extraction was successful, false otherwise.
38  typedef base::Callback<void(bool)> DoneCallback;
39
40  // Creates a PhishingTermFeatureExtractor which will extract features for
41  // all of the terms whose SHA-256 hashes are in |page_term_hashes|.  These
42  // terms may be multi-word n-grams, with at most |max_words_per_term| words.
43  //
44  // |page_word_hashes| contains the murmur3 hashes for all of the individual
45  // words that make up the terms.  Both sets of strings are UTF-8 encoded and
46  // lowercased prior to hashing.  The caller owns both sets of strings, and
47  // must ensure that they are valid until the PhishingTermFeatureExtractor is
48  // destroyed.
49  //
50  // In addition to extracting page terms, we will also extract text shingling
51  // sketch, which consists of hashes of N-gram-words (referred to as shingles)
52  // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines
53  // the maximum number of unique shingle hashes we extracted per page.
54  //
55  // |clock| is used for timing feature extractor operations, and may be mocked
56  // for testing.  The caller keeps ownership of the clock.
57  PhishingTermFeatureExtractor(
58      const base::hash_set<std::string>* page_term_hashes,
59      const base::hash_set<uint32>* page_word_hashes,
60      size_t max_words_per_term,
61      uint32 murmurhash3_seed,
62      size_t max_shingles_per_page,
63      size_t shingle_size,
64      FeatureExtractorClock* clock);
65  ~PhishingTermFeatureExtractor();
66
67  // Begins extracting features from |page_text| into the given FeatureMap.
68  // |page_text| should contain the plain text of a web page, including any
69  // subframes, as returned by RenderView::CaptureText().
70  //
71  // To avoid blocking the render thread for too long, the feature extractor
72  // may run in several chunks of work, posting a task to the current
73  // MessageLoop to continue processing.  Once feature extraction is complete,
74  // |done_callback| is run on the current thread.
75  // PhishingTermFeatureExtractor takes ownership of the callback.
76  //
77  // |page_text|, |features|, and |shingle_hashes| are owned by the caller,
78  // and must not be destroyed until either |done_callback| is run or
79  // CancelPendingExtraction() is called.
80  void ExtractFeatures(const base::string16* page_text,
81                       FeatureMap* features,
82                       std::set<uint32>* shingle_hashes,
83                       const DoneCallback& done_callback);
84
85  // Cancels any pending feature extraction.  The DoneCallback will not be run.
86  // Must be called if there is a feature extraction in progress when the page
87  // is unloaded or the PhishingTermFeatureExtractor is destroyed.
88  void CancelPendingExtraction();
89
90 private:
91  struct ExtractionState;
92
93  // The maximum amount of wall time that we will spend on a single extraction
94  // iteration before pausing to let other MessageLoop tasks run.
95  static const int kMaxTimePerChunkMs;
96
97  // The number of words that we will process before checking to see whether
98  // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
99  // slow, we don't do this on every word processed.
100  static const int kClockCheckGranularity;
101
102  // The maximum total amount of time that the feature extractor will run
103  // before giving up on the current page.
104  static const int kMaxTotalTimeMs;
105
106  // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
107  // until a predefined maximum amount of time has elapsed, then posts a task
108  // to the current MessageLoop to continue extraction.  When extraction
109  // finishes, calls RunCallback().
110  void ExtractFeaturesWithTimeout();
111
112  // Handles a single word in the page text.
113  void HandleWord(const base::StringPiece16& word);
114
115  // Helper to verify that there is no pending feature extraction.  Dies in
116  // debug builds if the state is not as expected.  This is a no-op in release
117  // builds.
118  void CheckNoPendingExtraction();
119
120  // Runs |done_callback_| and then clears all internal state.
121  void RunCallback(bool success);
122
123  // Clears all internal feature extraction state.
124  void Clear();
125
126  // All of the term hashes that we are looking for in the page.
127  const base::hash_set<std::string>* page_term_hashes_;
128
129  // Murmur3 hashes of all the individual words in page_term_hashes_.  If
130  // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
131  // would contain (hashed) "one" and "two".  We do this so that we can have a
132  // quick out in the common case that the current word we are processing
133  // doesn't contain any part of one of our terms.
134  const base::hash_set<uint32>* page_word_hashes_;
135
136  // The maximum number of words in an n-gram.
137  const size_t max_words_per_term_;
138
139  // The seed for murmurhash3.
140  const uint32 murmurhash3_seed_;
141
142  // The maximum number of unique shingle hashes we extract in a page.
143  const size_t max_shingles_per_page_;
144
145  // The number of words in a shingle.
146  const size_t shingle_size_;
147
148  // Non-owned pointer to our clock.
149  FeatureExtractorClock* clock_;
150
151  // The output parameters from the most recent call to ExtractFeatures().
152  const base::string16* page_text_;  // The caller keeps ownership of this.
153  FeatureMap* features_;  // The caller keeps ownership of this.
154  std::set<uint32>* shingle_hashes_;
155  DoneCallback done_callback_;
156
157  // Stores the current state of term extraction from |page_text_|.
158  scoped_ptr<ExtractionState> state_;
159
160  // Used in scheduling ExtractFeaturesWithTimeout tasks.
161  // These pointers are invalidated if extraction is cancelled.
162  base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_;
163
164  DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
165};
166
167}  // namespace safe_browsing
168
169#endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
170