1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6
7#include <string>
8
9#include "base/bind.h"
10#include "base/callback.h"
11#include "base/containers/hash_tables.h"
12#include "base/memory/scoped_ptr.h"
13#include "base/message_loop/message_loop.h"
14#include "base/strings/string16.h"
15#include "base/strings/stringprintf.h"
16#include "base/strings/utf_string_conversions.h"
17#include "base/time/time.h"
18#include "chrome/renderer/safe_browsing/features.h"
19#include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
20#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21#include "chrome/renderer/safe_browsing/test_utils.h"
22#include "crypto/sha2.h"
23#include "testing/gmock/include/gmock/gmock.h"
24#include "testing/gtest/include/gtest/gtest.h"
25
26using base::ASCIIToUTF16;
27using ::testing::Return;
28
29
30static const uint32 kMurmurHash3Seed = 2777808611U;
31
32namespace safe_browsing {
33
34class PhishingTermFeatureExtractorTest : public ::testing::Test {
35 protected:
36  virtual void SetUp() {
37    base::hash_set<std::string> terms;
38    terms.insert("one");
39    terms.insert("one one");
40    terms.insert("two");
41    terms.insert("multi word test");
42    terms.insert("capitalization");
43    terms.insert("space");
44    terms.insert("separator");
45    terms.insert("punctuation");
46    // Chinese (translation of "hello")
47    terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
48    // Chinese (translation of "goodbye")
49    terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
50
51    for (base::hash_set<std::string>::iterator it = terms.begin();
52         it != terms.end(); ++it) {
53      term_hashes_.insert(crypto::SHA256HashString(*it));
54    }
55
56    base::hash_set<std::string> words;
57    words.insert("one");
58    words.insert("two");
59    words.insert("multi");
60    words.insert("word");
61    words.insert("test");
62    words.insert("capitalization");
63    words.insert("space");
64    words.insert("separator");
65    words.insert("punctuation");
66    words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
67    words.insert("\xe5\x86\x8d\xe8\xa7\x81");
68
69    for (base::hash_set<std::string>::iterator it = words.begin();
70         it != words.end(); ++it) {
71      word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
72    }
73
74    ResetExtractor(3 /* max shingles per page */);
75  }
76
77  void ResetExtractor(size_t max_shingles_per_page) {
78    extractor_.reset(new PhishingTermFeatureExtractor(
79        &term_hashes_,
80        &word_hashes_,
81        3 /* max_words_per_term */,
82        kMurmurHash3Seed,
83        max_shingles_per_page,
84        4 /* shingle_size */,
85        &clock_));
86  }
87
88  // Runs the TermFeatureExtractor on |page_text|, waiting for the
89  // completion callback.  Returns the success boolean from the callback.
90  bool ExtractFeatures(const base::string16* page_text,
91                       FeatureMap* features,
92                       std::set<uint32>* shingle_hashes) {
93    success_ = false;
94    extractor_->ExtractFeatures(
95        page_text,
96        features,
97        shingle_hashes,
98        base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
99                   base::Unretained(this)));
100    msg_loop_.Run();
101    return success_;
102  }
103
104  void PartialExtractFeatures(const base::string16* page_text,
105                              FeatureMap* features,
106                              std::set<uint32>* shingle_hashes) {
107    extractor_->ExtractFeatures(
108        page_text,
109        features,
110        shingle_hashes,
111        base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
112                   base::Unretained(this)));
113    msg_loop_.PostTask(
114        FROM_HERE,
115        base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
116                   base::Unretained(this)));
117    msg_loop_.RunUntilIdle();
118  }
119
120  // Completion callback for feature extraction.
121  void ExtractionDone(bool success) {
122    success_ = success;
123    msg_loop_.Quit();
124  }
125
126  void QuitExtraction() {
127    extractor_->CancelPendingExtraction();
128    msg_loop_.Quit();
129  }
130
131  base::MessageLoop msg_loop_;
132  MockFeatureExtractorClock clock_;
133  scoped_ptr<PhishingTermFeatureExtractor> extractor_;
134  base::hash_set<std::string> term_hashes_;
135  base::hash_set<uint32> word_hashes_;
136  bool success_;  // holds the success value from ExtractFeatures
137};
138
139TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
140  // This test doesn't exercise the extraction timing.
141  EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
142
143  base::string16 page_text = ASCIIToUTF16("blah");
144  FeatureMap expected_features;  // initially empty
145  std::set<uint32> expected_shingle_hashes;
146
147  FeatureMap features;
148  std::set<uint32> shingle_hashes;
149  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
150  ExpectFeatureMapsAreEqual(features, expected_features);
151  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
152
153  page_text = ASCIIToUTF16("one one");
154  expected_features.Clear();
155  expected_features.AddBooleanFeature(features::kPageTerm +
156                                      std::string("one"));
157  expected_features.AddBooleanFeature(features::kPageTerm +
158                                      std::string("one one"));
159  expected_shingle_hashes.clear();
160
161  features.Clear();
162  shingle_hashes.clear();
163  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
164  ExpectFeatureMapsAreEqual(features, expected_features);
165  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
166
167  page_text = ASCIIToUTF16("bla bla multi word test bla");
168  expected_features.Clear();
169  expected_features.AddBooleanFeature(features::kPageTerm +
170                                      std::string("multi word test"));
171  expected_shingle_hashes.clear();
172  expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
173                                                   kMurmurHash3Seed));
174  expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
175                                                   kMurmurHash3Seed));
176  expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
177                                                   kMurmurHash3Seed));
178
179  features.Clear();
180  shingle_hashes.clear();
181  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
182  ExpectFeatureMapsAreEqual(features, expected_features);
183  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
184
185  // This text has all of the words for one of the terms, but they are
186  // not in the correct order.
187  page_text = ASCIIToUTF16("bla bla test word multi bla");
188  expected_features.Clear();
189  expected_shingle_hashes.clear();
190  expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
191                                                   kMurmurHash3Seed));
192  expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
193                                                   kMurmurHash3Seed));
194  expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
195                                                   kMurmurHash3Seed));
196
197  features.Clear();
198  shingle_hashes.clear();
199  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
200  ExpectFeatureMapsAreEqual(features, expected_features);
201  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
202
203  // Test various separators.
204  page_text = ASCIIToUTF16("Capitalization plus non-space\n"
205                           "separator... punctuation!");
206  expected_features.Clear();
207  expected_features.AddBooleanFeature(features::kPageTerm +
208                                      std::string("capitalization"));
209  expected_features.AddBooleanFeature(features::kPageTerm +
210                                      std::string("space"));
211  expected_features.AddBooleanFeature(features::kPageTerm +
212                                      std::string("separator"));
213  expected_features.AddBooleanFeature(features::kPageTerm +
214                                      std::string("punctuation"));
215  expected_shingle_hashes.clear();
216  expected_shingle_hashes.insert(
217      MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
218  expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
219                                                   kMurmurHash3Seed));
220  expected_shingle_hashes.insert(
221      MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
222
223  features.Clear();
224  shingle_hashes.clear();
225  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
226  ExpectFeatureMapsAreEqual(features, expected_features);
227  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
228
229  // Test a page with too many words and we should only 3 minimum hashes.
230  page_text = ASCIIToUTF16("This page has way too many words.");
231  expected_features.Clear();
232  expected_shingle_hashes.clear();
233  expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
234                                                   kMurmurHash3Seed));
235  expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
236                                                   kMurmurHash3Seed));
237  expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
238                                                   kMurmurHash3Seed));
239  expected_shingle_hashes.insert(MurmurHash3String("way too many words ",
240                                                   kMurmurHash3Seed));
241  std::set<uint32>::iterator it = expected_shingle_hashes.end();
242  expected_shingle_hashes.erase(--it);
243
244  features.Clear();
245  shingle_hashes.clear();
246  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
247  ExpectFeatureMapsAreEqual(features, expected_features);
248  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
249
250  // Test with empty page text.
251  page_text = base::string16();
252  expected_features.Clear();
253  expected_shingle_hashes.clear();
254  features.Clear();
255  shingle_hashes.clear();
256  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
257  ExpectFeatureMapsAreEqual(features, expected_features);
258  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
259
260#if !defined(OS_ANDROID)
261  // The test code is disabled due to http://crbug.com/392234
262  // The client-side detection feature is not enabled on Android yet.
263  // If we decided to enable the feature, we need to fix the bug first.
264
265  // Chinese translation of the phrase "hello goodbye hello goodbye". This tests
266  // that we can correctly separate terms in languages that don't use spaces.
267  page_text =
268      base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"
269                        "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
270  expected_features.Clear();
271  expected_features.AddBooleanFeature(
272      features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
273  expected_features.AddBooleanFeature(
274      features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
275  expected_shingle_hashes.clear();
276  expected_shingle_hashes.insert(MurmurHash3String(
277      "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 "
278      "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed));
279
280  features.Clear();
281  shingle_hashes.clear();
282  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
283  ExpectFeatureMapsAreEqual(features, expected_features);
284  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
285#endif
286}
287
288TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
289  // For this test, we'll cause the feature extraction to run multiple
290  // iterations by incrementing the clock.
291  ResetExtractor(200 /* max shingles per page */);
292
293  // This page has a total of 30 words.  For the features to be computed
294  // correctly, the extractor has to process the entire string of text.
295  base::string16 page_text(ASCIIToUTF16("one "));
296  for (int i = 0; i < 28; ++i) {
297    page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
298  }
299  page_text.append(ASCIIToUTF16("two"));
300
301  // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
302  // Note that this assumes kClockCheckGranularity = 5 and
303  // kMaxTimePerChunkMs = 10.
304  base::TimeTicks now = base::TimeTicks::Now();
305  EXPECT_CALL(clock_, Now())
306      // Time check at the start of extraction.
307      .WillOnce(Return(now))
308      // Time check at the start of the first chunk of work.
309      .WillOnce(Return(now))
310      // Time check after the first 5 words.
311      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3)))
312      // Time check after the next 5 words.
313      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
314      // Time check after the next 5 words.
315      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9)))
316      // Time check after the next 5 words.  This is over the chunk
317      // time limit, so a continuation task will be posted.
318      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
319      // Time check at the start of the second chunk of work.
320      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
321      // Time check after the next 5 words.
322      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
323      // Time check after the next 5 words.
324      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28)))
325      // A final check for the histograms.
326      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
327
328  FeatureMap expected_features;
329  expected_features.AddBooleanFeature(features::kPageTerm +
330                                      std::string("one"));
331  expected_features.AddBooleanFeature(features::kPageTerm +
332                                      std::string("two"));
333  std::set<uint32> expected_shingle_hashes;
334  expected_shingle_hashes.insert(
335      MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed));
336  expected_shingle_hashes.insert(
337      MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed));
338  expected_shingle_hashes.insert(
339      MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed));
340  expected_shingle_hashes.insert(
341      MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed));
342  expected_shingle_hashes.insert(
343      MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed));
344  expected_shingle_hashes.insert(
345      MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed));
346  expected_shingle_hashes.insert(
347      MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed));
348  expected_shingle_hashes.insert(
349      MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed));
350  expected_shingle_hashes.insert(
351      MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed));
352  expected_shingle_hashes.insert(
353      MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed));
354  expected_shingle_hashes.insert(
355      MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed));
356  expected_shingle_hashes.insert(
357      MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed));
358  expected_shingle_hashes.insert(
359      MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed));
360  expected_shingle_hashes.insert(
361      MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed));
362  expected_shingle_hashes.insert(
363      MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed));
364  expected_shingle_hashes.insert(
365      MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed));
366  expected_shingle_hashes.insert(
367      MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed));
368  expected_shingle_hashes.insert(
369      MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed));
370  expected_shingle_hashes.insert(
371      MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed));
372  expected_shingle_hashes.insert(
373      MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed));
374  expected_shingle_hashes.insert(
375      MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed));
376  expected_shingle_hashes.insert(
377      MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed));
378  expected_shingle_hashes.insert(
379      MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed));
380  expected_shingle_hashes.insert(
381      MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed));
382  expected_shingle_hashes.insert(
383      MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed));
384  expected_shingle_hashes.insert(
385      MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed));
386  expected_shingle_hashes.insert(
387      MurmurHash3String("25 26 27 two ", kMurmurHash3Seed));
388
389  FeatureMap features;
390  std::set<uint32> shingle_hashes;
391  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
392  ExpectFeatureMapsAreEqual(features, expected_features);
393  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
394  // Make sure none of the mock expectations carry over to the next test.
395  ::testing::Mock::VerifyAndClearExpectations(&clock_);
396
397  // Now repeat the test with the same text, but advance the clock faster so
398  // that the extraction time exceeds the maximum total time for the feature
399  // extractor.  Extraction should fail.  Note that this assumes
400  // kMaxTotalTimeMs = 500.
401  EXPECT_CALL(clock_, Now())
402      // Time check at the start of extraction.
403      .WillOnce(Return(now))
404      // Time check at the start of the first chunk of work.
405      .WillOnce(Return(now))
406      // Time check after the first 5 words,
407      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
408      // Time check at the start of the second chunk of work.
409      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
410      // Time check after the next 5 words.  This is over the limit.
411      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
412      // A final time check for the histograms.
413      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
414
415  features.Clear();
416  shingle_hashes.clear();
417  EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
418}
419
420TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
421  scoped_ptr<base::string16> page_text(
422      new base::string16(ASCIIToUTF16("one ")));
423  for (int i = 0; i < 28; ++i) {
424    page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
425  }
426
427  base::TimeTicks now = base::TimeTicks::Now();
428  EXPECT_CALL(clock_, Now())
429      // Time check at the start of extraction.
430      .WillOnce(Return(now))
431      // Time check at the start of the first chunk of work.
432      .WillOnce(Return(now))
433      // Time check after the first 5 words.
434      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
435      // Time check after the next 5 words. This should be greater than
436      // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
437      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
438
439  FeatureMap features;
440  std::set<uint32> shingle_hashes;
441  // Extract first 10 words then stop.
442  PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
443
444  page_text.reset(new base::string16());
445  for (int i = 30; i < 58; ++i) {
446    page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
447  }
448  page_text->append(ASCIIToUTF16("multi word test "));
449  features.Clear();
450  shingle_hashes.clear();
451
452  // This part doesn't exercise the extraction timing.
453  EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
454
455  // Now extract normally and make sure nothing breaks.
456  EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
457
458  FeatureMap expected_features;
459  expected_features.AddBooleanFeature(features::kPageTerm +
460                                      std::string("multi word test"));
461  ExpectFeatureMapsAreEqual(features, expected_features);
462}
463
464}  // namespace safe_browsing
465