1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 6 7#include <string> 8 9#include "base/bind.h" 10#include "base/callback.h" 11#include "base/containers/hash_tables.h" 12#include "base/memory/scoped_ptr.h" 13#include "base/message_loop/message_loop.h" 14#include "base/strings/string16.h" 15#include "base/strings/stringprintf.h" 16#include "base/strings/utf_string_conversions.h" 17#include "base/time/time.h" 18#include "chrome/renderer/safe_browsing/features.h" 19#include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" 20#include "chrome/renderer/safe_browsing/murmurhash3_util.h" 21#include "chrome/renderer/safe_browsing/test_utils.h" 22#include "crypto/sha2.h" 23#include "testing/gmock/include/gmock/gmock.h" 24#include "testing/gtest/include/gtest/gtest.h" 25 26using base::ASCIIToUTF16; 27using ::testing::Return; 28 29 30static const uint32 kMurmurHash3Seed = 2777808611U; 31 32namespace safe_browsing { 33 34class PhishingTermFeatureExtractorTest : public ::testing::Test { 35 protected: 36 virtual void SetUp() { 37 base::hash_set<std::string> terms; 38 terms.insert("one"); 39 terms.insert("one one"); 40 terms.insert("two"); 41 terms.insert("multi word test"); 42 terms.insert("capitalization"); 43 terms.insert("space"); 44 terms.insert("separator"); 45 terms.insert("punctuation"); 46 // Chinese (translation of "hello") 47 terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); 48 // Chinese (translation of "goodbye") 49 terms.insert("\xe5\x86\x8d\xe8\xa7\x81"); 50 51 for (base::hash_set<std::string>::iterator it = terms.begin(); 52 it != terms.end(); ++it) { 53 term_hashes_.insert(crypto::SHA256HashString(*it)); 54 } 55 56 base::hash_set<std::string> words; 57 words.insert("one"); 58 words.insert("two"); 59 words.insert("multi"); 60 words.insert("word"); 61 words.insert("test"); 62 words.insert("capitalization"); 63 words.insert("space"); 64 words.insert("separator"); 65 words.insert("punctuation"); 66 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); 67 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); 68 69 for (base::hash_set<std::string>::iterator it = words.begin(); 70 it != words.end(); ++it) { 71 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); 72 } 73 74 ResetExtractor(3 /* max shingles per page */); 75 } 76 77 void ResetExtractor(size_t max_shingles_per_page) { 78 extractor_.reset(new PhishingTermFeatureExtractor( 79 &term_hashes_, 80 &word_hashes_, 81 3 /* max_words_per_term */, 82 kMurmurHash3Seed, 83 max_shingles_per_page, 84 4 /* shingle_size */, 85 &clock_)); 86 } 87 88 // Runs the TermFeatureExtractor on |page_text|, waiting for the 89 // completion callback. Returns the success boolean from the callback. 90 bool ExtractFeatures(const base::string16* page_text, 91 FeatureMap* features, 92 std::set<uint32>* shingle_hashes) { 93 success_ = false; 94 extractor_->ExtractFeatures( 95 page_text, 96 features, 97 shingle_hashes, 98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, 99 base::Unretained(this))); 100 msg_loop_.Run(); 101 return success_; 102 } 103 104 void PartialExtractFeatures(const base::string16* page_text, 105 FeatureMap* features, 106 std::set<uint32>* shingle_hashes) { 107 extractor_->ExtractFeatures( 108 page_text, 109 features, 110 shingle_hashes, 111 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, 112 base::Unretained(this))); 113 msg_loop_.PostTask( 114 FROM_HERE, 115 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, 116 base::Unretained(this))); 117 msg_loop_.RunUntilIdle(); 118 } 119 120 // Completion callback for feature extraction. 121 void ExtractionDone(bool success) { 122 success_ = success; 123 msg_loop_.Quit(); 124 } 125 126 void QuitExtraction() { 127 extractor_->CancelPendingExtraction(); 128 msg_loop_.Quit(); 129 } 130 131 base::MessageLoop msg_loop_; 132 MockFeatureExtractorClock clock_; 133 scoped_ptr<PhishingTermFeatureExtractor> extractor_; 134 base::hash_set<std::string> term_hashes_; 135 base::hash_set<uint32> word_hashes_; 136 bool success_; // holds the success value from ExtractFeatures 137}; 138 139TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { 140 // This test doesn't exercise the extraction timing. 141 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); 142 143 base::string16 page_text = ASCIIToUTF16("blah"); 144 FeatureMap expected_features; // initially empty 145 std::set<uint32> expected_shingle_hashes; 146 147 FeatureMap features; 148 std::set<uint32> shingle_hashes; 149 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); 150 ExpectFeatureMapsAreEqual(features, expected_features); 151 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); 152 153 page_text = ASCIIToUTF16("one one"); 154 expected_features.Clear(); 155 expected_features.AddBooleanFeature(features::kPageTerm + 156 std::string("one")); 157 expected_features.AddBooleanFeature(features::kPageTerm + 158 std::string("one one")); 159 expected_shingle_hashes.clear(); 160 161 features.Clear(); 162 shingle_hashes.clear(); 163 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); 164 ExpectFeatureMapsAreEqual(features, expected_features); 165 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); 166 167 page_text = ASCIIToUTF16("bla bla multi word test bla"); 168 expected_features.Clear(); 169 expected_features.AddBooleanFeature(features::kPageTerm + 170 std::string("multi word test")); 171 expected_shingle_hashes.clear(); 172 expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ", 173 kMurmurHash3Seed)); 174 expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ", 175 kMurmurHash3Seed)); 176 expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ", 177 kMurmurHash3Seed)); 178 179 features.Clear(); 180 shingle_hashes.clear(); 181 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); 182 ExpectFeatureMapsAreEqual(features, expected_features); 183 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); 184 185 // This text has all of the words for one of the terms, but they are 186 // not in the correct order. 187 page_text = ASCIIToUTF16("bla bla test word multi bla"); 188 expected_features.Clear(); 189 expected_shingle_hashes.clear(); 190 expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ", 191 kMurmurHash3Seed)); 192 expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ", 193 kMurmurHash3Seed)); 194 expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ", 195 kMurmurHash3Seed)); 196 197 features.Clear(); 198 shingle_hashes.clear(); 199 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); 200 ExpectFeatureMapsAreEqual(features, expected_features); 201 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); 202 203 // Test various separators. 204 page_text = ASCIIToUTF16("Capitalization plus non-space\n" 205 "separator... punctuation!"); 206 expected_features.Clear(); 207 expected_features.AddBooleanFeature(features::kPageTerm + 208 std::string("capitalization")); 209 expected_features.AddBooleanFeature(features::kPageTerm + 210 std::string("space")); 211 expected_features.AddBooleanFeature(features::kPageTerm + 212 std::string("separator")); 213 expected_features.AddBooleanFeature(features::kPageTerm + 214 std::string("punctuation")); 215 expected_shingle_hashes.clear(); 216 expected_shingle_hashes.insert( 217 MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed)); 218 expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ", 219 kMurmurHash3Seed)); 220 expected_shingle_hashes.insert( 221 MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed)); 222 223 features.Clear(); 224 shingle_hashes.clear(); 225 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); 226 ExpectFeatureMapsAreEqual(features, expected_features); 227 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); 228 229 // Test a page with too many words and we should only 3 minimum hashes. 230 page_text = ASCIIToUTF16("This page has way too many words."); 231 expected_features.Clear(); 232 expected_shingle_hashes.clear(); 233 expected_shingle_hashes.insert(MurmurHash3String("this page has way ", 234 kMurmurHash3Seed)); 235 expected_shingle_hashes.insert(MurmurHash3String("page has way too ", 236 kMurmurHash3Seed)); 237 expected_shingle_hashes.insert(MurmurHash3String("has way too many ", 238 kMurmurHash3Seed)); 239 expected_shingle_hashes.insert(MurmurHash3String("way too many words ", 240 kMurmurHash3Seed)); 241 std::set<uint32>::iterator it = expected_shingle_hashes.end(); 242 expected_shingle_hashes.erase(--it); 243 244 features.Clear(); 245 shingle_hashes.clear(); 246 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); 247 ExpectFeatureMapsAreEqual(features, expected_features); 248 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); 249 250 // Test with empty page text. 251 page_text = base::string16(); 252 expected_features.Clear(); 253 expected_shingle_hashes.clear(); 254 features.Clear(); 255 shingle_hashes.clear(); 256 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); 257 ExpectFeatureMapsAreEqual(features, expected_features); 258 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); 259 260#if !defined(OS_ANDROID) 261 // The test code is disabled due to http://crbug.com/392234 262 // The client-side detection feature is not enabled on Android yet. 263 // If we decided to enable the feature, we need to fix the bug first. 264 265 // Chinese translation of the phrase "hello goodbye hello goodbye". This tests 266 // that we can correctly separate terms in languages that don't use spaces. 267 page_text = 268 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81" 269 "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); 270 expected_features.Clear(); 271 expected_features.AddBooleanFeature( 272 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); 273 expected_features.AddBooleanFeature( 274 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); 275 expected_shingle_hashes.clear(); 276 expected_shingle_hashes.insert(MurmurHash3String( 277 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 " 278 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed)); 279 280 features.Clear(); 281 shingle_hashes.clear(); 282 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); 283 ExpectFeatureMapsAreEqual(features, expected_features); 284 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); 285#endif 286} 287 288TEST_F(PhishingTermFeatureExtractorTest, Continuation) { 289 // For this test, we'll cause the feature extraction to run multiple 290 // iterations by incrementing the clock. 291 ResetExtractor(200 /* max shingles per page */); 292 293 // This page has a total of 30 words. For the features to be computed 294 // correctly, the extractor has to process the entire string of text. 295 base::string16 page_text(ASCIIToUTF16("one ")); 296 for (int i = 0; i < 28; ++i) { 297 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); 298 } 299 page_text.append(ASCIIToUTF16("two")); 300 301 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. 302 // Note that this assumes kClockCheckGranularity = 5 and 303 // kMaxTimePerChunkMs = 10. 304 base::TimeTicks now = base::TimeTicks::Now(); 305 EXPECT_CALL(clock_, Now()) 306 // Time check at the start of extraction. 307 .WillOnce(Return(now)) 308 // Time check at the start of the first chunk of work. 309 .WillOnce(Return(now)) 310 // Time check after the first 5 words. 311 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3))) 312 // Time check after the next 5 words. 313 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6))) 314 // Time check after the next 5 words. 315 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9))) 316 // Time check after the next 5 words. This is over the chunk 317 // time limit, so a continuation task will be posted. 318 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12))) 319 // Time check at the start of the second chunk of work. 320 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22))) 321 // Time check after the next 5 words. 322 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25))) 323 // Time check after the next 5 words. 324 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28))) 325 // A final check for the histograms. 326 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); 327 328 FeatureMap expected_features; 329 expected_features.AddBooleanFeature(features::kPageTerm + 330 std::string("one")); 331 expected_features.AddBooleanFeature(features::kPageTerm + 332 std::string("two")); 333 std::set<uint32> expected_shingle_hashes; 334 expected_shingle_hashes.insert( 335 MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed)); 336 expected_shingle_hashes.insert( 337 MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed)); 338 expected_shingle_hashes.insert( 339 MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed)); 340 expected_shingle_hashes.insert( 341 MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed)); 342 expected_shingle_hashes.insert( 343 MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed)); 344 expected_shingle_hashes.insert( 345 MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed)); 346 expected_shingle_hashes.insert( 347 MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed)); 348 expected_shingle_hashes.insert( 349 MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed)); 350 expected_shingle_hashes.insert( 351 MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed)); 352 expected_shingle_hashes.insert( 353 MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed)); 354 expected_shingle_hashes.insert( 355 MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed)); 356 expected_shingle_hashes.insert( 357 MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed)); 358 expected_shingle_hashes.insert( 359 MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed)); 360 expected_shingle_hashes.insert( 361 MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed)); 362 expected_shingle_hashes.insert( 363 MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed)); 364 expected_shingle_hashes.insert( 365 MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed)); 366 expected_shingle_hashes.insert( 367 MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed)); 368 expected_shingle_hashes.insert( 369 MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed)); 370 expected_shingle_hashes.insert( 371 MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed)); 372 expected_shingle_hashes.insert( 373 MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed)); 374 expected_shingle_hashes.insert( 375 MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed)); 376 expected_shingle_hashes.insert( 377 MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed)); 378 expected_shingle_hashes.insert( 379 MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed)); 380 expected_shingle_hashes.insert( 381 MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed)); 382 expected_shingle_hashes.insert( 383 MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed)); 384 expected_shingle_hashes.insert( 385 MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed)); 386 expected_shingle_hashes.insert( 387 MurmurHash3String("25 26 27 two ", kMurmurHash3Seed)); 388 389 FeatureMap features; 390 std::set<uint32> shingle_hashes; 391 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); 392 ExpectFeatureMapsAreEqual(features, expected_features); 393 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); 394 // Make sure none of the mock expectations carry over to the next test. 395 ::testing::Mock::VerifyAndClearExpectations(&clock_); 396 397 // Now repeat the test with the same text, but advance the clock faster so 398 // that the extraction time exceeds the maximum total time for the feature 399 // extractor. Extraction should fail. Note that this assumes 400 // kMaxTotalTimeMs = 500. 401 EXPECT_CALL(clock_, Now()) 402 // Time check at the start of extraction. 403 .WillOnce(Return(now)) 404 // Time check at the start of the first chunk of work. 405 .WillOnce(Return(now)) 406 // Time check after the first 5 words, 407 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) 408 // Time check at the start of the second chunk of work. 409 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) 410 // Time check after the next 5 words. This is over the limit. 411 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) 412 // A final time check for the histograms. 413 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); 414 415 features.Clear(); 416 shingle_hashes.clear(); 417 EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes)); 418} 419 420TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { 421 scoped_ptr<base::string16> page_text( 422 new base::string16(ASCIIToUTF16("one "))); 423 for (int i = 0; i < 28; ++i) { 424 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); 425 } 426 427 base::TimeTicks now = base::TimeTicks::Now(); 428 EXPECT_CALL(clock_, Now()) 429 // Time check at the start of extraction. 430 .WillOnce(Return(now)) 431 // Time check at the start of the first chunk of work. 432 .WillOnce(Return(now)) 433 // Time check after the first 5 words. 434 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) 435 // Time check after the next 5 words. This should be greater than 436 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. 437 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); 438 439 FeatureMap features; 440 std::set<uint32> shingle_hashes; 441 // Extract first 10 words then stop. 442 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); 443 444 page_text.reset(new base::string16()); 445 for (int i = 30; i < 58; ++i) { 446 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); 447 } 448 page_text->append(ASCIIToUTF16("multi word test ")); 449 features.Clear(); 450 shingle_hashes.clear(); 451 452 // This part doesn't exercise the extraction timing. 453 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); 454 455 // Now extract normally and make sure nothing breaks. 456 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); 457 458 FeatureMap expected_features; 459 expected_features.AddBooleanFeature(features::kPageTerm + 460 std::string("multi word test")); 461 ExpectFeatureMapsAreEqual(features, expected_features); 462} 463 464} // namespace safe_browsing 465