1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// Note that although this is not a "browser" test, it runs as part of 6// browser_tests. This is because WebKit does not work properly if it is 7// shutdown and re-initialized. Since browser_tests runs each test in a 8// new process, this avoids the problem. 9 10#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" 11 12#include "base/bind.h" 13#include "base/callback.h" 14#include "base/command_line.h" 15#include "base/compiler_specific.h" 16#include "base/memory/weak_ptr.h" 17#include "base/message_loop/message_loop.h" 18#include "base/strings/string_number_conversions.h" 19#include "base/time/time.h" 20#include "chrome/browser/ui/browser.h" 21#include "chrome/browser/ui/tabs/tab_strip_model.h" 22#include "chrome/common/chrome_switches.h" 23#include "chrome/renderer/safe_browsing/features.h" 24#include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" 25#include "chrome/renderer/safe_browsing/test_utils.h" 26#include "chrome/test/base/in_process_browser_test.h" 27#include "chrome/test/base/ui_test_utils.h" 28#include "content/public/browser/interstitial_page.h" 29#include "content/public/browser/web_contents.h" 30#include "content/public/renderer/render_view.h" 31#include "content/public/test/browser_test_utils.h" 32#include "content/public/test/test_utils.h" 33#include "net/dns/mock_host_resolver.h" 34#include "net/test/embedded_test_server/embedded_test_server.h" 35#include "net/test/embedded_test_server/http_request.h" 36#include "net/test/embedded_test_server/http_response.h" 37#include "testing/gmock/include/gmock/gmock.h" 38#include "third_party/WebKit/public/platform/WebString.h" 39#include "third_party/WebKit/public/web/WebFrame.h" 40#include "third_party/WebKit/public/web/WebScriptSource.h" 41#include "third_party/WebKit/public/web/WebView.h" 42 43using ::testing::DoAll; 44using ::testing::Invoke; 45using ::testing::Return; 46 47namespace { 48 49// The first RenderFrame is routing ID 1, and the first RenderView is 2. 50const int kRenderViewRoutingId = 2; 51 52} 53 54namespace safe_browsing { 55 56class PhishingDOMFeatureExtractorTest : public InProcessBrowserTest { 57 public: 58 content::WebContents* GetWebContents() { 59 return browser()->tab_strip_model()->GetActiveWebContents(); 60 } 61 62 // Helper for the SubframeRemoval test that posts a message to remove 63 // the iframe "frame1" from the document. 64 void ScheduleRemoveIframe() { 65 base::MessageLoop::current()->PostTask( 66 FROM_HERE, 67 base::Bind(&PhishingDOMFeatureExtractorTest::RemoveIframe, 68 weak_factory_.GetWeakPtr())); 69 } 70 71 protected: 72 PhishingDOMFeatureExtractorTest() : weak_factory_(this) {} 73 74 virtual ~PhishingDOMFeatureExtractorTest() {} 75 76 virtual void SetUpCommandLine(CommandLine* command_line) OVERRIDE { 77 command_line->AppendSwitch(switches::kSingleProcess); 78#if defined(OS_WIN) 79 // Don't want to try to create a GPU process. 80 command_line->AppendSwitch(switches::kDisableGpu); 81#endif 82 } 83 84 virtual void SetUpOnMainThread() OVERRIDE { 85 extractor_.reset(new PhishingDOMFeatureExtractor( 86 content::RenderView::FromRoutingID(kRenderViewRoutingId), &clock_)); 87 88 ASSERT_TRUE(StartTestServer()); 89 host_resolver()->AddRule("*", "127.0.0.1"); 90 } 91 92 // Runs the DOMFeatureExtractor on the RenderView, waiting for the 93 // completion callback. Returns the success boolean from the callback. 94 bool ExtractFeatures(FeatureMap* features) { 95 success_ = false; 96 PostTaskToInProcessRendererAndWait( 97 base::Bind(&PhishingDOMFeatureExtractorTest::ExtractFeaturesInternal, 98 base::Unretained(this), 99 features)); 100 return success_; 101 } 102 103 void ExtractFeaturesInternal(FeatureMap* features) { 104 scoped_refptr<content::MessageLoopRunner> message_loop = 105 new content::MessageLoopRunner; 106 extractor_->ExtractFeatures( 107 features, 108 base::Bind(&PhishingDOMFeatureExtractorTest::ExtractionDone, 109 base::Unretained(this), 110 message_loop->QuitClosure())); 111 message_loop->Run(); 112 } 113 114 // Completion callback for feature extraction. 115 void ExtractionDone(const base::Closure& quit_closure, 116 bool success) { 117 success_ = success; 118 quit_closure.Run(); 119 } 120 121 // Does the actual work of removing the iframe "frame1" from the document. 122 void RemoveIframe() { 123 content::RenderView* render_view = 124 content::RenderView::FromRoutingID(kRenderViewRoutingId); 125 blink::WebFrame* main_frame = render_view->GetWebView()->mainFrame(); 126 ASSERT_TRUE(main_frame); 127 main_frame->executeScript( 128 blink::WebString( 129 "document.body.removeChild(document.getElementById('frame1'));")); 130 } 131 132 bool StartTestServer() { 133 CHECK(!embedded_test_server_); 134 embedded_test_server_.reset(new net::test_server::EmbeddedTestServer()); 135 embedded_test_server_->RegisterRequestHandler( 136 base::Bind(&PhishingDOMFeatureExtractorTest::HandleRequest, 137 base::Unretained(this))); 138 return embedded_test_server_->InitializeAndWaitUntilReady(); 139 } 140 141 scoped_ptr<net::test_server::HttpResponse> HandleRequest( 142 const net::test_server::HttpRequest& request) { 143 std::map<std::string, std::string>::const_iterator host_it = 144 request.headers.find("Host"); 145 if (host_it == request.headers.end()) 146 return scoped_ptr<net::test_server::HttpResponse>(); 147 148 std::string url = 149 std::string("http://") + host_it->second + request.relative_url; 150 std::map<std::string, std::string>::const_iterator it = 151 responses_.find(url); 152 if (it == responses_.end()) 153 return scoped_ptr<net::test_server::HttpResponse>(); 154 155 scoped_ptr<net::test_server::BasicHttpResponse> http_response( 156 new net::test_server::BasicHttpResponse()); 157 http_response->set_code(net::HTTP_OK); 158 http_response->set_content_type("text/html"); 159 http_response->set_content(it->second); 160 return http_response.PassAs<net::test_server::HttpResponse>(); 161 } 162 163 GURL GetURL(const std::string& host, const std::string& path) { 164 GURL::Replacements replace; 165 replace.SetHostStr(host); 166 replace.SetPathStr(path); 167 return embedded_test_server_->base_url().ReplaceComponents(replace); 168 } 169 170 // Returns the URL that was loaded. 171 GURL LoadHtml(const std::string& host, const std::string& content) { 172 GURL url(GetURL(host, "")); 173 responses_[url.spec()] = content; 174 ui_test_utils::NavigateToURL(browser(), url); 175 return url; 176 } 177 178 // Map of url -> response body for network requests from the renderer. 179 // Any urls not in this map are served a 404 error. 180 std::map<std::string, std::string> responses_; 181 182 scoped_ptr<net::test_server::EmbeddedTestServer> embedded_test_server_; 183 MockFeatureExtractorClock clock_; 184 scoped_ptr<PhishingDOMFeatureExtractor> extractor_; 185 bool success_; // holds the success value from ExtractFeatures 186 base::WeakPtrFactory<PhishingDOMFeatureExtractorTest> weak_factory_; 187}; 188 189IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) { 190 // This test doesn't exercise the extraction timing. 191 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); 192 193 FeatureMap expected_features; 194 expected_features.AddBooleanFeature(features::kPageHasForms); 195 expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.25); 196 expected_features.AddBooleanFeature(features::kPageHasTextInputs); 197 expected_features.AddBooleanFeature(features::kPageHasCheckInputs); 198 199 FeatureMap features; 200 LoadHtml( 201 "host.com", 202 "<html><head><body>" 203 "<form action=\"query\"><input type=text><input type=checkbox></form>" 204 "<form action=\"http://cgi.host.com/submit\"></form>" 205 "<form action=\"http://other.com/\"></form>" 206 "<form action=\"query\"></form>" 207 "<form></form></body></html>"); 208 ASSERT_TRUE(ExtractFeatures(&features)); 209 ExpectFeatureMapsAreEqual(features, expected_features); 210 211 expected_features.Clear(); 212 expected_features.AddBooleanFeature(features::kPageHasRadioInputs); 213 expected_features.AddBooleanFeature(features::kPageHasPswdInputs); 214 215 features.Clear(); 216 LoadHtml( 217 "host.com", 218 "<html><head><body>" 219 "<input type=\"radio\"><input type=password></body></html>"); 220 ASSERT_TRUE(ExtractFeatures(&features)); 221 ExpectFeatureMapsAreEqual(features, expected_features); 222 223 expected_features.Clear(); 224 expected_features.AddBooleanFeature(features::kPageHasTextInputs); 225 226 features.Clear(); 227 LoadHtml( 228 "host.com", 229 "<html><head><body><input></body></html>"); 230 ASSERT_TRUE(ExtractFeatures(&features)); 231 ExpectFeatureMapsAreEqual(features, expected_features); 232 233 expected_features.Clear(); 234 expected_features.AddBooleanFeature(features::kPageHasTextInputs); 235 236 features.Clear(); 237 LoadHtml( 238 "host.com", 239 "<html><head><body><input type=\"invalid\"></body></html>"); 240 ASSERT_TRUE(ExtractFeatures(&features)); 241 ExpectFeatureMapsAreEqual(features, expected_features); 242} 243 244IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) { 245 // This test doesn't exercise the extraction timing. 246 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); 247 248 FeatureMap expected_features; 249 expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.5); 250 expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.0); 251 expected_features.AddBooleanFeature(features::kPageLinkDomain + 252 std::string("chromium.org")); 253 254 FeatureMap features; 255 LoadHtml( 256 "www.host.com", 257 "<html><head><body>" 258 "<a href=\"http://www2.host.com/abc\">link</a>" 259 "<a name=page_anchor></a>" 260 "<a href=\"http://www.chromium.org/\">chromium</a>" 261 "</body></html"); 262 ASSERT_TRUE(ExtractFeatures(&features)); 263 ExpectFeatureMapsAreEqual(features, expected_features); 264 265 expected_features.Clear(); 266 expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25); 267 expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.5); 268 expected_features.AddBooleanFeature(features::kPageLinkDomain + 269 std::string("chromium.org")); 270 271 net::SpawnedTestServer https_server( 272 net::SpawnedTestServer::TYPE_HTTPS, 273 net::SpawnedTestServer::kLocalhost, 274 base::FilePath(FILE_PATH_LITERAL("chrome/test/data"))); 275 ASSERT_TRUE(https_server.Start()); 276 277 // The PhishingDOMFeatureExtractor depends on URLs being domains and not IPs, 278 // so use a domain. 279 std::string url_str = "https://host.com:"; 280 url_str += base::IntToString(https_server.host_port_pair().port()); 281 url_str += "/files/safe_browsing/secure_link_features.html"; 282 ui_test_utils::NavigateToURL(browser(), GURL(url_str)); 283 284 // Click through the certificate error interstitial. 285 content::InterstitialPage* interstitial_page = 286 GetWebContents()->GetInterstitialPage(); 287 interstitial_page->Proceed(); 288 content::WaitForLoadStop(GetWebContents()); 289 290 features.Clear(); 291 ASSERT_TRUE(ExtractFeatures(&features)); 292 ExpectFeatureMapsAreEqual(features, expected_features); 293} 294 295IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, 296 ScriptAndImageFeatures) { 297 // This test doesn't exercise the extraction timing. 298 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); 299 300 FeatureMap expected_features; 301 expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne); 302 303 FeatureMap features; 304 LoadHtml( 305 "host.com", 306 "<html><head><script></script><script></script></head></html>"); 307 ASSERT_TRUE(ExtractFeatures(&features)); 308 ExpectFeatureMapsAreEqual(features, expected_features); 309 310 expected_features.Clear(); 311 expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne); 312 expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTSix); 313 expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 0.5); 314 315 features.Clear(); 316 net::SpawnedTestServer https_server( 317 net::SpawnedTestServer::TYPE_HTTPS, 318 net::SpawnedTestServer::kLocalhost, 319 base::FilePath(FILE_PATH_LITERAL("chrome/test/data"))); 320 ASSERT_TRUE(https_server.Start()); 321 322 // The PhishingDOMFeatureExtractor depends on URLs being domains and not IPs, 323 // so use a domain. 324 std::string url_str = "https://host.com:"; 325 url_str += base::IntToString(https_server.host_port_pair().port()); 326 url_str += "/files/safe_browsing/secure_script_and_image.html"; 327 ui_test_utils::NavigateToURL(browser(), GURL(url_str)); 328 329 // Click through the certificate error interstitial. 330 content::InterstitialPage* interstitial_page = 331 GetWebContents()->GetInterstitialPage(); 332 interstitial_page->Proceed(); 333 content::WaitForLoadStop(GetWebContents()); 334 335 ASSERT_TRUE(ExtractFeatures(&features)); 336 ExpectFeatureMapsAreEqual(features, expected_features); 337} 338 339IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) { 340 // This test doesn't exercise the extraction timing. 341 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); 342 343 // Test that features are aggregated across all frames. 344 345 std::string port = base::IntToString(embedded_test_server_->port()); 346 responses_[GetURL("host2.com", "").spec()] = 347 "<html><head><script></script><body>" 348 "<form action=\"http://host4.com/\"><input type=checkbox></form>" 349 "<form action=\"http://host2.com/submit\"></form>" 350 "<a href=\"http://www.host2.com/home\">link</a>" 351 "<iframe src=\"nested.html\"></iframe>" 352 "<body></html>"; 353 354 responses_[GetURL("host2.com", "nested.html").spec()] = 355 "<html><body><input type=password>" 356 "<a href=\"https://host4.com/\">link</a>" 357 "<a href=\"relative\">another</a>" 358 "</body></html>"; 359 360 responses_[GetURL("host3.com", "").spec()] = 361 "<html><head><script></script><body>" 362 "<img src=\"http://host.com/123.png\">" 363 "</body></html>"; 364 365 FeatureMap expected_features; 366 expected_features.AddBooleanFeature(features::kPageHasForms); 367 // Form action domains are compared to the URL of the document they're in, 368 // not the URL of the toplevel page. So http://host2.com/ has two form 369 // actions, one of which is external. 370 expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5); 371 expected_features.AddBooleanFeature(features::kPageHasTextInputs); 372 expected_features.AddBooleanFeature(features::kPageHasPswdInputs); 373 expected_features.AddBooleanFeature(features::kPageHasCheckInputs); 374 expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25); 375 expected_features.AddBooleanFeature(features::kPageLinkDomain + 376 std::string("host4.com")); 377 expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.25); 378 expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne); 379 expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 1.0); 380 381 FeatureMap features; 382 std::string html( 383 "<html><body><input type=text><a href=\"info.html\">link</a>" 384 "<iframe src=\"http://host2.com:"); 385 html += port; 386 html += std::string( 387 "/\"></iframe>" 388 "<iframe src=\"http://host3.com:"); 389 html += port; 390 html += std::string("/\"></iframe></body></html>"); 391 392 LoadHtml("host.com", html); 393 ASSERT_TRUE(ExtractFeatures(&features)); 394 ExpectFeatureMapsAreEqual(features, expected_features); 395} 396 397IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, Continuation) { 398 // For this test, we'll cause the feature extraction to run multiple 399 // iterations by incrementing the clock. 400 401 // This page has a total of 50 elements. For the external forms feature to 402 // be computed correctly, the extractor has to examine the whole document. 403 // Note: the empty HEAD is important -- WebKit will synthesize a HEAD if 404 // there isn't one present, which can be confusing for the element counts. 405 std::string response = "<html><head></head><body>" 406 "<form action=\"ondomain\"></form>"; 407 for (int i = 0; i < 45; ++i) { 408 response.append("<p>"); 409 } 410 response.append("<form action=\"http://host2.com/\"></form></body></html>"); 411 412 // Advance the clock 6 ms every 10 elements processed, 10 ms between chunks. 413 // Note that this assumes kClockCheckGranularity = 10 and 414 // kMaxTimePerChunkMs = 10. 415 base::TimeTicks now = base::TimeTicks::Now(); 416 EXPECT_CALL(clock_, Now()) 417 // Time check at the start of extraction. 418 .WillOnce(Return(now)) 419 // Time check at the start of the first chunk of work. 420 .WillOnce(Return(now)) 421 // Time check after the first 10 elements. 422 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6))) 423 // Time check after the next 10 elements. This is over the chunk 424 // time limit, so a continuation task will be posted. 425 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12))) 426 // Time check at the start of the second chunk of work. 427 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22))) 428 // Time check after resuming iteration for the second chunk. 429 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(24))) 430 // Time check after the next 10 elements. 431 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))) 432 // Time check after the next 10 elements. This will trigger another 433 // continuation task. 434 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(36))) 435 // Time check at the start of the third chunk of work. 436 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(46))) 437 // Time check after resuming iteration for the third chunk. 438 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(48))) 439 // Time check after the last 10 elements. 440 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(54))) 441 // A final time check for the histograms. 442 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(56))); 443 444 FeatureMap expected_features; 445 expected_features.AddBooleanFeature(features::kPageHasForms); 446 expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5); 447 448 FeatureMap features; 449 LoadHtml("host.com", response); 450 ASSERT_TRUE(ExtractFeatures(&features)); 451 ExpectFeatureMapsAreEqual(features, expected_features); 452 // Make sure none of the mock expectations carry over to the next test. 453 ::testing::Mock::VerifyAndClearExpectations(&clock_); 454 455 // Now repeat the test with the same page, but advance the clock faster so 456 // that the extraction time exceeds the maximum total time for the feature 457 // extractor. Extraction should fail. Note that this assumes 458 // kMaxTotalTimeMs = 500. 459 EXPECT_CALL(clock_, Now()) 460 // Time check at the start of extraction. 461 .WillOnce(Return(now)) 462 // Time check at the start of the first chunk of work. 463 .WillOnce(Return(now)) 464 // Time check after the first 10 elements. 465 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) 466 // Time check at the start of the second chunk of work. 467 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) 468 // Time check after resuming iteration for the second chunk. 469 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(360))) 470 // Time check after the next 10 elements. This is over the limit. 471 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) 472 // A final time check for the histograms. 473 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); 474 475 features.Clear(); 476 EXPECT_FALSE(ExtractFeatures(&features)); 477} 478 479IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, SubframeRemoval) { 480 // In this test, we'll advance the feature extractor so that it is positioned 481 // inside an iframe, and have it pause due to exceeding the chunk time limit. 482 // Then, prior to continuation, the iframe is removed from the document. 483 // As currently implemented, this should finish extraction from the removed 484 // iframe document. 485 responses_[GetURL("host.com", "frame.html").spec()] = 486 "<html><body><p><p><p><input type=password></body></html>"; 487 488 base::TimeTicks now = base::TimeTicks::Now(); 489 EXPECT_CALL(clock_, Now()) 490 // Time check at the start of extraction. 491 .WillOnce(Return(now)) 492 // Time check at the start of the first chunk of work. 493 .WillOnce(Return(now)) 494 // Time check after the first 10 elements. Enough time has passed 495 // to stop extraction. Schedule the iframe removal to happen as soon as 496 // the feature extractor returns control to the message loop. 497 .WillOnce(DoAll( 498 Invoke(this, &PhishingDOMFeatureExtractorTest::ScheduleRemoveIframe), 499 Return(now + base::TimeDelta::FromMilliseconds(21)))) 500 // Time check at the start of the second chunk of work. 501 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25))) 502 // Time check after resuming iteration for the second chunk. 503 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(27))) 504 // A final time check for the histograms. 505 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(33))); 506 507 FeatureMap expected_features; 508 expected_features.AddBooleanFeature(features::kPageHasForms); 509 expected_features.AddBooleanFeature(features::kPageHasPswdInputs); 510 511 FeatureMap features; 512 LoadHtml( 513 "host.com", 514 "<html><head></head><body>" 515 "<iframe src=\"frame.html\" id=\"frame1\"></iframe>" 516 "<form></form></body></html>"); 517 ASSERT_TRUE(ExtractFeatures(&features)); 518 ExpectFeatureMapsAreEqual(features, expected_features); 519} 520 521} // namespace safe_browsing 522