1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// Note that although this is not a "browser" test, it runs as part of
6// browser_tests.  This is because WebKit does not work properly if it is
7// shutdown and re-initialized.  Since browser_tests runs each test in a
8// new process, this avoids the problem.
9
10#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
11
12#include "base/bind.h"
13#include "base/callback.h"
14#include "base/command_line.h"
15#include "base/compiler_specific.h"
16#include "base/memory/weak_ptr.h"
17#include "base/message_loop/message_loop.h"
18#include "base/strings/string_number_conversions.h"
19#include "base/time/time.h"
20#include "chrome/browser/ui/browser.h"
21#include "chrome/browser/ui/tabs/tab_strip_model.h"
22#include "chrome/common/chrome_switches.h"
23#include "chrome/renderer/safe_browsing/features.h"
24#include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
25#include "chrome/renderer/safe_browsing/test_utils.h"
26#include "chrome/test/base/in_process_browser_test.h"
27#include "chrome/test/base/ui_test_utils.h"
28#include "content/public/browser/interstitial_page.h"
29#include "content/public/browser/web_contents.h"
30#include "content/public/renderer/render_view.h"
31#include "content/public/test/browser_test_utils.h"
32#include "content/public/test/test_utils.h"
33#include "net/dns/mock_host_resolver.h"
34#include "net/test/embedded_test_server/embedded_test_server.h"
35#include "net/test/embedded_test_server/http_request.h"
36#include "net/test/embedded_test_server/http_response.h"
37#include "testing/gmock/include/gmock/gmock.h"
38#include "third_party/WebKit/public/platform/WebString.h"
39#include "third_party/WebKit/public/web/WebFrame.h"
40#include "third_party/WebKit/public/web/WebScriptSource.h"
41#include "third_party/WebKit/public/web/WebView.h"
42
43using ::testing::DoAll;
44using ::testing::Invoke;
45using ::testing::Return;
46
47namespace {
48
49// The first RenderFrame is routing ID 1, and the first RenderView is 2.
50const int kRenderViewRoutingId = 2;
51
52}
53
54namespace safe_browsing {
55
56class PhishingDOMFeatureExtractorTest : public InProcessBrowserTest {
57 public:
58  content::WebContents* GetWebContents() {
59    return browser()->tab_strip_model()->GetActiveWebContents();
60  }
61
62  // Helper for the SubframeRemoval test that posts a message to remove
63  // the iframe "frame1" from the document.
64  void ScheduleRemoveIframe() {
65    base::MessageLoop::current()->PostTask(
66        FROM_HERE,
67        base::Bind(&PhishingDOMFeatureExtractorTest::RemoveIframe,
68                   weak_factory_.GetWeakPtr()));
69  }
70
71 protected:
72  PhishingDOMFeatureExtractorTest() : weak_factory_(this) {}
73
74  virtual ~PhishingDOMFeatureExtractorTest() {}
75
76  virtual void SetUpCommandLine(CommandLine* command_line) OVERRIDE {
77    command_line->AppendSwitch(switches::kSingleProcess);
78#if defined(OS_WIN)
79    // Don't want to try to create a GPU process.
80    command_line->AppendSwitch(switches::kDisableGpu);
81#endif
82  }
83
84  virtual void SetUpOnMainThread() OVERRIDE {
85    extractor_.reset(new PhishingDOMFeatureExtractor(
86        content::RenderView::FromRoutingID(kRenderViewRoutingId), &clock_));
87
88    ASSERT_TRUE(StartTestServer());
89    host_resolver()->AddRule("*", "127.0.0.1");
90  }
91
92  // Runs the DOMFeatureExtractor on the RenderView, waiting for the
93  // completion callback.  Returns the success boolean from the callback.
94  bool ExtractFeatures(FeatureMap* features) {
95    success_ = false;
96    PostTaskToInProcessRendererAndWait(
97        base::Bind(&PhishingDOMFeatureExtractorTest::ExtractFeaturesInternal,
98        base::Unretained(this),
99        features));
100    return success_;
101  }
102
103  void ExtractFeaturesInternal(FeatureMap* features) {
104    scoped_refptr<content::MessageLoopRunner> message_loop =
105        new content::MessageLoopRunner;
106    extractor_->ExtractFeatures(
107        features,
108        base::Bind(&PhishingDOMFeatureExtractorTest::ExtractionDone,
109                   base::Unretained(this),
110                   message_loop->QuitClosure()));
111    message_loop->Run();
112  }
113
114  // Completion callback for feature extraction.
115  void ExtractionDone(const base::Closure& quit_closure,
116                      bool success) {
117    success_ = success;
118    quit_closure.Run();
119  }
120
121  // Does the actual work of removing the iframe "frame1" from the document.
122  void RemoveIframe() {
123    content::RenderView* render_view =
124        content::RenderView::FromRoutingID(kRenderViewRoutingId);
125    blink::WebFrame* main_frame = render_view->GetWebView()->mainFrame();
126    ASSERT_TRUE(main_frame);
127    main_frame->executeScript(
128        blink::WebString(
129            "document.body.removeChild(document.getElementById('frame1'));"));
130  }
131
132  bool StartTestServer() {
133    CHECK(!embedded_test_server_);
134    embedded_test_server_.reset(new net::test_server::EmbeddedTestServer());
135    embedded_test_server_->RegisterRequestHandler(
136        base::Bind(&PhishingDOMFeatureExtractorTest::HandleRequest,
137                   base::Unretained(this)));
138    return embedded_test_server_->InitializeAndWaitUntilReady();
139  }
140
141  scoped_ptr<net::test_server::HttpResponse> HandleRequest(
142      const net::test_server::HttpRequest& request) {
143    std::map<std::string, std::string>::const_iterator host_it =
144        request.headers.find("Host");
145    if (host_it == request.headers.end())
146      return scoped_ptr<net::test_server::HttpResponse>();
147
148    std::string url =
149        std::string("http://") + host_it->second + request.relative_url;
150    std::map<std::string, std::string>::const_iterator it =
151        responses_.find(url);
152    if (it == responses_.end())
153      return scoped_ptr<net::test_server::HttpResponse>();
154
155    scoped_ptr<net::test_server::BasicHttpResponse> http_response(
156        new net::test_server::BasicHttpResponse());
157    http_response->set_code(net::HTTP_OK);
158    http_response->set_content_type("text/html");
159    http_response->set_content(it->second);
160    return http_response.PassAs<net::test_server::HttpResponse>();
161  }
162
163  GURL GetURL(const std::string& host, const std::string& path) {
164    GURL::Replacements replace;
165    replace.SetHostStr(host);
166    replace.SetPathStr(path);
167    return embedded_test_server_->base_url().ReplaceComponents(replace);
168  }
169
170  // Returns the URL that was loaded.
171  GURL LoadHtml(const std::string& host, const std::string& content) {
172    GURL url(GetURL(host, ""));
173    responses_[url.spec()] = content;
174    ui_test_utils::NavigateToURL(browser(), url);
175    return url;
176  }
177
178  // Map of url -> response body for network requests from the renderer.
179  // Any urls not in this map are served a 404 error.
180  std::map<std::string, std::string> responses_;
181
182  scoped_ptr<net::test_server::EmbeddedTestServer> embedded_test_server_;
183  MockFeatureExtractorClock clock_;
184  scoped_ptr<PhishingDOMFeatureExtractor> extractor_;
185  bool success_;  // holds the success value from ExtractFeatures
186  base::WeakPtrFactory<PhishingDOMFeatureExtractorTest> weak_factory_;
187};
188
189IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) {
190  // This test doesn't exercise the extraction timing.
191  EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
192
193  FeatureMap expected_features;
194  expected_features.AddBooleanFeature(features::kPageHasForms);
195  expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.25);
196  expected_features.AddBooleanFeature(features::kPageHasTextInputs);
197  expected_features.AddBooleanFeature(features::kPageHasCheckInputs);
198
199  FeatureMap features;
200  LoadHtml(
201      "host.com",
202      "<html><head><body>"
203      "<form action=\"query\"><input type=text><input type=checkbox></form>"
204      "<form action=\"http://cgi.host.com/submit\"></form>"
205      "<form action=\"http://other.com/\"></form>"
206      "<form action=\"query\"></form>"
207      "<form></form></body></html>");
208  ASSERT_TRUE(ExtractFeatures(&features));
209  ExpectFeatureMapsAreEqual(features, expected_features);
210
211  expected_features.Clear();
212  expected_features.AddBooleanFeature(features::kPageHasRadioInputs);
213  expected_features.AddBooleanFeature(features::kPageHasPswdInputs);
214
215  features.Clear();
216  LoadHtml(
217      "host.com",
218      "<html><head><body>"
219      "<input type=\"radio\"><input type=password></body></html>");
220  ASSERT_TRUE(ExtractFeatures(&features));
221  ExpectFeatureMapsAreEqual(features, expected_features);
222
223  expected_features.Clear();
224  expected_features.AddBooleanFeature(features::kPageHasTextInputs);
225
226  features.Clear();
227  LoadHtml(
228      "host.com",
229      "<html><head><body><input></body></html>");
230  ASSERT_TRUE(ExtractFeatures(&features));
231  ExpectFeatureMapsAreEqual(features, expected_features);
232
233  expected_features.Clear();
234  expected_features.AddBooleanFeature(features::kPageHasTextInputs);
235
236  features.Clear();
237  LoadHtml(
238      "host.com",
239      "<html><head><body><input type=\"invalid\"></body></html>");
240  ASSERT_TRUE(ExtractFeatures(&features));
241  ExpectFeatureMapsAreEqual(features, expected_features);
242}
243
244IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) {
245  // This test doesn't exercise the extraction timing.
246  EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
247
248  FeatureMap expected_features;
249  expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.5);
250  expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.0);
251  expected_features.AddBooleanFeature(features::kPageLinkDomain +
252                                      std::string("chromium.org"));
253
254  FeatureMap features;
255  LoadHtml(
256      "www.host.com",
257      "<html><head><body>"
258      "<a href=\"http://www2.host.com/abc\">link</a>"
259      "<a name=page_anchor></a>"
260      "<a href=\"http://www.chromium.org/\">chromium</a>"
261      "</body></html");
262  ASSERT_TRUE(ExtractFeatures(&features));
263  ExpectFeatureMapsAreEqual(features, expected_features);
264
265  expected_features.Clear();
266  expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25);
267  expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.5);
268  expected_features.AddBooleanFeature(features::kPageLinkDomain +
269                                      std::string("chromium.org"));
270
271  net::SpawnedTestServer https_server(
272      net::SpawnedTestServer::TYPE_HTTPS,
273      net::SpawnedTestServer::kLocalhost,
274      base::FilePath(FILE_PATH_LITERAL("chrome/test/data")));
275  ASSERT_TRUE(https_server.Start());
276
277  // The PhishingDOMFeatureExtractor depends on URLs being domains and not IPs,
278  // so use a domain.
279  std::string url_str = "https://host.com:";
280  url_str += base::IntToString(https_server.host_port_pair().port());
281  url_str += "/files/safe_browsing/secure_link_features.html";
282  ui_test_utils::NavigateToURL(browser(), GURL(url_str));
283
284  // Click through the certificate error interstitial.
285  content::InterstitialPage* interstitial_page =
286      GetWebContents()->GetInterstitialPage();
287  interstitial_page->Proceed();
288  content::WaitForLoadStop(GetWebContents());
289
290  features.Clear();
291  ASSERT_TRUE(ExtractFeatures(&features));
292  ExpectFeatureMapsAreEqual(features, expected_features);
293}
294
295IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest,
296                       ScriptAndImageFeatures) {
297  // This test doesn't exercise the extraction timing.
298  EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
299
300  FeatureMap expected_features;
301  expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
302
303  FeatureMap features;
304  LoadHtml(
305      "host.com",
306      "<html><head><script></script><script></script></head></html>");
307  ASSERT_TRUE(ExtractFeatures(&features));
308  ExpectFeatureMapsAreEqual(features, expected_features);
309
310  expected_features.Clear();
311  expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
312  expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTSix);
313  expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 0.5);
314
315  features.Clear();
316  net::SpawnedTestServer https_server(
317      net::SpawnedTestServer::TYPE_HTTPS,
318      net::SpawnedTestServer::kLocalhost,
319      base::FilePath(FILE_PATH_LITERAL("chrome/test/data")));
320  ASSERT_TRUE(https_server.Start());
321
322  // The PhishingDOMFeatureExtractor depends on URLs being domains and not IPs,
323  // so use a domain.
324  std::string url_str = "https://host.com:";
325  url_str += base::IntToString(https_server.host_port_pair().port());
326  url_str += "/files/safe_browsing/secure_script_and_image.html";
327  ui_test_utils::NavigateToURL(browser(), GURL(url_str));
328
329  // Click through the certificate error interstitial.
330  content::InterstitialPage* interstitial_page =
331      GetWebContents()->GetInterstitialPage();
332  interstitial_page->Proceed();
333  content::WaitForLoadStop(GetWebContents());
334
335  ASSERT_TRUE(ExtractFeatures(&features));
336  ExpectFeatureMapsAreEqual(features, expected_features);
337}
338
339IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) {
340  // This test doesn't exercise the extraction timing.
341  EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
342
343  // Test that features are aggregated across all frames.
344
345  std::string port = base::IntToString(embedded_test_server_->port());
346  responses_[GetURL("host2.com", "").spec()] =
347      "<html><head><script></script><body>"
348      "<form action=\"http://host4.com/\"><input type=checkbox></form>"
349      "<form action=\"http://host2.com/submit\"></form>"
350      "<a href=\"http://www.host2.com/home\">link</a>"
351      "<iframe src=\"nested.html\"></iframe>"
352      "<body></html>";
353
354  responses_[GetURL("host2.com", "nested.html").spec()] =
355      "<html><body><input type=password>"
356      "<a href=\"https://host4.com/\">link</a>"
357      "<a href=\"relative\">another</a>"
358      "</body></html>";
359
360  responses_[GetURL("host3.com", "").spec()] =
361      "<html><head><script></script><body>"
362      "<img src=\"http://host.com/123.png\">"
363      "</body></html>";
364
365  FeatureMap expected_features;
366  expected_features.AddBooleanFeature(features::kPageHasForms);
367  // Form action domains are compared to the URL of the document they're in,
368  // not the URL of the toplevel page.  So http://host2.com/ has two form
369  // actions, one of which is external.
370  expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5);
371  expected_features.AddBooleanFeature(features::kPageHasTextInputs);
372  expected_features.AddBooleanFeature(features::kPageHasPswdInputs);
373  expected_features.AddBooleanFeature(features::kPageHasCheckInputs);
374  expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25);
375  expected_features.AddBooleanFeature(features::kPageLinkDomain +
376                                      std::string("host4.com"));
377  expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.25);
378  expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
379  expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 1.0);
380
381  FeatureMap features;
382  std::string html(
383      "<html><body><input type=text><a href=\"info.html\">link</a>"
384      "<iframe src=\"http://host2.com:");
385  html += port;
386  html += std::string(
387      "/\"></iframe>"
388      "<iframe src=\"http://host3.com:");
389  html += port;
390  html += std::string("/\"></iframe></body></html>");
391
392  LoadHtml("host.com", html);
393  ASSERT_TRUE(ExtractFeatures(&features));
394  ExpectFeatureMapsAreEqual(features, expected_features);
395}
396
397IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, Continuation) {
398  // For this test, we'll cause the feature extraction to run multiple
399  // iterations by incrementing the clock.
400
401  // This page has a total of 50 elements.  For the external forms feature to
402  // be computed correctly, the extractor has to examine the whole document.
403  // Note: the empty HEAD is important -- WebKit will synthesize a HEAD if
404  // there isn't one present, which can be confusing for the element counts.
405  std::string response = "<html><head></head><body>"
406      "<form action=\"ondomain\"></form>";
407  for (int i = 0; i < 45; ++i) {
408    response.append("<p>");
409  }
410  response.append("<form action=\"http://host2.com/\"></form></body></html>");
411
412  // Advance the clock 6 ms every 10 elements processed, 10 ms between chunks.
413  // Note that this assumes kClockCheckGranularity = 10 and
414  // kMaxTimePerChunkMs = 10.
415  base::TimeTicks now = base::TimeTicks::Now();
416  EXPECT_CALL(clock_, Now())
417      // Time check at the start of extraction.
418      .WillOnce(Return(now))
419      // Time check at the start of the first chunk of work.
420      .WillOnce(Return(now))
421      // Time check after the first 10 elements.
422      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
423      // Time check after the next 10 elements.  This is over the chunk
424      // time limit, so a continuation task will be posted.
425      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
426      // Time check at the start of the second chunk of work.
427      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
428      // Time check after resuming iteration for the second chunk.
429      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(24)))
430      // Time check after the next 10 elements.
431      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))
432      // Time check after the next 10 elements.  This will trigger another
433      // continuation task.
434      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(36)))
435      // Time check at the start of the third chunk of work.
436      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(46)))
437      // Time check after resuming iteration for the third chunk.
438      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(48)))
439      // Time check after the last 10 elements.
440      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(54)))
441      // A final time check for the histograms.
442      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(56)));
443
444  FeatureMap expected_features;
445  expected_features.AddBooleanFeature(features::kPageHasForms);
446  expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5);
447
448  FeatureMap features;
449  LoadHtml("host.com", response);
450  ASSERT_TRUE(ExtractFeatures(&features));
451  ExpectFeatureMapsAreEqual(features, expected_features);
452  // Make sure none of the mock expectations carry over to the next test.
453  ::testing::Mock::VerifyAndClearExpectations(&clock_);
454
455  // Now repeat the test with the same page, but advance the clock faster so
456  // that the extraction time exceeds the maximum total time for the feature
457  // extractor.  Extraction should fail.  Note that this assumes
458  // kMaxTotalTimeMs = 500.
459  EXPECT_CALL(clock_, Now())
460      // Time check at the start of extraction.
461      .WillOnce(Return(now))
462      // Time check at the start of the first chunk of work.
463      .WillOnce(Return(now))
464      // Time check after the first 10 elements.
465      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
466      // Time check at the start of the second chunk of work.
467      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
468      // Time check after resuming iteration for the second chunk.
469      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(360)))
470      // Time check after the next 10 elements.  This is over the limit.
471      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
472      // A final time check for the histograms.
473      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
474
475  features.Clear();
476  EXPECT_FALSE(ExtractFeatures(&features));
477}
478
479IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, SubframeRemoval) {
480  // In this test, we'll advance the feature extractor so that it is positioned
481  // inside an iframe, and have it pause due to exceeding the chunk time limit.
482  // Then, prior to continuation, the iframe is removed from the document.
483  // As currently implemented, this should finish extraction from the removed
484  // iframe document.
485  responses_[GetURL("host.com", "frame.html").spec()] =
486      "<html><body><p><p><p><input type=password></body></html>";
487
488  base::TimeTicks now = base::TimeTicks::Now();
489  EXPECT_CALL(clock_, Now())
490      // Time check at the start of extraction.
491      .WillOnce(Return(now))
492      // Time check at the start of the first chunk of work.
493      .WillOnce(Return(now))
494      // Time check after the first 10 elements.  Enough time has passed
495      // to stop extraction.  Schedule the iframe removal to happen as soon as
496      // the feature extractor returns control to the message loop.
497      .WillOnce(DoAll(
498          Invoke(this, &PhishingDOMFeatureExtractorTest::ScheduleRemoveIframe),
499          Return(now + base::TimeDelta::FromMilliseconds(21))))
500      // Time check at the start of the second chunk of work.
501      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
502      // Time check after resuming iteration for the second chunk.
503      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(27)))
504      // A final time check for the histograms.
505      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(33)));
506
507  FeatureMap expected_features;
508  expected_features.AddBooleanFeature(features::kPageHasForms);
509  expected_features.AddBooleanFeature(features::kPageHasPswdInputs);
510
511  FeatureMap features;
512  LoadHtml(
513      "host.com",
514      "<html><head></head><body>"
515      "<iframe src=\"frame.html\" id=\"frame1\"></iframe>"
516      "<form></form></body></html>");
517  ASSERT_TRUE(ExtractFeatures(&features));
518  ExpectFeatureMapsAreEqual(features, expected_features);
519}
520
521}  // namespace safe_browsing
522