1ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Use of this source code is governed by a BSD-style license that can be
3c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// found in the LICENSE file.
4c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
5c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// This test performs a series false positive checks using a list of URLs
6c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// against a known set of SafeBrowsing data.
7c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
8c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// It uses a normal SafeBrowsing database to create a bloom filter where it
9c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// looks up all the URLs in the url file. A URL that has a prefix found in the
10c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// bloom filter and found in the database is considered a hit: a valid lookup
11c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// that will result in a gethash request. A URL that has a prefix found in the
12c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// bloom filter but not in the database is a miss: a false positive lookup that
13c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// will result in an unnecessary gethash request.
14c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
15c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// By varying the size of the bloom filter and using a constant set of
16c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// SafeBrowsing data, we can check a known set of URLs against the filter and
17c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// determine the false positive rate.
18c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
19c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// False positive calculation usage:
20c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//   $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.FalsePositives
21c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                      --filter-start=<integer>
22c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                      --filter-steps=<integer>
23c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                      --filter-verbose
24c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
25c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//  --filter-start:   The filter multiplier to begin with. This represents the
26c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                    number of bits per prefix of memory to use in the filter.
27c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                    The default value is identical to the current SafeBrowsing
28c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                    database value.
29c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//  --filter-steps:   The number of iterations to run, with each iteration
30c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                    increasing the filter multiplier by 1. The default value
31c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                    is 1.
32c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//  --filter-verbose: Used to print out the hit / miss results per URL.
33c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//  --filter-csv:     The URL file contains information about the number of
34c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                    unique views (the popularity) of each URL. See the format
35c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                    description below.
36c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
37c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
38c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Hash compute time usage:
39c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//   $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.HashTime
40c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                      --filter-num-checks=<integer>
41c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
42c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//  --filter-num-checks: The number of hash look ups to perform on the bloom
43c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//                       filter. The default is 10 million.
44c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
45c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Data files:
46c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//    chrome/test/data/safe_browsing/filter/database
47c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//    chrome/test/data/safe_browsing/filter/urls
48c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
49c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// database: A normal SafeBrowsing database.
50c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// urls:     A text file containing a list of URLs, one per line. If the option
51c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//           --filter-csv is specified, the format of each line in the file is
52c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//           <url>,<weight> where weight is an integer indicating the number of
53c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//           unique views for the URL.
54c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
5572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen#include <algorithm>
56c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include <fstream>
57c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include <vector>
58c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
59ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "app/sql/statement.h"
60c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/command_line.h"
61c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/file_path.h"
62c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/file_util.h"
63c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/logging.h"
64ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "base/memory/scoped_ptr.h"
65c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/path_service.h"
66c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/rand_util.h"
673345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#include "base/string_number_conversions.h"
68c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/string_util.h"
69c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/time.h"
70ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "crypto/sha2.h"
71c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/browser/safe_browsing/bloom_filter.h"
72c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/browser/safe_browsing/safe_browsing_util.h"
73c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/common/chrome_paths.h"
74c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "googleurl/src/gurl.h"
75c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "testing/gtest/include/gtest/gtest.h"
76c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
77c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochusing base::Time;
78c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochusing base::TimeDelta;
79c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
80c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochnamespace {
81c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
82c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Command line flags.
833345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickconst char kFilterVerbose[] = "filter-verbose";
843345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickconst char kFilterStart[] = "filter-start";
853345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickconst char kFilterSteps[] = "filter-steps";
863345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickconst char kFilterCsv[] = "filter-csv";
873345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickconst char kFilterNumChecks[] = "filter-num-checks";
88c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
89c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Number of hash checks to make during performance testing.
90ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenconst int kNumHashChecks = 10000000;
91c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
92c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Returns the path to the data used in this test, relative to the top of the
93c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// source directory.
94c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochFilePath GetFullDataPath() {
95c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath full_path;
96c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  CHECK(PathService::Get(chrome::DIR_TEST_DATA, &full_path));
97c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  full_path = full_path.Append(FILE_PATH_LITERAL("safe_browsing"));
98c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  full_path = full_path.Append(FILE_PATH_LITERAL("filter"));
99c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  CHECK(file_util::PathExists(full_path));
100c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return full_path;
101c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
102c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
103c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Constructs a bloom filter of the appropriate size from the provided prefixes.
104c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid BuildBloomFilter(int size_multiplier,
105c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                      const std::vector<SBPrefix>& prefixes,
106c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                      BloomFilter** bloom_filter) {
107c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Create a BloomFilter with the specified size.
108c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const int key_count = std::max(static_cast<int>(prefixes.size()),
109c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 BloomFilter::kBloomFilterMinSize);
110c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const int filter_size = key_count * size_multiplier;
111c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  *bloom_filter = new BloomFilter(filter_size);
112c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
113c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Add the prefixes to it.
114c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = 0; i < prefixes.size(); ++i)
115c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    (*bloom_filter)->Insert(prefixes[i]);
116c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
117c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::cout << "Bloom filter with prefixes: " << prefixes.size()
118c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", multiplier: " << size_multiplier
119c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", size (bytes): " << (*bloom_filter)->size()
120c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << std::endl;
121c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
122c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
123c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Reads the set of add prefixes contained in a SafeBrowsing database into a
124c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// sorted array suitable for fast searching. This takes significantly less time
125c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// to look up a given prefix than performing SQL queries.
126c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool ReadDatabase(const FilePath& path, std::vector<SBPrefix>* prefixes) {
127c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath database_file = path.Append(FILE_PATH_LITERAL("database"));
128ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  sql::Connection db;
129ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (!db.Open(database_file))
130c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
131c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
132c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Get the number of items in the add_prefix table.
133ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  const char* query = "SELECT COUNT(*) FROM add_prefix";
134ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  sql::Statement count_statement(db.GetCachedStatement(SQL_FROM_HERE, query));
135ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (!count_statement)
136c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
137c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
138ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (!count_statement.Step())
139c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
140c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
141ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  const int count = count_statement.ColumnInt(0);
142c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
143ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // Load them into a prefix vector and sort.
144c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  prefixes->reserve(count);
145ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  query = "SELECT prefix FROM add_prefix";
146ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  sql::Statement prefix_statement(db.GetCachedStatement(SQL_FROM_HERE, query));
147ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (!prefix_statement)
148c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
149c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
150ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  while (prefix_statement.Step())
151ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    prefixes->push_back(prefix_statement.ColumnInt(0));
152c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
153c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(static_cast<int>(prefixes->size()) == count);
154c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  sort(prefixes->begin(), prefixes->end());
155c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
156c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return true;
157c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
158c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
159c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Generates all legal SafeBrowsing prefixes for the specified URL, and returns
160c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// the set of Prefixes that exist in the bloom filter. The function returns the
161c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// number of host + path combinations checked.
162c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochint GeneratePrefixHits(const std::string url,
163c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                       BloomFilter* bloom_filter,
164c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                       std::vector<SBPrefix>* prefixes) {
165c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  GURL url_check(url);
166c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::vector<std::string> hosts;
167c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (url_check.HostIsIPAddress()) {
168c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    hosts.push_back(url_check.host());
169c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  } else {
170c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    safe_browsing_util::GenerateHostsToCheck(url_check, &hosts);
171c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
172c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
173c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::vector<std::string> paths;
174c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  safe_browsing_util::GeneratePathsToCheck(url_check, &paths);
175c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
176c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = 0; i < hosts.size(); ++i) {
177c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    for (size_t j = 0; j < paths.size(); ++j) {
178c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      SBPrefix prefix;
179ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      crypto::SHA256HashString(hosts[i] + paths[j], &prefix, sizeof(prefix));
180c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      if (bloom_filter->Exists(prefix))
181c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        prefixes->push_back(prefix);
182c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
183c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
184c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
185c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return hosts.size() * paths.size();
186c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
187c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
188c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Binary search of sorted prefixes.
189c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool IsPrefixInDatabase(SBPrefix prefix,
190c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                        const std::vector<SBPrefix>& prefixes) {
191c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (prefixes.empty())
192c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
193c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
194c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int low = 0;
195c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int high = prefixes.size() - 1;
196c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  while (low <= high) {
197c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    int mid = ((unsigned int)low + (unsigned int)high) >> 1;
198c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    SBPrefix prefix_mid = prefixes[mid];
199c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (prefix_mid == prefix)
200c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      return true;
201c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (prefix_mid < prefix)
202c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      low = mid + 1;
203c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    else
204c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      high = mid - 1;
205c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
206c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
207c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return false;
208c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
209c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
210c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Construct a bloom filter with the given prefixes and multiplier, and test the
211c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// false positive rate (misses) against a URL list.
212c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid CalculateBloomFilterFalsePositives(
213c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    int size_multiplier,
214c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    const FilePath& data_dir,
215c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    const std::vector<SBPrefix>& prefix_list) {
216c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  BloomFilter* bloom_filter = NULL;
217c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  BuildBloomFilter(size_multiplier, prefix_list, &bloom_filter);
218c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  scoped_refptr<BloomFilter> scoped_filter(bloom_filter);
219c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
220c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Read in data file line at a time.
221c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath url_file = data_dir.Append(FILE_PATH_LITERAL("urls"));
222dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen  std::ifstream url_stream(url_file.value().c_str());
223c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
224c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Keep track of stats
225c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int hits = 0;
226c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int misses = 0;
227c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int weighted_hits = 0;
228c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int weighted_misses = 0;
229c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int url_count = 0;
230c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int prefix_count = 0;
231c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
232c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Print out volumes of data (per URL hit and miss information).
233c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  bool verbose = CommandLine::ForCurrentProcess()->HasSwitch(kFilterVerbose);
234c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  bool use_weights = CommandLine::ForCurrentProcess()->HasSwitch(kFilterCsv);
235c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
236c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string url;
237c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  while (std::getline(url_stream, url)) {
238c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    ++url_count;
239c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
240c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Handle a format that contains URLs weighted by unique views.
241c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    int weight = 1;
242c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (use_weights) {
243c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      std::string::size_type pos = url.find_last_of(",");
244c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      if (pos != std::string::npos) {
245513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch        base::StringToInt(url.begin() + pos + 1, url.end(), &weight);
246c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        url = url.substr(0, pos);
247c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      }
248c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
249c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
250c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // See if the URL is in the bloom filter.
251c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    std::vector<SBPrefix> prefixes;
252c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    prefix_count += GeneratePrefixHits(url, bloom_filter, &prefixes);
253c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
254c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // See if the prefix is actually in the database (in-memory prefix list).
255c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    for (size_t i = 0; i < prefixes.size(); ++i) {
256c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      if (IsPrefixInDatabase(prefixes[i], prefix_list)) {
257c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        ++hits;
258c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        weighted_hits += weight;
259c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        if (verbose) {
260c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          std::cout << "Hit for URL: " << url
261c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                    << " (prefix = "   << prefixes[i] << ")"
262c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                    << std::endl;
263c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        }
264c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      } else {
265c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        ++misses;
266c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        weighted_misses += weight;
267c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        if (verbose) {
268c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          std::cout << "Miss for URL: " << url
269c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                    << " (prefix = "    << prefixes[i] << ")"
270c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                    << std::endl;
271c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        }
272c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      }
273c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
274c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
275c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
276c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Print out the results for this test.
277c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::cout << "URLs checked: "      << url_count
278c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", prefix compares: " << prefix_count
279c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", hits: "            << hits
280c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", misses: "          << misses;
281c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (use_weights) {
282c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    std::cout << ", weighted hits: "   << weighted_hits
283c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch              << ", weighted misses: " << weighted_misses;
284c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
285c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::cout << std::endl;
286c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
287c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
288c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}  // namespace
289c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
290c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// This test can take several minutes to perform its calculations, so it should
291c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// be disabled until you need to run it.
292c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochTEST(SafeBrowsingBloomFilter, FalsePositives) {
293c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::vector<SBPrefix> prefix_list;
294c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath data_dir = GetFullDataPath();
295c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list));
296c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
2973345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  const CommandLine& cmd_line = *CommandLine::ForCurrentProcess();
2983345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick
299c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int start = BloomFilter::kBloomFilterSizeRatio;
3003345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  if (cmd_line.HasSwitch(kFilterStart)) {
3013345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick    ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterStart),
3023345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick                                  &start));
303c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
304c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
305c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int steps = 1;
3063345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  if (cmd_line.HasSwitch(kFilterSteps)) {
3073345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick    ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterSteps),
3083345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick                                  &steps));
309c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
310c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
311c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int stop = start + steps;
312c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
313c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (int multiplier = start; multiplier < stop; ++multiplier)
314c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    CalculateBloomFilterFalsePositives(multiplier, data_dir, prefix_list);
315c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
316c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
317c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Computes the time required for performing a number of look ups in a bloom
318c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// filter. This is useful for measuring the performance of new hash functions.
319c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochTEST(SafeBrowsingBloomFilter, HashTime) {
320c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Read the data from the database.
321c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::vector<SBPrefix> prefix_list;
322c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath data_dir = GetFullDataPath();
323c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list));
324c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
3253345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  const CommandLine& cmd_line = *CommandLine::ForCurrentProcess();
3263345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick
327c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int num_checks = kNumHashChecks;
3283345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  if (cmd_line.HasSwitch(kFilterNumChecks)) {
3293345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick    ASSERT_TRUE(
3303345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick        base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterNumChecks),
3313345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick                          &num_checks));
332c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
333c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
334c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Populate the bloom filter and measure the time.
335c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  BloomFilter* bloom_filter = NULL;
336c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  Time populate_before = Time::Now();
337c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  BuildBloomFilter(BloomFilter::kBloomFilterSizeRatio,
338c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                   prefix_list, &bloom_filter);
339c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  TimeDelta populate = Time::Now() - populate_before;
340c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
341c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Check a large number of random prefixes against the filter.
342c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int hits = 0;
343c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  Time check_before = Time::Now();
344c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (int i = 0; i < num_checks; ++i) {
345c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    uint32 prefix = static_cast<uint32>(base::RandUint64());
346c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (bloom_filter->Exists(prefix))
347c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      ++hits;
348c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
349c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  TimeDelta check = Time::Now() - check_before;
350c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
351c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int64 time_per_insert = populate.InMicroseconds() /
352c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                          static_cast<int>(prefix_list.size());
353c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int64 time_per_check = check.InMicroseconds() / num_checks;
354c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
355c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::cout << "Time results for checks: " << num_checks
356c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", prefixes: "              << prefix_list.size()
357c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", populate time (ms): "    << populate.InMilliseconds()
358c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", check time (ms): "       << check.InMilliseconds()
359c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", hits: "                  << hits
360c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", per-populate (us): "     << time_per_insert
361c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << ", per-check (us): "        << time_per_check
362c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            << std::endl;
363c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
364