filter_false_positive_perftest.cc revision 513209b27ff55e2841eac0e4120199c23acce758
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// This test performs a series false positive checks using a list of URLs
6// against a known set of SafeBrowsing data.
7//
8// It uses a normal SafeBrowsing database to create a bloom filter where it
9// looks up all the URLs in the url file. A URL that has a prefix found in the
10// bloom filter and found in the database is considered a hit: a valid lookup
11// that will result in a gethash request. A URL that has a prefix found in the
12// bloom filter but not in the database is a miss: a false positive lookup that
13// will result in an unnecessary gethash request.
14//
15// By varying the size of the bloom filter and using a constant set of
16// SafeBrowsing data, we can check a known set of URLs against the filter and
17// determine the false positive rate.
18//
19// False positive calculation usage:
20//   $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.FalsePositives
21//                      --filter-start=<integer>
22//                      --filter-steps=<integer>
23//                      --filter-verbose
24//
25//  --filter-start:   The filter multiplier to begin with. This represents the
26//                    number of bits per prefix of memory to use in the filter.
27//                    The default value is identical to the current SafeBrowsing
28//                    database value.
29//  --filter-steps:   The number of iterations to run, with each iteration
30//                    increasing the filter multiplier by 1. The default value
31//                    is 1.
32//  --filter-verbose: Used to print out the hit / miss results per URL.
33//  --filter-csv:     The URL file contains information about the number of
34//                    unique views (the popularity) of each URL. See the format
35//                    description below.
36//
37//
38// Hash compute time usage:
39//   $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.HashTime
40//                      --filter-num-checks=<integer>
41//
42//  --filter-num-checks: The number of hash look ups to perform on the bloom
43//                       filter. The default is 10 million.
44//
45// Data files:
46//    chrome/test/data/safe_browsing/filter/database
47//    chrome/test/data/safe_browsing/filter/urls
48//
49// database: A normal SafeBrowsing database.
50// urls:     A text file containing a list of URLs, one per line. If the option
51//           --filter-csv is specified, the format of each line in the file is
52//           <url>,<weight> where weight is an integer indicating the number of
53//           unique views for the URL.
54
55#include <fstream>
56#include <vector>
57
58#include "base/command_line.h"
59#include "base/file_path.h"
60#include "base/file_util.h"
61#include "base/logging.h"
62#include "base/path_service.h"
63#include "base/rand_util.h"
64#include "base/scoped_ptr.h"
65#include "base/sha2.h"
66#include "base/string_number_conversions.h"
67#include "base/string_util.h"
68#include "base/time.h"
69#include "chrome/browser/safe_browsing/bloom_filter.h"
70#include "chrome/browser/safe_browsing/safe_browsing_util.h"
71#include "chrome/common/chrome_paths.h"
72#include "chrome/common/sqlite_compiled_statement.h"
73#include "chrome/common/sqlite_utils.h"
74#include "googleurl/src/gurl.h"
75#include "testing/gtest/include/gtest/gtest.h"
76
77using base::Time;
78using base::TimeDelta;
79
80namespace {
81
82// Ensures the SafeBrowsing database is closed properly.
83class ScopedPerfDatabase {
84 public:
85  explicit ScopedPerfDatabase(sqlite3* db) : db_(db) {}
86  ~ScopedPerfDatabase() {
87    sqlite3_close(db_);
88  }
89
90 private:
91  sqlite3* db_;
92
93  DISALLOW_COPY_AND_ASSIGN(ScopedPerfDatabase);
94};
95
96// Command line flags.
97const char kFilterVerbose[] = "filter-verbose";
98const char kFilterStart[] = "filter-start";
99const char kFilterSteps[] = "filter-steps";
100const char kFilterCsv[] = "filter-csv";
101const char kFilterNumChecks[] = "filter-num-checks";
102
103// Number of hash checks to make during performance testing.
104static const int kNumHashChecks = 10000000;
105
106// Returns the path to the data used in this test, relative to the top of the
107// source directory.
108FilePath GetFullDataPath() {
109  FilePath full_path;
110  CHECK(PathService::Get(chrome::DIR_TEST_DATA, &full_path));
111  full_path = full_path.Append(FILE_PATH_LITERAL("safe_browsing"));
112  full_path = full_path.Append(FILE_PATH_LITERAL("filter"));
113  CHECK(file_util::PathExists(full_path));
114  return full_path;
115}
116
117// Constructs a bloom filter of the appropriate size from the provided prefixes.
118void BuildBloomFilter(int size_multiplier,
119                      const std::vector<SBPrefix>& prefixes,
120                      BloomFilter** bloom_filter) {
121  // Create a BloomFilter with the specified size.
122  const int key_count = std::max(static_cast<int>(prefixes.size()),
123                                 BloomFilter::kBloomFilterMinSize);
124  const int filter_size = key_count * size_multiplier;
125  *bloom_filter = new BloomFilter(filter_size);
126
127  // Add the prefixes to it.
128  for (size_t i = 0; i < prefixes.size(); ++i)
129    (*bloom_filter)->Insert(prefixes[i]);
130
131  std::cout << "Bloom filter with prefixes: " << prefixes.size()
132            << ", multiplier: " << size_multiplier
133            << ", size (bytes): " << (*bloom_filter)->size()
134            << std::endl;
135}
136
137// Reads the set of add prefixes contained in a SafeBrowsing database into a
138// sorted array suitable for fast searching. This takes significantly less time
139// to look up a given prefix than performing SQL queries.
140bool ReadDatabase(const FilePath& path, std::vector<SBPrefix>* prefixes) {
141  FilePath database_file = path.Append(FILE_PATH_LITERAL("database"));
142  sqlite3* db = NULL;
143  if (sqlite_utils::OpenSqliteDb(database_file, &db) != SQLITE_OK) {
144    sqlite3_close(db);
145    return false;
146  }
147
148  ScopedPerfDatabase database(db);
149  scoped_ptr<SqliteStatementCache> sql_cache(new SqliteStatementCache(db));
150
151  // Get the number of items in the add_prefix table.
152  std::string sql = "SELECT COUNT(*) FROM add_prefix";
153  SQLITE_UNIQUE_STATEMENT(count_statement, *sql_cache, sql.c_str());
154  if (!count_statement.is_valid())
155    return false;
156
157  if (count_statement->step() != SQLITE_ROW)
158    return false;
159
160  const int count = count_statement->column_int(0);
161
162  // Load them into a prefix vector and sort
163  prefixes->reserve(count);
164  sql = "SELECT prefix FROM add_prefix";
165  SQLITE_UNIQUE_STATEMENT(prefix_statement, *sql_cache, sql.c_str());
166  if (!prefix_statement.is_valid())
167    return false;
168
169  while (prefix_statement->step() == SQLITE_ROW)
170    prefixes->push_back(prefix_statement->column_int(0));
171
172  DCHECK(static_cast<int>(prefixes->size()) == count);
173  sort(prefixes->begin(), prefixes->end());
174
175  return true;
176}
177
178// Generates all legal SafeBrowsing prefixes for the specified URL, and returns
179// the set of Prefixes that exist in the bloom filter. The function returns the
180// number of host + path combinations checked.
181int GeneratePrefixHits(const std::string url,
182                       BloomFilter* bloom_filter,
183                       std::vector<SBPrefix>* prefixes) {
184  GURL url_check(url);
185  std::vector<std::string> hosts;
186  if (url_check.HostIsIPAddress()) {
187    hosts.push_back(url_check.host());
188  } else {
189    safe_browsing_util::GenerateHostsToCheck(url_check, &hosts);
190  }
191
192  std::vector<std::string> paths;
193  safe_browsing_util::GeneratePathsToCheck(url_check, &paths);
194
195  for (size_t i = 0; i < hosts.size(); ++i) {
196    for (size_t j = 0; j < paths.size(); ++j) {
197      SBPrefix prefix;
198      base::SHA256HashString(hosts[i] + paths[j], &prefix, sizeof(prefix));
199      if (bloom_filter->Exists(prefix))
200        prefixes->push_back(prefix);
201    }
202  }
203
204  return hosts.size() * paths.size();
205}
206
207// Binary search of sorted prefixes.
208bool IsPrefixInDatabase(SBPrefix prefix,
209                        const std::vector<SBPrefix>& prefixes) {
210  if (prefixes.empty())
211    return false;
212
213  int low = 0;
214  int high = prefixes.size() - 1;
215  while (low <= high) {
216    int mid = ((unsigned int)low + (unsigned int)high) >> 1;
217    SBPrefix prefix_mid = prefixes[mid];
218    if (prefix_mid == prefix)
219      return true;
220    if (prefix_mid < prefix)
221      low = mid + 1;
222    else
223      high = mid - 1;
224  }
225
226  return false;
227}
228
229// Construct a bloom filter with the given prefixes and multiplier, and test the
230// false positive rate (misses) against a URL list.
231void CalculateBloomFilterFalsePositives(
232    int size_multiplier,
233    const FilePath& data_dir,
234    const std::vector<SBPrefix>& prefix_list) {
235  BloomFilter* bloom_filter = NULL;
236  BuildBloomFilter(size_multiplier, prefix_list, &bloom_filter);
237  scoped_refptr<BloomFilter> scoped_filter(bloom_filter);
238
239  // Read in data file line at a time.
240  FilePath url_file = data_dir.Append(FILE_PATH_LITERAL("urls"));
241  std::ifstream url_stream(WideToASCII(url_file.ToWStringHack()).c_str());
242
243  // Keep track of stats
244  int hits = 0;
245  int misses = 0;
246  int weighted_hits = 0;
247  int weighted_misses = 0;
248  int url_count = 0;
249  int prefix_count = 0;
250
251  // Print out volumes of data (per URL hit and miss information).
252  bool verbose = CommandLine::ForCurrentProcess()->HasSwitch(kFilterVerbose);
253  bool use_weights = CommandLine::ForCurrentProcess()->HasSwitch(kFilterCsv);
254
255  std::string url;
256  while (std::getline(url_stream, url)) {
257    ++url_count;
258
259    // Handle a format that contains URLs weighted by unique views.
260    int weight = 1;
261    if (use_weights) {
262      std::string::size_type pos = url.find_last_of(",");
263      if (pos != std::string::npos) {
264        base::StringToInt(url.begin() + pos + 1, url.end(), &weight);
265        url = url.substr(0, pos);
266      }
267    }
268
269    // See if the URL is in the bloom filter.
270    std::vector<SBPrefix> prefixes;
271    prefix_count += GeneratePrefixHits(url, bloom_filter, &prefixes);
272
273    // See if the prefix is actually in the database (in-memory prefix list).
274    for (size_t i = 0; i < prefixes.size(); ++i) {
275      if (IsPrefixInDatabase(prefixes[i], prefix_list)) {
276        ++hits;
277        weighted_hits += weight;
278        if (verbose) {
279          std::cout << "Hit for URL: " << url
280                    << " (prefix = "   << prefixes[i] << ")"
281                    << std::endl;
282        }
283      } else {
284        ++misses;
285        weighted_misses += weight;
286        if (verbose) {
287          std::cout << "Miss for URL: " << url
288                    << " (prefix = "    << prefixes[i] << ")"
289                    << std::endl;
290        }
291      }
292    }
293  }
294
295  // Print out the results for this test.
296  std::cout << "URLs checked: "      << url_count
297            << ", prefix compares: " << prefix_count
298            << ", hits: "            << hits
299            << ", misses: "          << misses;
300  if (use_weights) {
301    std::cout << ", weighted hits: "   << weighted_hits
302              << ", weighted misses: " << weighted_misses;
303  }
304  std::cout << std::endl;
305}
306
307}  // namespace
308
309// This test can take several minutes to perform its calculations, so it should
310// be disabled until you need to run it.
311TEST(SafeBrowsingBloomFilter, FalsePositives) {
312  std::vector<SBPrefix> prefix_list;
313  FilePath data_dir = GetFullDataPath();
314  ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list));
315
316  const CommandLine& cmd_line = *CommandLine::ForCurrentProcess();
317
318  int start = BloomFilter::kBloomFilterSizeRatio;
319  if (cmd_line.HasSwitch(kFilterStart)) {
320    ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterStart),
321                                  &start));
322  }
323
324  int steps = 1;
325  if (cmd_line.HasSwitch(kFilterSteps)) {
326    ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterSteps),
327                                  &steps));
328  }
329
330  int stop = start + steps;
331
332  for (int multiplier = start; multiplier < stop; ++multiplier)
333    CalculateBloomFilterFalsePositives(multiplier, data_dir, prefix_list);
334}
335
336// Computes the time required for performing a number of look ups in a bloom
337// filter. This is useful for measuring the performance of new hash functions.
338TEST(SafeBrowsingBloomFilter, HashTime) {
339  // Read the data from the database.
340  std::vector<SBPrefix> prefix_list;
341  FilePath data_dir = GetFullDataPath();
342  ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list));
343
344  const CommandLine& cmd_line = *CommandLine::ForCurrentProcess();
345
346  int num_checks = kNumHashChecks;
347  if (cmd_line.HasSwitch(kFilterNumChecks)) {
348    ASSERT_TRUE(
349        base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterNumChecks),
350                          &num_checks));
351  }
352
353  // Populate the bloom filter and measure the time.
354  BloomFilter* bloom_filter = NULL;
355  Time populate_before = Time::Now();
356  BuildBloomFilter(BloomFilter::kBloomFilterSizeRatio,
357                   prefix_list, &bloom_filter);
358  TimeDelta populate = Time::Now() - populate_before;
359
360  // Check a large number of random prefixes against the filter.
361  int hits = 0;
362  Time check_before = Time::Now();
363  for (int i = 0; i < num_checks; ++i) {
364    uint32 prefix = static_cast<uint32>(base::RandUint64());
365    if (bloom_filter->Exists(prefix))
366      ++hits;
367  }
368  TimeDelta check = Time::Now() - check_before;
369
370  int64 time_per_insert = populate.InMicroseconds() /
371                          static_cast<int>(prefix_list.size());
372  int64 time_per_check = check.InMicroseconds() / num_checks;
373
374  std::cout << "Time results for checks: " << num_checks
375            << ", prefixes: "              << prefix_list.size()
376            << ", populate time (ms): "    << populate.InMilliseconds()
377            << ", check time (ms): "       << check.InMilliseconds()
378            << ", hits: "                  << hits
379            << ", per-populate (us): "     << time_per_insert
380            << ", per-check (us): "        << time_per_check
381            << std::endl;
382}
383
384