1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// This test performs a series false positive checks using a list of URLs
6// against a known set of SafeBrowsing data.
7//
8// It uses a normal SafeBrowsing database to create a bloom filter where it
9// looks up all the URLs in the url file. A URL that has a prefix found in the
10// bloom filter and found in the database is considered a hit: a valid lookup
11// that will result in a gethash request. A URL that has a prefix found in the
12// bloom filter but not in the database is a miss: a false positive lookup that
13// will result in an unnecessary gethash request.
14//
15// By varying the size of the bloom filter and using a constant set of
16// SafeBrowsing data, we can check a known set of URLs against the filter and
17// determine the false positive rate.
18//
19// False positive calculation usage:
20//   $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.FalsePositives
21//                      --filter-start=<integer>
22//                      --filter-steps=<integer>
23//                      --filter-verbose
24//
25//  --filter-start:   The filter multiplier to begin with. This represents the
26//                    number of bits per prefix of memory to use in the filter.
27//                    The default value is identical to the current SafeBrowsing
28//                    database value.
29//  --filter-steps:   The number of iterations to run, with each iteration
30//                    increasing the filter multiplier by 1. The default value
31//                    is 1.
32//  --filter-verbose: Used to print out the hit / miss results per URL.
33//  --filter-csv:     The URL file contains information about the number of
34//                    unique views (the popularity) of each URL. See the format
35//                    description below.
36//
37//
38// Hash compute time usage:
39//   $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.HashTime
40//                      --filter-num-checks=<integer>
41//
42//  --filter-num-checks: The number of hash look ups to perform on the bloom
43//                       filter. The default is 10 million.
44//
45// Data files:
46//    chrome/test/data/safe_browsing/filter/database
47//    chrome/test/data/safe_browsing/filter/urls
48//
49// database: A normal SafeBrowsing database.
50// urls:     A text file containing a list of URLs, one per line. If the option
51//           --filter-csv is specified, the format of each line in the file is
52//           <url>,<weight> where weight is an integer indicating the number of
53//           unique views for the URL.
54
55#include <algorithm>
56#include <fstream>
57#include <vector>
58
59#include "app/sql/statement.h"
60#include "base/command_line.h"
61#include "base/file_path.h"
62#include "base/file_util.h"
63#include "base/logging.h"
64#include "base/memory/scoped_ptr.h"
65#include "base/path_service.h"
66#include "base/rand_util.h"
67#include "base/string_number_conversions.h"
68#include "base/string_util.h"
69#include "base/time.h"
70#include "crypto/sha2.h"
71#include "chrome/browser/safe_browsing/bloom_filter.h"
72#include "chrome/browser/safe_browsing/safe_browsing_util.h"
73#include "chrome/common/chrome_paths.h"
74#include "googleurl/src/gurl.h"
75#include "testing/gtest/include/gtest/gtest.h"
76
77using base::Time;
78using base::TimeDelta;
79
80namespace {
81
82// Command line flags.
83const char kFilterVerbose[] = "filter-verbose";
84const char kFilterStart[] = "filter-start";
85const char kFilterSteps[] = "filter-steps";
86const char kFilterCsv[] = "filter-csv";
87const char kFilterNumChecks[] = "filter-num-checks";
88
89// Number of hash checks to make during performance testing.
90const int kNumHashChecks = 10000000;
91
92// Returns the path to the data used in this test, relative to the top of the
93// source directory.
94FilePath GetFullDataPath() {
95  FilePath full_path;
96  CHECK(PathService::Get(chrome::DIR_TEST_DATA, &full_path));
97  full_path = full_path.Append(FILE_PATH_LITERAL("safe_browsing"));
98  full_path = full_path.Append(FILE_PATH_LITERAL("filter"));
99  CHECK(file_util::PathExists(full_path));
100  return full_path;
101}
102
103// Constructs a bloom filter of the appropriate size from the provided prefixes.
104void BuildBloomFilter(int size_multiplier,
105                      const std::vector<SBPrefix>& prefixes,
106                      BloomFilter** bloom_filter) {
107  // Create a BloomFilter with the specified size.
108  const int key_count = std::max(static_cast<int>(prefixes.size()),
109                                 BloomFilter::kBloomFilterMinSize);
110  const int filter_size = key_count * size_multiplier;
111  *bloom_filter = new BloomFilter(filter_size);
112
113  // Add the prefixes to it.
114  for (size_t i = 0; i < prefixes.size(); ++i)
115    (*bloom_filter)->Insert(prefixes[i]);
116
117  std::cout << "Bloom filter with prefixes: " << prefixes.size()
118            << ", multiplier: " << size_multiplier
119            << ", size (bytes): " << (*bloom_filter)->size()
120            << std::endl;
121}
122
123// Reads the set of add prefixes contained in a SafeBrowsing database into a
124// sorted array suitable for fast searching. This takes significantly less time
125// to look up a given prefix than performing SQL queries.
126bool ReadDatabase(const FilePath& path, std::vector<SBPrefix>* prefixes) {
127  FilePath database_file = path.Append(FILE_PATH_LITERAL("database"));
128  sql::Connection db;
129  if (!db.Open(database_file))
130    return false;
131
132  // Get the number of items in the add_prefix table.
133  const char* query = "SELECT COUNT(*) FROM add_prefix";
134  sql::Statement count_statement(db.GetCachedStatement(SQL_FROM_HERE, query));
135  if (!count_statement)
136    return false;
137
138  if (!count_statement.Step())
139    return false;
140
141  const int count = count_statement.ColumnInt(0);
142
143  // Load them into a prefix vector and sort.
144  prefixes->reserve(count);
145  query = "SELECT prefix FROM add_prefix";
146  sql::Statement prefix_statement(db.GetCachedStatement(SQL_FROM_HERE, query));
147  if (!prefix_statement)
148    return false;
149
150  while (prefix_statement.Step())
151    prefixes->push_back(prefix_statement.ColumnInt(0));
152
153  DCHECK(static_cast<int>(prefixes->size()) == count);
154  sort(prefixes->begin(), prefixes->end());
155
156  return true;
157}
158
159// Generates all legal SafeBrowsing prefixes for the specified URL, and returns
160// the set of Prefixes that exist in the bloom filter. The function returns the
161// number of host + path combinations checked.
162int GeneratePrefixHits(const std::string url,
163                       BloomFilter* bloom_filter,
164                       std::vector<SBPrefix>* prefixes) {
165  GURL url_check(url);
166  std::vector<std::string> hosts;
167  if (url_check.HostIsIPAddress()) {
168    hosts.push_back(url_check.host());
169  } else {
170    safe_browsing_util::GenerateHostsToCheck(url_check, &hosts);
171  }
172
173  std::vector<std::string> paths;
174  safe_browsing_util::GeneratePathsToCheck(url_check, &paths);
175
176  for (size_t i = 0; i < hosts.size(); ++i) {
177    for (size_t j = 0; j < paths.size(); ++j) {
178      SBPrefix prefix;
179      crypto::SHA256HashString(hosts[i] + paths[j], &prefix, sizeof(prefix));
180      if (bloom_filter->Exists(prefix))
181        prefixes->push_back(prefix);
182    }
183  }
184
185  return hosts.size() * paths.size();
186}
187
188// Binary search of sorted prefixes.
189bool IsPrefixInDatabase(SBPrefix prefix,
190                        const std::vector<SBPrefix>& prefixes) {
191  if (prefixes.empty())
192    return false;
193
194  int low = 0;
195  int high = prefixes.size() - 1;
196  while (low <= high) {
197    int mid = ((unsigned int)low + (unsigned int)high) >> 1;
198    SBPrefix prefix_mid = prefixes[mid];
199    if (prefix_mid == prefix)
200      return true;
201    if (prefix_mid < prefix)
202      low = mid + 1;
203    else
204      high = mid - 1;
205  }
206
207  return false;
208}
209
210// Construct a bloom filter with the given prefixes and multiplier, and test the
211// false positive rate (misses) against a URL list.
212void CalculateBloomFilterFalsePositives(
213    int size_multiplier,
214    const FilePath& data_dir,
215    const std::vector<SBPrefix>& prefix_list) {
216  BloomFilter* bloom_filter = NULL;
217  BuildBloomFilter(size_multiplier, prefix_list, &bloom_filter);
218  scoped_refptr<BloomFilter> scoped_filter(bloom_filter);
219
220  // Read in data file line at a time.
221  FilePath url_file = data_dir.Append(FILE_PATH_LITERAL("urls"));
222  std::ifstream url_stream(url_file.value().c_str());
223
224  // Keep track of stats
225  int hits = 0;
226  int misses = 0;
227  int weighted_hits = 0;
228  int weighted_misses = 0;
229  int url_count = 0;
230  int prefix_count = 0;
231
232  // Print out volumes of data (per URL hit and miss information).
233  bool verbose = CommandLine::ForCurrentProcess()->HasSwitch(kFilterVerbose);
234  bool use_weights = CommandLine::ForCurrentProcess()->HasSwitch(kFilterCsv);
235
236  std::string url;
237  while (std::getline(url_stream, url)) {
238    ++url_count;
239
240    // Handle a format that contains URLs weighted by unique views.
241    int weight = 1;
242    if (use_weights) {
243      std::string::size_type pos = url.find_last_of(",");
244      if (pos != std::string::npos) {
245        base::StringToInt(url.begin() + pos + 1, url.end(), &weight);
246        url = url.substr(0, pos);
247      }
248    }
249
250    // See if the URL is in the bloom filter.
251    std::vector<SBPrefix> prefixes;
252    prefix_count += GeneratePrefixHits(url, bloom_filter, &prefixes);
253
254    // See if the prefix is actually in the database (in-memory prefix list).
255    for (size_t i = 0; i < prefixes.size(); ++i) {
256      if (IsPrefixInDatabase(prefixes[i], prefix_list)) {
257        ++hits;
258        weighted_hits += weight;
259        if (verbose) {
260          std::cout << "Hit for URL: " << url
261                    << " (prefix = "   << prefixes[i] << ")"
262                    << std::endl;
263        }
264      } else {
265        ++misses;
266        weighted_misses += weight;
267        if (verbose) {
268          std::cout << "Miss for URL: " << url
269                    << " (prefix = "    << prefixes[i] << ")"
270                    << std::endl;
271        }
272      }
273    }
274  }
275
276  // Print out the results for this test.
277  std::cout << "URLs checked: "      << url_count
278            << ", prefix compares: " << prefix_count
279            << ", hits: "            << hits
280            << ", misses: "          << misses;
281  if (use_weights) {
282    std::cout << ", weighted hits: "   << weighted_hits
283              << ", weighted misses: " << weighted_misses;
284  }
285  std::cout << std::endl;
286}
287
288}  // namespace
289
290// This test can take several minutes to perform its calculations, so it should
291// be disabled until you need to run it.
292TEST(SafeBrowsingBloomFilter, FalsePositives) {
293  std::vector<SBPrefix> prefix_list;
294  FilePath data_dir = GetFullDataPath();
295  ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list));
296
297  const CommandLine& cmd_line = *CommandLine::ForCurrentProcess();
298
299  int start = BloomFilter::kBloomFilterSizeRatio;
300  if (cmd_line.HasSwitch(kFilterStart)) {
301    ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterStart),
302                                  &start));
303  }
304
305  int steps = 1;
306  if (cmd_line.HasSwitch(kFilterSteps)) {
307    ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterSteps),
308                                  &steps));
309  }
310
311  int stop = start + steps;
312
313  for (int multiplier = start; multiplier < stop; ++multiplier)
314    CalculateBloomFilterFalsePositives(multiplier, data_dir, prefix_list);
315}
316
317// Computes the time required for performing a number of look ups in a bloom
318// filter. This is useful for measuring the performance of new hash functions.
319TEST(SafeBrowsingBloomFilter, HashTime) {
320  // Read the data from the database.
321  std::vector<SBPrefix> prefix_list;
322  FilePath data_dir = GetFullDataPath();
323  ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list));
324
325  const CommandLine& cmd_line = *CommandLine::ForCurrentProcess();
326
327  int num_checks = kNumHashChecks;
328  if (cmd_line.HasSwitch(kFilterNumChecks)) {
329    ASSERT_TRUE(
330        base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterNumChecks),
331                          &num_checks));
332  }
333
334  // Populate the bloom filter and measure the time.
335  BloomFilter* bloom_filter = NULL;
336  Time populate_before = Time::Now();
337  BuildBloomFilter(BloomFilter::kBloomFilterSizeRatio,
338                   prefix_list, &bloom_filter);
339  TimeDelta populate = Time::Now() - populate_before;
340
341  // Check a large number of random prefixes against the filter.
342  int hits = 0;
343  Time check_before = Time::Now();
344  for (int i = 0; i < num_checks; ++i) {
345    uint32 prefix = static_cast<uint32>(base::RandUint64());
346    if (bloom_filter->Exists(prefix))
347      ++hits;
348  }
349  TimeDelta check = Time::Now() - check_before;
350
351  int64 time_per_insert = populate.InMicroseconds() /
352                          static_cast<int>(prefix_list.size());
353  int64 time_per_check = check.InMicroseconds() / num_checks;
354
355  std::cout << "Time results for checks: " << num_checks
356            << ", prefixes: "              << prefix_list.size()
357            << ", populate time (ms): "    << populate.InMilliseconds()
358            << ", check time (ms): "       << check.InMilliseconds()
359            << ", hits: "                  << hits
360            << ", per-populate (us): "     << time_per_insert
361            << ", per-check (us): "        << time_per_check
362            << std::endl;
363}
364