filter_false_positive_perftest.cc revision 513209b27ff55e2841eac0e4120199c23acce758
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// This test performs a series false positive checks using a list of URLs 6// against a known set of SafeBrowsing data. 7// 8// It uses a normal SafeBrowsing database to create a bloom filter where it 9// looks up all the URLs in the url file. A URL that has a prefix found in the 10// bloom filter and found in the database is considered a hit: a valid lookup 11// that will result in a gethash request. A URL that has a prefix found in the 12// bloom filter but not in the database is a miss: a false positive lookup that 13// will result in an unnecessary gethash request. 14// 15// By varying the size of the bloom filter and using a constant set of 16// SafeBrowsing data, we can check a known set of URLs against the filter and 17// determine the false positive rate. 18// 19// False positive calculation usage: 20// $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.FalsePositives 21// --filter-start=<integer> 22// --filter-steps=<integer> 23// --filter-verbose 24// 25// --filter-start: The filter multiplier to begin with. This represents the 26// number of bits per prefix of memory to use in the filter. 27// The default value is identical to the current SafeBrowsing 28// database value. 29// --filter-steps: The number of iterations to run, with each iteration 30// increasing the filter multiplier by 1. The default value 31// is 1. 32// --filter-verbose: Used to print out the hit / miss results per URL. 33// --filter-csv: The URL file contains information about the number of 34// unique views (the popularity) of each URL. See the format 35// description below. 36// 37// 38// Hash compute time usage: 39// $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.HashTime 40// --filter-num-checks=<integer> 41// 42// --filter-num-checks: The number of hash look ups to perform on the bloom 43// filter. The default is 10 million. 44// 45// Data files: 46// chrome/test/data/safe_browsing/filter/database 47// chrome/test/data/safe_browsing/filter/urls 48// 49// database: A normal SafeBrowsing database. 50// urls: A text file containing a list of URLs, one per line. If the option 51// --filter-csv is specified, the format of each line in the file is 52// <url>,<weight> where weight is an integer indicating the number of 53// unique views for the URL. 54 55#include <fstream> 56#include <vector> 57 58#include "base/command_line.h" 59#include "base/file_path.h" 60#include "base/file_util.h" 61#include "base/logging.h" 62#include "base/path_service.h" 63#include "base/rand_util.h" 64#include "base/scoped_ptr.h" 65#include "base/sha2.h" 66#include "base/string_number_conversions.h" 67#include "base/string_util.h" 68#include "base/time.h" 69#include "chrome/browser/safe_browsing/bloom_filter.h" 70#include "chrome/browser/safe_browsing/safe_browsing_util.h" 71#include "chrome/common/chrome_paths.h" 72#include "chrome/common/sqlite_compiled_statement.h" 73#include "chrome/common/sqlite_utils.h" 74#include "googleurl/src/gurl.h" 75#include "testing/gtest/include/gtest/gtest.h" 76 77using base::Time; 78using base::TimeDelta; 79 80namespace { 81 82// Ensures the SafeBrowsing database is closed properly. 83class ScopedPerfDatabase { 84 public: 85 explicit ScopedPerfDatabase(sqlite3* db) : db_(db) {} 86 ~ScopedPerfDatabase() { 87 sqlite3_close(db_); 88 } 89 90 private: 91 sqlite3* db_; 92 93 DISALLOW_COPY_AND_ASSIGN(ScopedPerfDatabase); 94}; 95 96// Command line flags. 97const char kFilterVerbose[] = "filter-verbose"; 98const char kFilterStart[] = "filter-start"; 99const char kFilterSteps[] = "filter-steps"; 100const char kFilterCsv[] = "filter-csv"; 101const char kFilterNumChecks[] = "filter-num-checks"; 102 103// Number of hash checks to make during performance testing. 104static const int kNumHashChecks = 10000000; 105 106// Returns the path to the data used in this test, relative to the top of the 107// source directory. 108FilePath GetFullDataPath() { 109 FilePath full_path; 110 CHECK(PathService::Get(chrome::DIR_TEST_DATA, &full_path)); 111 full_path = full_path.Append(FILE_PATH_LITERAL("safe_browsing")); 112 full_path = full_path.Append(FILE_PATH_LITERAL("filter")); 113 CHECK(file_util::PathExists(full_path)); 114 return full_path; 115} 116 117// Constructs a bloom filter of the appropriate size from the provided prefixes. 118void BuildBloomFilter(int size_multiplier, 119 const std::vector<SBPrefix>& prefixes, 120 BloomFilter** bloom_filter) { 121 // Create a BloomFilter with the specified size. 122 const int key_count = std::max(static_cast<int>(prefixes.size()), 123 BloomFilter::kBloomFilterMinSize); 124 const int filter_size = key_count * size_multiplier; 125 *bloom_filter = new BloomFilter(filter_size); 126 127 // Add the prefixes to it. 128 for (size_t i = 0; i < prefixes.size(); ++i) 129 (*bloom_filter)->Insert(prefixes[i]); 130 131 std::cout << "Bloom filter with prefixes: " << prefixes.size() 132 << ", multiplier: " << size_multiplier 133 << ", size (bytes): " << (*bloom_filter)->size() 134 << std::endl; 135} 136 137// Reads the set of add prefixes contained in a SafeBrowsing database into a 138// sorted array suitable for fast searching. This takes significantly less time 139// to look up a given prefix than performing SQL queries. 140bool ReadDatabase(const FilePath& path, std::vector<SBPrefix>* prefixes) { 141 FilePath database_file = path.Append(FILE_PATH_LITERAL("database")); 142 sqlite3* db = NULL; 143 if (sqlite_utils::OpenSqliteDb(database_file, &db) != SQLITE_OK) { 144 sqlite3_close(db); 145 return false; 146 } 147 148 ScopedPerfDatabase database(db); 149 scoped_ptr<SqliteStatementCache> sql_cache(new SqliteStatementCache(db)); 150 151 // Get the number of items in the add_prefix table. 152 std::string sql = "SELECT COUNT(*) FROM add_prefix"; 153 SQLITE_UNIQUE_STATEMENT(count_statement, *sql_cache, sql.c_str()); 154 if (!count_statement.is_valid()) 155 return false; 156 157 if (count_statement->step() != SQLITE_ROW) 158 return false; 159 160 const int count = count_statement->column_int(0); 161 162 // Load them into a prefix vector and sort 163 prefixes->reserve(count); 164 sql = "SELECT prefix FROM add_prefix"; 165 SQLITE_UNIQUE_STATEMENT(prefix_statement, *sql_cache, sql.c_str()); 166 if (!prefix_statement.is_valid()) 167 return false; 168 169 while (prefix_statement->step() == SQLITE_ROW) 170 prefixes->push_back(prefix_statement->column_int(0)); 171 172 DCHECK(static_cast<int>(prefixes->size()) == count); 173 sort(prefixes->begin(), prefixes->end()); 174 175 return true; 176} 177 178// Generates all legal SafeBrowsing prefixes for the specified URL, and returns 179// the set of Prefixes that exist in the bloom filter. The function returns the 180// number of host + path combinations checked. 181int GeneratePrefixHits(const std::string url, 182 BloomFilter* bloom_filter, 183 std::vector<SBPrefix>* prefixes) { 184 GURL url_check(url); 185 std::vector<std::string> hosts; 186 if (url_check.HostIsIPAddress()) { 187 hosts.push_back(url_check.host()); 188 } else { 189 safe_browsing_util::GenerateHostsToCheck(url_check, &hosts); 190 } 191 192 std::vector<std::string> paths; 193 safe_browsing_util::GeneratePathsToCheck(url_check, &paths); 194 195 for (size_t i = 0; i < hosts.size(); ++i) { 196 for (size_t j = 0; j < paths.size(); ++j) { 197 SBPrefix prefix; 198 base::SHA256HashString(hosts[i] + paths[j], &prefix, sizeof(prefix)); 199 if (bloom_filter->Exists(prefix)) 200 prefixes->push_back(prefix); 201 } 202 } 203 204 return hosts.size() * paths.size(); 205} 206 207// Binary search of sorted prefixes. 208bool IsPrefixInDatabase(SBPrefix prefix, 209 const std::vector<SBPrefix>& prefixes) { 210 if (prefixes.empty()) 211 return false; 212 213 int low = 0; 214 int high = prefixes.size() - 1; 215 while (low <= high) { 216 int mid = ((unsigned int)low + (unsigned int)high) >> 1; 217 SBPrefix prefix_mid = prefixes[mid]; 218 if (prefix_mid == prefix) 219 return true; 220 if (prefix_mid < prefix) 221 low = mid + 1; 222 else 223 high = mid - 1; 224 } 225 226 return false; 227} 228 229// Construct a bloom filter with the given prefixes and multiplier, and test the 230// false positive rate (misses) against a URL list. 231void CalculateBloomFilterFalsePositives( 232 int size_multiplier, 233 const FilePath& data_dir, 234 const std::vector<SBPrefix>& prefix_list) { 235 BloomFilter* bloom_filter = NULL; 236 BuildBloomFilter(size_multiplier, prefix_list, &bloom_filter); 237 scoped_refptr<BloomFilter> scoped_filter(bloom_filter); 238 239 // Read in data file line at a time. 240 FilePath url_file = data_dir.Append(FILE_PATH_LITERAL("urls")); 241 std::ifstream url_stream(WideToASCII(url_file.ToWStringHack()).c_str()); 242 243 // Keep track of stats 244 int hits = 0; 245 int misses = 0; 246 int weighted_hits = 0; 247 int weighted_misses = 0; 248 int url_count = 0; 249 int prefix_count = 0; 250 251 // Print out volumes of data (per URL hit and miss information). 252 bool verbose = CommandLine::ForCurrentProcess()->HasSwitch(kFilterVerbose); 253 bool use_weights = CommandLine::ForCurrentProcess()->HasSwitch(kFilterCsv); 254 255 std::string url; 256 while (std::getline(url_stream, url)) { 257 ++url_count; 258 259 // Handle a format that contains URLs weighted by unique views. 260 int weight = 1; 261 if (use_weights) { 262 std::string::size_type pos = url.find_last_of(","); 263 if (pos != std::string::npos) { 264 base::StringToInt(url.begin() + pos + 1, url.end(), &weight); 265 url = url.substr(0, pos); 266 } 267 } 268 269 // See if the URL is in the bloom filter. 270 std::vector<SBPrefix> prefixes; 271 prefix_count += GeneratePrefixHits(url, bloom_filter, &prefixes); 272 273 // See if the prefix is actually in the database (in-memory prefix list). 274 for (size_t i = 0; i < prefixes.size(); ++i) { 275 if (IsPrefixInDatabase(prefixes[i], prefix_list)) { 276 ++hits; 277 weighted_hits += weight; 278 if (verbose) { 279 std::cout << "Hit for URL: " << url 280 << " (prefix = " << prefixes[i] << ")" 281 << std::endl; 282 } 283 } else { 284 ++misses; 285 weighted_misses += weight; 286 if (verbose) { 287 std::cout << "Miss for URL: " << url 288 << " (prefix = " << prefixes[i] << ")" 289 << std::endl; 290 } 291 } 292 } 293 } 294 295 // Print out the results for this test. 296 std::cout << "URLs checked: " << url_count 297 << ", prefix compares: " << prefix_count 298 << ", hits: " << hits 299 << ", misses: " << misses; 300 if (use_weights) { 301 std::cout << ", weighted hits: " << weighted_hits 302 << ", weighted misses: " << weighted_misses; 303 } 304 std::cout << std::endl; 305} 306 307} // namespace 308 309// This test can take several minutes to perform its calculations, so it should 310// be disabled until you need to run it. 311TEST(SafeBrowsingBloomFilter, FalsePositives) { 312 std::vector<SBPrefix> prefix_list; 313 FilePath data_dir = GetFullDataPath(); 314 ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list)); 315 316 const CommandLine& cmd_line = *CommandLine::ForCurrentProcess(); 317 318 int start = BloomFilter::kBloomFilterSizeRatio; 319 if (cmd_line.HasSwitch(kFilterStart)) { 320 ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterStart), 321 &start)); 322 } 323 324 int steps = 1; 325 if (cmd_line.HasSwitch(kFilterSteps)) { 326 ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterSteps), 327 &steps)); 328 } 329 330 int stop = start + steps; 331 332 for (int multiplier = start; multiplier < stop; ++multiplier) 333 CalculateBloomFilterFalsePositives(multiplier, data_dir, prefix_list); 334} 335 336// Computes the time required for performing a number of look ups in a bloom 337// filter. This is useful for measuring the performance of new hash functions. 338TEST(SafeBrowsingBloomFilter, HashTime) { 339 // Read the data from the database. 340 std::vector<SBPrefix> prefix_list; 341 FilePath data_dir = GetFullDataPath(); 342 ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list)); 343 344 const CommandLine& cmd_line = *CommandLine::ForCurrentProcess(); 345 346 int num_checks = kNumHashChecks; 347 if (cmd_line.HasSwitch(kFilterNumChecks)) { 348 ASSERT_TRUE( 349 base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterNumChecks), 350 &num_checks)); 351 } 352 353 // Populate the bloom filter and measure the time. 354 BloomFilter* bloom_filter = NULL; 355 Time populate_before = Time::Now(); 356 BuildBloomFilter(BloomFilter::kBloomFilterSizeRatio, 357 prefix_list, &bloom_filter); 358 TimeDelta populate = Time::Now() - populate_before; 359 360 // Check a large number of random prefixes against the filter. 361 int hits = 0; 362 Time check_before = Time::Now(); 363 for (int i = 0; i < num_checks; ++i) { 364 uint32 prefix = static_cast<uint32>(base::RandUint64()); 365 if (bloom_filter->Exists(prefix)) 366 ++hits; 367 } 368 TimeDelta check = Time::Now() - check_before; 369 370 int64 time_per_insert = populate.InMicroseconds() / 371 static_cast<int>(prefix_list.size()); 372 int64 time_per_check = check.InMicroseconds() / num_checks; 373 374 std::cout << "Time results for checks: " << num_checks 375 << ", prefixes: " << prefix_list.size() 376 << ", populate time (ms): " << populate.InMilliseconds() 377 << ", check time (ms): " << check.InMilliseconds() 378 << ", hits: " << hits 379 << ", per-populate (us): " << time_per_insert 380 << ", per-check (us): " << time_per_check 381 << std::endl; 382} 383 384