1ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Use of this source code is governed by a BSD-style license that can be 3c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// found in the LICENSE file. 4c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 5c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// This test performs a series false positive checks using a list of URLs 6c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// against a known set of SafeBrowsing data. 7c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 8c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// It uses a normal SafeBrowsing database to create a bloom filter where it 9c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// looks up all the URLs in the url file. A URL that has a prefix found in the 10c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// bloom filter and found in the database is considered a hit: a valid lookup 11c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// that will result in a gethash request. A URL that has a prefix found in the 12c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// bloom filter but not in the database is a miss: a false positive lookup that 13c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// will result in an unnecessary gethash request. 14c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 15c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// By varying the size of the bloom filter and using a constant set of 16c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// SafeBrowsing data, we can check a known set of URLs against the filter and 17c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// determine the false positive rate. 18c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 19c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// False positive calculation usage: 20c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.FalsePositives 21c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// --filter-start=<integer> 22c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// --filter-steps=<integer> 23c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// --filter-verbose 24c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 25c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// --filter-start: The filter multiplier to begin with. This represents the 26c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// number of bits per prefix of memory to use in the filter. 27c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// The default value is identical to the current SafeBrowsing 28c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// database value. 29c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// --filter-steps: The number of iterations to run, with each iteration 30c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// increasing the filter multiplier by 1. The default value 31c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// is 1. 32c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// --filter-verbose: Used to print out the hit / miss results per URL. 33c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// --filter-csv: The URL file contains information about the number of 34c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// unique views (the popularity) of each URL. See the format 35c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// description below. 36c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 37c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 38c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Hash compute time usage: 39c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.HashTime 40c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// --filter-num-checks=<integer> 41c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 42c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// --filter-num-checks: The number of hash look ups to perform on the bloom 43c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// filter. The default is 10 million. 44c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 45c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Data files: 46c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// chrome/test/data/safe_browsing/filter/database 47c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// chrome/test/data/safe_browsing/filter/urls 48c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 49c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// database: A normal SafeBrowsing database. 50c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// urls: A text file containing a list of URLs, one per line. If the option 51c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// --filter-csv is specified, the format of each line in the file is 52c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// <url>,<weight> where weight is an integer indicating the number of 53c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// unique views for the URL. 54c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 5572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen#include <algorithm> 56c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include <fstream> 57c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include <vector> 58c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 59ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "app/sql/statement.h" 60c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/command_line.h" 61c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/file_path.h" 62c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/file_util.h" 63c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/logging.h" 64ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "base/memory/scoped_ptr.h" 65c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/path_service.h" 66c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/rand_util.h" 673345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#include "base/string_number_conversions.h" 68c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/string_util.h" 69c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/time.h" 70ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "crypto/sha2.h" 71c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/browser/safe_browsing/bloom_filter.h" 72c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/browser/safe_browsing/safe_browsing_util.h" 73c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/common/chrome_paths.h" 74c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "googleurl/src/gurl.h" 75c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "testing/gtest/include/gtest/gtest.h" 76c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 77c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochusing base::Time; 78c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochusing base::TimeDelta; 79c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 80c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochnamespace { 81c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 82c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Command line flags. 833345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickconst char kFilterVerbose[] = "filter-verbose"; 843345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickconst char kFilterStart[] = "filter-start"; 853345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickconst char kFilterSteps[] = "filter-steps"; 863345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickconst char kFilterCsv[] = "filter-csv"; 873345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickconst char kFilterNumChecks[] = "filter-num-checks"; 88c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 89c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Number of hash checks to make during performance testing. 90ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenconst int kNumHashChecks = 10000000; 91c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 92c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Returns the path to the data used in this test, relative to the top of the 93c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// source directory. 94c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochFilePath GetFullDataPath() { 95c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath full_path; 96c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CHECK(PathService::Get(chrome::DIR_TEST_DATA, &full_path)); 97c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch full_path = full_path.Append(FILE_PATH_LITERAL("safe_browsing")); 98c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch full_path = full_path.Append(FILE_PATH_LITERAL("filter")); 99c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CHECK(file_util::PathExists(full_path)); 100c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return full_path; 101c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 102c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 103c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Constructs a bloom filter of the appropriate size from the provided prefixes. 104c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid BuildBloomFilter(int size_multiplier, 105c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::vector<SBPrefix>& prefixes, 106c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch BloomFilter** bloom_filter) { 107c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Create a BloomFilter with the specified size. 108c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const int key_count = std::max(static_cast<int>(prefixes.size()), 109c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch BloomFilter::kBloomFilterMinSize); 110c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const int filter_size = key_count * size_multiplier; 111c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch *bloom_filter = new BloomFilter(filter_size); 112c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 113c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Add the prefixes to it. 114c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0; i < prefixes.size(); ++i) 115c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch (*bloom_filter)->Insert(prefixes[i]); 116c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 117c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::cout << "Bloom filter with prefixes: " << prefixes.size() 118c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", multiplier: " << size_multiplier 119c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", size (bytes): " << (*bloom_filter)->size() 120c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << std::endl; 121c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 122c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 123c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Reads the set of add prefixes contained in a SafeBrowsing database into a 124c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// sorted array suitable for fast searching. This takes significantly less time 125c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// to look up a given prefix than performing SQL queries. 126c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool ReadDatabase(const FilePath& path, std::vector<SBPrefix>* prefixes) { 127c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath database_file = path.Append(FILE_PATH_LITERAL("database")); 128ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen sql::Connection db; 129ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen if (!db.Open(database_file)) 130c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 131c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 132c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Get the number of items in the add_prefix table. 133ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen const char* query = "SELECT COUNT(*) FROM add_prefix"; 134ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen sql::Statement count_statement(db.GetCachedStatement(SQL_FROM_HERE, query)); 135ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen if (!count_statement) 136c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 137c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 138ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen if (!count_statement.Step()) 139c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 140c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 141ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen const int count = count_statement.ColumnInt(0); 142c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 143ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen // Load them into a prefix vector and sort. 144c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch prefixes->reserve(count); 145ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen query = "SELECT prefix FROM add_prefix"; 146ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen sql::Statement prefix_statement(db.GetCachedStatement(SQL_FROM_HERE, query)); 147ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen if (!prefix_statement) 148c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 149c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 150ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen while (prefix_statement.Step()) 151ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen prefixes->push_back(prefix_statement.ColumnInt(0)); 152c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 153c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(static_cast<int>(prefixes->size()) == count); 154c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch sort(prefixes->begin(), prefixes->end()); 155c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 156c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return true; 157c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 158c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 159c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Generates all legal SafeBrowsing prefixes for the specified URL, and returns 160c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// the set of Prefixes that exist in the bloom filter. The function returns the 161c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// number of host + path combinations checked. 162c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochint GeneratePrefixHits(const std::string url, 163c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch BloomFilter* bloom_filter, 164c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<SBPrefix>* prefixes) { 165c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch GURL url_check(url); 166c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<std::string> hosts; 167c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (url_check.HostIsIPAddress()) { 168c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch hosts.push_back(url_check.host()); 169c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else { 170c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch safe_browsing_util::GenerateHostsToCheck(url_check, &hosts); 171c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 172c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 173c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<std::string> paths; 174c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch safe_browsing_util::GeneratePathsToCheck(url_check, &paths); 175c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 176c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0; i < hosts.size(); ++i) { 177c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t j = 0; j < paths.size(); ++j) { 178c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch SBPrefix prefix; 179ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen crypto::SHA256HashString(hosts[i] + paths[j], &prefix, sizeof(prefix)); 180c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (bloom_filter->Exists(prefix)) 181c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch prefixes->push_back(prefix); 182c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 183c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 184c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 185c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return hosts.size() * paths.size(); 186c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 187c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 188c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Binary search of sorted prefixes. 189c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool IsPrefixInDatabase(SBPrefix prefix, 190c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::vector<SBPrefix>& prefixes) { 191c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (prefixes.empty()) 192c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 193c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 194c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int low = 0; 195c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int high = prefixes.size() - 1; 196c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch while (low <= high) { 197c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int mid = ((unsigned int)low + (unsigned int)high) >> 1; 198c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch SBPrefix prefix_mid = prefixes[mid]; 199c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (prefix_mid == prefix) 200c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return true; 201c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (prefix_mid < prefix) 202c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch low = mid + 1; 203c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch else 204c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch high = mid - 1; 205c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 206c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 207c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 208c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 209c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 210c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Construct a bloom filter with the given prefixes and multiplier, and test the 211c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// false positive rate (misses) against a URL list. 212c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid CalculateBloomFilterFalsePositives( 213c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int size_multiplier, 214c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const FilePath& data_dir, 215c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::vector<SBPrefix>& prefix_list) { 216c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch BloomFilter* bloom_filter = NULL; 217c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch BuildBloomFilter(size_multiplier, prefix_list, &bloom_filter); 218c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch scoped_refptr<BloomFilter> scoped_filter(bloom_filter); 219c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 220c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Read in data file line at a time. 221c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath url_file = data_dir.Append(FILE_PATH_LITERAL("urls")); 222dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen std::ifstream url_stream(url_file.value().c_str()); 223c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 224c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Keep track of stats 225c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int hits = 0; 226c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int misses = 0; 227c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int weighted_hits = 0; 228c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int weighted_misses = 0; 229c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int url_count = 0; 230c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int prefix_count = 0; 231c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 232c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Print out volumes of data (per URL hit and miss information). 233c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bool verbose = CommandLine::ForCurrentProcess()->HasSwitch(kFilterVerbose); 234c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bool use_weights = CommandLine::ForCurrentProcess()->HasSwitch(kFilterCsv); 235c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 236c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string url; 237c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch while (std::getline(url_stream, url)) { 238c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ++url_count; 239c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 240c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Handle a format that contains URLs weighted by unique views. 241c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int weight = 1; 242c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (use_weights) { 243c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string::size_type pos = url.find_last_of(","); 244c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (pos != std::string::npos) { 245513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch base::StringToInt(url.begin() + pos + 1, url.end(), &weight); 246c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url = url.substr(0, pos); 247c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 248c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 249c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 250c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // See if the URL is in the bloom filter. 251c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<SBPrefix> prefixes; 252c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch prefix_count += GeneratePrefixHits(url, bloom_filter, &prefixes); 253c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 254c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // See if the prefix is actually in the database (in-memory prefix list). 255c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0; i < prefixes.size(); ++i) { 256c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (IsPrefixInDatabase(prefixes[i], prefix_list)) { 257c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ++hits; 258c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch weighted_hits += weight; 259c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (verbose) { 260c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::cout << "Hit for URL: " << url 261c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << " (prefix = " << prefixes[i] << ")" 262c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << std::endl; 263c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 264c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else { 265c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ++misses; 266c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch weighted_misses += weight; 267c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (verbose) { 268c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::cout << "Miss for URL: " << url 269c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << " (prefix = " << prefixes[i] << ")" 270c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << std::endl; 271c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 272c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 273c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 274c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 275c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 276c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Print out the results for this test. 277c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::cout << "URLs checked: " << url_count 278c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", prefix compares: " << prefix_count 279c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", hits: " << hits 280c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", misses: " << misses; 281c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (use_weights) { 282c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::cout << ", weighted hits: " << weighted_hits 283c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", weighted misses: " << weighted_misses; 284c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 285c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::cout << std::endl; 286c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 287c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 288c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} // namespace 289c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 290c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// This test can take several minutes to perform its calculations, so it should 291c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// be disabled until you need to run it. 292c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochTEST(SafeBrowsingBloomFilter, FalsePositives) { 293c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<SBPrefix> prefix_list; 294c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath data_dir = GetFullDataPath(); 295c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list)); 296c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 2973345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick const CommandLine& cmd_line = *CommandLine::ForCurrentProcess(); 2983345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick 299c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int start = BloomFilter::kBloomFilterSizeRatio; 3003345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick if (cmd_line.HasSwitch(kFilterStart)) { 3013345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterStart), 3023345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick &start)); 303c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 304c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 305c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int steps = 1; 3063345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick if (cmd_line.HasSwitch(kFilterSteps)) { 3073345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterSteps), 3083345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick &steps)); 309c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 310c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 311c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int stop = start + steps; 312c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 313c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (int multiplier = start; multiplier < stop; ++multiplier) 314c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CalculateBloomFilterFalsePositives(multiplier, data_dir, prefix_list); 315c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 316c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 317c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Computes the time required for performing a number of look ups in a bloom 318c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// filter. This is useful for measuring the performance of new hash functions. 319c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochTEST(SafeBrowsingBloomFilter, HashTime) { 320c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Read the data from the database. 321c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<SBPrefix> prefix_list; 322c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath data_dir = GetFullDataPath(); 323c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list)); 324c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 3253345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick const CommandLine& cmd_line = *CommandLine::ForCurrentProcess(); 3263345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick 327c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int num_checks = kNumHashChecks; 3283345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick if (cmd_line.HasSwitch(kFilterNumChecks)) { 3293345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick ASSERT_TRUE( 3303345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterNumChecks), 3313345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick &num_checks)); 332c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 333c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 334c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Populate the bloom filter and measure the time. 335c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch BloomFilter* bloom_filter = NULL; 336c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Time populate_before = Time::Now(); 337c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch BuildBloomFilter(BloomFilter::kBloomFilterSizeRatio, 338c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch prefix_list, &bloom_filter); 339c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch TimeDelta populate = Time::Now() - populate_before; 340c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 341c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Check a large number of random prefixes against the filter. 342c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int hits = 0; 343c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Time check_before = Time::Now(); 344c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (int i = 0; i < num_checks; ++i) { 345c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch uint32 prefix = static_cast<uint32>(base::RandUint64()); 346c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (bloom_filter->Exists(prefix)) 347c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ++hits; 348c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 349c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch TimeDelta check = Time::Now() - check_before; 350c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 351c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int64 time_per_insert = populate.InMicroseconds() / 352c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch static_cast<int>(prefix_list.size()); 353c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int64 time_per_check = check.InMicroseconds() / num_checks; 354c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 355c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::cout << "Time results for checks: " << num_checks 356c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", prefixes: " << prefix_list.size() 357c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", populate time (ms): " << populate.InMilliseconds() 358c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", check time (ms): " << check.InMilliseconds() 359c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", hits: " << hits 360c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", per-populate (us): " << time_per_insert 361c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << ", per-check (us): " << time_per_check 362c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch << std::endl; 363c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 364