1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// A simple bloom filter. It uses a large number (20) of hashes to reduce the
6// possibility of false positives. The bloom filter's hashing uses random keys
7// in order to minimize the chance that a false positive for one user is a false
8// positive for all.
9//
10// The bloom filter manages it serialization to disk with the following file
11// format:
12//         4 byte version number
13//         4 byte number of hash keys (n)
14//     n * 8 bytes of hash keys
15// Remaining bytes are the filter data.
16
17#ifndef CHROME_BROWSER_SAFE_BROWSING_BLOOM_FILTER_H_
18#define CHROME_BROWSER_SAFE_BROWSING_BLOOM_FILTER_H_
19#pragma once
20
21#include <vector>
22
23#include "base/gtest_prod_util.h"
24#include "base/memory/ref_counted.h"
25#include "base/memory/scoped_ptr.h"
26#include "chrome/browser/safe_browsing/safe_browsing_util.h"
27
28class FilePath;
29
30class BloomFilter : public base::RefCountedThreadSafe<BloomFilter> {
31 public:
32  typedef uint64 HashKey;
33  typedef std::vector<HashKey> HashKeys;
34
35  // Constructs an empty filter with the given size.
36  explicit BloomFilter(int bit_size);
37
38  // Constructs a filter from serialized data. This object owns the memory and
39  // will delete it on destruction.
40  BloomFilter(char* data, int size, const HashKeys& keys);
41
42  void Insert(SBPrefix hash);
43  bool Exists(SBPrefix hash) const;
44
45  const char* data() const { return data_.get(); }
46  int size() const { return byte_size_; }
47
48  // Loading and storing the filter from / to disk.
49  static BloomFilter* LoadFile(const FilePath& filter_name);
50  bool WriteFile(const FilePath& filter_name) const;
51
52  // How many bits to use per item. See the design doc for more information.
53  static const int kBloomFilterSizeRatio = 25;
54
55  // Force a minimum size on the bloom filter to prevent a high false positive
56  // hash request rate (in bytes).
57  static const int kBloomFilterMinSize = 250000;
58
59  // Force a maximum size on the bloom filter to avoid using too much memory
60  // (in bytes).
61  static const int kBloomFilterMaxSize = 3 * 1024 * 1024;
62
63  // Use the above constants to calculate an appropriate size to pass
64  // to the BloomFilter constructor based on the intended |key_count|.
65  // TODO(shess): This is very clunky.  It would be cleaner to have
66  // the constructor manage this, but at this time the unit and perf
67  // tests wish to make their own calculations.
68  static int FilterSizeForKeyCount(int key_count);
69
70 private:
71  friend class base::RefCountedThreadSafe<BloomFilter>;
72  FRIEND_TEST_ALL_PREFIXES(SafeBrowsingBloomFilter, BloomFilterUse);
73  FRIEND_TEST_ALL_PREFIXES(SafeBrowsingBloomFilter, BloomFilterFile);
74
75  static const int kNumHashKeys = 20;
76  static const int kFileVersion = 1;
77
78  // Enumerate failures for histogramming purposes.  DO NOT CHANGE THE
79  // ORDERING OF THESE VALUES.
80  enum FailureType {
81    FAILURE_FILTER_READ_OPEN,
82    FAILURE_FILTER_READ_VERSION,
83    FAILURE_FILTER_READ_NUM_KEYS,
84    FAILURE_FILTER_READ_KEY,
85    FAILURE_FILTER_READ_DATA_MINSIZE,
86    FAILURE_FILTER_READ_DATA_MAXSIZE,
87    FAILURE_FILTER_READ_DATA_SHORT,
88    FAILURE_FILTER_READ_DATA,
89
90    // Memory space for histograms is determined by the max.  ALWAYS
91    // ADD NEW VALUES BEFORE THIS ONE.
92    FAILURE_FILTER_MAX
93  };
94
95  static void RecordFailure(FailureType failure_type);
96
97  ~BloomFilter();
98
99  int byte_size_;  // size in bytes
100  int bit_size_;   // size in bits
101  scoped_array<char> data_;
102
103  // Random keys used for hashing.
104  HashKeys hash_keys_;
105
106  DISALLOW_COPY_AND_ASSIGN(BloomFilter);
107};
108
109#endif  // CHROME_BROWSER_SAFE_BROWSING_BLOOM_FILTER_H_
110