1/*
2 * Copyright (C) 2011 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5 * in compliance with the License. You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software distributed under the License
10 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11 * or implied. See the License for the specific language governing permissions and limitations under
12 * the License.
13 */
14
15package com.google.common.hash;
16
17import static com.google.common.base.Preconditions.checkArgument;
18import static com.google.common.base.Preconditions.checkNotNull;
19
20import com.google.common.annotations.Beta;
21import com.google.common.annotations.VisibleForTesting;
22import com.google.common.base.Preconditions;
23import com.google.common.hash.BloomFilterStrategies.BitArray;
24
25import java.io.Serializable;
26
27/**
28 * A Bloom filter for instances of {@code T}. A Bloom filter offers an approximate containment test
29 * with one-sided error: if it claims that an element is contained in it, this might be in error,
30 * but if it claims that an element is <i>not</i> contained in it, then this is definitely true.
31 *
32 * <p>If you are unfamiliar with Bloom filters, this nice
33 * <a href="http://llimllib.github.com/bloomfilter-tutorial/">tutorial</a> may help you understand
34 * how they work.
35 *
36 * @param <T> the type of instances that the {@code BloomFilter} accepts
37 * @author Kevin Bourrillion
38 * @author Dimitris Andreou
39 * @since 11.0
40 */
41@Beta
42public final class BloomFilter<T> implements Serializable {
43  /**
44   * A strategy to translate T instances, to {@code numHashFunctions} bit indexes.
45   */
46  interface Strategy extends java.io.Serializable {
47    /**
48     * Sets {@code numHashFunctions} bits of the given bit array, by hashing a user element.
49     */
50    <T> void put(T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits);
51
52    /**
53     * Queries {@code numHashFunctions} bits of the given bit array, by hashing a user element;
54     * returns {@code true} if and only if all selected bits are set.
55     */
56    <T> boolean mightContain(
57        T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits);
58  }
59
60  /** The bit set of the BloomFilter (not necessarily power of 2!)*/
61  private final BitArray bits;
62
63  /** Number of hashes per element */
64  private final int numHashFunctions;
65
66  /** The funnel to translate Ts to bytes */
67  private final Funnel<T> funnel;
68
69  /**
70   * The strategy we employ to map an element T to {@code numHashFunctions} bit indexes.
71   */
72  private final Strategy strategy;
73
74  /**
75   * Creates a BloomFilter.
76   */
77  private BloomFilter(BitArray bits, int numHashFunctions, Funnel<T> funnel,
78      Strategy strategy) {
79    Preconditions.checkArgument(numHashFunctions > 0, "numHashFunctions zero or negative");
80    this.bits = checkNotNull(bits);
81    this.numHashFunctions = numHashFunctions;
82    this.funnel = checkNotNull(funnel);
83    this.strategy = strategy;
84  }
85
86  /**
87   * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter,
88   * {@code false} if this is <i>definitely</i> not the case.
89   */
90  public boolean mightContain(T object) {
91    return strategy.mightContain(object, funnel, numHashFunctions, bits);
92  }
93
94  /**
95   * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of
96   * {@link #mightContain(Object)} with the same element will always return {@code true}.
97   */
98  public void put(T object) {
99    strategy.put(object, funnel, numHashFunctions, bits);
100  }
101
102  @VisibleForTesting int getHashCount() {
103    return numHashFunctions;
104  }
105
106  @VisibleForTesting double computeExpectedFalsePositiveRate(int insertions) {
107    return Math.pow(
108        1 - Math.exp(-numHashFunctions * ((double) insertions / (bits.size()))),
109        numHashFunctions);
110  }
111
112  /**
113   * Creates a {@code Builder} of a {@link BloomFilter BloomFilter<T>}, with the expected number
114   * of insertions and expected false positive probability.
115   *
116   * <p>Note that overflowing a {@code BloomFilter} with significantly more elements
117   * than specified, will result in its saturation, and a sharp deterioration of its
118   * false positive probability.
119   *
120   * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
121   * {@code Funnel<T>} is.
122   *
123   * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
124   * @param expectedInsertions the number of expected insertions to the constructed
125   *        {@code BloomFilter<T>}; must be positive
126   * @param falsePositiveProbability the desired false positive probability (must be positive and
127   *        less than 1.0)
128   * @return a {@code Builder}
129   */
130  public static <T> BloomFilter<T> create(Funnel<T> funnel, int expectedInsertions /* n */,
131      double falsePositiveProbability) {
132    checkNotNull(funnel);
133    checkArgument(expectedInsertions > 0, "Expected insertions must be positive");
134    checkArgument(falsePositiveProbability > 0.0 & falsePositiveProbability < 1.0,
135        "False positive probability in (0.0, 1.0)");
136    /*
137     * andreou: I wanted to put a warning in the javadoc about tiny fpp values,
138     * since the resulting size is proportional to -log(p), but there is not
139     * much of a point after all, e.g. optimalM(1000, 0.0000000000000001) = 76680
140     * which is less that 10kb. Who cares!
141     */
142    int numBits = optimalNumOfBits(expectedInsertions, falsePositiveProbability);
143    int numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits);
144    return new BloomFilter<T>(new BitArray(numBits), numHashFunctions, funnel,
145        BloomFilterStrategies.MURMUR128_MITZ_32);
146  }
147
148  /**
149   * Creates a {@code Builder} of a {@link BloomFilter BloomFilter<T>}, with the expected number
150   * of insertions, and a default expected false positive probability of 3%.
151   *
152   * <p>Note that overflowing a {@code BloomFilter} with significantly more elements
153   * than specified, will result in its saturation, and a sharp deterioration of its
154   * false positive probability.
155   *
156   * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
157   * {@code Funnel<T>} is.
158   *
159   * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
160   * @param expectedInsertions the number of expected insertions to the constructed
161   *        {@code BloomFilter<T>}; must be positive
162   * @return a {@code Builder}
163   */
164  public static <T> BloomFilter<T> create(Funnel<T> funnel, int expectedInsertions /* n */) {
165    return create(funnel, expectedInsertions, 0.03); // FYI, for 3%, we always get 5 hash functions
166  }
167
168  /*
169   * Cheat sheet:
170   *
171   * m: total bits
172   * n: expected insertions
173   * b: m/n, bits per insertion
174
175   * p: expected false positive probability
176   *
177   * 1) Optimal k = b * ln2
178   * 2) p = (1 - e ^ (-kn/m))^k
179   * 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b
180   * 4) For optimal k: m = -nlnp / ((ln2) ^ 2)
181   */
182
183  private static final double LN2 = Math.log(2);
184  private static final double LN2_SQUARED = LN2 * LN2;
185
186  /**
187   * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the
188   * expected insertions and total number of bits in the Bloom filter.
189   *
190   * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula.
191   *
192   * @param n expected insertions (must be positive)
193   * @param m total number of bits in Bloom filter (must be positive)
194   */
195  @VisibleForTesting static int optimalNumOfHashFunctions(int n, int m) {
196    return Math.max(1, (int) Math.round(m / n * LN2));
197  }
198
199  /**
200   * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified
201   * expected insertions, the required false positive probability.
202   *
203   * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula.
204   *
205   * @param n expected insertions (must be positive)
206   * @param p false positive rate (must be 0 < p < 1)
207   */
208  @VisibleForTesting static int optimalNumOfBits(int n, double p) {
209    return (int) (-n * Math.log(p) / LN2_SQUARED);
210  }
211
212  private Object writeReplace() {
213    return new SerialForm<T>(this);
214  }
215
216  private static class SerialForm<T> implements Serializable {
217    final long[] data;
218    final int numHashFunctions;
219    final Funnel<T> funnel;
220    final Strategy strategy;
221
222    SerialForm(BloomFilter<T> bf) {
223      this.data = bf.bits.data;
224      this.numHashFunctions = bf.numHashFunctions;
225      this.funnel = bf.funnel;
226      this.strategy = bf.strategy;
227    }
228    Object readResolve() {
229      return new BloomFilter<T>(new BitArray(data), numHashFunctions, funnel, strategy);
230    }
231    private static final long serialVersionUID = 1;
232  }
233}
234