1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LATINIME_BLOOM_FILTER_H
18#define LATINIME_BLOOM_FILTER_H
19
20#include <bitset>
21
22#include "defines.h"
23
24namespace latinime {
25
26// This bloom filter is used for optimizing bigram retrieval.
27// Execution times with previous word "this" are as follows:
28//  without bloom filter (use only hash_map):
29//   Total 147792.34 (sum of others 147771.57)
30//  with bloom filter:
31//   Total 145900.64 (sum of others 145874.30)
32//  always read binary dictionary:
33//   Total 148603.14 (sum of others 148579.90)
34class BloomFilter {
35 public:
36    BloomFilter() : mFilter() {}
37
38    AK_FORCE_INLINE void setInFilter(const int position) {
39        mFilter.set(getIndex(position));
40    }
41
42    AK_FORCE_INLINE bool isInFilter(const int position) const {
43        return mFilter.test(getIndex(position));
44    }
45
46 private:
47    DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter);
48
49    AK_FORCE_INLINE size_t getIndex(const int position) const {
50        return static_cast<size_t>(position) % BIGRAM_FILTER_MODULO;
51    }
52
53    // Size, in bits, of the bloom filter index for bigrams
54    // The probability of false positive is (1 - e ** (-kn/m))**k,
55    // where k is the number of hash functions, n the number of bigrams, and m the number of
56    // bits we can test.
57    // At the moment 100 is the maximum number of bigrams for a word with the current main
58    // dictionaries, so n = 100. 1024 buckets give us m = 1024.
59    // With 1 hash function, our false positive rate is about 9.3%, which should be enough for
60    // our uses since we are only using this to increase average performance. For the record,
61    // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
62    // and m = 4096 gives 2.4%.
63    // This is assigned here because it is used for bitset size.
64    // 1021 is the largest prime under 1024.
65    static const size_t BIGRAM_FILTER_MODULO = 1021;
66    std::bitset<BIGRAM_FILTER_MODULO> mFilter;
67};
68} // namespace latinime
69#endif // LATINIME_BLOOM_FILTER_H
70