bitmap-index.h revision dfd8b8327b93660601d016cdc6f29f433b45a8d8
1
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6//     http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13//
14// Copyright 2005-2010 Google, Inc.
15// Author: sorenj@google.com (Jeffrey Sorensen)
16
17#ifndef FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
18#define FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
19
20#include <vector>
21using std::vector;
22
23#include <fst/compat.h>
24
25// This class is a bitstring storage class with an index that allows
26// seeking to the Nth set or clear bit in time O(Log(N)) where N is
27// the length of the bit vector.  In addition, it allows counting set or
28// clear bits over ranges in constant time.
29//
30// This is accomplished by maintaining an "secondary" index of limited
31// size in bits that maintains a running count of the number of bits set
32// in each block of bitmap data.  A block is defined as the number of
33// uint64 values that can fit in the secondary index before an overflow
34// occurs.
35//
36// To handle overflows, a "primary" index containing a running count of
37// bits set in each block is created using the type uint64.
38
39namespace fst {
40
41class BitmapIndex {
42 public:
43  static size_t StorageSize(size_t size) {
44    return ((size + kStorageBlockMask) >> kStorageLogBitSize);
45  }
46
47  BitmapIndex() : bits_(NULL), size_(0) { }
48
49  bool Get(size_t index) const {
50    return (bits_[index >> kStorageLogBitSize] &
51            (kOne << (index & kStorageBlockMask))) != 0;
52  }
53
54  static void Set(uint64* bits, size_t index) {
55    bits[index >> kStorageLogBitSize] |= (kOne << (index & kStorageBlockMask));
56  }
57
58  static void Clear(uint64* bits, size_t index) {
59    bits[index >> kStorageLogBitSize] &= ~(kOne << (index & kStorageBlockMask));
60  }
61
62  size_t Bits() const {
63    return size_;
64  }
65
66  size_t ArraySize() const {
67    return StorageSize(size_);
68  }
69
70  // Returns the number of one bits in the bitmap
71  size_t GetOnesCount() const {
72    return primary_index_[primary_index_size() - 1];
73  }
74
75  // Returns the number of one bits in positions 0 to limit - 1.
76  // REQUIRES: limit <= Bits()
77  size_t Rank1(size_t end) const;
78
79  // Returns the number of one bits in the range start to end - 1.
80  // REQUIRES: limit <= Bits()
81  size_t GetOnesCountInRange(size_t start, size_t end) const {
82    return Rank1(end) - Rank1(start);
83  }
84
85  // Returns the number of zero bits in positions 0 to limit - 1.
86  // REQUIRES: limit <= Bits()
87  size_t Rank0(size_t end) const {
88    return end - Rank1(end);
89  }
90
91  // Returns the number of zero bits in the range start to end - 1.
92  // REQUIRES: limit <= Bits()
93  size_t GetZeroesCountInRange(size_t start, size_t end) const {
94    return end - start - GetOnesCountInRange(start, end);
95  }
96
97  // Return true if any bit between begin inclusive and end exclusive
98  // is set.  0 <= begin <= end <= Bits() is required.
99  //
100  bool TestRange(size_t start, size_t end) const {
101    return Rank1(end) > Rank1(start);
102  }
103
104  // Returns the offset to the nth set bit (zero based)
105  // or Bits() if index >= number of ones
106  size_t Select1(size_t bit_index) const;
107
108  // Returns the offset to the nth clear bit (zero based)
109  // or Bits() if index > number of
110  size_t Select0(size_t bit_index) const;
111
112  // Rebuilds from index for the associated Bitmap, should be called
113  // whenever changes have been made to the Bitmap or else behavior
114  // of the indexed bitmap methods will be undefined.
115  void BuildIndex(const uint64 *bits, size_t size);
116
117  // the secondary index accumulates counts until it can possibly overflow
118  // this constant computes the number of uint64 units that can fit into
119  // units the size of uint16.
120  static const uint64 kOne = 1;
121  static const uint32 kStorageBitSize = 64;
122  static const uint32 kStorageLogBitSize = 6;
123  static const uint32 kSecondaryBlockSize = ((1 << 16) - 1)
124      >> kStorageLogBitSize;
125
126 private:
127  static const uint32 kStorageBlockMask = kStorageBitSize - 1;
128
129  // returns, from the index, the count of ones up to array_index
130  size_t get_index_ones_count(size_t array_index) const;
131
132  // because the indexes, both primary and secondary, contain a running
133  // count of the population of one bits contained in [0,i), there is
134  // no reason to have an element in the zeroth position as this value would
135  // necessarily be zero.  (The bits are indexed in a zero based way.)  Thus
136  // we don't store the 0th element in either index.  Both of the following
137  // functions, if greater than 0, must be decremented by one before retreiving
138  // the value from the corresponding array.
139  // returns the 1 + the block that contains the bitindex in question
140  // the inverted version works the same but looks for zeros using an inverted
141  // view of the index
142  size_t find_primary_block(size_t bit_index) const;
143
144  size_t find_inverted_primary_block(size_t bit_index) const;
145
146  // similarly, the secondary index (which resets its count to zero at
147  // the end of every kSecondaryBlockSize entries) does not store the element
148  // at 0.  Note that the rem_bit_index parameter is the number of bits
149  // within the secondary block, after the bits accounted for by the primary
150  // block have been removed (i.e. the remaining bits)  And, because we
151  // reset to zero with each new block, there is no need to store those
152  // actual zeros.
153  // returns 1 + the secondary block that contains the bitindex in question
154  size_t find_secondary_block(size_t block, size_t rem_bit_index) const;
155
156  size_t find_inverted_secondary_block(size_t block, size_t rem_bit_index)
157      const;
158
159  // We create a primary index based upon the number of secondary index
160  // blocks.  The primary index uses fields wide enough to accomodate any
161  // index of the bitarray so cannot overflow
162  // The primary index is the actual running
163  // count of one bits set for all blocks (and, thus, all uint64s).
164  size_t primary_index_size() const {
165    return (ArraySize() + kSecondaryBlockSize - 1) / kSecondaryBlockSize;
166  }
167
168  const uint64* bits_;
169  size_t size_;
170
171  // The primary index contains the running popcount of all blocks
172  // which means the nth value contains the popcounts of
173  // [0,n*kSecondaryBlockSize], however, the 0th element is omitted.
174  vector<uint32> primary_index_;
175  // The secondary index contains the running popcount of the associated
176  // bitmap.  It is the same length (in units of uint16) as the
177  // bitmap's map is in units of uint64s.
178  vector<uint16> secondary_index_;
179};
180
181}  // end namespace fst
182
183#endif  // FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
184