1dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
2dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Licensed under the Apache License, Version 2.0 (the "License");
3dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// you may not use this file except in compliance with the License.
4dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// You may obtain a copy of the License at
5dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin//
6dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin//     http://www.apache.org/licenses/LICENSE-2.0
7dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin//
8dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Unless required by applicable law or agreed to in writing, software
9dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// distributed under the License is distributed on an "AS IS" BASIS,
10dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// See the License for the specific language governing permissions and
12dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// limitations under the License.
13dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin//
14dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Copyright 2005-2010 Google, Inc.
15dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Author: Jeffrey Soresnen (sorenj@google.com)
16dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
17dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/extensions/ngram/bitmap-index.h>
18dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
19dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <algorithm>
20dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <iterator>
21dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
22dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/extensions/ngram/nthbit.h>
23dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
24dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinnamespace fst {
25dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
26dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// These two internal classes implemented inverted views of the
27dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// primary and secondary indexes.  That is, they provide iterators
28dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// that have operator*'s that return the number zeros rather than
29dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// the number of ones.
30dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
31dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinclass primary_index_inverted : public vector<uint32>::const_iterator {
32dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin public:
33dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  primary_index_inverted() {}
34dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  primary_index_inverted(vector<uint32>::const_iterator loc,
35dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                         vector<uint32>::const_iterator begin) :
36dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    vector<uint32>::const_iterator(loc), begin_(begin) {}
37dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  uint32 operator*() {
38dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    return BitmapIndex::kStorageBitSize * BitmapIndex::kSecondaryBlockSize *
39dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin           (1 + std::distance<vector<uint32>::const_iterator>(begin_, *this)) -
40dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin           vector<uint32>::const_iterator::operator*();
41dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  }
42dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin private:
43dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  vector<uint32>::const_iterator begin_;
44dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin};
45dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
46dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinclass secondary_index_inverted : public vector<uint16>::const_iterator {
47dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin public:
48dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  secondary_index_inverted() : vector<uint16>::const_iterator() {}
49dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  secondary_index_inverted(vector<uint16>::const_iterator loc,
50dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                           vector<uint16>::const_iterator block_begin) :
51dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    vector<uint16>::const_iterator(loc), block_begin_(block_begin) {}
52dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  uint16 operator*() {
53dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    return ((1 + std::distance<vector<uint16>::const_iterator>(
54dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        block_begin_, *this)) << BitmapIndex::kStorageLogBitSize) -
55dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        vector<uint16>::const_iterator::operator*();
56dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  }
57dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin private:
58dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  vector<uint16>::const_iterator block_begin_;
59dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin};
60dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
61dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::Rank1(size_t end) const {
62dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  if (end == 0) return 0;
63dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  CHECK_LE(end, Bits());
64dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  const uint32 end_word = (end - 1) >> BitmapIndex::kStorageLogBitSize;
65dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  const uint32 sum = get_index_ones_count(end_word);
66dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  const uint64 zero = 0;
67dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  const uint64 ones = ~zero;
68dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  return sum + __builtin_popcountll(bits_[end_word] &
69dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      (ones >> (kStorageBitSize - (end & kStorageBlockMask))));
70dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
71dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
72dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::Select1(size_t bit_index) const {
73dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  if (bit_index >= GetOnesCount()) return Bits();
74dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // search primary index for the relevant block
75dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  uint32 rembits = bit_index + 1;
76dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  const uint32 block = find_primary_block(bit_index + 1);
77dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  uint32 offset = 0;
78dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  if (block > 0) {
79dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    rembits -= primary_index_[block - 1];
80dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    offset += block * kSecondaryBlockSize;
81dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  }
82dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // search the secondary index
83dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  uint32 word = find_secondary_block(offset, rembits);
84dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  if (word > 0) {
85dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    rembits -= secondary_index_[offset + word - 1];
86dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    offset += word;
87dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  }
88dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  int nth = nth_bit(bits_[offset], rembits);
89dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  return (offset << BitmapIndex::kStorageLogBitSize) + nth;
90dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
91dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
92dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::Select0(size_t bit_index) const {
93dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  if (bit_index >= Bits() - GetOnesCount()) return Bits();
94dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // search inverted primary index for relevant block
95dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  uint32 remzeros = bit_index + 1;
96dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  uint32 offset = 0;
97dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  const uint32 block = find_inverted_primary_block(bit_index + 1);
98dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  if (block > 0) {
99dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    remzeros -= *primary_index_inverted(primary_index_.begin() + block - 1,
100dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                                        primary_index_.begin());
101dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    offset += block * kSecondaryBlockSize;
102dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  }
103dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // search the inverted secondary index
104dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  uint32 word = find_inverted_secondary_block(offset, remzeros);
105dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  if (word > 0) {
106dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    vector<uint16>::const_iterator block_begin =
107dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        secondary_index_.begin() + offset;
108dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    remzeros -= *secondary_index_inverted(block_begin + word - 1, block_begin);
109dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    offset += word;
110dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  }
111dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  int nth = nth_bit(~bits_[offset], remzeros);
112dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  return (offset << BitmapIndex::kStorageLogBitSize) + nth;
113dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
114dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
115dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::get_index_ones_count(size_t array_index) const {
116dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  uint32 sum = 0;
117dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  if (array_index > 0) {
118dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    sum += secondary_index_[array_index-1];
119dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    uint32 end_block = (array_index - 1) / kSecondaryBlockSize;
120dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    if (end_block > 0) sum += primary_index_[end_block-1];
121dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  }
122dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  return sum;
123dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
124dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
125dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinvoid BitmapIndex::BuildIndex(const uint64 *bits, size_t size) {
126dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  bits_ = bits;
127dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  size_ = size;
128dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  secondary_index_.clear();
129dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  secondary_index_.reserve(ArraySize());
130dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  primary_index_.clear();
131dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  primary_index_.reserve(primary_index_size());
132dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  const uint64 zero = 0;
133dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  const uint64 ones = ~zero;
134dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  uint32 popcount = 0;
135dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  for (uint32 block_begin = 0; block_begin < ArraySize();
136dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin       block_begin += kSecondaryBlockSize) {
137dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    uint32 block_popcount = 0;
138dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    uint32 block_end = block_begin + kSecondaryBlockSize;
139dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    if (block_end > ArraySize()) block_end = ArraySize();
140dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    for (uint32 j = block_begin; j < block_end; ++j) {
141dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      uint64 mask = ones;
142dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      if (j == ArraySize() - 1) {
143dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        mask = ones >> (-size_ & BitmapIndex::kStorageBlockMask);
144dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      }
145dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      block_popcount += __builtin_popcountll(bits_[j] & mask);
146dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      secondary_index_.push_back(block_popcount);
147dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    }
148dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    popcount += block_popcount;
149dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    primary_index_.push_back(popcount);
150dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  }
151dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
152dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
153dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::find_secondary_block(
154dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    size_t block_begin, size_t rem_bit_index) const {
155dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  size_t block_end = block_begin + kSecondaryBlockSize;
156dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  if (block_end > secondary_index_.size()) block_end = secondary_index_.size();
157dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  return std::distance(secondary_index_.begin() + block_begin,
158dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                       std::lower_bound(secondary_index_.begin() + block_begin,
159dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                                        secondary_index_.begin() + block_end,
160dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                                        rem_bit_index));
161dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
162dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
163dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::find_inverted_secondary_block(
164dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    size_t block_begin, size_t rem_bit_index) const {
165dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  size_t block_end = block_begin + kSecondaryBlockSize;
166dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  if (block_end > secondary_index_.size()) block_end = secondary_index_.size();
167dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  secondary_index_inverted start(secondary_index_.begin() + block_begin,
168dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                                 secondary_index_.begin() + block_begin);
169dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  secondary_index_inverted end(secondary_index_.begin() + block_end,
170dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                               secondary_index_.begin() + block_begin);
171dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  return std::distance(start,
172dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                       std::lower_bound(start, end, rem_bit_index));
173dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
174dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
175dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkininline size_t BitmapIndex::find_primary_block(size_t bit_index) const {
176dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  return std::distance(primary_index_.begin(),
177dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                       std::lower_bound(primary_index_.begin(),
178dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                                        primary_index_.end(), bit_index));
179dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
180dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
181dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::find_inverted_primary_block(size_t bit_index) const {
182dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  primary_index_inverted start(primary_index_.begin(), primary_index_.begin());
183dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  primary_index_inverted end(primary_index_.end(), primary_index_.begin());
184dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  return std::distance(start, std::lower_bound(start, end, bit_index));
185dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
186dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
187dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}  // end namespace fst
188