1dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 2dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Licensed under the Apache License, Version 2.0 (the "License"); 3dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// you may not use this file except in compliance with the License. 4dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// You may obtain a copy of the License at 5dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// 6dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// http://www.apache.org/licenses/LICENSE-2.0 7dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// 8dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Unless required by applicable law or agreed to in writing, software 9dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// distributed under the License is distributed on an "AS IS" BASIS, 10dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// See the License for the specific language governing permissions and 12dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// limitations under the License. 13dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// 14dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Copyright 2005-2010 Google, Inc. 15dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Author: Jeffrey Soresnen (sorenj@google.com) 16dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 17dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/extensions/ngram/bitmap-index.h> 18dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 19dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <algorithm> 20dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <iterator> 21dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 22dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/extensions/ngram/nthbit.h> 23dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 24dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinnamespace fst { 25dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 26dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// These two internal classes implemented inverted views of the 27dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// primary and secondary indexes. That is, they provide iterators 28dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// that have operator*'s that return the number zeros rather than 29dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// the number of ones. 30dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 31dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinclass primary_index_inverted : public vector<uint32>::const_iterator { 32dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin public: 33dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin primary_index_inverted() {} 34dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin primary_index_inverted(vector<uint32>::const_iterator loc, 35dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<uint32>::const_iterator begin) : 36dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<uint32>::const_iterator(loc), begin_(begin) {} 37dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 operator*() { 38dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return BitmapIndex::kStorageBitSize * BitmapIndex::kSecondaryBlockSize * 39dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin (1 + std::distance<vector<uint32>::const_iterator>(begin_, *this)) - 40dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<uint32>::const_iterator::operator*(); 41dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 42dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin private: 43dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<uint32>::const_iterator begin_; 44dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}; 45dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 46dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinclass secondary_index_inverted : public vector<uint16>::const_iterator { 47dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin public: 48dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_inverted() : vector<uint16>::const_iterator() {} 49dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_inverted(vector<uint16>::const_iterator loc, 50dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<uint16>::const_iterator block_begin) : 51dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<uint16>::const_iterator(loc), block_begin_(block_begin) {} 52dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint16 operator*() { 53dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return ((1 + std::distance<vector<uint16>::const_iterator>( 54dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin block_begin_, *this)) << BitmapIndex::kStorageLogBitSize) - 55dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<uint16>::const_iterator::operator*(); 56dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 57dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin private: 58dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<uint16>::const_iterator block_begin_; 59dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}; 60dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 61dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::Rank1(size_t end) const { 62dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (end == 0) return 0; 63dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin CHECK_LE(end, Bits()); 64dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const uint32 end_word = (end - 1) >> BitmapIndex::kStorageLogBitSize; 65dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const uint32 sum = get_index_ones_count(end_word); 66dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const uint64 zero = 0; 67dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const uint64 ones = ~zero; 68dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return sum + __builtin_popcountll(bits_[end_word] & 69dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin (ones >> (kStorageBitSize - (end & kStorageBlockMask)))); 70dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 71dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 72dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::Select1(size_t bit_index) const { 73dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (bit_index >= GetOnesCount()) return Bits(); 74dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // search primary index for the relevant block 75dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 rembits = bit_index + 1; 76dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const uint32 block = find_primary_block(bit_index + 1); 77dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 offset = 0; 78dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (block > 0) { 79dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin rembits -= primary_index_[block - 1]; 80dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += block * kSecondaryBlockSize; 81dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 82dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // search the secondary index 83dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 word = find_secondary_block(offset, rembits); 84dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (word > 0) { 85dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin rembits -= secondary_index_[offset + word - 1]; 86dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += word; 87dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 88dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin int nth = nth_bit(bits_[offset], rembits); 89dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return (offset << BitmapIndex::kStorageLogBitSize) + nth; 90dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 91dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 92dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::Select0(size_t bit_index) const { 93dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (bit_index >= Bits() - GetOnesCount()) return Bits(); 94dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // search inverted primary index for relevant block 95dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 remzeros = bit_index + 1; 96dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 offset = 0; 97dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const uint32 block = find_inverted_primary_block(bit_index + 1); 98dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (block > 0) { 99dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin remzeros -= *primary_index_inverted(primary_index_.begin() + block - 1, 100dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin primary_index_.begin()); 101dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += block * kSecondaryBlockSize; 102dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 103dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // search the inverted secondary index 104dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 word = find_inverted_secondary_block(offset, remzeros); 105dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (word > 0) { 106dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<uint16>::const_iterator block_begin = 107dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_.begin() + offset; 108dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin remzeros -= *secondary_index_inverted(block_begin + word - 1, block_begin); 109dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += word; 110dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 111dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin int nth = nth_bit(~bits_[offset], remzeros); 112dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return (offset << BitmapIndex::kStorageLogBitSize) + nth; 113dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 114dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 115dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::get_index_ones_count(size_t array_index) const { 116dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 sum = 0; 117dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (array_index > 0) { 118dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin sum += secondary_index_[array_index-1]; 119dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 end_block = (array_index - 1) / kSecondaryBlockSize; 120dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (end_block > 0) sum += primary_index_[end_block-1]; 121dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 122dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return sum; 123dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 124dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 125dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinvoid BitmapIndex::BuildIndex(const uint64 *bits, size_t size) { 126dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bits_ = bits; 127dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_ = size; 128dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_.clear(); 129dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_.reserve(ArraySize()); 130dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin primary_index_.clear(); 131dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin primary_index_.reserve(primary_index_size()); 132dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const uint64 zero = 0; 133dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const uint64 ones = ~zero; 134dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 popcount = 0; 135dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin for (uint32 block_begin = 0; block_begin < ArraySize(); 136dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin block_begin += kSecondaryBlockSize) { 137dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 block_popcount = 0; 138dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 block_end = block_begin + kSecondaryBlockSize; 139dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (block_end > ArraySize()) block_end = ArraySize(); 140dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin for (uint32 j = block_begin; j < block_end; ++j) { 141dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64 mask = ones; 142dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (j == ArraySize() - 1) { 143dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin mask = ones >> (-size_ & BitmapIndex::kStorageBlockMask); 144dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 145dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin block_popcount += __builtin_popcountll(bits_[j] & mask); 146dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_.push_back(block_popcount); 147dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 148dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin popcount += block_popcount; 149dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin primary_index_.push_back(popcount); 150dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 151dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 152dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 153dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::find_secondary_block( 154dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t block_begin, size_t rem_bit_index) const { 155dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t block_end = block_begin + kSecondaryBlockSize; 156dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (block_end > secondary_index_.size()) block_end = secondary_index_.size(); 157dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return std::distance(secondary_index_.begin() + block_begin, 158dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin std::lower_bound(secondary_index_.begin() + block_begin, 159dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_.begin() + block_end, 160dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin rem_bit_index)); 161dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 162dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 163dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::find_inverted_secondary_block( 164dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t block_begin, size_t rem_bit_index) const { 165dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t block_end = block_begin + kSecondaryBlockSize; 166dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (block_end > secondary_index_.size()) block_end = secondary_index_.size(); 167dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_inverted start(secondary_index_.begin() + block_begin, 168dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_.begin() + block_begin); 169dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_inverted end(secondary_index_.begin() + block_end, 170dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin secondary_index_.begin() + block_begin); 171dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return std::distance(start, 172dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin std::lower_bound(start, end, rem_bit_index)); 173dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 174dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 175dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkininline size_t BitmapIndex::find_primary_block(size_t bit_index) const { 176dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return std::distance(primary_index_.begin(), 177dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin std::lower_bound(primary_index_.begin(), 178dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin primary_index_.end(), bit_index)); 179dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 180dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 181dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinsize_t BitmapIndex::find_inverted_primary_block(size_t bit_index) const { 182dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin primary_index_inverted start(primary_index_.begin(), primary_index_.begin()); 183dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin primary_index_inverted end(primary_index_.end(), primary_index_.begin()); 184dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return std::distance(start, std::lower_bound(start, end, bit_index)); 185dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 186dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 187dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} // end namespace fst 188