1dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 2dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Licensed under the Apache License, Version 2.0 (the "License"); 3dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// you may not use this file except in compliance with the License. 4dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// You may obtain a copy of the License at 5dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// 6dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// http://www.apache.org/licenses/LICENSE-2.0 7dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// 8dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Unless required by applicable law or agreed to in writing, software 9dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// distributed under the License is distributed on an "AS IS" BASIS, 10dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// See the License for the specific language governing permissions and 12dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// limitations under the License. 13dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// 14dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Copyright 2005-2010 Google, Inc. 15dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Author: sorenj@google.com (Jeffrey Sorensen) 16dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// 17dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#ifndef FST_EXTENSIONS_NGRAM_NGRAM_FST_H_ 18dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#define FST_EXTENSIONS_NGRAM_NGRAM_FST_H_ 19dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 20dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <stddef.h> 21dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <string.h> 22dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <algorithm> 23dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <string> 24dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <vector> 25dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinusing std::vector; 26dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 27dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/compat.h> 28dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/fstlib.h> 295b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin#include <fst/mapped-file.h> 30dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/extensions/ngram/bitmap-index.h> 31dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 32dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// NgramFst implements a n-gram language model based upon the LOUDS data 33dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// structure. Please refer to "Unary Data Strucutres for Language Models" 34dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// http://research.google.com/pubs/archive/37218.pdf 35dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 36dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinnamespace fst { 37dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate <class A> class NGramFst; 38dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate <class A> class NGramFstMatcher; 39dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 40dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Instance data containing mutable state for bookkeeping repeated access to 41dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// the same state. 42dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate <class A> 43dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinstruct NGramFstInst { 44dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::Label Label; 45dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::StateId StateId; 46dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::Weight Weight; 47dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId state_; 48dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t num_futures_; 49dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t offset_; 50dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t node_; 51dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId node_state_; 52dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<Label> context_; 53dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId context_state_; 54dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin NGramFstInst() 55dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin : state_(kNoStateId), node_state_(kNoStateId), 56dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_state_(kNoStateId) { } 57dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}; 58dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 59dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Implementation class for LOUDS based NgramFst interface 60dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate <class A> 61dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinclass NGramFstImpl : public FstImpl<A> { 62dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin using FstImpl<A>::SetInputSymbols; 63dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin using FstImpl<A>::SetOutputSymbols; 64dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin using FstImpl<A>::SetType; 65dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin using FstImpl<A>::WriteHeader; 66dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 67dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin friend class ArcIterator<NGramFst<A> >; 68dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin friend class NGramFstMatcher<A>; 69dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 70dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin public: 71dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin using FstImpl<A>::InputSymbols; 72dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin using FstImpl<A>::SetProperties; 73dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin using FstImpl<A>::Properties; 74dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 75dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef A Arc; 76dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::Label Label; 77dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::StateId StateId; 78dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::Weight Weight; 79dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 805b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin NGramFstImpl() : data_region_(0), data_(0), owned_(false) { 81dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetType("ngram"); 82dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetInputSymbols(NULL); 83dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetOutputSymbols(NULL); 84dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetProperties(kStaticProperties); 85dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 86dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 87dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin NGramFstImpl(const Fst<A> &fst, vector<StateId>* order_out); 88dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 89dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ~NGramFstImpl() { 90dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (owned_) { 91dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin delete [] data_; 92dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 935b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin delete data_region_; 94dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 95dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 96dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin static NGramFstImpl<A>* Read(istream &strm, // NOLINT 97dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const FstReadOptions &opts) { 98dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin NGramFstImpl<A>* impl = new NGramFstImpl(); 99dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin FstHeader hdr; 100dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!impl->ReadHeader(strm, opts, kMinFileVersion, &hdr)) return 0; 101dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64 num_states, num_futures, num_final; 102dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const size_t offset = sizeof(num_states) + sizeof(num_futures) + 103dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin sizeof(num_final); 104dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Peek at num_states and num_futures to see how much more needs to be read. 105dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin strm.read(reinterpret_cast<char *>(&num_states), sizeof(num_states)); 106dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin strm.read(reinterpret_cast<char *>(&num_futures), sizeof(num_futures)); 107dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin strm.read(reinterpret_cast<char *>(&num_final), sizeof(num_final)); 108dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t size = Storage(num_states, num_futures, num_final); 1095b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin MappedFile *data_region = MappedFile::Allocate(size); 1105b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin char *data = reinterpret_cast<char *>(data_region->mutable_data()); 111dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Copy num_states, num_futures and num_final back into data. 112dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin memcpy(data, reinterpret_cast<char *>(&num_states), sizeof(num_states)); 113dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin memcpy(data + sizeof(num_states), reinterpret_cast<char *>(&num_futures), 114dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin sizeof(num_futures)); 115dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin memcpy(data + sizeof(num_states) + sizeof(num_futures), 116dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin reinterpret_cast<char *>(&num_final), sizeof(num_final)); 117dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin strm.read(data + offset, size - offset); 118dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!strm) { 119dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin delete impl; 120dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return NULL; 121dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 1225b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin impl->Init(data, false, data_region); 123dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return impl; 124dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 125dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 126dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool Write(ostream &strm, // NOLINT 127dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const FstWriteOptions &opts) const { 128dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin FstHeader hdr; 129dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin hdr.SetStart(Start()); 130dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin hdr.SetNumStates(num_states_); 131dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin WriteHeader(strm, opts, kFileVersion, &hdr); 1325b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin strm.write(data_, StorageSize()); 133dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return strm; 134dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 135dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 136dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId Start() const { 137dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return 1; 138dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 139dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 140dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Weight Final(StateId state) const { 141dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (final_index_.Get(state)) { 142dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return final_probs_[final_index_.Rank1(state)]; 143dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } else { 144dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return Weight::Zero(); 145dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 146dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 147dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 148dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t NumArcs(StateId state, NGramFstInst<A> *inst = NULL) const { 149dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (inst == NULL) { 150dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const size_t next_zero = future_index_.Select0(state + 1); 151dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const size_t this_zero = future_index_.Select0(state); 152dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return next_zero - this_zero - 1; 153dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 154dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetInstFuture(state, inst); 155dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return inst->num_futures_ + ((state == 0) ? 0 : 1); 156dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 157dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 158dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t NumInputEpsilons(StateId state) const { 159dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // State 0 has no parent, thus no backoff. 160dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (state == 0) return 0; 161dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return 1; 162dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 163dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 164dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t NumOutputEpsilons(StateId state) const { 165dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return NumInputEpsilons(state); 166dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 167dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 168dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId NumStates() const { 169dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return num_states_; 170dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 171dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 172dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void InitStateIterator(StateIteratorData<A>* data) const { 173dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin data->base = 0; 174dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin data->nstates = num_states_; 175dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 176dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 177dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin static size_t Storage(uint64 num_states, uint64 num_futures, 178dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64 num_final) { 179dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64 b64; 180dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Weight weight; 181dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Label label; 182dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t offset = sizeof(num_states) + sizeof(num_futures) + 183dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin sizeof(num_final); 184dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += sizeof(b64) * ( 185dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex::StorageSize(num_states * 2 + 1) + 186dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex::StorageSize(num_futures + num_states + 1) + 187dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex::StorageSize(num_states)); 188dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += (num_states + 1) * sizeof(label) + num_futures * sizeof(label); 189dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Pad for alignemnt, see 190dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // http://en.wikipedia.org/wiki/Data_structure_alignment#Computing_padding 191dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset = (offset + sizeof(weight) - 1) & ~(sizeof(weight) - 1); 192dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += (num_states + 1) * sizeof(weight) + num_final * sizeof(weight) + 193dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin (num_futures + 1) * sizeof(weight); 194dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return offset; 195dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 196dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 197dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void SetInstFuture(StateId state, NGramFstInst<A> *inst) const { 198dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (inst->state_ != state) { 199dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin inst->state_ = state; 200dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const size_t next_zero = future_index_.Select0(state + 1); 201dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const size_t this_zero = future_index_.Select0(state); 202dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin inst->num_futures_ = next_zero - this_zero - 1; 203dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin inst->offset_ = future_index_.Rank1(future_index_.Select0(state) + 1); 204dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 205dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 206dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 207dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void SetInstNode(NGramFstInst<A> *inst) const { 208dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (inst->node_state_ != inst->state_) { 209dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin inst->node_state_ = inst->state_; 210dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin inst->node_ = context_index_.Select1(inst->state_); 211dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 212dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 213dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 214dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void SetInstContext(NGramFstInst<A> *inst) const { 215dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetInstNode(inst); 216dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (inst->context_state_ != inst->state_) { 217dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin inst->context_state_ = inst->state_; 218dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin inst->context_.clear(); 219dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t node = inst->node_; 220dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin while (node != 0) { 221dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin inst->context_.push_back(context_words_[context_index_.Rank1(node)]); 222dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin node = context_index_.Select1(context_index_.Rank0(node) - 1); 223dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 224dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 225dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 226dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 227dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Access to the underlying representation 228dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const char* GetData(size_t* data_size) const { 2295b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin *data_size = StorageSize(); 230dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return data_; 231dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 232dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 2335b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin void Init(const char* data, bool owned, MappedFile *file = 0); 2345b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 2355b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const vector<Label> &GetContext(StateId s, NGramFstInst<A> *inst) const { 2365b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin SetInstFuture(s, inst); 2375b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin SetInstContext(inst); 2385b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return inst->context_; 2395b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 2405b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 2415b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin size_t StorageSize() const { 2425b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return Storage(num_states_, num_futures_, num_final_); 2435b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 2445b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 2455b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin void GetStates(const vector<Label>& context, vector<StateId> *states) const; 246dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 247dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin private: 248dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId Transition(const vector<Label> &context, Label future) const; 249dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 250dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Properties always true for this Fst class. 251dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin static const uint64 kStaticProperties = kAcceptor | kIDeterministic | 252dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin kODeterministic | kEpsilons | kIEpsilons | kOEpsilons | kILabelSorted | 253dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin kOLabelSorted | kWeighted | kCyclic | kInitialAcyclic | kNotTopSorted | 254dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin kAccessible | kCoAccessible | kNotString | kExpanded; 255dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Current file format version. 256dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin static const int kFileVersion = 4; 257dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Minimum file format version supported. 258dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin static const int kMinFileVersion = 4; 259dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 2605b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin MappedFile *data_region_; 261dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const char* data_; 262dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool owned_; // True if we own data_ 263dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64 num_states_, num_futures_, num_final_; 264dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t root_num_children_; 265dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Label *root_children_; 266dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t root_first_child_; 267dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // borrowed references 268dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const uint64 *context_, *future_, *final_; 269dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Label *context_words_, *future_words_; 270dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Weight *backoff_, *final_probs_, *future_probs_; 271dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex context_index_; 272dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex future_index_; 273dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex final_index_; 274dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 275dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void operator=(const NGramFstImpl<A> &); // Disallow 276dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}; 277dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 278dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate<typename A> 279dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander GutkinNGramFstImpl<A>::NGramFstImpl(const Fst<A> &fst, vector<StateId>* order_out) 2805b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin : data_region_(0), data_(0), owned_(false) { 281dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef A Arc; 282dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename Arc::Label Label; 283dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename Arc::Weight Weight; 284dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename Arc::StateId StateId; 285dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetType("ngram"); 286dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetInputSymbols(fst.InputSymbols()); 287dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetOutputSymbols(fst.OutputSymbols()); 288dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetProperties(kStaticProperties); 289dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 290dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Check basic requirements for an OpenGRM language model Fst. 291dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin int64 props = kAcceptor | kIDeterministic | kIEpsilons | kILabelSorted; 292dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (fst.Properties(props, true) != props) { 293dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin FSTERROR() << "NGramFst only accepts OpenGRM langauge models as input"; 294dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetProperties(kError, kError); 295dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return; 296dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 297dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 298dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin int64 num_states = CountStates(fst); 299dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Label* context = new Label[num_states]; 300dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 301dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Find the unigram state by starting from the start state, following 302dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // epsilons. 303dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId unigram = fst.Start(); 304dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin while (1) { 3055b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (unigram == kNoStateId) { 3065b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin FSTERROR() << "Could not identify unigram state."; 307dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetProperties(kError, kError); 308dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return; 309dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 3105b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin ArcIterator<Fst<A> > aiter(fst, unigram); 3115b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (aiter.Done()) { 3125b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin LOG(WARNING) << "Unigram state " << unigram << " has no arcs."; 3135b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin break; 3145b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 315dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (aiter.Value().ilabel != 0) break; 316dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin unigram = aiter.Value().nextstate; 317dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 318dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 319dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Each state's context is determined by the subtree it is under from the 320dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // unigram state. 321dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin queue<pair<StateId, Label> > label_queue; 322dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin vector<bool> visited(num_states); 323dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Force an epsilon link to the start state. 324dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin label_queue.push(make_pair(fst.Start(), 0)); 325dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin for (ArcIterator<Fst<A> > aiter(fst, unigram); 326dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin !aiter.Done(); aiter.Next()) { 327dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin label_queue.push(make_pair(aiter.Value().nextstate, aiter.Value().ilabel)); 328dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 329dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // investigate states in breadth first fashion to assign context words. 330dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin while (!label_queue.empty()) { 331dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin pair<StateId, Label> &now = label_queue.front(); 332dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!visited[now.first]) { 333dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context[now.first] = now.second; 334dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin visited[now.first] = true; 335dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin for (ArcIterator<Fst<A> > aiter(fst, now.first); 336dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin !aiter.Done(); aiter.Next()) { 337dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Arc &arc = aiter.Value(); 338dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (arc.ilabel != 0) { 339dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin label_queue.push(make_pair(arc.nextstate, now.second)); 340dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 341dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 342dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 343dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin label_queue.pop(); 344dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 345dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin visited.clear(); 346dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 347dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // The arc from the start state should be assigned an epsilon to put it 348dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // in front of the all other labels (which makes Start state 1 after 349dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // unigram which is state 0). 350dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context[fst.Start()] = 0; 351dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 352dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Build the tree of contexts fst by reversing the epsilon arcs from fst. 353dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin VectorFst<Arc> context_fst; 354dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64 num_final = 0; 355dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin for (int i = 0; i < num_states; ++i) { 356dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (fst.Final(i) != Weight::Zero()) { 357dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ++num_final; 358dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 359dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_fst.SetFinal(context_fst.AddState(), fst.Final(i)); 360dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 361dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_fst.SetStart(unigram); 362dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_fst.SetInputSymbols(fst.InputSymbols()); 363dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_fst.SetOutputSymbols(fst.OutputSymbols()); 364dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin int64 num_context_arcs = 0; 365dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin int64 num_futures = 0; 366dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin for (StateIterator<Fst<A> > siter(fst); !siter.Done(); siter.Next()) { 367dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const StateId &state = siter.Value(); 368dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin num_futures += fst.NumArcs(state) - fst.NumInputEpsilons(state); 369dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ArcIterator<Fst<A> > aiter(fst, state); 370dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!aiter.Done()) { 371dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Arc &arc = aiter.Value(); 372dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // this arc goes from state to arc.nextstate, so create an arc from 373dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // arc.nextstate to state to reverse it. 374dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (arc.ilabel == 0) { 375dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_fst.AddArc(arc.nextstate, Arc(context[state], context[state], 376dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc.weight, state)); 377dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin num_context_arcs++; 378dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 379dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 380dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 381dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (num_context_arcs != context_fst.NumStates() - 1) { 382dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin FSTERROR() << "Number of contexts arcs != number of states - 1"; 383dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetProperties(kError, kError); 384dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return; 385dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 386dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (context_fst.NumStates() != num_states) { 387dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin FSTERROR() << "Number of contexts != number of states"; 388dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetProperties(kError, kError); 389dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return; 390dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 391dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin int64 context_props = context_fst.Properties(kIDeterministic | 392dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin kILabelSorted, true); 393dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!(context_props & kIDeterministic)) { 394dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin FSTERROR() << "Input fst is not structured properly"; 395dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetProperties(kError, kError); 396dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return; 397dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 398dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!(context_props & kILabelSorted)) { 399dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ArcSort(&context_fst, ILabelCompare<Arc>()); 400dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 401dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 402dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin delete [] context; 403dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 404dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64 b64; 405dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Weight weight; 406dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Label label = kNoLabel; 407dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const size_t storage = Storage(num_states, num_futures, num_final); 4085b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin MappedFile *data_region = MappedFile::Allocate(storage); 4095b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin char *data = reinterpret_cast<char *>(data_region->mutable_data()); 410dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin memset(data, 0, storage); 411dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t offset = 0; 412dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin memcpy(data + offset, reinterpret_cast<char *>(&num_states), 413dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin sizeof(num_states)); 414dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += sizeof(num_states); 415dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin memcpy(data + offset, reinterpret_cast<char *>(&num_futures), 416dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin sizeof(num_futures)); 417dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += sizeof(num_futures); 418dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin memcpy(data + offset, reinterpret_cast<char *>(&num_final), 419dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin sizeof(num_final)); 420dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += sizeof(num_final); 421dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64* context_bits = reinterpret_cast<uint64*>(data + offset); 422dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += BitmapIndex::StorageSize(num_states * 2 + 1) * sizeof(b64); 423dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64* future_bits = reinterpret_cast<uint64*>(data + offset); 424dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += 425dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex::StorageSize(num_futures + num_states + 1) * sizeof(b64); 426dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64* final_bits = reinterpret_cast<uint64*>(data + offset); 427dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += BitmapIndex::StorageSize(num_states) * sizeof(b64); 428dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Label* context_words = reinterpret_cast<Label*>(data + offset); 429dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += (num_states + 1) * sizeof(label); 430dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Label* future_words = reinterpret_cast<Label*>(data + offset); 431dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += num_futures * sizeof(label); 432dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset = (offset + sizeof(weight) - 1) & ~(sizeof(weight) - 1); 433dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Weight* backoff = reinterpret_cast<Weight*>(data + offset); 434dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += (num_states + 1) * sizeof(weight); 435dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Weight* final_probs = reinterpret_cast<Weight*>(data + offset); 436dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += num_final * sizeof(weight); 437dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Weight* future_probs = reinterpret_cast<Weight*>(data + offset); 438dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin int64 context_arc = 0, future_arc = 0, context_bit = 0, future_bit = 0, 439dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin final_bit = 0; 440dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 441dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // pseudo-root bits 442dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex::Set(context_bits, context_bit++); 443dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ++context_bit; 444dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_words[context_arc] = label; 445dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin backoff[context_arc] = Weight::Zero(); 446dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_arc++; 447dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 448dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ++future_bit; 449dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (order_out) { 450dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin order_out->clear(); 451dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin order_out->resize(num_states); 452dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 453dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 454dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin queue<StateId> context_q; 455dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_q.push(context_fst.Start()); 456dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId state_number = 0; 457dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin while (!context_q.empty()) { 458dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const StateId &state = context_q.front(); 459dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (order_out) { 460dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin (*order_out)[state] = state_number; 461dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 462dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 463dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Weight &final = context_fst.Final(state); 464dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (final != Weight::Zero()) { 465dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex::Set(final_bits, state_number); 466dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin final_probs[final_bit] = final; 467dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ++final_bit; 468dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 469dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 470dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin for (ArcIterator<VectorFst<A> > aiter(context_fst, state); 471dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin !aiter.Done(); aiter.Next()) { 472dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Arc &arc = aiter.Value(); 473dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_words[context_arc] = arc.ilabel; 474dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin backoff[context_arc] = arc.weight; 475dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ++context_arc; 476dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex::Set(context_bits, context_bit++); 477dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_q.push(arc.nextstate); 478dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 479dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ++context_bit; 480dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 481dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin for (ArcIterator<Fst<A> > aiter(fst, state); !aiter.Done(); aiter.Next()) { 482dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Arc &arc = aiter.Value(); 483dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (arc.ilabel != 0) { 484dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin future_words[future_arc] = arc.ilabel; 485dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin future_probs[future_arc] = arc.weight; 486dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ++future_arc; 487dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin BitmapIndex::Set(future_bits, future_bit++); 488dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 489dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 490dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ++future_bit; 491dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ++state_number; 492dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_q.pop(); 493dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 494dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 495dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if ((state_number != num_states) || 496dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin (context_bit != num_states * 2 + 1) || 497dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin (context_arc != num_states) || 498dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin (future_arc != num_futures) || 499dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin (future_bit != num_futures + num_states + 1) || 500dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin (final_bit != num_final)) { 501dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin FSTERROR() << "Structure problems detected during construction"; 502dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetProperties(kError, kError); 503dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return; 504dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 505dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 5065b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin Init(data, false, data_region); 507dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 508dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 509dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate<typename A> 5105b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkininline void NGramFstImpl<A>::Init(const char* data, bool owned, 5115b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin MappedFile *data_region) { 512dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (owned_) { 513dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin delete [] data_; 514dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 5155b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin delete data_region_; 5165b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin data_region_ = data_region; 517dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin owned_ = owned; 518dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin data_ = data; 519dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t offset = 0; 520dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin num_states_ = *(reinterpret_cast<const uint64*>(data_ + offset)); 521dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += sizeof(num_states_); 522dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin num_futures_ = *(reinterpret_cast<const uint64*>(data_ + offset)); 523dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += sizeof(num_futures_); 524dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin num_final_ = *(reinterpret_cast<const uint64*>(data_ + offset)); 525dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += sizeof(num_final_); 526dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint64 bits; 527dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t context_bits = num_states_ * 2 + 1; 528dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t future_bits = num_futures_ + num_states_ + 1; 529dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_ = reinterpret_cast<const uint64*>(data_ + offset); 530dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += BitmapIndex::StorageSize(context_bits) * sizeof(bits); 531dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin future_ = reinterpret_cast<const uint64*>(data_ + offset); 532dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += BitmapIndex::StorageSize(future_bits) * sizeof(bits); 533dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin final_ = reinterpret_cast<const uint64*>(data_ + offset); 5345b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin offset += BitmapIndex::StorageSize(num_states_) * sizeof(bits); 535dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_words_ = reinterpret_cast<const Label*>(data_ + offset); 536dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += (num_states_ + 1) * sizeof(*context_words_); 537dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin future_words_ = reinterpret_cast<const Label*>(data_ + offset); 538dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += num_futures_ * sizeof(*future_words_); 539dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset = (offset + sizeof(*backoff_) - 1) & ~(sizeof(*backoff_) - 1); 540dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin backoff_ = reinterpret_cast<const Weight*>(data_ + offset); 541dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += (num_states_ + 1) * sizeof(*backoff_); 542dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin final_probs_ = reinterpret_cast<const Weight*>(data_ + offset); 543dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin offset += num_final_ * sizeof(*final_probs_); 544dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin future_probs_ = reinterpret_cast<const Weight*>(data_ + offset); 545dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 546dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context_index_.BuildIndex(context_, context_bits); 547dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin future_index_.BuildIndex(future_, future_bits); 548dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin final_index_.BuildIndex(final_, num_states_); 549dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 550dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const size_t node_rank = context_index_.Rank1(0); 551dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin root_first_child_ = context_index_.Select0(node_rank) + 1; 552dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (context_index_.Get(root_first_child_) == false) { 553dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin FSTERROR() << "Missing unigrams"; 554dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SetProperties(kError, kError); 555dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return; 556dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 557dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const size_t last_child = context_index_.Select0(node_rank + 1) - 1; 558dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin root_num_children_ = last_child - root_first_child_ + 1; 559dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin root_children_ = context_words_ + context_index_.Rank1(root_first_child_); 560dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 561dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 562dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate<typename A> 563dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkininline typename A::StateId NGramFstImpl<A>::Transition( 564dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const vector<Label> &context, Label future) const { 565dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Label *children = root_children_; 5665b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const Label *loc = lower_bound(children, children + root_num_children_, 5675b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin future); 5685b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (loc == children + root_num_children_ || *loc != future) { 569dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return context_index_.Rank1(0); 570dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 571dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t node = root_first_child_ + loc - children; 572dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t node_rank = context_index_.Rank1(node); 573dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t first_child = context_index_.Select0(node_rank) + 1; 574dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (context_index_.Get(first_child) == false) { 575dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return context_index_.Rank1(node); 576dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 577dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t last_child = context_index_.Select0(node_rank + 1) - 1; 578dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin for (int word = context.size() - 1; word >= 0; --word) { 579dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin children = context_words_ + context_index_.Rank1(first_child); 580dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin loc = lower_bound(children, children + last_child - first_child + 1, 581dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin context[word]); 582dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (loc == children + last_child - first_child + 1 || 583dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin *loc != context[word]) { 584dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin break; 585dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 586dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin node = first_child + loc - children; 587dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin node_rank = context_index_.Rank1(node); 588dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin first_child = context_index_.Select0(node_rank) + 1; 589dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (context_index_.Get(first_child) == false) break; 590dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin last_child = context_index_.Select0(node_rank + 1) - 1; 591dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 592dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return context_index_.Rank1(node); 593dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 594dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 5955b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkintemplate<typename A> 5965b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkininline void NGramFstImpl<A>::GetStates( 5975b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const vector<Label> &context, 5985b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin vector<typename A::StateId>* states) const { 5995b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin states->clear(); 6005b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin states->push_back(0); 6015b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin typename vector<Label>::const_reverse_iterator cit = context.rbegin(); 6025b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const Label *children = root_children_; 6035b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const Label *loc = lower_bound(children, children + root_num_children_, *cit); 6045b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (loc == children + root_num_children_ || *loc != *cit) return; 6055b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin size_t node = root_first_child_ + loc - children; 6065b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin states->push_back(context_index_.Rank1(node)); 6075b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (context.size() == 1) return; 6085b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin size_t node_rank = context_index_.Rank1(node); 6095b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin size_t first_child = context_index_.Select0(node_rank) + 1; 6105b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin ++cit; 6115b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (context_index_.Get(first_child) != false) { 6125b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin size_t last_child = context_index_.Select0(node_rank + 1) - 1; 6135b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin while (cit != context.rend()) { 6145b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin children = context_words_ + context_index_.Rank1(first_child); 6155b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin loc = lower_bound(children, children + last_child - first_child + 1, 6165b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin *cit); 6175b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (loc == children + last_child - first_child + 1 || *loc != *cit) { 6185b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin break; 6195b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 6205b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin ++cit; 6215b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin node = first_child + loc - children; 6225b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin states->push_back(context_index_.Rank1(node)); 6235b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin node_rank = context_index_.Rank1(node); 6245b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin first_child = context_index_.Select0(node_rank) + 1; 6255b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (context_index_.Get(first_child) == false) break; 6265b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin last_child = context_index_.Select0(node_rank + 1) - 1; 6275b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 6285b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 6295b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin} 6305b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 631dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin/*****************************************************************************/ 632dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate<class A> 633dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinclass NGramFst : public ImplToExpandedFst<NGramFstImpl<A> > { 634dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin friend class ArcIterator<NGramFst<A> >; 635dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin friend class NGramFstMatcher<A>; 636dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 637dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin public: 638dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef A Arc; 639dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::StateId StateId; 640dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::Label Label; 641dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::Weight Weight; 642dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef NGramFstImpl<A> Impl; 643dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 644dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin explicit NGramFst(const Fst<A> &dst) 645dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin : ImplToExpandedFst<Impl>(new Impl(dst, NULL)) {} 646dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 647dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin NGramFst(const Fst<A> &fst, vector<StateId>* order_out) 648dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin : ImplToExpandedFst<Impl>(new Impl(fst, order_out)) {} 649dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 650dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Because the NGramFstImpl is a const stateless data structure, there 651dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // is never a need to do anything beside copy the reference. 652dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin NGramFst(const NGramFst<A> &fst, bool safe = false) 653dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin : ImplToExpandedFst<Impl>(fst, false) {} 654dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 655dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin NGramFst() : ImplToExpandedFst<Impl>(new Impl()) {} 656dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 657dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Non-standard constructor to initialize NGramFst directly from data. 658dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin NGramFst(const char* data, bool owned) : ImplToExpandedFst<Impl>(new Impl()) { 6595b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin GetImpl()->Init(data, owned, NULL); 660dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 661dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 662dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Get method that gets the data associated with Init(). 663dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const char* GetData(size_t* data_size) const { 664dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return GetImpl()->GetData(data_size); 665dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 666dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 6675b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const vector<Label> GetContext(StateId s) const { 6685b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return GetImpl()->GetContext(s, &inst_); 6695b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 6705b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 6715b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin // Consumes as much as possible of context from right to left, returns the 6725b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin // the states corresponding to the increasingly conditioned input sequence. 6735b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin void GetStates(const vector<Label>& context, vector<StateId> *state) const { 6745b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return GetImpl()->GetStates(context, state); 6755b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 6765b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 677dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual size_t NumArcs(StateId s) const { 678dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return GetImpl()->NumArcs(s, &inst_); 679dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 680dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 681dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual NGramFst<A>* Copy(bool safe = false) const { 682dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return new NGramFst(*this, safe); 683dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 684dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 685dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin static NGramFst<A>* Read(istream &strm, const FstReadOptions &opts) { 686dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Impl* impl = Impl::Read(strm, opts); 687dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return impl ? new NGramFst<A>(impl) : 0; 688dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 689dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 690dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin static NGramFst<A>* Read(const string &filename) { 691dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!filename.empty()) { 692dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ifstream strm(filename.c_str(), ifstream::in | ifstream::binary); 693dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!strm) { 694dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin LOG(ERROR) << "NGramFst::Read: Can't open file: " << filename; 695dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return 0; 696dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 697dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return Read(strm, FstReadOptions(filename)); 698dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } else { 699dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return Read(cin, FstReadOptions("standard input")); 700dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 701dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 702dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 703dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual bool Write(ostream &strm, const FstWriteOptions &opts) const { 704dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return GetImpl()->Write(strm, opts); 705dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 706dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 707dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual bool Write(const string &filename) const { 708dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return Fst<A>::WriteFile(filename); 709dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 710dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 711dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual inline void InitStateIterator(StateIteratorData<A>* data) const { 712dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin GetImpl()->InitStateIterator(data); 713dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 714dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 715dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual inline void InitArcIterator( 716dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId s, ArcIteratorData<A>* data) const; 717dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 718dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual MatcherBase<A>* InitMatcher(MatchType match_type) const { 719dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return new NGramFstMatcher<A>(*this, match_type); 720dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 721dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 7225b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin size_t StorageSize() const { 7235b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return GetImpl()->StorageSize(); 7245b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 7255b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 726dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin private: 727dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin explicit NGramFst(Impl* impl) : ImplToExpandedFst<Impl>(impl) {} 728dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 729dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Impl* GetImpl() const { 730dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return 731dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ImplToExpandedFst<Impl, ExpandedFst<A> >::GetImpl(); 732dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 733dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 734dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void SetImpl(Impl* impl, bool own_impl = true) { 735dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ImplToExpandedFst<Impl, Fst<A> >::SetImpl(impl, own_impl); 736dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 737dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 738dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin mutable NGramFstInst<A> inst_; 739dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}; 740dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 741dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate <class A> inline void 742dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander GutkinNGramFst<A>::InitArcIterator(StateId s, ArcIteratorData<A>* data) const { 743dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin GetImpl()->SetInstFuture(s, &inst_); 744dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin GetImpl()->SetInstNode(&inst_); 745dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin data->base = new ArcIterator<NGramFst<A> >(*this, s); 746dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 747dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 748dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin/*****************************************************************************/ 749dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate <class A> 750dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinclass NGramFstMatcher : public MatcherBase<A> { 751dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin public: 752dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef A Arc; 753dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::Label Label; 754dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::StateId StateId; 755dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::Weight Weight; 756dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 757dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin NGramFstMatcher(const NGramFst<A> &fst, MatchType match_type) 758dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin : fst_(fst), inst_(fst.inst_), match_type_(match_type), 759dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin current_loop_(false), 760dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin loop_(kNoLabel, 0, A::Weight::One(), kNoStateId) { 761dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (match_type_ == MATCH_OUTPUT) { 762dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin swap(loop_.ilabel, loop_.olabel); 763dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 764dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 765dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 766dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin NGramFstMatcher(const NGramFstMatcher<A> &matcher, bool safe = false) 767dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin : fst_(matcher.fst_), inst_(matcher.inst_), 768dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin match_type_(matcher.match_type_), current_loop_(false), 769dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin loop_(kNoLabel, 0, A::Weight::One(), kNoStateId) { 770dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (match_type_ == MATCH_OUTPUT) { 771dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin swap(loop_.ilabel, loop_.olabel); 772dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 773dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 774dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 775dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual NGramFstMatcher<A>* Copy(bool safe = false) const { 776dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return new NGramFstMatcher<A>(*this, safe); 777dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 778dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 779dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual MatchType Type(bool test) const { 780dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return match_type_; 781dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 782dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 783dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual const Fst<A> &GetFst() const { 784dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return fst_; 785dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 786dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 787dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual uint64 Properties(uint64 props) const { 788dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return props; 789dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 790dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 791dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin private: 792dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual void SetState_(StateId s) { 793dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst_.GetImpl()->SetInstFuture(s, &inst_); 794dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin current_loop_ = false; 795dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 796dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 797dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual bool Find_(Label label) { 798dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Label nolabel = kNoLabel; 799dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin done_ = true; 800dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (label == 0 || label == nolabel) { 801dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (label == 0) { 802dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin current_loop_ = true; 803dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin loop_.nextstate = inst_.state_; 804dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 805dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // The unigram state has no epsilon arc. 806dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (inst_.state_ != 0) { 807dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.ilabel = arc_.olabel = 0; 808dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst_.GetImpl()->SetInstNode(&inst_); 809dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.nextstate = fst_.GetImpl()->context_index_.Rank1( 810dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst_.GetImpl()->context_index_.Select1( 811dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst_.GetImpl()->context_index_.Rank0(inst_.node_) - 1)); 812dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.weight = fst_.GetImpl()->backoff_[inst_.state_]; 813dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin done_ = false; 814dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 815dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } else { 816dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Label *start = fst_.GetImpl()->future_words_ + inst_.offset_; 817dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Label *end = start + inst_.num_futures_; 818dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Label* search = lower_bound(start, end, label); 819dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (search != end && *search == label) { 820dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t state = search - start; 821dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.ilabel = arc_.olabel = label; 822dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.weight = fst_.GetImpl()->future_probs_[inst_.offset_ + state]; 823dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst_.GetImpl()->SetInstContext(&inst_); 824dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.nextstate = fst_.GetImpl()->Transition(inst_.context_, label); 825dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin done_ = false; 826dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 827dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 828dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return !Done_(); 829dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 830dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 831dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual bool Done_() const { 832dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return !current_loop_ && done_; 833dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 834dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 835dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual const Arc& Value_() const { 836dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return (current_loop_) ? loop_ : arc_; 837dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 838dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 839dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual void Next_() { 840dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (current_loop_) { 841dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin current_loop_ = false; 842dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } else { 843dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin done_ = true; 844dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 845dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 846dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 847dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const NGramFst<A>& fst_; 848dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin NGramFstInst<A> inst_; 849dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin MatchType match_type_; // Supplied by caller 850dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool done_; 851dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Arc arc_; 852dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool current_loop_; // Current arc is the implicit loop 853dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin Arc loop_; 854dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}; 855dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 856dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin/*****************************************************************************/ 857dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate<class A> 858dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinclass ArcIterator<NGramFst<A> > : public ArcIteratorBase<A> { 859dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin public: 860dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef A Arc; 861dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::Label Label; 862dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::StateId StateId; 863dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::Weight Weight; 864dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 865dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ArcIterator(const NGramFst<A> &fst, StateId state) 866dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin : lazy_(~0), impl_(fst.GetImpl()), i_(0), flags_(kArcValueFlags) { 867dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin inst_ = fst.inst_; 868dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin impl_->SetInstFuture(state, &inst_); 869dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin impl_->SetInstNode(&inst_); 870dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 871dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 872dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool Done() const { 873dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return i_ >= ((inst_.node_ == 0) ? inst_.num_futures_ : 874dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin inst_.num_futures_ + 1); 875dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 876dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 877dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const Arc &Value() const { 878dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool eps = (inst_.node_ != 0 && i_ == 0); 879dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId state = (inst_.node_ == 0) ? i_ : i_ - 1; 880dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (flags_ & lazy_ & (kArcILabelValue | kArcOLabelValue)) { 881dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.ilabel = 882dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.olabel = eps ? 0 : impl_->future_words_[inst_.offset_ + state]; 883dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin lazy_ &= ~(kArcILabelValue | kArcOLabelValue); 884dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 885dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (flags_ & lazy_ & kArcNextStateValue) { 886dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (eps) { 887dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.nextstate = impl_->context_index_.Rank1( 888dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin impl_->context_index_.Select1( 889dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin impl_->context_index_.Rank0(inst_.node_) - 1)); 890dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } else { 891dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (lazy_ & kArcNextStateValue) { 892dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin impl_->SetInstContext(&inst_); // first time only. 893dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 894dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.nextstate = 895dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin impl_->Transition(inst_.context_, 896dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin impl_->future_words_[inst_.offset_ + state]); 897dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 898dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin lazy_ &= ~kArcNextStateValue; 899dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 900dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (flags_ & lazy_ & kArcWeightValue) { 901dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin arc_.weight = eps ? impl_->backoff_[inst_.state_] : 902dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin impl_->future_probs_[inst_.offset_ + state]; 903dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin lazy_ &= ~kArcWeightValue; 904dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 905dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return arc_; 906dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 907dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 908dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void Next() { 909dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ++i_; 910dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin lazy_ = ~0; 911dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 912dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 913dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t Position() const { return i_; } 914dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 915dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void Reset() { 916dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin i_ = 0; 917dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin lazy_ = ~0; 918dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 919dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 920dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void Seek(size_t a) { 921dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (i_ != a) { 922dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin i_ = a; 923dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin lazy_ = ~0; 924dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 925dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 926dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 927dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 Flags() const { 928dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return flags_; 929dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 930dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 931dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void SetFlags(uint32 f, uint32 m) { 932dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin flags_ &= ~m; 933dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin flags_ |= (f & kArcValueFlags); 934dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 935dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 936dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin private: 937dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual bool Done_() const { return Done(); } 938dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual const Arc& Value_() const { return Value(); } 939dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual void Next_() { Next(); } 940dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual size_t Position_() const { return Position(); } 941dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual void Reset_() { Reset(); } 942dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual void Seek_(size_t a) { Seek(a); } 943dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 Flags_() const { return Flags(); } 944dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void SetFlags_(uint32 f, uint32 m) { SetFlags(f, m); } 945dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 946dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin mutable Arc arc_; 947dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin mutable uint32 lazy_; 948dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const NGramFstImpl<A> *impl_; 949dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin mutable NGramFstInst<A> inst_; 950dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 951dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin size_t i_; 952dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin uint32 flags_; 953dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 954dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin DISALLOW_COPY_AND_ASSIGN(ArcIterator); 955dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}; 956dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 957dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin/*****************************************************************************/ 958dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Specialization for NGramFst; see generic version in fst.h 959dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// for sample usage (but use the ProdLmFst type!). This version 960dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// should inline. 961dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkintemplate <class A> 962dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinclass StateIterator<NGramFst<A> > : public StateIteratorBase<A> { 963dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin public: 964dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin typedef typename A::StateId StateId; 965dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 966dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin explicit StateIterator(const NGramFst<A> &fst) 967dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin : s_(0), num_states_(fst.NumStates()) { } 968dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 969dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool Done() const { return s_ >= num_states_; } 970dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId Value() const { return s_; } 971dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void Next() { ++s_; } 972dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void Reset() { s_ = 0; } 973dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 974dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin private: 975dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual bool Done_() const { return Done(); } 976dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual StateId Value_() const { return Value(); } 977dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual void Next_() { Next(); } 978dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual void Reset_() { Reset(); } 979dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 980dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StateId s_, num_states_; 981dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 982dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin DISALLOW_COPY_AND_ASSIGN(StateIterator); 983dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}; 984dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} // namespace fst 985dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#endif // FST_EXTENSIONS_NGRAM_NGRAM_FST_H_ 986