1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// replace.h 2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License"); 4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License. 5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at 6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// http://www.apache.org/licenses/LICENSE-2.0 8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software 10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS, 11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and 13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License. 14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc. 16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author: riley@google.com (Michael Riley) 17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file 19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Recursively replace Fst arcs with other Fst(s) returning a PDT. 20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_EXTENSIONS_PDT_REPLACE_H__ 22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_EXTENSIONS_PDT_REPLACE_H__ 23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/replace.h> 25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst { 27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Hash to paren IDs 29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <typename S> 30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonstruct ReplaceParenHash { 31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t operator()(const pair<size_t, S> &p) const { 32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return p.first + p.second * kPrime; 33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static const size_t kPrime = 7853; 36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <typename S> const size_t ReplaceParenHash<S>::kPrime; 39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Builds a pushdown transducer (PDT) from an RTN specification 41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// identical to that in fst/lib/replace.h. The result is a PDT 42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// encoded as the FST 'ofst' where some transitions are labeled with 43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// open or close parentheses. To be interpreted as a PDT, the parens 44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// must balance on a path (see PdtExpand()). The open/close 45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// parenthesis label pairs are returned in 'parens'. 46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class Arc> 47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonvoid Replace(const vector<pair<typename Arc::Label, 48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const Fst<Arc>* > >& ifst_array, 49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MutableFst<Arc> *ofst, 50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<pair<typename Arc::Label, 51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typename Arc::Label> > *parens, 52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typename Arc::Label root) { 53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename Arc::Label Label; 54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename Arc::StateId StateId; 55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename Arc::Weight Weight; 56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ofst->DeleteStates(); 58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson parens->clear(); 59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 60f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson unordered_map<Label, size_t> label2id; 61f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < ifst_array.size(); ++i) 62f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson label2id[ifst_array[i].first] = i; 63f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 64f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Label max_label = kNoLabel; 65f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson deque<size_t> non_term_queue; // Queue of non-terminals to replace 67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson unordered_set<Label> non_term_set; // Set of non-terminals to replace 68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson non_term_queue.push_back(root); 69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson non_term_set.insert(root); 70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // PDT state corr. to ith replace FST start state. 72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<StateId> fst_start(ifst_array.size(), kNoLabel); 73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // PDT state, weight pairs corr. to ith replace FST final state & weights. 74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector< vector<pair<StateId, Weight> > > fst_final(ifst_array.size()); 75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Builds single Fst combining all referenced input Fsts. Leaves in the 77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // non-termnals for now. Tabulate the PDT states that correspond to 78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // the start and final states of the input Fsts. 79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (StateId soff = 0; !non_term_queue.empty(); soff = ofst->NumStates()) { 80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Label label = non_term_queue.front(); 81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson non_term_queue.pop_front(); 82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t fst_id = label2id[label]; 83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const Fst<Arc> *ifst = ifst_array[fst_id].second; 85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (StateIterator< Fst<Arc> > siter(*ifst); 86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson !siter.Done(); siter.Next()) { 87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson StateId is = siter.Value(); 88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson StateId os = ofst->AddState(); 89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (is == ifst->Start()) { 90f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson fst_start[fst_id] = os; 91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (label == root) 92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ofst->SetStart(os); 93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 94f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (ifst->Final(is) != Weight::Zero()) { 95f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (label == root) 96f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ofst->SetFinal(os, ifst->Final(is)); 97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson fst_final[fst_id].push_back(make_pair(os, ifst->Final(is))); 98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (ArcIterator< Fst<Arc> > aiter(*ifst, is); 100f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson !aiter.Done(); aiter.Next()) { 101f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Arc arc = aiter.Value(); 102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (max_label == kNoLabel || arc.olabel > max_label) 103f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson max_label = arc.olabel; 104f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typename unordered_map<Label, size_t>::const_iterator it = 105f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson label2id.find(arc.olabel); 106f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (it != label2id.end()) { 107f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t nfst_id = it->second; 108f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (ifst_array[nfst_id].second->Start() == -1) 109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson continue; 110f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (non_term_set.count(arc.olabel) == 0) { 111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson non_term_queue.push_back(arc.olabel); 112f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson non_term_set.insert(arc.olabel); 113f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson arc.nextstate += soff; 116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ofst->AddArc(os, arc); 117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 121f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Changes each non-terminal transition to an open parenthesis 122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // transition redirected to the PDT state that corresponds to the 123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // start state of the input FST for the non-terminal. Adds close parenthesis 124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // transitions from the PDT states corr. to the final states of the 125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // input FST for the non-terminal to the former destination state of the 126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // non-terminal transition. 127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 128f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef MutableArcIterator< MutableFst<Arc> > MIter; 129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef unordered_map<pair<size_t, StateId >, size_t, 130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ReplaceParenHash<StateId> > ParenMap; 131f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Parenthesis pair ID per fst, state pair. 133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ParenMap paren_map; 134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // # of parenthesis pairs per fst. 135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<size_t> nparens(ifst_array.size(), 0); 136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Initial open parenthesis label 137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Label first_paren = max_label + 1; 138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (StateIterator< Fst<Arc> > siter(*ofst); 140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson !siter.Done(); siter.Next()) { 141f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson StateId os = siter.Value(); 142f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MIter *aiter = new MIter(ofst, os); 143f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t n = 0; !aiter->Done(); aiter->Next(), ++n) { 144f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Arc arc = aiter->Value(); 145f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typename unordered_map<Label, size_t>::const_iterator lit = 146f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson label2id.find(arc.olabel); 147f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (lit != label2id.end()) { 148f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t nfst_id = lit->second; 149f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 150f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Get parentheses. Ensures distinct parenthesis pair per 151f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // non-terminal and destination state but otherwise reuses them. 152f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Label open_paren = kNoLabel, close_paren = kNoLabel; 153f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson pair<size_t, StateId> paren_key(nfst_id, arc.nextstate); 154f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typename ParenMap::const_iterator pit = paren_map.find(paren_key); 155f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (pit != paren_map.end()) { 156f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t paren_id = pit->second; 157f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson open_paren = (*parens)[paren_id].first; 158f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson close_paren = (*parens)[paren_id].second; 159f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 160f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t paren_id = nparens[nfst_id]++; 161f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson open_paren = first_paren + 2 * paren_id; 162f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson close_paren = open_paren + 1; 163f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson paren_map[paren_key] = paren_id; 164f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (paren_id >= parens->size()) 165f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson parens->push_back(make_pair(open_paren, close_paren)); 166f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 167f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 168f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Sets open parenthesis. 169f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Arc sarc(open_paren, open_paren, arc.weight, fst_start[nfst_id]); 170f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson aiter->SetValue(sarc); 171f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 172f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Adds close parentheses. 173f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < fst_final[nfst_id].size(); ++i) { 174f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson pair<StateId, Weight> &p = fst_final[nfst_id][i]; 175f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Arc farc(close_paren, close_paren, p.second, arc.nextstate); 176f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 177f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ofst->AddArc(p.first, farc); 178f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (os == p.first) { // Invalidated iterator 179f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete aiter; 180f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson aiter = new MIter(ofst, os); 181f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson aiter->Seek(n); 182f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 183f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 184f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 185f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 186f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete aiter; 187f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 188f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 189f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 190f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} // namespace fst 191f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 192f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif // FST_EXTENSIONS_PDT_REPLACE_H__ 193