1// synchronize.h 2 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14// 15// Copyright 2005-2010 Google, Inc. 16// Author: allauzen@google.com (Cyril Allauzen) 17// 18// \file 19// Synchronize an FST with bounded delay. 20 21#ifndef FST_LIB_SYNCHRONIZE_H__ 22#define FST_LIB_SYNCHRONIZE_H__ 23 24#include <algorithm> 25#include <unordered_map> 26using std::tr1::unordered_map; 27using std::tr1::unordered_multimap; 28#include <unordered_set> 29using std::tr1::unordered_set; 30using std::tr1::unordered_multiset; 31#include <string> 32#include <utility> 33using std::pair; using std::make_pair; 34#include <vector> 35using std::vector; 36 37#include <fst/cache.h> 38#include <fst/test-properties.h> 39 40 41namespace fst { 42 43typedef CacheOptions SynchronizeFstOptions; 44 45 46// Implementation class for SynchronizeFst 47template <class A> 48class SynchronizeFstImpl 49 : public CacheImpl<A> { 50 public: 51 using FstImpl<A>::SetType; 52 using FstImpl<A>::SetProperties; 53 using FstImpl<A>::SetInputSymbols; 54 using FstImpl<A>::SetOutputSymbols; 55 56 using CacheBaseImpl< CacheState<A> >::PushArc; 57 using CacheBaseImpl< CacheState<A> >::HasArcs; 58 using CacheBaseImpl< CacheState<A> >::HasFinal; 59 using CacheBaseImpl< CacheState<A> >::HasStart; 60 using CacheBaseImpl< CacheState<A> >::SetArcs; 61 using CacheBaseImpl< CacheState<A> >::SetFinal; 62 using CacheBaseImpl< CacheState<A> >::SetStart; 63 64 typedef A Arc; 65 typedef typename A::Label Label; 66 typedef typename A::Weight Weight; 67 typedef typename A::StateId StateId; 68 69 typedef basic_string<Label> String; 70 71 struct Element { 72 Element() {} 73 74 Element(StateId s, const String *i, const String *o) 75 : state(s), istring(i), ostring(o) {} 76 77 StateId state; // Input state Id 78 const String *istring; // Residual input labels 79 const String *ostring; // Residual output labels 80 // Residual strings are represented by const pointers to 81 // basic_string<Label> and are stored in a hash_set. The pointed 82 // memory is owned by the hash_set string_set_. 83 }; 84 85 SynchronizeFstImpl(const Fst<A> &fst, const SynchronizeFstOptions &opts) 86 : CacheImpl<A>(opts), fst_(fst.Copy()) { 87 SetType("synchronize"); 88 uint64 props = fst.Properties(kFstProperties, false); 89 SetProperties(SynchronizeProperties(props), kCopyProperties); 90 91 SetInputSymbols(fst.InputSymbols()); 92 SetOutputSymbols(fst.OutputSymbols()); 93 } 94 95 SynchronizeFstImpl(const SynchronizeFstImpl &impl) 96 : CacheImpl<A>(impl), 97 fst_(impl.fst_->Copy(true)) { 98 SetType("synchronize"); 99 SetProperties(impl.Properties(), kCopyProperties); 100 SetInputSymbols(impl.InputSymbols()); 101 SetOutputSymbols(impl.OutputSymbols()); 102 } 103 104 ~SynchronizeFstImpl() { 105 delete fst_; 106 // Extract pointers from the hash set 107 vector<const String*> strings; 108 typename StringSet::iterator it = string_set_.begin(); 109 for (; it != string_set_.end(); ++it) 110 strings.push_back(*it); 111 // Free the extracted pointers 112 for (size_t i = 0; i < strings.size(); ++i) 113 delete strings[i]; 114 } 115 116 StateId Start() { 117 if (!HasStart()) { 118 StateId s = fst_->Start(); 119 if (s == kNoStateId) 120 return kNoStateId; 121 const String *empty = FindString(new String()); 122 StateId start = FindState(Element(fst_->Start(), empty, empty)); 123 SetStart(start); 124 } 125 return CacheImpl<A>::Start(); 126 } 127 128 Weight Final(StateId s) { 129 if (!HasFinal(s)) { 130 const Element &e = elements_[s]; 131 Weight w = e.state == kNoStateId ? Weight::One() : fst_->Final(e.state); 132 if ((w != Weight::Zero()) && (e.istring)->empty() && (e.ostring)->empty()) 133 SetFinal(s, w); 134 else 135 SetFinal(s, Weight::Zero()); 136 } 137 return CacheImpl<A>::Final(s); 138 } 139 140 size_t NumArcs(StateId s) { 141 if (!HasArcs(s)) 142 Expand(s); 143 return CacheImpl<A>::NumArcs(s); 144 } 145 146 size_t NumInputEpsilons(StateId s) { 147 if (!HasArcs(s)) 148 Expand(s); 149 return CacheImpl<A>::NumInputEpsilons(s); 150 } 151 152 size_t NumOutputEpsilons(StateId s) { 153 if (!HasArcs(s)) 154 Expand(s); 155 return CacheImpl<A>::NumOutputEpsilons(s); 156 } 157 158 uint64 Properties() const { return Properties(kFstProperties); } 159 160 // Set error if found; return FST impl properties. 161 uint64 Properties(uint64 mask) const { 162 if ((mask & kError) && fst_->Properties(kError, false)) 163 SetProperties(kError, kError); 164 return FstImpl<Arc>::Properties(mask); 165 } 166 167 void InitArcIterator(StateId s, ArcIteratorData<A> *data) { 168 if (!HasArcs(s)) 169 Expand(s); 170 CacheImpl<A>::InitArcIterator(s, data); 171 } 172 173 // Returns the first character of the string obtained by 174 // concatenating s and l. 175 Label Car(const String *s, Label l = 0) const { 176 if (!s->empty()) 177 return (*s)[0]; 178 else 179 return l; 180 } 181 182 // Computes the residual string obtained by removing the first 183 // character in the concatenation of s and l. 184 const String *Cdr(const String *s, Label l = 0) { 185 String *r = new String(); 186 for (int i = 1; i < s->size(); ++i) 187 r->push_back((*s)[i]); 188 if (l && !(s->empty())) r->push_back(l); 189 return FindString(r); 190 } 191 192 // Computes the concatenation of s and l. 193 const String *Concat(const String *s, Label l = 0) { 194 String *r = new String(); 195 for (int i = 0; i < s->size(); ++i) 196 r->push_back((*s)[i]); 197 if (l) r->push_back(l); 198 return FindString(r); 199 } 200 201 // Tests if the concatenation of s and l is empty 202 bool Empty(const String *s, Label l = 0) const { 203 if (s->empty()) 204 return l == 0; 205 else 206 return false; 207 } 208 209 // Finds the string pointed by s in the hash set. Transfers the 210 // pointer ownership to the hash set. 211 const String *FindString(const String *s) { 212 typename StringSet::iterator it = string_set_.find(s); 213 if (it != string_set_.end()) { 214 delete s; 215 return (*it); 216 } else { 217 string_set_.insert(s); 218 return s; 219 } 220 } 221 222 // Finds state corresponding to an element. Creates new state 223 // if element not found. 224 StateId FindState(const Element &e) { 225 typename ElementMap::iterator eit = element_map_.find(e); 226 if (eit != element_map_.end()) { 227 return (*eit).second; 228 } else { 229 StateId s = elements_.size(); 230 elements_.push_back(e); 231 element_map_.insert(pair<const Element, StateId>(e, s)); 232 return s; 233 } 234 } 235 236 237 // Computes the outgoing transitions from a state, creating new destination 238 // states as needed. 239 void Expand(StateId s) { 240 Element e = elements_[s]; 241 242 if (e.state != kNoStateId) 243 for (ArcIterator< Fst<A> > ait(*fst_, e.state); 244 !ait.Done(); 245 ait.Next()) { 246 const A &arc = ait.Value(); 247 if (!Empty(e.istring, arc.ilabel) && !Empty(e.ostring, arc.olabel)) { 248 const String *istring = Cdr(e.istring, arc.ilabel); 249 const String *ostring = Cdr(e.ostring, arc.olabel); 250 StateId d = FindState(Element(arc.nextstate, istring, ostring)); 251 PushArc(s, Arc(Car(e.istring, arc.ilabel), 252 Car(e.ostring, arc.olabel), arc.weight, d)); 253 } else { 254 const String *istring = Concat(e.istring, arc.ilabel); 255 const String *ostring = Concat(e.ostring, arc.olabel); 256 StateId d = FindState(Element(arc.nextstate, istring, ostring)); 257 PushArc(s, Arc(0 , 0, arc.weight, d)); 258 } 259 } 260 261 Weight w = e.state == kNoStateId ? Weight::One() : fst_->Final(e.state); 262 if ((w != Weight::Zero()) && 263 ((e.istring)->size() + (e.ostring)->size() > 0)) { 264 const String *istring = Cdr(e.istring); 265 const String *ostring = Cdr(e.ostring); 266 StateId d = FindState(Element(kNoStateId, istring, ostring)); 267 PushArc(s, Arc(Car(e.istring), Car(e.ostring), w, d)); 268 } 269 SetArcs(s); 270 } 271 272 private: 273 // Equality function for Elements, assume strings have been hashed. 274 class ElementEqual { 275 public: 276 bool operator()(const Element &x, const Element &y) const { 277 return x.state == y.state && 278 x.istring == y.istring && 279 x.ostring == y.ostring; 280 } 281 }; 282 283 // Hash function for Elements to Fst states. 284 class ElementKey { 285 public: 286 size_t operator()(const Element &x) const { 287 size_t key = x.state; 288 key = (key << 1) ^ (x.istring)->size(); 289 for (size_t i = 0; i < (x.istring)->size(); ++i) 290 key = (key << 1) ^ (*x.istring)[i]; 291 key = (key << 1) ^ (x.ostring)->size(); 292 for (size_t i = 0; i < (x.ostring)->size(); ++i) 293 key = (key << 1) ^ (*x.ostring)[i]; 294 return key; 295 } 296 }; 297 298 // Equality function for strings 299 class StringEqual { 300 public: 301 bool operator()(const String * const &x, const String * const &y) const { 302 if (x->size() != y->size()) return false; 303 for (size_t i = 0; i < x->size(); ++i) 304 if ((*x)[i] != (*y)[i]) return false; 305 return true; 306 } 307 }; 308 309 // Hash function for set of strings 310 class StringKey{ 311 public: 312 size_t operator()(const String * const & x) const { 313 size_t key = x->size(); 314 for (size_t i = 0; i < x->size(); ++i) 315 key = (key << 1) ^ (*x)[i]; 316 return key; 317 } 318 }; 319 320 321 typedef unordered_map<Element, StateId, ElementKey, ElementEqual> ElementMap; 322 typedef unordered_set<const String*, StringKey, StringEqual> StringSet; 323 324 const Fst<A> *fst_; 325 vector<Element> elements_; // mapping Fst state to Elements 326 ElementMap element_map_; // mapping Elements to Fst state 327 StringSet string_set_; 328 329 void operator=(const SynchronizeFstImpl<A> &); // disallow 330}; 331 332 333// Synchronizes a transducer. This version is a delayed Fst. The 334// result will be an equivalent FST that has the property that during 335// the traversal of a path, the delay is either zero or strictly 336// increasing, where the delay is the difference between the number of 337// non-epsilon output labels and input labels along the path. 338// 339// For the algorithm to terminate, the input transducer must have 340// bounded delay, i.e., the delay of every cycle must be zero. 341// 342// Complexity: 343// - A has bounded delay: exponential 344// - A does not have bounded delay: does not terminate 345// 346// References: 347// - Mehryar Mohri. Edit-Distance of Weighted Automata: General 348// Definitions and Algorithms, International Journal of Computer 349// Science, 14(6): 957-982 (2003). 350// 351// This class attaches interface to implementation and handles 352// reference counting, delegating most methods to ImplToFst. 353template <class A> 354class SynchronizeFst : public ImplToFst< SynchronizeFstImpl<A> > { 355 public: 356 friend class ArcIterator< SynchronizeFst<A> >; 357 friend class StateIterator< SynchronizeFst<A> >; 358 359 typedef A Arc; 360 typedef typename A::Weight Weight; 361 typedef typename A::StateId StateId; 362 typedef CacheState<A> State; 363 typedef SynchronizeFstImpl<A> Impl; 364 365 SynchronizeFst(const Fst<A> &fst) 366 : ImplToFst<Impl>(new Impl(fst, SynchronizeFstOptions())) {} 367 368 SynchronizeFst(const Fst<A> &fst, const SynchronizeFstOptions &opts) 369 : ImplToFst<Impl>(new Impl(fst, opts)) {} 370 371 // See Fst<>::Copy() for doc. 372 SynchronizeFst(const SynchronizeFst<A> &fst, bool safe = false) 373 : ImplToFst<Impl>(fst, safe) {} 374 375 // Get a copy of this SynchronizeFst. See Fst<>::Copy() for further doc. 376 virtual SynchronizeFst<A> *Copy(bool safe = false) const { 377 return new SynchronizeFst<A>(*this, safe); 378 } 379 380 virtual inline void InitStateIterator(StateIteratorData<A> *data) const; 381 382 virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const { 383 GetImpl()->InitArcIterator(s, data); 384 } 385 386 private: 387 // Makes visible to friends. 388 Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); } 389 390 void operator=(const SynchronizeFst<A> &fst); // Disallow 391}; 392 393 394// Specialization for SynchronizeFst. 395template<class A> 396class StateIterator< SynchronizeFst<A> > 397 : public CacheStateIterator< SynchronizeFst<A> > { 398 public: 399 explicit StateIterator(const SynchronizeFst<A> &fst) 400 : CacheStateIterator< SynchronizeFst<A> >(fst, fst.GetImpl()) {} 401}; 402 403 404// Specialization for SynchronizeFst. 405template <class A> 406class ArcIterator< SynchronizeFst<A> > 407 : public CacheArcIterator< SynchronizeFst<A> > { 408 public: 409 typedef typename A::StateId StateId; 410 411 ArcIterator(const SynchronizeFst<A> &fst, StateId s) 412 : CacheArcIterator< SynchronizeFst<A> >(fst.GetImpl(), s) { 413 if (!fst.GetImpl()->HasArcs(s)) 414 fst.GetImpl()->Expand(s); 415 } 416 417 private: 418 DISALLOW_COPY_AND_ASSIGN(ArcIterator); 419}; 420 421 422template <class A> inline 423void SynchronizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const 424{ 425 data->base = new StateIterator< SynchronizeFst<A> >(*this); 426} 427 428 429 430// Synchronizes a transducer. This version writes the synchronized 431// result to a MutableFst. The result will be an equivalent FST that 432// has the property that during the traversal of a path, the delay is 433// either zero or strictly increasing, where the delay is the 434// difference between the number of non-epsilon output labels and 435// input labels along the path. 436// 437// For the algorithm to terminate, the input transducer must have 438// bounded delay, i.e., the delay of every cycle must be zero. 439// 440// Complexity: 441// - A has bounded delay: exponential 442// - A does not have bounded delay: does not terminate 443// 444// References: 445// - Mehryar Mohri. Edit-Distance of Weighted Automata: General 446// Definitions and Algorithms, International Journal of Computer 447// Science, 14(6): 957-982 (2003). 448template<class Arc> 449void Synchronize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst) { 450 SynchronizeFstOptions opts; 451 opts.gc_limit = 0; // Cache only the last state for fastest copy. 452 *ofst = SynchronizeFst<Arc>(ifst, opts); 453} 454 455} // namespace fst 456 457#endif // FST_LIB_SYNCHRONIZE_H__ 458