determinize.h revision f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2
1// determinize.h 2 3 4// Licensed under the Apache License, Version 2.0 (the "License"); 5// you may not use this file except in compliance with the License. 6// You may obtain a copy of the License at 7// 8// http://www.apache.org/licenses/LICENSE-2.0 9// 10// Unless required by applicable law or agreed to in writing, software 11// distributed under the License is distributed on an "AS IS" BASIS, 12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13// See the License for the specific language governing permissions and 14// limitations under the License. 15// 16// Copyright 2005-2010 Google, Inc. 17// Author: riley@google.com (Michael Riley) 18// 19// \file 20// Functions and classes to determinize an FST. 21 22#ifndef FST_LIB_DETERMINIZE_H__ 23#define FST_LIB_DETERMINIZE_H__ 24 25#include <algorithm> 26#include <climits> 27#include <unordered_map> 28using std::tr1::unordered_map; 29using std::tr1::unordered_multimap; 30#include <map> 31#include <fst/slist.h> 32#include <string> 33#include <vector> 34using std::vector; 35 36#include <fst/cache.h> 37#include <fst/factor-weight.h> 38#include <fst/arc-map.h> 39#include <fst/prune.h> 40#include <fst/test-properties.h> 41 42 43namespace fst { 44 45// 46// COMMON DIVISORS - these are used in determinization to compute 47// the transition weights. In the simplest case, it is just the same 48// as the semiring Plus(). However, other choices permit more efficient 49// determinization when the output contains strings. 50// 51 52// The default common divisor uses the semiring Plus. 53template <class W> 54class DefaultCommonDivisor { 55 public: 56 typedef W Weight; 57 58 W operator()(const W &w1, const W &w2) const { return Plus(w1, w2); } 59}; 60 61 62// The label common divisor for a (left) string semiring selects a 63// single letter common prefix or the empty string. This is used in 64// the determinization of output strings so that at most a single 65// letter will appear in the output of a transtion. 66template <typename L, StringType S> 67class LabelCommonDivisor { 68 public: 69 typedef StringWeight<L, S> Weight; 70 71 Weight operator()(const Weight &w1, const Weight &w2) const { 72 StringWeightIterator<L, S> iter1(w1); 73 StringWeightIterator<L, S> iter2(w2); 74 75 if (!(StringWeight<L, S>::Properties() & kLeftSemiring)) { 76 FSTERROR() << "LabelCommonDivisor: Weight needs to be left semiring"; 77 return Weight::NoWeight(); 78 } else if (w1.Size() == 0 || w2.Size() == 0) { 79 return Weight::One(); 80 } else if (w1 == Weight::Zero()) { 81 return Weight(iter2.Value()); 82 } else if (w2 == Weight::Zero()) { 83 return Weight(iter1.Value()); 84 } else if (iter1.Value() == iter2.Value()) { 85 return Weight(iter1.Value()); 86 } else { 87 return Weight::One(); 88 } 89 } 90}; 91 92 93// The gallic common divisor uses the label common divisor on the 94// string component and the template argument D common divisor on the 95// weight component, which defaults to the default common divisor. 96template <class L, class W, StringType S, class D = DefaultCommonDivisor<W> > 97class GallicCommonDivisor { 98 public: 99 typedef GallicWeight<L, W, S> Weight; 100 101 Weight operator()(const Weight &w1, const Weight &w2) const { 102 return Weight(label_common_divisor_(w1.Value1(), w2.Value1()), 103 weight_common_divisor_(w1.Value2(), w2.Value2())); 104 } 105 106 private: 107 LabelCommonDivisor<L, S> label_common_divisor_; 108 D weight_common_divisor_; 109}; 110 111// Options for finite-state transducer determinization. 112template <class Arc> 113struct DeterminizeFstOptions : CacheOptions { 114 typedef typename Arc::Label Label; 115 float delta; // Quantization delta for subset weights 116 Label subsequential_label; // Label used for residual final output 117 // when producing subsequential transducers. 118 119 explicit DeterminizeFstOptions(const CacheOptions &opts, 120 float del = kDelta, 121 Label lab = 0) 122 : CacheOptions(opts), delta(del), subsequential_label(lab) {} 123 124 explicit DeterminizeFstOptions(float del = kDelta, Label lab = 0) 125 : delta(del), subsequential_label(lab) {} 126}; 127 128 129// Implementation of delayed DeterminizeFst. This base class is 130// common to the variants that implement acceptor and transducer 131// determinization. 132template <class A> 133class DeterminizeFstImplBase : public CacheImpl<A> { 134 public: 135 using FstImpl<A>::SetType; 136 using FstImpl<A>::SetProperties; 137 using FstImpl<A>::Properties; 138 using FstImpl<A>::SetInputSymbols; 139 using FstImpl<A>::SetOutputSymbols; 140 141 using CacheBaseImpl< CacheState<A> >::HasStart; 142 using CacheBaseImpl< CacheState<A> >::HasFinal; 143 using CacheBaseImpl< CacheState<A> >::HasArcs; 144 using CacheBaseImpl< CacheState<A> >::SetFinal; 145 using CacheBaseImpl< CacheState<A> >::SetStart; 146 147 typedef typename A::Label Label; 148 typedef typename A::Weight Weight; 149 typedef typename A::StateId StateId; 150 typedef CacheState<A> State; 151 152 DeterminizeFstImplBase(const Fst<A> &fst, 153 const DeterminizeFstOptions<A> &opts) 154 : CacheImpl<A>(opts), fst_(fst.Copy()) { 155 SetType("determinize"); 156 uint64 props = fst.Properties(kFstProperties, false); 157 SetProperties(DeterminizeProperties(props, 158 opts.subsequential_label != 0), 159 kCopyProperties); 160 161 SetInputSymbols(fst.InputSymbols()); 162 SetOutputSymbols(fst.OutputSymbols()); 163 } 164 165 DeterminizeFstImplBase(const DeterminizeFstImplBase<A> &impl) 166 : CacheImpl<A>(impl), 167 fst_(impl.fst_->Copy(true)) { 168 SetType("determinize"); 169 SetProperties(impl.Properties(), kCopyProperties); 170 SetInputSymbols(impl.InputSymbols()); 171 SetOutputSymbols(impl.OutputSymbols()); 172 } 173 174 virtual ~DeterminizeFstImplBase() { delete fst_; } 175 176 virtual DeterminizeFstImplBase<A> *Copy() = 0; 177 178 StateId Start() { 179 if (!HasStart()) { 180 StateId start = ComputeStart(); 181 if (start != kNoStateId) { 182 SetStart(start); 183 } 184 } 185 return CacheImpl<A>::Start(); 186 } 187 188 Weight Final(StateId s) { 189 if (!HasFinal(s)) { 190 Weight final = ComputeFinal(s); 191 SetFinal(s, final); 192 } 193 return CacheImpl<A>::Final(s); 194 } 195 196 virtual void Expand(StateId s) = 0; 197 198 size_t NumArcs(StateId s) { 199 if (!HasArcs(s)) 200 Expand(s); 201 return CacheImpl<A>::NumArcs(s); 202 } 203 204 size_t NumInputEpsilons(StateId s) { 205 if (!HasArcs(s)) 206 Expand(s); 207 return CacheImpl<A>::NumInputEpsilons(s); 208 } 209 210 size_t NumOutputEpsilons(StateId s) { 211 if (!HasArcs(s)) 212 Expand(s); 213 return CacheImpl<A>::NumOutputEpsilons(s); 214 } 215 216 void InitArcIterator(StateId s, ArcIteratorData<A> *data) { 217 if (!HasArcs(s)) 218 Expand(s); 219 CacheImpl<A>::InitArcIterator(s, data); 220 } 221 222 virtual StateId ComputeStart() = 0; 223 224 virtual Weight ComputeFinal(StateId s) = 0; 225 226 const Fst<A> &GetFst() const { return *fst_; } 227 228 private: 229 const Fst<A> *fst_; // Input Fst 230 231 void operator=(const DeterminizeFstImplBase<A> &); // disallow 232}; 233 234 235// Implementation of delayed determinization for weighted acceptors. 236// It is templated on the arc type A and the common divisor D. 237template <class A, class D> 238class DeterminizeFsaImpl : public DeterminizeFstImplBase<A> { 239 public: 240 using FstImpl<A>::SetProperties; 241 using DeterminizeFstImplBase<A>::GetFst; 242 using DeterminizeFstImplBase<A>::SetArcs; 243 244 typedef typename A::Label Label; 245 typedef typename A::Weight Weight; 246 typedef typename A::StateId StateId; 247 248 struct Element { 249 Element() {} 250 251 Element(StateId s, Weight w) : state_id(s), weight(w) {} 252 253 StateId state_id; // Input state Id 254 Weight weight; // Residual weight 255 }; 256 typedef slist<Element> Subset; 257 typedef map<Label, Subset*> LabelMap; 258 259 DeterminizeFsaImpl(const Fst<A> &fst, D common_divisor, 260 const vector<Weight> *in_dist, vector<Weight> *out_dist, 261 const DeterminizeFstOptions<A> &opts) 262 : DeterminizeFstImplBase<A>(fst, opts), 263 delta_(opts.delta), 264 in_dist_(in_dist), 265 out_dist_(out_dist), 266 common_divisor_(common_divisor), 267 subset_hash_(0, SubsetKey(), SubsetEqual(&elements_)) { 268 if (!fst.Properties(kAcceptor, true)) { 269 FSTERROR() << "DeterminizeFst: argument not an acceptor"; 270 SetProperties(kError, kError); 271 } 272 if (!(Weight::Properties() & kLeftSemiring)) { 273 FSTERROR() << "DeterminizeFst: Weight needs to be left distributive: " 274 << Weight::Type(); 275 SetProperties(kError, kError); 276 } 277 if (out_dist_) 278 out_dist_->clear(); 279 } 280 281 DeterminizeFsaImpl(const DeterminizeFsaImpl<A, D> &impl) 282 : DeterminizeFstImplBase<A>(impl), 283 delta_(impl.delta_), 284 in_dist_(0), 285 out_dist_(0), 286 common_divisor_(impl.common_divisor_), 287 subset_hash_(0, SubsetKey(), SubsetEqual(&elements_)) { 288 if (impl.out_dist_) { 289 FSTERROR() << "DeterminizeFsaImpl: cannot copy with out_dist vector"; 290 SetProperties(kError, kError); 291 } 292 } 293 294 virtual ~DeterminizeFsaImpl() { 295 for (int i = 0; i < subsets_.size(); ++i) 296 delete subsets_[i]; 297 } 298 299 virtual DeterminizeFsaImpl<A, D> *Copy() { 300 return new DeterminizeFsaImpl<A, D>(*this); 301 } 302 303 uint64 Properties() const { return Properties(kFstProperties); } 304 305 // Set error if found; return FST impl properties. 306 uint64 Properties(uint64 mask) const { 307 if ((mask & kError) && (GetFst().Properties(kError, false))) 308 SetProperties(kError, kError); 309 return FstImpl<A>::Properties(mask); 310 } 311 312 virtual StateId ComputeStart() { 313 StateId s = GetFst().Start(); 314 if (s == kNoStateId) 315 return kNoStateId; 316 Element element(s, Weight::One()); 317 Subset *subset = new Subset; 318 subset->push_front(element); 319 return FindState(subset); 320 } 321 322 virtual Weight ComputeFinal(StateId s) { 323 Subset *subset = subsets_[s]; 324 Weight final = Weight::Zero(); 325 for (typename Subset::iterator siter = subset->begin(); 326 siter != subset->end(); 327 ++siter) { 328 Element &element = *siter; 329 final = Plus(final, Times(element.weight, 330 GetFst().Final(element.state_id))); 331 if (!final.Member()) 332 SetProperties(kError, kError); 333 } 334 return final; 335 } 336 337 // Finds the state corresponding to a subset. Only creates a new state 338 // if the subset is not found in the subset hash. FindState takes 339 // ownership of the subset argument (so that it doesn't have to copy it 340 // if it creates a new state). 341 // 342 // The method exploits the following device: all pairs stored in the 343 // associative container subset_hash_ are of the form (subset, 344 // id(subset) + 1), i.e. subset_hash_[subset] > 0 if subset has been 345 // stored previously. For unassigned subsets, the call to 346 // subset_hash_[subset] creates a new pair (subset, 0). As a result, 347 // subset_hash_[subset] == 0 iff subset is new. 348 StateId FindState(Subset *subset) { 349 StateId &assoc_value = subset_hash_[subset]; 350 if (assoc_value == 0) { // subset wasn't present; create new state 351 StateId s = CreateState(subset); 352 assoc_value = s + 1; 353 return s; 354 } else { 355 delete subset; 356 return assoc_value - 1; // NB: assoc_value = ID + 1 357 } 358 } 359 360 StateId CreateState(Subset *subset) { 361 StateId s = subsets_.size(); 362 subsets_.push_back(subset); 363 if (in_dist_) 364 out_dist_->push_back(ComputeDistance(subset)); 365 return s; 366 } 367 368 // Compute distance from a state to the final states in the DFA 369 // given the distances in the NFA. 370 Weight ComputeDistance(const Subset *subset) { 371 Weight outd = Weight::Zero(); 372 for (typename Subset::const_iterator siter = subset->begin(); 373 siter != subset->end(); ++siter) { 374 const Element &element = *siter; 375 Weight ind = element.state_id < in_dist_->size() ? 376 (*in_dist_)[element.state_id] : Weight::Zero(); 377 outd = Plus(outd, Times(element.weight, ind)); 378 } 379 return outd; 380 } 381 382 // Computes the outgoing transitions from a state, creating new destination 383 // states as needed. 384 virtual void Expand(StateId s) { 385 386 LabelMap label_map; 387 LabelSubsets(s, &label_map); 388 389 for (typename LabelMap::iterator liter = label_map.begin(); 390 liter != label_map.end(); 391 ++liter) 392 AddArc(s, liter->first, liter->second); 393 SetArcs(s); 394 } 395 396 private: 397 // Constructs destination subsets per label. At return, subset 398 // element weights include the input automaton label weights and the 399 // subsets may contain duplicate states. 400 void LabelSubsets(StateId s, LabelMap *label_map) { 401 Subset *src_subset = subsets_[s]; 402 403 for (typename Subset::iterator siter = src_subset->begin(); 404 siter != src_subset->end(); 405 ++siter) { 406 Element &src_element = *siter; 407 for (ArcIterator< Fst<A> > aiter(GetFst(), src_element.state_id); 408 !aiter.Done(); 409 aiter.Next()) { 410 const A &arc = aiter.Value(); 411 Element dest_element(arc.nextstate, 412 Times(src_element.weight, arc.weight)); 413 Subset* &dest_subset = (*label_map)[arc.ilabel]; 414 if (dest_subset == 0) 415 dest_subset = new Subset; 416 dest_subset->push_front(dest_element); 417 } 418 } 419 } 420 421 // Adds an arc from state S to the destination state associated 422 // with subset DEST_SUBSET (as created by LabelSubsets). 423 void AddArc(StateId s, Label label, Subset *dest_subset) { 424 A arc; 425 arc.ilabel = label; 426 arc.olabel = label; 427 arc.weight = Weight::Zero(); 428 429 typename Subset::iterator oiter; 430 for (typename Subset::iterator diter = dest_subset->begin(); 431 diter != dest_subset->end();) { 432 Element &dest_element = *diter; 433 // Computes label weight. 434 arc.weight = common_divisor_(arc.weight, dest_element.weight); 435 436 while (elements_.size() <= dest_element.state_id) 437 elements_.push_back(0); 438 Element *matching_element = elements_[dest_element.state_id]; 439 if (matching_element) { 440 // Found duplicate state: sums state weight and deletes dup. 441 matching_element->weight = Plus(matching_element->weight, 442 dest_element.weight); 443 if (!matching_element->weight.Member()) 444 SetProperties(kError, kError); 445 ++diter; 446 dest_subset->erase_after(oiter); 447 } else { 448 // Saves element so we can check for duplicate for this state. 449 elements_[dest_element.state_id] = &dest_element; 450 oiter = diter; 451 ++diter; 452 } 453 } 454 455 // Divides out label weight from destination subset elements. 456 // Quantizes to ensure comparisons are effective. 457 // Clears element vector. 458 for (typename Subset::iterator diter = dest_subset->begin(); 459 diter != dest_subset->end(); 460 ++diter) { 461 Element &dest_element = *diter; 462 dest_element.weight = Divide(dest_element.weight, arc.weight, 463 DIVIDE_LEFT); 464 dest_element.weight = dest_element.weight.Quantize(delta_); 465 elements_[dest_element.state_id] = 0; 466 } 467 468 arc.nextstate = FindState(dest_subset); 469 CacheImpl<A>::PushArc(s, arc); 470 } 471 472 // Comparison object for hashing Subset(s). Subsets are not sorted in this 473 // implementation, so ordering must not be assumed in the equivalence 474 // test. 475 class SubsetEqual { 476 public: 477 // Constructor takes vector needed to check equality. See immediately 478 // below for constraints on it. 479 explicit SubsetEqual(vector<Element *> *elements) 480 : elements_(elements) {} 481 482 // At each call to operator(), the elements_ vector should contain 483 // only NULLs. When this operator returns, elements_ will still 484 // have this property. 485 bool operator()(Subset* subset1, Subset* subset2) const { 486 if (subset1->size() != subset2->size()) 487 return false; 488 489 // Loads first subset elements in element vector. 490 for (typename Subset::iterator iter1 = subset1->begin(); 491 iter1 != subset1->end(); 492 ++iter1) { 493 Element &element1 = *iter1; 494 while (elements_->size() <= element1.state_id) 495 elements_->push_back(0); 496 (*elements_)[element1.state_id] = &element1; 497 } 498 499 // Checks second subset matches first via element vector. 500 for (typename Subset::iterator iter2 = subset2->begin(); 501 iter2 != subset2->end(); 502 ++iter2) { 503 Element &element2 = *iter2; 504 while (elements_->size() <= element2.state_id) 505 elements_->push_back(0); 506 Element *element1 = (*elements_)[element2.state_id]; 507 if (!element1 || element1->weight != element2.weight) { 508 // Mismatch found. Resets element vector before returning false. 509 for (typename Subset::iterator iter1 = subset1->begin(); 510 iter1 != subset1->end(); 511 ++iter1) 512 (*elements_)[iter1->state_id] = 0; 513 return false; 514 } else { 515 (*elements_)[element2.state_id] = 0; // Clears entry 516 } 517 } 518 return true; 519 } 520 private: 521 vector<Element *> *elements_; 522 }; 523 524 // Hash function for Subset to Fst states. Subset elements are not 525 // sorted in this implementation, so the hash must be invariant 526 // under subset reordering. 527 class SubsetKey { 528 public: 529 size_t operator()(const Subset* subset) const { 530 size_t hash = 0; 531 for (typename Subset::const_iterator iter = subset->begin(); 532 iter != subset->end(); 533 ++iter) { 534 const Element &element = *iter; 535 int lshift = element.state_id % (CHAR_BIT * sizeof(size_t) - 1) + 1; 536 int rshift = CHAR_BIT * sizeof(size_t) - lshift; 537 size_t n = element.state_id; 538 hash ^= n << lshift ^ n >> rshift ^ element.weight.Hash(); 539 } 540 return hash; 541 } 542 }; 543 544 float delta_; // Quantization delta for subset weights 545 const vector<Weight> *in_dist_; // Distance to final NFA states 546 vector<Weight> *out_dist_; // Distance to final DFA states 547 548 D common_divisor_; 549 550 // Used to test equivalence of subsets. 551 vector<Element *> elements_; 552 553 // Maps from StateId to Subset. 554 vector<Subset *> subsets_; 555 556 // Hashes from Subset to its StateId in the output automaton. 557 typedef unordered_map<Subset *, StateId, SubsetKey, SubsetEqual> 558 SubsetHash; 559 560 // Hashes from Label to Subsets corr. to destination states of current state. 561 SubsetHash subset_hash_; 562 563 void operator=(const DeterminizeFsaImpl<A, D> &); // disallow 564}; 565 566 567// Implementation of delayed determinization for transducers. 568// Transducer determinization is implemented by mapping the input to 569// the Gallic semiring as an acceptor whose weights contain the output 570// strings and using acceptor determinization above to determinize 571// that acceptor. 572template <class A, StringType S> 573class DeterminizeFstImpl : public DeterminizeFstImplBase<A> { 574 public: 575 using FstImpl<A>::SetProperties; 576 using DeterminizeFstImplBase<A>::GetFst; 577 using CacheBaseImpl< CacheState<A> >::GetCacheGc; 578 using CacheBaseImpl< CacheState<A> >::GetCacheLimit; 579 580 typedef typename A::Label Label; 581 typedef typename A::Weight Weight; 582 typedef typename A::StateId StateId; 583 584 typedef ToGallicMapper<A, S> ToMapper; 585 typedef FromGallicMapper<A, S> FromMapper; 586 587 typedef typename ToMapper::ToArc ToArc; 588 typedef ArcMapFst<A, ToArc, ToMapper> ToFst; 589 typedef ArcMapFst<ToArc, A, FromMapper> FromFst; 590 591 typedef GallicCommonDivisor<Label, Weight, S> CommonDivisor; 592 typedef GallicFactor<Label, Weight, S> FactorIterator; 593 594 DeterminizeFstImpl(const Fst<A> &fst, const DeterminizeFstOptions<A> &opts) 595 : DeterminizeFstImplBase<A>(fst, opts), 596 delta_(opts.delta), 597 subsequential_label_(opts.subsequential_label) { 598 Init(GetFst()); 599 } 600 601 DeterminizeFstImpl(const DeterminizeFstImpl<A, S> &impl) 602 : DeterminizeFstImplBase<A>(impl), 603 delta_(impl.delta_), 604 subsequential_label_(impl.subsequential_label_) { 605 Init(GetFst()); 606 } 607 608 ~DeterminizeFstImpl() { delete from_fst_; } 609 610 virtual DeterminizeFstImpl<A, S> *Copy() { 611 return new DeterminizeFstImpl<A, S>(*this); 612 } 613 614 uint64 Properties() const { return Properties(kFstProperties); } 615 616 // Set error if found; return FST impl properties. 617 uint64 Properties(uint64 mask) const { 618 if ((mask & kError) && (GetFst().Properties(kError, false) || 619 from_fst_->Properties(kError, false))) 620 SetProperties(kError, kError); 621 return FstImpl<A>::Properties(mask); 622 } 623 624 virtual StateId ComputeStart() { return from_fst_->Start(); } 625 626 virtual Weight ComputeFinal(StateId s) { return from_fst_->Final(s); } 627 628 virtual void Expand(StateId s) { 629 for (ArcIterator<FromFst> aiter(*from_fst_, s); 630 !aiter.Done(); 631 aiter.Next()) 632 CacheImpl<A>::PushArc(s, aiter.Value()); 633 CacheImpl<A>::SetArcs(s); 634 } 635 636 private: 637 // Initialization of transducer determinization implementation, which 638 // is defined after DeterminizeFst since it calls it. 639 void Init(const Fst<A> &fst); 640 641 float delta_; 642 Label subsequential_label_; 643 FromFst *from_fst_; 644 645 void operator=(const DeterminizeFstImpl<A, S> &); // disallow 646}; 647 648 649// Determinizes a weighted transducer. This version is a delayed 650// Fst. The result will be an equivalent FST that has the property 651// that no state has two transitions with the same input label. 652// For this algorithm, epsilon transitions are treated as regular 653// symbols (cf. RmEpsilon). 654// 655// The transducer must be functional. The weights must be (weakly) 656// left divisible (valid for TropicalWeight and LogWeight for instance) 657// and be zero-sum-free if for all a,b: (Plus(a, b) = 0 => a = b = 0. 658// 659// Complexity: 660// - Determinizable: exponential (polynomial in the size of the output) 661// - Non-determinizable) does not terminate 662// 663// The determinizable automata include all unweighted and all acyclic input. 664// 665// References: 666// - Mehryar Mohri, "Finite-State Transducers in Language and Speech 667// Processing". Computational Linguistics, 23:2, 1997. 668// 669// This class attaches interface to implementation and handles 670// reference counting, delegating most methods to ImplToFst. 671template <class A> 672class DeterminizeFst : public ImplToFst< DeterminizeFstImplBase<A> > { 673 public: 674 friend class ArcIterator< DeterminizeFst<A> >; 675 friend class StateIterator< DeterminizeFst<A> >; 676 template <class B, StringType S> friend class DeterminizeFstImpl; 677 678 typedef A Arc; 679 typedef typename A::Weight Weight; 680 typedef typename A::StateId StateId; 681 typedef typename A::Label Label; 682 typedef CacheState<A> State; 683 typedef DeterminizeFstImplBase<A> Impl; 684 685 using ImplToFst<Impl>::SetImpl; 686 687 explicit DeterminizeFst( 688 const Fst<A> &fst, 689 const DeterminizeFstOptions<A> &opts = DeterminizeFstOptions<A>()) { 690 if (fst.Properties(kAcceptor, true)) { 691 // Calls implementation for acceptors. 692 typedef DefaultCommonDivisor<Weight> D; 693 SetImpl(new DeterminizeFsaImpl<A, D>(fst, D(), 0, 0, opts)); 694 } else { 695 // Calls implementation for transducers. 696 SetImpl(new DeterminizeFstImpl<A, STRING_LEFT_RESTRICT>(fst, opts)); 697 } 698 } 699 700 // This acceptor-only version additionally computes the distance to 701 // final states in the output if provided with those distances for the 702 // input. Useful for e.g. unique N-shortest paths. 703 DeterminizeFst( 704 const Fst<A> &fst, 705 const vector<Weight> &in_dist, vector<Weight> *out_dist, 706 const DeterminizeFstOptions<A> &opts = DeterminizeFstOptions<A>()) { 707 if (!fst.Properties(kAcceptor, true)) { 708 FSTERROR() << "DeterminizeFst:" 709 << " distance to final states computed for acceptors only"; 710 GetImpl()->SetProperties(kError, kError); 711 } 712 typedef DefaultCommonDivisor<Weight> D; 713 SetImpl(new DeterminizeFsaImpl<A, D>(fst, D(), &in_dist, out_dist, opts)); 714 } 715 716 // See Fst<>::Copy() for doc. 717 DeterminizeFst(const DeterminizeFst<A> &fst, bool safe = false) { 718 if (safe) 719 SetImpl(fst.GetImpl()->Copy()); 720 else 721 SetImpl(fst.GetImpl(), false); 722 } 723 724 // Get a copy of this DeterminizeFst. See Fst<>::Copy() for further doc. 725 virtual DeterminizeFst<A> *Copy(bool safe = false) const { 726 return new DeterminizeFst<A>(*this, safe); 727 } 728 729 virtual inline void InitStateIterator(StateIteratorData<A> *data) const; 730 731 virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const { 732 GetImpl()->InitArcIterator(s, data); 733 } 734 735 private: 736 // This private version is for passing the common divisor to 737 // FSA determinization. 738 template <class D> 739 DeterminizeFst(const Fst<A> &fst, const D &common_div, 740 const DeterminizeFstOptions<A> &opts) 741 : ImplToFst<Impl>( 742 new DeterminizeFsaImpl<A, D>(fst, common_div, 0, 0, opts)) {} 743 744 // Makes visible to friends. 745 Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); } 746 747 void operator=(const DeterminizeFst<A> &fst); // Disallow 748}; 749 750 751// Initialization of transducer determinization implementation. which 752// is defined after DeterminizeFst since it calls it. 753template <class A, StringType S> 754void DeterminizeFstImpl<A, S>::Init(const Fst<A> &fst) { 755 // Mapper to an acceptor. 756 ToFst to_fst(fst, ToMapper()); 757 758 // Determinize acceptor. 759 // This recursive call terminates since it passes the common divisor 760 // to a private constructor. 761 CacheOptions copts(GetCacheGc(), GetCacheLimit()); 762 DeterminizeFstOptions<ToArc> dopts(copts, delta_); 763 DeterminizeFst<ToArc> det_fsa(to_fst, CommonDivisor(), dopts); 764 765 // Mapper back to transducer. 766 FactorWeightOptions<ToArc> fopts(CacheOptions(true, 0), delta_, 767 kFactorFinalWeights, 768 subsequential_label_, 769 subsequential_label_); 770 FactorWeightFst<ToArc, FactorIterator> factored_fst(det_fsa, fopts); 771 from_fst_ = new FromFst(factored_fst, FromMapper(subsequential_label_)); 772} 773 774 775// Specialization for DeterminizeFst. 776template <class A> 777class StateIterator< DeterminizeFst<A> > 778 : public CacheStateIterator< DeterminizeFst<A> > { 779 public: 780 explicit StateIterator(const DeterminizeFst<A> &fst) 781 : CacheStateIterator< DeterminizeFst<A> >(fst, fst.GetImpl()) {} 782}; 783 784 785// Specialization for DeterminizeFst. 786template <class A> 787class ArcIterator< DeterminizeFst<A> > 788 : public CacheArcIterator< DeterminizeFst<A> > { 789 public: 790 typedef typename A::StateId StateId; 791 792 ArcIterator(const DeterminizeFst<A> &fst, StateId s) 793 : CacheArcIterator< DeterminizeFst<A> >(fst.GetImpl(), s) { 794 if (!fst.GetImpl()->HasArcs(s)) 795 fst.GetImpl()->Expand(s); 796 } 797 798 private: 799 DISALLOW_COPY_AND_ASSIGN(ArcIterator); 800}; 801 802 803template <class A> inline 804void DeterminizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const 805{ 806 data->base = new StateIterator< DeterminizeFst<A> >(*this); 807} 808 809 810// Useful aliases when using StdArc. 811typedef DeterminizeFst<StdArc> StdDeterminizeFst; 812 813 814template <class Arc> 815struct DeterminizeOptions { 816 typedef typename Arc::StateId StateId; 817 typedef typename Arc::Weight Weight; 818 typedef typename Arc::Label Label; 819 820 float delta; // Quantization delta for subset weights. 821 Weight weight_threshold; // Pruning weight threshold. 822 StateId state_threshold; // Pruning state threshold. 823 Label subsequential_label; // Label used for residual final output 824 // when producing subsequential transducers. 825 826 explicit DeterminizeOptions(float d = kDelta, Weight w = Weight::Zero(), 827 StateId n = kNoStateId, Label l = 0) 828 : delta(d), weight_threshold(w), state_threshold(n), 829 subsequential_label(l) {} 830}; 831 832 833// Determinizes a weighted transducer. This version writes the 834// determinized Fst to an output MutableFst. The result will be an 835// equivalent FSt that has the property that no state has two 836// transitions with the same input label. For this algorithm, epsilon 837// transitions are treated as regular symbols (cf. RmEpsilon). 838// 839// The transducer must be functional. The weights must be (weakly) 840// left divisible (valid for TropicalWeight and LogWeight). 841// 842// Complexity: 843// - Determinizable: exponential (polynomial in the size of the output) 844// - Non-determinizable: does not terminate 845// 846// The determinizable automata include all unweighted and all acyclic input. 847// 848// References: 849// - Mehryar Mohri, "Finite-State Transducers in Language and Speech 850// Processing". Computational Linguistics, 23:2, 1997. 851template <class Arc> 852void Determinize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst, 853 const DeterminizeOptions<Arc> &opts 854 = DeterminizeOptions<Arc>()) { 855 typedef typename Arc::StateId StateId; 856 typedef typename Arc::Weight Weight; 857 858 DeterminizeFstOptions<Arc> nopts; 859 nopts.delta = opts.delta; 860 nopts.subsequential_label = opts.subsequential_label; 861 862 nopts.gc_limit = 0; // Cache only the last state for fastest copy. 863 864 if (opts.weight_threshold != Weight::Zero() || 865 opts.state_threshold != kNoStateId) { 866 if (ifst.Properties(kAcceptor, false)) { 867 vector<Weight> idistance, odistance; 868 ShortestDistance(ifst, &idistance, true); 869 DeterminizeFst<Arc> dfst(ifst, idistance, &odistance, nopts); 870 PruneOptions< Arc, AnyArcFilter<Arc> > popts(opts.weight_threshold, 871 opts.state_threshold, 872 AnyArcFilter<Arc>(), 873 &odistance); 874 Prune(dfst, ofst, popts); 875 } else { 876 *ofst = DeterminizeFst<Arc>(ifst, nopts); 877 Prune(ofst, opts.weight_threshold, opts.state_threshold); 878 } 879 } else { 880 *ofst = DeterminizeFst<Arc>(ifst, nopts); 881 } 882} 883 884 885} // namespace fst 886 887#endif // FST_LIB_DETERMINIZE_H__ 888