determinize.h revision f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2
1// determinize.h
2
3
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//     http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16// Copyright 2005-2010 Google, Inc.
17// Author: riley@google.com (Michael Riley)
18//
19// \file
20// Functions and classes to determinize an FST.
21
22#ifndef FST_LIB_DETERMINIZE_H__
23#define FST_LIB_DETERMINIZE_H__
24
25#include <algorithm>
26#include <climits>
27#include <unordered_map>
28using std::tr1::unordered_map;
29using std::tr1::unordered_multimap;
30#include <map>
31#include <fst/slist.h>
32#include <string>
33#include <vector>
34using std::vector;
35
36#include <fst/cache.h>
37#include <fst/factor-weight.h>
38#include <fst/arc-map.h>
39#include <fst/prune.h>
40#include <fst/test-properties.h>
41
42
43namespace fst {
44
45//
46// COMMON DIVISORS - these are used in determinization to compute
47// the transition weights. In the simplest case, it is just the same
48// as the semiring Plus(). However, other choices permit more efficient
49// determinization when the output contains strings.
50//
51
52// The default common divisor uses the semiring Plus.
53template <class W>
54class DefaultCommonDivisor {
55 public:
56  typedef W Weight;
57
58  W operator()(const W &w1, const W &w2) const { return Plus(w1, w2); }
59};
60
61
62// The label common divisor for a (left) string semiring selects a
63// single letter common prefix or the empty string. This is used in
64// the determinization of output strings so that at most a single
65// letter will appear in the output of a transtion.
66template <typename L, StringType S>
67class LabelCommonDivisor {
68 public:
69  typedef StringWeight<L, S> Weight;
70
71  Weight operator()(const Weight &w1, const Weight &w2) const {
72    StringWeightIterator<L, S> iter1(w1);
73    StringWeightIterator<L, S> iter2(w2);
74
75    if (!(StringWeight<L, S>::Properties() & kLeftSemiring)) {
76      FSTERROR() << "LabelCommonDivisor: Weight needs to be left semiring";
77      return Weight::NoWeight();
78    } else if (w1.Size() == 0 || w2.Size() == 0) {
79      return Weight::One();
80    } else if (w1 == Weight::Zero()) {
81      return Weight(iter2.Value());
82    } else if (w2 == Weight::Zero()) {
83      return Weight(iter1.Value());
84    } else if (iter1.Value() == iter2.Value()) {
85      return Weight(iter1.Value());
86    } else {
87      return Weight::One();
88    }
89  }
90};
91
92
93// The gallic common divisor uses the label common divisor on the
94// string component and the template argument D common divisor on the
95// weight component, which defaults to the default common divisor.
96template <class L, class W, StringType S, class D = DefaultCommonDivisor<W> >
97class GallicCommonDivisor {
98 public:
99  typedef GallicWeight<L, W, S> Weight;
100
101  Weight operator()(const Weight &w1, const Weight &w2) const {
102    return Weight(label_common_divisor_(w1.Value1(), w2.Value1()),
103                  weight_common_divisor_(w1.Value2(), w2.Value2()));
104  }
105
106 private:
107  LabelCommonDivisor<L, S> label_common_divisor_;
108  D weight_common_divisor_;
109};
110
111// Options for finite-state transducer determinization.
112template <class Arc>
113struct DeterminizeFstOptions : CacheOptions {
114  typedef typename Arc::Label Label;
115  float delta;                // Quantization delta for subset weights
116  Label subsequential_label;  // Label used for residual final output
117                              // when producing subsequential transducers.
118
119  explicit DeterminizeFstOptions(const CacheOptions &opts,
120                                 float del = kDelta,
121                                 Label lab = 0)
122      : CacheOptions(opts), delta(del), subsequential_label(lab) {}
123
124  explicit DeterminizeFstOptions(float del = kDelta, Label lab = 0)
125      : delta(del), subsequential_label(lab) {}
126};
127
128
129// Implementation of delayed DeterminizeFst. This base class is
130// common to the variants that implement acceptor and transducer
131// determinization.
132template <class A>
133class DeterminizeFstImplBase : public CacheImpl<A> {
134 public:
135  using FstImpl<A>::SetType;
136  using FstImpl<A>::SetProperties;
137  using FstImpl<A>::Properties;
138  using FstImpl<A>::SetInputSymbols;
139  using FstImpl<A>::SetOutputSymbols;
140
141  using CacheBaseImpl< CacheState<A> >::HasStart;
142  using CacheBaseImpl< CacheState<A> >::HasFinal;
143  using CacheBaseImpl< CacheState<A> >::HasArcs;
144  using CacheBaseImpl< CacheState<A> >::SetFinal;
145  using CacheBaseImpl< CacheState<A> >::SetStart;
146
147  typedef typename A::Label Label;
148  typedef typename A::Weight Weight;
149  typedef typename A::StateId StateId;
150  typedef CacheState<A> State;
151
152  DeterminizeFstImplBase(const Fst<A> &fst,
153                         const DeterminizeFstOptions<A> &opts)
154      : CacheImpl<A>(opts), fst_(fst.Copy()) {
155    SetType("determinize");
156    uint64 props = fst.Properties(kFstProperties, false);
157    SetProperties(DeterminizeProperties(props,
158                                        opts.subsequential_label != 0),
159                  kCopyProperties);
160
161    SetInputSymbols(fst.InputSymbols());
162    SetOutputSymbols(fst.OutputSymbols());
163  }
164
165  DeterminizeFstImplBase(const DeterminizeFstImplBase<A> &impl)
166      : CacheImpl<A>(impl),
167        fst_(impl.fst_->Copy(true)) {
168    SetType("determinize");
169    SetProperties(impl.Properties(), kCopyProperties);
170    SetInputSymbols(impl.InputSymbols());
171    SetOutputSymbols(impl.OutputSymbols());
172  }
173
174  virtual ~DeterminizeFstImplBase() { delete fst_; }
175
176  virtual DeterminizeFstImplBase<A> *Copy() = 0;
177
178  StateId Start() {
179    if (!HasStart()) {
180      StateId start = ComputeStart();
181      if (start != kNoStateId) {
182        SetStart(start);
183      }
184    }
185    return CacheImpl<A>::Start();
186  }
187
188  Weight Final(StateId s) {
189    if (!HasFinal(s)) {
190      Weight final = ComputeFinal(s);
191      SetFinal(s, final);
192    }
193    return CacheImpl<A>::Final(s);
194  }
195
196  virtual void Expand(StateId s) = 0;
197
198  size_t NumArcs(StateId s) {
199    if (!HasArcs(s))
200      Expand(s);
201    return CacheImpl<A>::NumArcs(s);
202  }
203
204  size_t NumInputEpsilons(StateId s) {
205    if (!HasArcs(s))
206      Expand(s);
207    return CacheImpl<A>::NumInputEpsilons(s);
208  }
209
210  size_t NumOutputEpsilons(StateId s) {
211    if (!HasArcs(s))
212      Expand(s);
213    return CacheImpl<A>::NumOutputEpsilons(s);
214  }
215
216  void InitArcIterator(StateId s, ArcIteratorData<A> *data) {
217    if (!HasArcs(s))
218      Expand(s);
219    CacheImpl<A>::InitArcIterator(s, data);
220  }
221
222  virtual StateId ComputeStart() = 0;
223
224  virtual Weight ComputeFinal(StateId s) = 0;
225
226  const Fst<A> &GetFst() const { return *fst_; }
227
228 private:
229  const Fst<A> *fst_;            // Input Fst
230
231  void operator=(const DeterminizeFstImplBase<A> &);  // disallow
232};
233
234
235// Implementation of delayed determinization for weighted acceptors.
236// It is templated on the arc type A and the common divisor D.
237template <class A, class D>
238class DeterminizeFsaImpl : public DeterminizeFstImplBase<A> {
239 public:
240  using FstImpl<A>::SetProperties;
241  using DeterminizeFstImplBase<A>::GetFst;
242  using DeterminizeFstImplBase<A>::SetArcs;
243
244  typedef typename A::Label Label;
245  typedef typename A::Weight Weight;
246  typedef typename A::StateId StateId;
247
248  struct Element {
249    Element() {}
250
251    Element(StateId s, Weight w) : state_id(s), weight(w) {}
252
253    StateId state_id;  // Input state Id
254    Weight weight;     // Residual weight
255  };
256  typedef slist<Element> Subset;
257  typedef map<Label, Subset*> LabelMap;
258
259  DeterminizeFsaImpl(const Fst<A> &fst, D common_divisor,
260                     const vector<Weight> *in_dist, vector<Weight> *out_dist,
261                     const DeterminizeFstOptions<A> &opts)
262      : DeterminizeFstImplBase<A>(fst, opts),
263        delta_(opts.delta),
264        in_dist_(in_dist),
265        out_dist_(out_dist),
266        common_divisor_(common_divisor),
267        subset_hash_(0, SubsetKey(), SubsetEqual(&elements_)) {
268    if (!fst.Properties(kAcceptor, true)) {
269      FSTERROR()  << "DeterminizeFst: argument not an acceptor";
270      SetProperties(kError, kError);
271    }
272    if (!(Weight::Properties() & kLeftSemiring)) {
273      FSTERROR() << "DeterminizeFst: Weight needs to be left distributive: "
274                 << Weight::Type();
275      SetProperties(kError, kError);
276    }
277    if (out_dist_)
278      out_dist_->clear();
279  }
280
281  DeterminizeFsaImpl(const DeterminizeFsaImpl<A, D> &impl)
282      : DeterminizeFstImplBase<A>(impl),
283        delta_(impl.delta_),
284        in_dist_(0),
285        out_dist_(0),
286        common_divisor_(impl.common_divisor_),
287        subset_hash_(0, SubsetKey(), SubsetEqual(&elements_)) {
288    if (impl.out_dist_) {
289      FSTERROR() << "DeterminizeFsaImpl: cannot copy with out_dist vector";
290      SetProperties(kError, kError);
291    }
292  }
293
294  virtual ~DeterminizeFsaImpl() {
295    for (int i = 0; i < subsets_.size(); ++i)
296      delete subsets_[i];
297  }
298
299  virtual DeterminizeFsaImpl<A, D> *Copy() {
300    return new DeterminizeFsaImpl<A, D>(*this);
301  }
302
303  uint64 Properties() const { return Properties(kFstProperties); }
304
305  // Set error if found; return FST impl properties.
306  uint64 Properties(uint64 mask) const {
307    if ((mask & kError) && (GetFst().Properties(kError, false)))
308      SetProperties(kError, kError);
309    return FstImpl<A>::Properties(mask);
310  }
311
312  virtual StateId ComputeStart() {
313    StateId s = GetFst().Start();
314    if (s == kNoStateId)
315      return kNoStateId;
316    Element element(s, Weight::One());
317    Subset *subset = new Subset;
318    subset->push_front(element);
319    return FindState(subset);
320  }
321
322  virtual Weight ComputeFinal(StateId s) {
323    Subset *subset = subsets_[s];
324    Weight final = Weight::Zero();
325    for (typename Subset::iterator siter = subset->begin();
326         siter != subset->end();
327         ++siter) {
328      Element &element = *siter;
329      final = Plus(final, Times(element.weight,
330                                GetFst().Final(element.state_id)));
331      if (!final.Member())
332        SetProperties(kError, kError);
333    }
334    return final;
335  }
336
337  // Finds the state corresponding to a subset. Only creates a new state
338  // if the subset is not found in the subset hash. FindState takes
339  // ownership of the subset argument (so that it doesn't have to copy it
340  // if it creates a new state).
341  //
342  // The method exploits the following device: all pairs stored in the
343  // associative container subset_hash_ are of the form (subset,
344  // id(subset) + 1), i.e. subset_hash_[subset] > 0 if subset has been
345  // stored previously. For unassigned subsets, the call to
346  // subset_hash_[subset] creates a new pair (subset, 0). As a result,
347  // subset_hash_[subset] == 0 iff subset is new.
348  StateId FindState(Subset *subset) {
349    StateId &assoc_value = subset_hash_[subset];
350    if (assoc_value == 0) {    // subset wasn't present; create new state
351      StateId s = CreateState(subset);
352      assoc_value = s + 1;
353      return  s;
354    } else {
355      delete subset;
356      return assoc_value - 1;  // NB: assoc_value = ID + 1
357    }
358  }
359
360  StateId CreateState(Subset *subset) {
361    StateId s = subsets_.size();
362    subsets_.push_back(subset);
363    if (in_dist_)
364      out_dist_->push_back(ComputeDistance(subset));
365    return s;
366  }
367
368  // Compute distance from a state to the final states in the DFA
369  // given the distances in the NFA.
370  Weight ComputeDistance(const Subset *subset) {
371    Weight outd = Weight::Zero();
372    for (typename Subset::const_iterator siter = subset->begin();
373         siter != subset->end(); ++siter) {
374      const Element &element = *siter;
375      Weight ind = element.state_id < in_dist_->size() ?
376          (*in_dist_)[element.state_id] : Weight::Zero();
377      outd = Plus(outd, Times(element.weight, ind));
378    }
379    return outd;
380  }
381
382  // Computes the outgoing transitions from a state, creating new destination
383  // states as needed.
384  virtual void Expand(StateId s) {
385
386    LabelMap label_map;
387    LabelSubsets(s, &label_map);
388
389    for (typename LabelMap::iterator liter = label_map.begin();
390         liter != label_map.end();
391         ++liter)
392      AddArc(s, liter->first, liter->second);
393    SetArcs(s);
394  }
395
396 private:
397  // Constructs destination subsets per label. At return, subset
398  // element weights include the input automaton label weights and the
399  // subsets may contain duplicate states.
400  void LabelSubsets(StateId s, LabelMap *label_map) {
401    Subset *src_subset = subsets_[s];
402
403    for (typename Subset::iterator siter = src_subset->begin();
404         siter != src_subset->end();
405         ++siter) {
406      Element &src_element = *siter;
407      for (ArcIterator< Fst<A> > aiter(GetFst(), src_element.state_id);
408           !aiter.Done();
409           aiter.Next()) {
410        const A &arc = aiter.Value();
411        Element dest_element(arc.nextstate,
412                             Times(src_element.weight, arc.weight));
413        Subset* &dest_subset = (*label_map)[arc.ilabel];
414        if (dest_subset == 0)
415          dest_subset = new Subset;
416        dest_subset->push_front(dest_element);
417      }
418    }
419  }
420
421  // Adds an arc from state S to the destination state associated
422  // with subset DEST_SUBSET (as created by LabelSubsets).
423  void AddArc(StateId s, Label label, Subset *dest_subset) {
424    A arc;
425    arc.ilabel = label;
426    arc.olabel = label;
427    arc.weight = Weight::Zero();
428
429    typename Subset::iterator oiter;
430    for (typename Subset::iterator diter = dest_subset->begin();
431         diter != dest_subset->end();) {
432      Element &dest_element = *diter;
433      // Computes label weight.
434      arc.weight = common_divisor_(arc.weight, dest_element.weight);
435
436      while (elements_.size() <= dest_element.state_id)
437        elements_.push_back(0);
438      Element *matching_element = elements_[dest_element.state_id];
439      if (matching_element) {
440        // Found duplicate state: sums state weight and deletes dup.
441        matching_element->weight = Plus(matching_element->weight,
442                                        dest_element.weight);
443        if (!matching_element->weight.Member())
444          SetProperties(kError, kError);
445        ++diter;
446        dest_subset->erase_after(oiter);
447      } else {
448        // Saves element so we can check for duplicate for this state.
449        elements_[dest_element.state_id] = &dest_element;
450        oiter = diter;
451        ++diter;
452      }
453    }
454
455    // Divides out label weight from destination subset elements.
456    // Quantizes to ensure comparisons are effective.
457    // Clears element vector.
458    for (typename Subset::iterator diter = dest_subset->begin();
459         diter != dest_subset->end();
460         ++diter) {
461      Element &dest_element = *diter;
462      dest_element.weight = Divide(dest_element.weight, arc.weight,
463                                   DIVIDE_LEFT);
464      dest_element.weight = dest_element.weight.Quantize(delta_);
465      elements_[dest_element.state_id] = 0;
466    }
467
468    arc.nextstate = FindState(dest_subset);
469    CacheImpl<A>::PushArc(s, arc);
470  }
471
472  // Comparison object for hashing Subset(s). Subsets are not sorted in this
473  // implementation, so ordering must not be assumed in the equivalence
474  // test.
475  class SubsetEqual {
476   public:
477    // Constructor takes vector needed to check equality. See immediately
478    // below for constraints on it.
479    explicit SubsetEqual(vector<Element *> *elements)
480        : elements_(elements) {}
481
482    // At each call to operator(), the elements_ vector should contain
483    // only NULLs. When this operator returns, elements_ will still
484    // have this property.
485    bool operator()(Subset* subset1, Subset* subset2) const {
486        if (subset1->size() != subset2->size())
487          return false;
488
489      // Loads first subset elements in element vector.
490      for (typename Subset::iterator iter1 = subset1->begin();
491           iter1 != subset1->end();
492           ++iter1) {
493        Element &element1 = *iter1;
494        while (elements_->size() <= element1.state_id)
495          elements_->push_back(0);
496        (*elements_)[element1.state_id] = &element1;
497      }
498
499      // Checks second subset matches first via element vector.
500      for (typename Subset::iterator iter2 = subset2->begin();
501           iter2 != subset2->end();
502           ++iter2) {
503        Element &element2 = *iter2;
504        while (elements_->size() <= element2.state_id)
505          elements_->push_back(0);
506        Element *element1 = (*elements_)[element2.state_id];
507        if (!element1 || element1->weight != element2.weight) {
508          // Mismatch found. Resets element vector before returning false.
509          for (typename Subset::iterator iter1 = subset1->begin();
510               iter1 != subset1->end();
511               ++iter1)
512            (*elements_)[iter1->state_id] = 0;
513          return false;
514        } else {
515          (*elements_)[element2.state_id] = 0;  // Clears entry
516        }
517      }
518      return true;
519    }
520   private:
521    vector<Element *> *elements_;
522  };
523
524  // Hash function for Subset to Fst states. Subset elements are not
525  // sorted in this implementation, so the hash must be invariant
526  // under subset reordering.
527  class SubsetKey {
528   public:
529    size_t operator()(const Subset* subset) const {
530      size_t hash = 0;
531      for (typename Subset::const_iterator iter = subset->begin();
532           iter != subset->end();
533           ++iter) {
534        const Element &element = *iter;
535        int lshift = element.state_id % (CHAR_BIT * sizeof(size_t) - 1) + 1;
536        int rshift = CHAR_BIT * sizeof(size_t) - lshift;
537        size_t n = element.state_id;
538        hash ^= n << lshift ^ n >> rshift ^ element.weight.Hash();
539      }
540      return hash;
541    }
542  };
543
544  float delta_;                    // Quantization delta for subset weights
545  const vector<Weight> *in_dist_;  // Distance to final NFA states
546  vector<Weight> *out_dist_;       // Distance to final DFA states
547
548  D common_divisor_;
549
550  // Used to test equivalence of subsets.
551  vector<Element *> elements_;
552
553  // Maps from StateId to Subset.
554  vector<Subset *> subsets_;
555
556  // Hashes from Subset to its StateId in the output automaton.
557  typedef unordered_map<Subset *, StateId, SubsetKey, SubsetEqual>
558  SubsetHash;
559
560  // Hashes from Label to Subsets corr. to destination states of current state.
561  SubsetHash subset_hash_;
562
563  void operator=(const DeterminizeFsaImpl<A, D> &);  // disallow
564};
565
566
567// Implementation of delayed determinization for transducers.
568// Transducer determinization is implemented by mapping the input to
569// the Gallic semiring as an acceptor whose weights contain the output
570// strings and using acceptor determinization above to determinize
571// that acceptor.
572template <class A, StringType S>
573class DeterminizeFstImpl : public DeterminizeFstImplBase<A> {
574 public:
575  using FstImpl<A>::SetProperties;
576  using DeterminizeFstImplBase<A>::GetFst;
577  using CacheBaseImpl< CacheState<A> >::GetCacheGc;
578  using CacheBaseImpl< CacheState<A> >::GetCacheLimit;
579
580  typedef typename A::Label Label;
581  typedef typename A::Weight Weight;
582  typedef typename A::StateId StateId;
583
584  typedef ToGallicMapper<A, S> ToMapper;
585  typedef FromGallicMapper<A, S> FromMapper;
586
587  typedef typename ToMapper::ToArc ToArc;
588  typedef ArcMapFst<A, ToArc, ToMapper> ToFst;
589  typedef ArcMapFst<ToArc, A, FromMapper> FromFst;
590
591  typedef GallicCommonDivisor<Label, Weight, S> CommonDivisor;
592  typedef GallicFactor<Label, Weight, S> FactorIterator;
593
594  DeterminizeFstImpl(const Fst<A> &fst, const DeterminizeFstOptions<A> &opts)
595      : DeterminizeFstImplBase<A>(fst, opts),
596        delta_(opts.delta),
597        subsequential_label_(opts.subsequential_label) {
598     Init(GetFst());
599  }
600
601  DeterminizeFstImpl(const DeterminizeFstImpl<A, S> &impl)
602      : DeterminizeFstImplBase<A>(impl),
603        delta_(impl.delta_),
604        subsequential_label_(impl.subsequential_label_) {
605    Init(GetFst());
606  }
607
608  ~DeterminizeFstImpl() { delete from_fst_; }
609
610  virtual DeterminizeFstImpl<A, S> *Copy() {
611    return new DeterminizeFstImpl<A, S>(*this);
612  }
613
614  uint64 Properties() const { return Properties(kFstProperties); }
615
616  // Set error if found; return FST impl properties.
617  uint64 Properties(uint64 mask) const {
618    if ((mask & kError) && (GetFst().Properties(kError, false) ||
619                            from_fst_->Properties(kError, false)))
620      SetProperties(kError, kError);
621    return FstImpl<A>::Properties(mask);
622  }
623
624  virtual StateId ComputeStart() { return from_fst_->Start(); }
625
626  virtual Weight ComputeFinal(StateId s) { return from_fst_->Final(s); }
627
628  virtual void Expand(StateId s) {
629    for (ArcIterator<FromFst> aiter(*from_fst_, s);
630         !aiter.Done();
631         aiter.Next())
632      CacheImpl<A>::PushArc(s, aiter.Value());
633    CacheImpl<A>::SetArcs(s);
634  }
635
636 private:
637  // Initialization of transducer determinization implementation, which
638  // is defined after DeterminizeFst since it calls it.
639  void Init(const Fst<A> &fst);
640
641  float delta_;
642  Label subsequential_label_;
643  FromFst *from_fst_;
644
645  void operator=(const DeterminizeFstImpl<A, S> &);  // disallow
646};
647
648
649// Determinizes a weighted transducer. This version is a delayed
650// Fst. The result will be an equivalent FST that has the property
651// that no state has two transitions with the same input label.
652// For this algorithm, epsilon transitions are treated as regular
653// symbols (cf. RmEpsilon).
654//
655// The transducer must be functional. The weights must be (weakly)
656// left divisible (valid for TropicalWeight and LogWeight for instance)
657// and be zero-sum-free if for all a,b: (Plus(a, b) = 0 => a = b = 0.
658//
659// Complexity:
660// - Determinizable: exponential (polynomial in the size of the output)
661// - Non-determinizable) does not terminate
662//
663// The determinizable automata include all unweighted and all acyclic input.
664//
665// References:
666// - Mehryar Mohri, "Finite-State Transducers in Language and Speech
667//   Processing". Computational Linguistics, 23:2, 1997.
668//
669// This class attaches interface to implementation and handles
670// reference counting, delegating most methods to ImplToFst.
671template <class A>
672class DeterminizeFst : public ImplToFst< DeterminizeFstImplBase<A> >  {
673 public:
674  friend class ArcIterator< DeterminizeFst<A> >;
675  friend class StateIterator< DeterminizeFst<A> >;
676  template <class B, StringType S> friend class DeterminizeFstImpl;
677
678  typedef A Arc;
679  typedef typename A::Weight Weight;
680  typedef typename A::StateId StateId;
681  typedef typename A::Label Label;
682  typedef CacheState<A> State;
683  typedef DeterminizeFstImplBase<A> Impl;
684
685  using ImplToFst<Impl>::SetImpl;
686
687  explicit DeterminizeFst(
688      const Fst<A> &fst,
689      const DeterminizeFstOptions<A> &opts = DeterminizeFstOptions<A>()) {
690    if (fst.Properties(kAcceptor, true)) {
691      // Calls implementation for acceptors.
692      typedef DefaultCommonDivisor<Weight> D;
693      SetImpl(new DeterminizeFsaImpl<A, D>(fst, D(), 0, 0, opts));
694    } else {
695      // Calls implementation for transducers.
696      SetImpl(new DeterminizeFstImpl<A, STRING_LEFT_RESTRICT>(fst, opts));
697    }
698  }
699
700  // This acceptor-only version additionally computes the distance to
701  // final states in the output if provided with those distances for the
702  // input. Useful for e.g. unique N-shortest paths.
703  DeterminizeFst(
704      const Fst<A> &fst,
705      const vector<Weight> &in_dist, vector<Weight> *out_dist,
706      const DeterminizeFstOptions<A> &opts = DeterminizeFstOptions<A>()) {
707    if (!fst.Properties(kAcceptor, true)) {
708      FSTERROR() << "DeterminizeFst:"
709                 << " distance to final states computed for acceptors only";
710      GetImpl()->SetProperties(kError, kError);
711    }
712    typedef DefaultCommonDivisor<Weight> D;
713    SetImpl(new DeterminizeFsaImpl<A, D>(fst, D(), &in_dist, out_dist, opts));
714  }
715
716  // See Fst<>::Copy() for doc.
717  DeterminizeFst(const DeterminizeFst<A> &fst, bool safe = false) {
718    if (safe)
719      SetImpl(fst.GetImpl()->Copy());
720    else
721      SetImpl(fst.GetImpl(), false);
722  }
723
724  // Get a copy of this DeterminizeFst. See Fst<>::Copy() for further doc.
725  virtual DeterminizeFst<A> *Copy(bool safe = false) const {
726    return new DeterminizeFst<A>(*this, safe);
727  }
728
729  virtual inline void InitStateIterator(StateIteratorData<A> *data) const;
730
731  virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const {
732    GetImpl()->InitArcIterator(s, data);
733  }
734
735 private:
736  // This private version is for passing the common divisor to
737  // FSA determinization.
738  template <class D>
739  DeterminizeFst(const Fst<A> &fst, const D &common_div,
740                 const DeterminizeFstOptions<A> &opts)
741      :  ImplToFst<Impl>(
742          new DeterminizeFsaImpl<A, D>(fst, common_div, 0, 0, opts)) {}
743
744  // Makes visible to friends.
745  Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); }
746
747  void operator=(const DeterminizeFst<A> &fst);  // Disallow
748};
749
750
751// Initialization of transducer determinization implementation. which
752// is defined after DeterminizeFst since it calls it.
753template <class A, StringType S>
754void DeterminizeFstImpl<A, S>::Init(const Fst<A> &fst) {
755  // Mapper to an acceptor.
756  ToFst to_fst(fst, ToMapper());
757
758  // Determinize acceptor.
759  // This recursive call terminates since it passes the common divisor
760  // to a private constructor.
761  CacheOptions copts(GetCacheGc(), GetCacheLimit());
762  DeterminizeFstOptions<ToArc> dopts(copts, delta_);
763  DeterminizeFst<ToArc> det_fsa(to_fst, CommonDivisor(), dopts);
764
765  // Mapper back to transducer.
766  FactorWeightOptions<ToArc> fopts(CacheOptions(true, 0), delta_,
767                                   kFactorFinalWeights,
768                                   subsequential_label_,
769                                   subsequential_label_);
770  FactorWeightFst<ToArc, FactorIterator> factored_fst(det_fsa, fopts);
771  from_fst_ = new FromFst(factored_fst, FromMapper(subsequential_label_));
772}
773
774
775// Specialization for DeterminizeFst.
776template <class A>
777class StateIterator< DeterminizeFst<A> >
778    : public CacheStateIterator< DeterminizeFst<A> > {
779 public:
780  explicit StateIterator(const DeterminizeFst<A> &fst)
781      : CacheStateIterator< DeterminizeFst<A> >(fst, fst.GetImpl()) {}
782};
783
784
785// Specialization for DeterminizeFst.
786template <class A>
787class ArcIterator< DeterminizeFst<A> >
788    : public CacheArcIterator< DeterminizeFst<A> > {
789 public:
790  typedef typename A::StateId StateId;
791
792  ArcIterator(const DeterminizeFst<A> &fst, StateId s)
793      : CacheArcIterator< DeterminizeFst<A> >(fst.GetImpl(), s) {
794    if (!fst.GetImpl()->HasArcs(s))
795      fst.GetImpl()->Expand(s);
796  }
797
798 private:
799  DISALLOW_COPY_AND_ASSIGN(ArcIterator);
800};
801
802
803template <class A> inline
804void DeterminizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const
805{
806  data->base = new StateIterator< DeterminizeFst<A> >(*this);
807}
808
809
810// Useful aliases when using StdArc.
811typedef DeterminizeFst<StdArc> StdDeterminizeFst;
812
813
814template <class Arc>
815struct DeterminizeOptions {
816  typedef typename Arc::StateId StateId;
817  typedef typename Arc::Weight Weight;
818  typedef typename Arc::Label Label;
819
820  float delta;                // Quantization delta for subset weights.
821  Weight weight_threshold;    // Pruning weight threshold.
822  StateId state_threshold;    // Pruning state threshold.
823  Label subsequential_label;  // Label used for residual final output
824  // when producing subsequential transducers.
825
826  explicit DeterminizeOptions(float d = kDelta, Weight w = Weight::Zero(),
827                              StateId n = kNoStateId, Label l = 0)
828      : delta(d), weight_threshold(w), state_threshold(n),
829        subsequential_label(l) {}
830};
831
832
833// Determinizes a weighted transducer.  This version writes the
834// determinized Fst to an output MutableFst.  The result will be an
835// equivalent FSt that has the property that no state has two
836// transitions with the same input label.  For this algorithm, epsilon
837// transitions are treated as regular symbols (cf. RmEpsilon).
838//
839// The transducer must be functional. The weights must be (weakly)
840// left divisible (valid for TropicalWeight and LogWeight).
841//
842// Complexity:
843// - Determinizable: exponential (polynomial in the size of the output)
844// - Non-determinizable: does not terminate
845//
846// The determinizable automata include all unweighted and all acyclic input.
847//
848// References:
849// - Mehryar Mohri, "Finite-State Transducers in Language and Speech
850//   Processing". Computational Linguistics, 23:2, 1997.
851template <class Arc>
852void Determinize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst,
853             const DeterminizeOptions<Arc> &opts
854                 = DeterminizeOptions<Arc>()) {
855  typedef typename Arc::StateId StateId;
856  typedef typename Arc::Weight Weight;
857
858  DeterminizeFstOptions<Arc> nopts;
859  nopts.delta = opts.delta;
860  nopts.subsequential_label = opts.subsequential_label;
861
862  nopts.gc_limit = 0;  // Cache only the last state for fastest copy.
863
864  if (opts.weight_threshold != Weight::Zero() ||
865      opts.state_threshold != kNoStateId) {
866    if (ifst.Properties(kAcceptor, false)) {
867      vector<Weight> idistance, odistance;
868      ShortestDistance(ifst, &idistance, true);
869      DeterminizeFst<Arc> dfst(ifst, idistance, &odistance, nopts);
870      PruneOptions< Arc, AnyArcFilter<Arc> > popts(opts.weight_threshold,
871                                                   opts.state_threshold,
872                                                   AnyArcFilter<Arc>(),
873                                                   &odistance);
874      Prune(dfst, ofst, popts);
875    } else {
876      *ofst = DeterminizeFst<Arc>(ifst, nopts);
877      Prune(ofst, opts.weight_threshold, opts.state_threshold);
878    }
879  } else {
880    *ofst = DeterminizeFst<Arc>(ifst, nopts);
881  }
882}
883
884
885}  // namespace fst
886
887#endif  // FST_LIB_DETERMINIZE_H__
888