determinize.h revision 5b6dc79427b8f7eeb6a7ff68034ab8548ce670ea
1// determinize.h
2
3
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//     http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16// Copyright 2005-2010 Google, Inc.
17// Author: riley@google.com (Michael Riley)
18//
19// \file
20// Functions and classes to determinize an FST.
21
22#ifndef FST_LIB_DETERMINIZE_H__
23#define FST_LIB_DETERMINIZE_H__
24
25#include <algorithm>
26#include <climits>
27#include <tr1/unordered_map>
28using std::tr1::unordered_map;
29using std::tr1::unordered_multimap;
30#include <map>
31#include <fst/slist.h>
32#include <string>
33#include <vector>
34using std::vector;
35
36#include <fst/arc-map.h>
37#include <fst/cache.h>
38#include <fst/bi-table.h>
39#include <fst/factor-weight.h>
40#include <fst/prune.h>
41#include <fst/test-properties.h>
42
43
44namespace fst {
45
46//
47// COMMON DIVISORS - these are used in determinization to compute
48// the transition weights. In the simplest case, it is just the same
49// as the semiring Plus(). However, other choices permit more efficient
50// determinization when the output contains strings.
51//
52
53// The default common divisor uses the semiring Plus.
54template <class W>
55class DefaultCommonDivisor {
56 public:
57  typedef W Weight;
58
59  W operator()(const W &w1, const W &w2) const { return Plus(w1, w2); }
60};
61
62
63// The label common divisor for a (left) string semiring selects a
64// single letter common prefix or the empty string. This is used in
65// the determinization of output strings so that at most a single
66// letter will appear in the output of a transtion.
67template <typename L, StringType S>
68class LabelCommonDivisor {
69 public:
70  typedef StringWeight<L, S> Weight;
71
72  Weight operator()(const Weight &w1, const Weight &w2) const {
73    StringWeightIterator<L, S> iter1(w1);
74    StringWeightIterator<L, S> iter2(w2);
75
76    if (!(StringWeight<L, S>::Properties() & kLeftSemiring)) {
77      FSTERROR() << "LabelCommonDivisor: Weight needs to be left semiring";
78      return Weight::NoWeight();
79    } else if (w1.Size() == 0 || w2.Size() == 0) {
80      return Weight::One();
81    } else if (w1 == Weight::Zero()) {
82      return Weight(iter2.Value());
83    } else if (w2 == Weight::Zero()) {
84      return Weight(iter1.Value());
85    } else if (iter1.Value() == iter2.Value()) {
86      return Weight(iter1.Value());
87    } else {
88      return Weight::One();
89    }
90  }
91};
92
93
94// The gallic common divisor uses the label common divisor on the
95// string component and the template argument D common divisor on the
96// weight component, which defaults to the default common divisor.
97template <class L, class W, StringType S, class D = DefaultCommonDivisor<W> >
98class GallicCommonDivisor {
99 public:
100  typedef GallicWeight<L, W, S> Weight;
101
102  Weight operator()(const Weight &w1, const Weight &w2) const {
103    return Weight(label_common_divisor_(w1.Value1(), w2.Value1()),
104                  weight_common_divisor_(w1.Value2(), w2.Value2()));
105  }
106
107 private:
108  LabelCommonDivisor<L, S> label_common_divisor_;
109  D weight_common_divisor_;
110};
111
112
113// Represents an element in a subset
114template <class A>
115struct DeterminizeElement {
116  typedef typename A::StateId StateId;
117  typedef typename A::Weight Weight;
118
119  DeterminizeElement() {}
120
121  DeterminizeElement(StateId s, Weight w) : state_id(s), weight(w) {}
122
123  bool operator==(const DeterminizeElement<A> & element) const {
124    return state_id == element.state_id && weight == element.weight;
125  }
126
127  bool operator<(const DeterminizeElement<A> & element) const {
128    return state_id < element.state_id ||
129        (state_id == element.state_id && weight == element.weight);
130  }
131
132  StateId state_id;  // Input state Id
133  Weight weight;     // Residual weight
134};
135
136
137//
138// DETERMINIZE FILTERS - these can be used in determinization to compute
139// transformations on the subsets prior to their being added as destination
140// states. The filter operates on a map between a label and the
141// corresponding destination subsets. The possibly modified map is
142// then used to construct the destination states for arcs exiting state 's'.
143// It must define the ordered map type LabelMap and have a default
144// and copy constructor.
145
146// A determinize filter that does not modify its input.
147template <class Arc>
148struct IdentityDeterminizeFilter {
149  typedef typename Arc::StateId StateId;
150  typedef typename Arc::Label Label;
151  typedef slist< DeterminizeElement<Arc> > Subset;
152  typedef map<Label, Subset*> LabelMap;
153
154  static uint64 Properties(uint64 props) { return props; }
155
156  void operator()(StateId s, LabelMap *label_map) {}
157};
158
159
160//
161// DETERMINIZATION STATE TABLES
162//
163// The determiziation state table has the form:
164//
165// template <class Arc>
166// class DeterminizeStateTable {
167//  public:
168//   typedef typename Arc::StateId StateId;
169//   typedef DeterminizeElement<Arc> Element;
170//   typedef slist<Element> Subset;
171//
172//   // Required constuctor
173//   DeterminizeStateTable();
174//
175//   // Required copy constructor that does not copy state
176//   DeterminizeStateTable(const DeterminizeStateTable<A,P> &table);
177//
178//   // Lookup state ID by subset (not depending of the element order).
179//   // If it doesn't exist, then add it.  FindState takes
180//   // ownership of the subset argument (so that it doesn't have to
181//   // copy it if it creates a new state).
182//   StateId FindState(Subset *subset);
183//
184//   // Lookup subset by ID.
185//   const Subset *FindSubset(StateId id) const;
186// };
187//
188
189// The default determinization state table based on the
190// compact hash bi-table.
191template <class Arc>
192class DefaultDeterminizeStateTable {
193 public:
194  typedef typename Arc::StateId StateId;
195  typedef typename Arc::Label Label;
196  typedef typename Arc::Weight Weight;
197  typedef DeterminizeElement<Arc> Element;
198  typedef slist<Element> Subset;
199
200  explicit DefaultDeterminizeStateTable(size_t table_size = 0)
201      : table_size_(table_size),
202        subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { }
203
204  DefaultDeterminizeStateTable(const DefaultDeterminizeStateTable<Arc> &table)
205      : table_size_(table.table_size_),
206        subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { }
207
208  ~DefaultDeterminizeStateTable() {
209    for (StateId s = 0; s < subsets_.Size(); ++s)
210      delete subsets_.FindEntry(s);
211  }
212
213  // Finds the state corresponding to a subset. Only creates a new
214  // state if the subset is not found. FindState takes ownership of
215  // the subset argument (so that it doesn't have to copy it if it
216  // creates a new state).
217  StateId FindState(Subset *subset) {
218    StateId ns = subsets_.Size();
219    StateId s = subsets_.FindId(subset);
220    if (s != ns) delete subset;  // subset found
221    return s;
222  }
223
224  const Subset* FindSubset(StateId s) { return subsets_.FindEntry(s); }
225
226 private:
227  // Comparison object for hashing Subset(s). Subsets are not sorted in this
228  // implementation, so ordering must not be assumed in the equivalence
229  // test.
230  class SubsetEqual {
231   public:
232    SubsetEqual() {  // needed for compilation but should never be called
233      FSTERROR() << "SubsetEqual: default constructor not implemented";
234    }
235
236    // Constructor takes vector needed to check equality. See immediately
237    // below for constraints on it.
238    explicit SubsetEqual(vector<Element *> *elements)
239        : elements_(elements) {}
240
241    // At each call to operator(), the elements_ vector should contain
242    // only NULLs. When this operator returns, elements_ will still
243    // have this property.
244    bool operator()(Subset* subset1, Subset* subset2) const {
245      if (!subset1 && !subset2)
246        return true;
247      if ((subset1 && !subset2) || (!subset1 && subset2))
248        return false;
249
250      if (subset1->size() != subset2->size())
251        return false;
252
253      // Loads first subset elements in element vector.
254      for (typename Subset::iterator iter1 = subset1->begin();
255           iter1 != subset1->end();
256           ++iter1) {
257        Element &element1 = *iter1;
258        while (elements_->size() <= element1.state_id)
259          elements_->push_back(0);
260        (*elements_)[element1.state_id] = &element1;
261      }
262
263      // Checks second subset matches first via element vector.
264      for (typename Subset::iterator iter2 = subset2->begin();
265           iter2 != subset2->end();
266           ++iter2) {
267        Element &element2 = *iter2;
268        while (elements_->size() <= element2.state_id)
269          elements_->push_back(0);
270        Element *element1 = (*elements_)[element2.state_id];
271        if (!element1 || element1->weight != element2.weight) {
272          // Mismatch found. Resets element vector before returning false.
273          for (typename Subset::iterator iter1 = subset1->begin();
274               iter1 != subset1->end();
275               ++iter1)
276            (*elements_)[iter1->state_id] = 0;
277          return false;
278        } else {
279          (*elements_)[element2.state_id] = 0;  // Clears entry
280        }
281      }
282      return true;
283    }
284   private:
285    vector<Element *> *elements_;
286  };
287
288  // Hash function for Subset to Fst states. Subset elements are not
289  // sorted in this implementation, so the hash must be invariant
290  // under subset reordering.
291  class SubsetKey {
292   public:
293    size_t operator()(const Subset* subset) const {
294      size_t hash = 0;
295      if (subset) {
296        for (typename Subset::const_iterator iter = subset->begin();
297             iter != subset->end();
298             ++iter) {
299          const Element &element = *iter;
300          int lshift = element.state_id % (CHAR_BIT * sizeof(size_t) - 1) + 1;
301          int rshift = CHAR_BIT * sizeof(size_t) - lshift;
302          size_t n = element.state_id;
303          hash ^= n << lshift ^ n >> rshift ^ element.weight.Hash();
304        }
305      }
306      return hash;
307    }
308  };
309
310  size_t table_size_;
311
312  typedef CompactHashBiTable<StateId, Subset *,
313                             SubsetKey, SubsetEqual, HS_STL>  SubsetTable;
314
315  SubsetTable subsets_;
316  vector<Element *> elements_;
317
318  void operator=(const DefaultDeterminizeStateTable<Arc> &);  // disallow
319};
320
321// Options for finite-state transducer determinization templated on
322// the arc type, common divisor, the determinization filter and the
323// state table.  DeterminizeFst takes ownership of the determinization
324// filter and state table if provided.
325template <class Arc,
326          class D = DefaultCommonDivisor<typename Arc::Weight>,
327          class F = IdentityDeterminizeFilter<Arc>,
328          class T = DefaultDeterminizeStateTable<Arc> >
329struct DeterminizeFstOptions : CacheOptions {
330  typedef typename Arc::Label Label;
331  float delta;                // Quantization delta for subset weights
332  Label subsequential_label;  // Label used for residual final output
333                              // when producing subsequential transducers.
334  F *filter;                  // Determinization filter
335  T *state_table;             // Determinization state table
336
337  explicit DeterminizeFstOptions(const CacheOptions &opts,
338                                 float del = kDelta, Label lab = 0,
339                                 F *filt = 0,
340                                 T *table = 0)
341      : CacheOptions(opts), delta(del), subsequential_label(lab),
342        filter(filt), state_table(table) {}
343
344  explicit DeterminizeFstOptions(float del = kDelta, Label lab = 0,
345                                 F *filt = 0, T *table = 0)
346      : delta(del), subsequential_label(lab), filter(filt),
347        state_table(table) {}
348};
349
350// Implementation of delayed DeterminizeFst. This base class is
351// common to the variants that implement acceptor and transducer
352// determinization.
353template <class A>
354class DeterminizeFstImplBase : public CacheImpl<A> {
355 public:
356  using FstImpl<A>::SetType;
357  using FstImpl<A>::SetProperties;
358  using FstImpl<A>::Properties;
359  using FstImpl<A>::SetInputSymbols;
360  using FstImpl<A>::SetOutputSymbols;
361
362  using CacheBaseImpl< CacheState<A> >::HasStart;
363  using CacheBaseImpl< CacheState<A> >::HasFinal;
364  using CacheBaseImpl< CacheState<A> >::HasArcs;
365  using CacheBaseImpl< CacheState<A> >::SetFinal;
366  using CacheBaseImpl< CacheState<A> >::SetStart;
367
368  typedef typename A::Label Label;
369  typedef typename A::Weight Weight;
370  typedef typename A::StateId StateId;
371  typedef CacheState<A> State;
372
373  template <class D, class F, class T>
374  DeterminizeFstImplBase(const Fst<A> &fst,
375                         const DeterminizeFstOptions<A, D, F, T> &opts)
376      : CacheImpl<A>(opts), fst_(fst.Copy()) {
377    SetType("determinize");
378    uint64 iprops = fst.Properties(kFstProperties, false);
379    uint64 dprops = DeterminizeProperties(iprops,
380                                          opts.subsequential_label != 0);
381    SetProperties(F::Properties(dprops), kCopyProperties);
382    SetInputSymbols(fst.InputSymbols());
383    SetOutputSymbols(fst.OutputSymbols());
384  }
385
386  DeterminizeFstImplBase(const DeterminizeFstImplBase<A> &impl)
387      : CacheImpl<A>(impl),
388        fst_(impl.fst_->Copy(true)) {
389    SetType("determinize");
390    SetProperties(impl.Properties(), kCopyProperties);
391    SetInputSymbols(impl.InputSymbols());
392    SetOutputSymbols(impl.OutputSymbols());
393  }
394
395  virtual ~DeterminizeFstImplBase() { delete fst_; }
396
397  virtual DeterminizeFstImplBase<A> *Copy() = 0;
398
399  StateId Start() {
400    if (!HasStart()) {
401      StateId start = ComputeStart();
402      if (start != kNoStateId) {
403        SetStart(start);
404      }
405    }
406    return CacheImpl<A>::Start();
407  }
408
409  Weight Final(StateId s) {
410    if (!HasFinal(s)) {
411      Weight final = ComputeFinal(s);
412      SetFinal(s, final);
413    }
414    return CacheImpl<A>::Final(s);
415  }
416
417  virtual void Expand(StateId s) = 0;
418
419  size_t NumArcs(StateId s) {
420    if (!HasArcs(s))
421      Expand(s);
422    return CacheImpl<A>::NumArcs(s);
423  }
424
425  size_t NumInputEpsilons(StateId s) {
426    if (!HasArcs(s))
427      Expand(s);
428    return CacheImpl<A>::NumInputEpsilons(s);
429  }
430
431  size_t NumOutputEpsilons(StateId s) {
432    if (!HasArcs(s))
433      Expand(s);
434    return CacheImpl<A>::NumOutputEpsilons(s);
435  }
436
437  void InitArcIterator(StateId s, ArcIteratorData<A> *data) {
438    if (!HasArcs(s))
439      Expand(s);
440    CacheImpl<A>::InitArcIterator(s, data);
441  }
442
443  virtual StateId ComputeStart() = 0;
444
445  virtual Weight ComputeFinal(StateId s) = 0;
446
447  const Fst<A> &GetFst() const { return *fst_; }
448
449 private:
450  const Fst<A> *fst_;            // Input Fst
451
452  void operator=(const DeterminizeFstImplBase<A> &);  // disallow
453};
454
455
456// Implementation of delayed determinization for weighted acceptors.
457// It is templated on the arc type A and the common divisor D.
458template <class A, class D, class F, class T>
459class DeterminizeFsaImpl : public DeterminizeFstImplBase<A> {
460 public:
461  using FstImpl<A>::SetProperties;
462  using DeterminizeFstImplBase<A>::GetFst;
463  using DeterminizeFstImplBase<A>::SetArcs;
464
465  typedef typename A::Label Label;
466  typedef typename A::Weight Weight;
467  typedef typename A::StateId StateId;
468  typedef DeterminizeElement<A> Element;
469  typedef slist<Element> Subset;
470  typedef typename F::LabelMap LabelMap;
471
472  DeterminizeFsaImpl(const Fst<A> &fst,
473                     const vector<Weight> *in_dist, vector<Weight> *out_dist,
474                     const DeterminizeFstOptions<A, D, F, T> &opts)
475      : DeterminizeFstImplBase<A>(fst, opts),
476        delta_(opts.delta),
477        in_dist_(in_dist),
478        out_dist_(out_dist),
479        filter_(opts.filter ? opts.filter : new F()),
480        state_table_(opts.state_table ? opts.state_table : new T()) {
481    if (!fst.Properties(kAcceptor, true)) {
482      FSTERROR()  << "DeterminizeFst: argument not an acceptor";
483      SetProperties(kError, kError);
484    }
485    if (!(Weight::Properties() & kLeftSemiring)) {
486      FSTERROR() << "DeterminizeFst: Weight needs to be left distributive: "
487                 << Weight::Type();
488      SetProperties(kError, kError);
489    }
490    if (out_dist_)
491      out_dist_->clear();
492  }
493
494  DeterminizeFsaImpl(const DeterminizeFsaImpl<A, D, F, T> &impl)
495      : DeterminizeFstImplBase<A>(impl),
496        delta_(impl.delta_),
497        in_dist_(0),
498        out_dist_(0),
499        filter_(new F(*impl.filter_)),
500        state_table_(new T(*impl.state_table_)) {
501    if (impl.out_dist_) {
502      FSTERROR() << "DeterminizeFsaImpl: cannot copy with out_dist vector";
503      SetProperties(kError, kError);
504    }
505  }
506
507  virtual ~DeterminizeFsaImpl() {
508    delete filter_;
509    delete state_table_;
510  }
511
512  virtual DeterminizeFsaImpl<A, D, F, T> *Copy() {
513    return new DeterminizeFsaImpl<A, D, F, T>(*this);
514  }
515
516  uint64 Properties() const { return Properties(kFstProperties); }
517
518  // Set error if found; return FST impl properties.
519  uint64 Properties(uint64 mask) const {
520    if ((mask & kError) && (GetFst().Properties(kError, false)))
521      SetProperties(kError, kError);
522    return FstImpl<A>::Properties(mask);
523  }
524
525  virtual StateId ComputeStart() {
526    StateId s = GetFst().Start();
527    if (s == kNoStateId)
528      return kNoStateId;
529    Element element(s, Weight::One());
530    Subset *subset = new Subset;
531    subset->push_front(element);
532    return FindState(subset);
533  }
534
535  virtual Weight ComputeFinal(StateId s) {
536    const Subset *subset = state_table_->FindSubset(s);
537    Weight final = Weight::Zero();
538    for (typename Subset::const_iterator siter = subset->begin();
539         siter != subset->end();
540         ++siter) {
541      const Element &element = *siter;
542      final = Plus(final, Times(element.weight,
543                                GetFst().Final(element.state_id)));
544      if (!final.Member())
545        SetProperties(kError, kError);
546    }
547    return final;
548  }
549
550  StateId FindState(Subset *subset) {
551    StateId s = state_table_->FindState(subset);
552    if (in_dist_ && out_dist_->size() <= s)
553      out_dist_->push_back(ComputeDistance(subset));
554    return s;
555  }
556
557  // Compute distance from a state to the final states in the DFA
558  // given the distances in the NFA.
559  Weight ComputeDistance(const Subset *subset) {
560    Weight outd = Weight::Zero();
561    for (typename Subset::const_iterator siter = subset->begin();
562         siter != subset->end(); ++siter) {
563      const Element &element = *siter;
564      Weight ind = element.state_id < in_dist_->size() ?
565          (*in_dist_)[element.state_id] : Weight::Zero();
566      outd = Plus(outd, Times(element.weight, ind));
567    }
568    return outd;
569  }
570
571  // Computes the outgoing transitions from a state, creating new destination
572  // states as needed.
573  virtual void Expand(StateId s) {
574
575    LabelMap label_map;
576    LabelSubsets(s, &label_map);
577
578    for (typename LabelMap::iterator liter = label_map.begin();
579         liter != label_map.end();
580         ++liter)
581      AddArc(s, liter->first, liter->second);
582    SetArcs(s);
583  }
584
585 private:
586  // Constructs destination subsets per label. At return, subset
587  // element weights include the input automaton label weights and the
588  // subsets may contain duplicate states.
589  void LabelSubsets(StateId s, LabelMap *label_map) {
590    const Subset *src_subset = state_table_->FindSubset(s);
591
592    for (typename Subset::const_iterator siter = src_subset->begin();
593         siter != src_subset->end();
594         ++siter) {
595      const Element &src_element = *siter;
596      for (ArcIterator< Fst<A> > aiter(GetFst(), src_element.state_id);
597           !aiter.Done();
598           aiter.Next()) {
599        const A &arc = aiter.Value();
600        Element dest_element(arc.nextstate,
601                             Times(src_element.weight, arc.weight));
602
603        // The LabelMap may be a e.g. multimap with more complex
604        // determinization filters, so we insert efficiently w/o using [].
605        typename LabelMap::iterator liter = label_map->lower_bound(arc.ilabel);
606        Subset* dest_subset;
607        if (liter == label_map->end() || liter->first != arc.ilabel) {
608          dest_subset = new Subset;
609          label_map->insert(liter, make_pair(arc.ilabel, dest_subset));
610        } else {
611          dest_subset = liter->second;
612        }
613
614        dest_subset->push_front(dest_element);
615      }
616    }
617    // Applies the determinization filter
618    (*filter_)(s, label_map);
619  }
620
621  // Adds an arc from state S to the destination state associated
622  // with subset DEST_SUBSET (as created by LabelSubsets).
623  void AddArc(StateId s, Label label, Subset *dest_subset) {
624    A arc;
625    arc.ilabel = label;
626    arc.olabel = label;
627    arc.weight = Weight::Zero();
628
629    typename Subset::iterator oiter;
630    for (typename Subset::iterator diter = dest_subset->begin();
631         diter != dest_subset->end();) {
632      Element &dest_element = *diter;
633      // Computes label weight.
634      arc.weight = common_divisor_(arc.weight, dest_element.weight);
635
636      while (elements_.size() <= dest_element.state_id)
637        elements_.push_back(0);
638      Element *matching_element = elements_[dest_element.state_id];
639      if (matching_element) {
640        // Found duplicate state: sums state weight and deletes dup.
641        matching_element->weight = Plus(matching_element->weight,
642                                        dest_element.weight);
643        if (!matching_element->weight.Member())
644          SetProperties(kError, kError);
645        ++diter;
646        dest_subset->erase_after(oiter);
647      } else {
648        // Saves element so we can check for duplicate for this state.
649        elements_[dest_element.state_id] = &dest_element;
650        oiter = diter;
651        ++diter;
652      }
653    }
654
655    // Divides out label weight from destination subset elements.
656    // Quantizes to ensure comparisons are effective.
657    // Clears element vector.
658    for (typename Subset::iterator diter = dest_subset->begin();
659         diter != dest_subset->end();
660         ++diter) {
661      Element &dest_element = *diter;
662      dest_element.weight = Divide(dest_element.weight, arc.weight,
663                                   DIVIDE_LEFT);
664      dest_element.weight = dest_element.weight.Quantize(delta_);
665      elements_[dest_element.state_id] = 0;
666    }
667
668    arc.nextstate = FindState(dest_subset);
669    CacheImpl<A>::PushArc(s, arc);
670  }
671
672  float delta_;                    // Quantization delta for subset weights
673  const vector<Weight> *in_dist_;  // Distance to final NFA states
674  vector<Weight> *out_dist_;       // Distance to final DFA states
675
676  D common_divisor_;
677  F *filter_;
678  T *state_table_;
679
680  vector<Element *> elements_;
681
682  void operator=(const DeterminizeFsaImpl<A, D, F, T> &);  // disallow
683};
684
685
686// Implementation of delayed determinization for transducers.
687// Transducer determinization is implemented by mapping the input to
688// the Gallic semiring as an acceptor whose weights contain the output
689// strings and using acceptor determinization above to determinize
690// that acceptor.
691template <class A, StringType S, class D, class F, class T>
692class DeterminizeFstImpl : public DeterminizeFstImplBase<A> {
693 public:
694  using FstImpl<A>::SetProperties;
695  using DeterminizeFstImplBase<A>::GetFst;
696  using CacheBaseImpl< CacheState<A> >::GetCacheGc;
697  using CacheBaseImpl< CacheState<A> >::GetCacheLimit;
698
699  typedef typename A::Label Label;
700  typedef typename A::Weight Weight;
701  typedef typename A::StateId StateId;
702
703  typedef ToGallicMapper<A, S> ToMapper;
704  typedef FromGallicMapper<A, S> FromMapper;
705
706  typedef typename ToMapper::ToArc ToArc;
707  typedef ArcMapFst<A, ToArc, ToMapper> ToFst;
708  typedef ArcMapFst<ToArc, A, FromMapper> FromFst;
709
710  typedef GallicCommonDivisor<Label, Weight, S, D> CommonDivisor;
711  typedef GallicFactor<Label, Weight, S> FactorIterator;
712
713  DeterminizeFstImpl(const Fst<A> &fst,
714                     const DeterminizeFstOptions<A, D, F, T> &opts)
715      : DeterminizeFstImplBase<A>(fst, opts),
716        delta_(opts.delta),
717        subsequential_label_(opts.subsequential_label) {
718     Init(GetFst());
719  }
720
721  DeterminizeFstImpl(const DeterminizeFstImpl<A, S, D, F, T> &impl)
722      : DeterminizeFstImplBase<A>(impl),
723        delta_(impl.delta_),
724        subsequential_label_(impl.subsequential_label_) {
725    Init(GetFst());
726  }
727
728  ~DeterminizeFstImpl() { delete from_fst_; }
729
730  virtual DeterminizeFstImpl<A, S, D, F, T> *Copy() {
731    return new DeterminizeFstImpl<A, S, D, F, T>(*this);
732  }
733
734  uint64 Properties() const { return Properties(kFstProperties); }
735
736  // Set error if found; return FST impl properties.
737  uint64 Properties(uint64 mask) const {
738    if ((mask & kError) && (GetFst().Properties(kError, false) ||
739                            from_fst_->Properties(kError, false)))
740      SetProperties(kError, kError);
741    return FstImpl<A>::Properties(mask);
742  }
743
744  virtual StateId ComputeStart() { return from_fst_->Start(); }
745
746  virtual Weight ComputeFinal(StateId s) { return from_fst_->Final(s); }
747
748  virtual void Expand(StateId s) {
749    for (ArcIterator<FromFst> aiter(*from_fst_, s);
750         !aiter.Done();
751         aiter.Next())
752      CacheImpl<A>::PushArc(s, aiter.Value());
753    CacheImpl<A>::SetArcs(s);
754  }
755
756 private:
757  // Initialization of transducer determinization implementation, which
758  // is defined after DeterminizeFst since it calls it.
759  void Init(const Fst<A> &fst);
760
761  float delta_;
762  Label subsequential_label_;
763  FromFst *from_fst_;
764
765  void operator=(const DeterminizeFstImpl<A, S, D, F, T> &);  // disallow
766};
767
768
769// Determinizes a weighted transducer. This version is a delayed
770// Fst. The result will be an equivalent FST that has the property
771// that no state has two transitions with the same input label.
772// For this algorithm, epsilon transitions are treated as regular
773// symbols (cf. RmEpsilon).
774//
775// The transducer must be functional. The weights must be (weakly)
776// left divisible (valid for TropicalWeight and LogWeight for instance)
777// and be zero-sum-free if for all a,b: (Plus(a, b) = 0 => a = b = 0.
778//
779// Complexity:
780// - Determinizable: exponential (polynomial in the size of the output)
781// - Non-determinizable) does not terminate
782//
783// The determinizable automata include all unweighted and all acyclic input.
784//
785// References:
786// - Mehryar Mohri, "Finite-State Transducers in Language and Speech
787//   Processing". Computational Linguistics, 23:2, 1997.
788//
789// This class attaches interface to implementation and handles
790// reference counting, delegating most methods to ImplToFst.
791template <class A>
792class DeterminizeFst : public ImplToFst< DeterminizeFstImplBase<A> >  {
793 public:
794  friend class ArcIterator< DeterminizeFst<A> >;
795  friend class StateIterator< DeterminizeFst<A> >;
796  template <class B, StringType S, class D, class F, class T>
797  friend class DeterminizeFstImpl;
798
799  typedef A Arc;
800  typedef typename A::Weight Weight;
801  typedef typename A::StateId StateId;
802  typedef typename A::Label Label;
803  typedef CacheState<A> State;
804  typedef DeterminizeFstImplBase<A> Impl;
805
806  using ImplToFst<Impl>::SetImpl;
807
808  explicit DeterminizeFst(const Fst<A> &fst) {
809    typedef DefaultCommonDivisor<Weight> D;
810    typedef IdentityDeterminizeFilter<A> F;
811    typedef DefaultDeterminizeStateTable<A> T;
812    DeterminizeFstOptions<A, D, F, T> opts;
813    if (fst.Properties(kAcceptor, true)) {
814      // Calls implementation for acceptors.
815      SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, 0, 0, opts));
816    } else {
817      // Calls implementation for transducers.
818      SetImpl(new
819              DeterminizeFstImpl<A, STRING_LEFT_RESTRICT, D, F, T>(fst, opts));
820    }
821  }
822
823  template <class D, class F, class T>
824  DeterminizeFst(const Fst<A> &fst,
825                 const DeterminizeFstOptions<A, D, F, T> &opts) {
826    if (fst.Properties(kAcceptor, true)) {
827      // Calls implementation for acceptors.
828      SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, 0, 0, opts));
829    } else {
830      // Calls implementation for transducers.
831      SetImpl(new
832              DeterminizeFstImpl<A, STRING_LEFT_RESTRICT, D, F, T>(fst, opts));
833    }
834  }
835
836  // This acceptor-only version additionally computes the distance to
837  // final states in the output if provided with those distances for the
838  // input. Useful for e.g. unique N-shortest paths.
839  template <class D, class F, class T>
840  DeterminizeFst(const Fst<A> &fst,
841                 const vector<Weight> *in_dist, vector<Weight> *out_dist,
842                 const DeterminizeFstOptions<A, D, F, T> &opts) {
843    if (!fst.Properties(kAcceptor, true)) {
844      FSTERROR() << "DeterminizeFst:"
845                 << " distance to final states computed for acceptors only";
846      GetImpl()->SetProperties(kError, kError);
847    }
848    SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, in_dist, out_dist, opts));
849  }
850
851  // See Fst<>::Copy() for doc.
852  DeterminizeFst(const DeterminizeFst<A> &fst, bool safe = false) {
853    if (safe)
854      SetImpl(fst.GetImpl()->Copy());
855    else
856      SetImpl(fst.GetImpl(), false);
857  }
858
859  // Get a copy of this DeterminizeFst. See Fst<>::Copy() for further doc.
860  virtual DeterminizeFst<A> *Copy(bool safe = false) const {
861    return new DeterminizeFst<A>(*this, safe);
862  }
863
864  virtual inline void InitStateIterator(StateIteratorData<A> *data) const;
865
866  virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const {
867    GetImpl()->InitArcIterator(s, data);
868  }
869
870 private:
871  // Makes visible to friends.
872  Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); }
873
874  void operator=(const DeterminizeFst<A> &fst);  // Disallow
875};
876
877
878// Initialization of transducer determinization implementation. which
879// is defined after DeterminizeFst since it calls it.
880template <class A, StringType S, class D, class F, class T>
881void DeterminizeFstImpl<A, S, D, F, T>::Init(const Fst<A> &fst) {
882  // Mapper to an acceptor.
883  ToFst to_fst(fst, ToMapper());
884
885  // Determinizes acceptor.
886  // This recursive call terminates since it passes the common divisor
887  // to a private constructor.
888  CacheOptions copts(GetCacheGc(), GetCacheLimit());
889  DeterminizeFstOptions<ToArc, CommonDivisor> dopts(copts, delta_);
890  // Uses acceptor-only constructor to avoid template recursion
891  DeterminizeFst<ToArc> det_fsa(to_fst, 0, 0, dopts);
892
893  // Mapper back to transducer.
894  FactorWeightOptions<ToArc> fopts(CacheOptions(true, 0), delta_,
895                                   kFactorFinalWeights,
896                                   subsequential_label_,
897                                   subsequential_label_);
898  FactorWeightFst<ToArc, FactorIterator> factored_fst(det_fsa, fopts);
899  from_fst_ = new FromFst(factored_fst, FromMapper(subsequential_label_));
900}
901
902
903// Specialization for DeterminizeFst.
904template <class A>
905class StateIterator< DeterminizeFst<A> >
906    : public CacheStateIterator< DeterminizeFst<A> > {
907 public:
908  explicit StateIterator(const DeterminizeFst<A> &fst)
909      : CacheStateIterator< DeterminizeFst<A> >(fst, fst.GetImpl()) {}
910};
911
912
913// Specialization for DeterminizeFst.
914template <class A>
915class ArcIterator< DeterminizeFst<A> >
916    : public CacheArcIterator< DeterminizeFst<A> > {
917 public:
918  typedef typename A::StateId StateId;
919
920  ArcIterator(const DeterminizeFst<A> &fst, StateId s)
921      : CacheArcIterator< DeterminizeFst<A> >(fst.GetImpl(), s) {
922    if (!fst.GetImpl()->HasArcs(s))
923      fst.GetImpl()->Expand(s);
924  }
925
926 private:
927  DISALLOW_COPY_AND_ASSIGN(ArcIterator);
928};
929
930
931template <class A> inline
932void DeterminizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const
933{
934  data->base = new StateIterator< DeterminizeFst<A> >(*this);
935}
936
937
938// Useful aliases when using StdArc.
939typedef DeterminizeFst<StdArc> StdDeterminizeFst;
940
941
942template <class Arc>
943struct DeterminizeOptions {
944  typedef typename Arc::StateId StateId;
945  typedef typename Arc::Weight Weight;
946  typedef typename Arc::Label Label;
947
948  float delta;                // Quantization delta for subset weights.
949  Weight weight_threshold;    // Pruning weight threshold.
950  StateId state_threshold;    // Pruning state threshold.
951  Label subsequential_label;  // Label used for residual final output
952  // when producing subsequential transducers.
953
954  explicit DeterminizeOptions(float d = kDelta, Weight w = Weight::Zero(),
955                              StateId n = kNoStateId, Label l = 0)
956      : delta(d), weight_threshold(w), state_threshold(n),
957        subsequential_label(l) {}
958};
959
960
961// Determinizes a weighted transducer.  This version writes the
962// determinized Fst to an output MutableFst.  The result will be an
963// equivalent FST that has the property that no state has two
964// transitions with the same input label.  For this algorithm, epsilon
965// transitions are treated as regular symbols (cf. RmEpsilon).
966//
967// The transducer must be functional. The weights must be (weakly)
968// left divisible (valid for TropicalWeight and LogWeight).
969//
970// Complexity:
971// - Determinizable: exponential (polynomial in the size of the output)
972// - Non-determinizable: does not terminate
973//
974// The determinizable automata include all unweighted and all acyclic input.
975//
976// References:
977// - Mehryar Mohri, "Finite-State Transducers in Language and Speech
978//   Processing". Computational Linguistics, 23:2, 1997.
979template <class Arc>
980void Determinize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst,
981             const DeterminizeOptions<Arc> &opts
982                 = DeterminizeOptions<Arc>()) {
983  typedef typename Arc::StateId StateId;
984  typedef typename Arc::Weight Weight;
985
986  DeterminizeFstOptions<Arc> nopts;
987  nopts.delta = opts.delta;
988  nopts.subsequential_label = opts.subsequential_label;
989
990  nopts.gc_limit = 0;  // Cache only the last state for fastest copy.
991
992  if (opts.weight_threshold != Weight::Zero() ||
993      opts.state_threshold != kNoStateId) {
994    if (ifst.Properties(kAcceptor, false)) {
995      vector<Weight> idistance, odistance;
996      ShortestDistance(ifst, &idistance, true);
997      DeterminizeFst<Arc> dfst(ifst, &idistance, &odistance, nopts);
998      PruneOptions< Arc, AnyArcFilter<Arc> > popts(opts.weight_threshold,
999                                                   opts.state_threshold,
1000                                                   AnyArcFilter<Arc>(),
1001                                                   &odistance);
1002      Prune(dfst, ofst, popts);
1003    } else {
1004      *ofst = DeterminizeFst<Arc>(ifst, nopts);
1005      Prune(ofst, opts.weight_threshold, opts.state_threshold);
1006    }
1007  } else {
1008    *ofst = DeterminizeFst<Arc>(ifst, nopts);
1009  }
1010}
1011
1012
1013}  // namespace fst
1014
1015#endif  // FST_LIB_DETERMINIZE_H__
1016