1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License");
3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License.
4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at
5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//     http://www.apache.org/licenses/LICENSE-2.0
7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software
9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS,
10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and
12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License.
13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc.
15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author: sorenj@google.com (Jeffrey Sorensen)
16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_LIB_SYMBOL_TABLE_OPS_H_
18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_LIB_SYMBOL_TABLE_OPS_H_
19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector>
21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector;
22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string>
233da1eb108d36da35333b2d655202791af854996bPrzemyslaw Szczepaniak#include <tr1/unordered_set>
24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::tr1::unordered_set;
25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::tr1::unordered_multiset;
26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/fst.h>
29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/symbol-table.h>
30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst {
33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Returns a minimal symbol table containing only symbols referenced by the
35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// passed fst.  Symbols preserve their original numbering, so fst does not
36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// require relabeling.
37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate<class Arc>
38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *PruneSymbolTable(const Fst<Arc> &fst, const SymbolTable &syms,
39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                              bool input) {
40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  unordered_set<typename Arc::Label> seen;
41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  seen.insert(0);  // Always keep epslion
42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  StateIterator<Fst<Arc> > siter(fst);
43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  for (; !siter.Done(); siter.Next()) {
44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ArcIterator<Fst<Arc> > aiter(fst, siter.Value());
45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    for (; !aiter.Done(); aiter.Next()) {
46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      typename Arc::Label sym = (input) ? aiter.Value().ilabel :
47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                                          aiter.Value().olabel;
48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      seen.insert(sym);
49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTable *pruned = new SymbolTable(syms.Name() + "_pruned");
52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  for (SymbolTableIterator stiter(syms); !stiter.Done(); stiter.Next()) {
53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    typename Arc::Label label = stiter.Value();
54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (seen.find(label) != seen.end()) {
55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      pruned->AddSymbol(stiter.Symbol(), stiter.Value());
56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  return pruned;
59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}
60f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
61f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Relabels a symbol table to make it a contiguous mapping.
62f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *CompactSymbolTable(const SymbolTable &syms);
63f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
64f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Merges two SymbolTables, all symbols from left will be merged into right
65f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// with the same ids.  Symbols in right that have conflicting ids with those
66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// in left will be assigned to value assigned from the left SymbolTable.
67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// The returned symbol table will never modify symbol assignments from the left
68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// side, but may do so on the right.  If right_relabel_output is non-NULL, it
69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// will be assigned true if the symbols from the right table needed to be
70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// reassigned.
71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// A potential use case is to Compose two Fst's that have different symbol
72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// tables.  You can reconcile them in the following way:
73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//   Fst<Arc> a, b;
74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//   bool relabel;
75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//   SymbolTable *bnew = MergeSymbolTable(a.OutputSymbols(),
76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//                                        b.InputSymbols(), &relabel);
77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//   if (relabel) {
78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//     Relabel(b, bnew, NULL);
79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//   }
80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//   b.SetInputSymbols(bnew);
81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//   delete bnew;
82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *MergeSymbolTable(const SymbolTable &left, const SymbolTable &right,
83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                              bool *right_relabel_output = 0);
84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Read the symbol table from any Fst::Read()able file, without loading the
86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// corresponding Fst.  Returns NULL if the Fst does not contain a symbol table
87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// or the symbol table cannot be read.
88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *FstReadSymbols(const string &filename, bool input);
89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
90f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}  // namespace fst
91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif  // FST_LIB_SYMBOL_TABLE_OPS_H_
92