symbol-table.h revision 8fc5a7f51e62cb4ae44a27bdf4176d04adc80ede
1// symbol-table.h
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15//
16// \file
17// Classes to provide symbol-to-integer and integer-to-symbol mappings.
18
19#ifndef FST_LIB_SYMBOL_TABLE_H__
20#define FST_LIB_SYMBOL_TABLE_H__
21
22#include <ext/hash_map>
23using __gnu_cxx::hash_map;
24#include <fstream>
25#include <iostream>
26#include <string>
27#include <vector>
28
29#include "fst/lib/compat.h"
30
31
32
33DECLARE_bool(fst_compat_symbols);
34
35namespace fst {
36
37class SymbolTableImpl {
38  friend class SymbolTableIterator;
39 public:
40  SymbolTableImpl(const string &name)
41      : name_(name), available_key_(0), ref_count_(1),
42        check_sum_finalized_(false) {}
43  ~SymbolTableImpl() {
44    for (size_t i = 0; i < symbols_.size(); ++i)
45      delete[] symbols_[i];
46  }
47
48  int64 AddSymbol(const string& symbol, int64 key);
49
50  int64 AddSymbol(const string& symbol) {
51    int64 key = Find(symbol);
52    return (key == -1) ? AddSymbol(symbol, available_key_++) : key;
53  }
54
55  void AddTable(SymbolTableImpl* table) {
56    for (size_t i = 0; i < table->symbols_.size(); ++i) {
57      AddSymbol(table->symbols_[i]);
58    }
59  }
60
61  static SymbolTableImpl* ReadText(const string& filename);
62
63  static SymbolTableImpl* Read(istream &strm, const string& source);
64
65  bool Write(ostream &strm) const;
66
67  bool WriteText(ostream &strm) const;
68
69  //
70  // Return the string associated with the key. If the key is out of
71  // range (<0, >max), return an empty string.
72  string Find(int64 key) const {
73    hash_map<int64, string>::const_iterator it =
74      key_map_.find(key);
75    if (it == key_map_.end()) {
76      return "";
77    }
78    return it->second;
79  }
80
81  //
82  // Return the key associated with the symbol. If the symbol
83  // does not exists, return -1.
84  int64 Find(const string& symbol) const {
85    return Find(symbol.c_str());
86  }
87
88  //
89  // Return the key associated with the symbol. If the symbol
90  // does not exists, return -1.
91  int64 Find(const char* symbol) const {
92    hash_map<string, int64>::const_iterator it =
93      symbol_map_.find(symbol);
94    if (it == symbol_map_.end()) {
95      return -1;
96    }
97    return it->second;
98  }
99
100  const string& Name() const { return name_; }
101
102  int IncrRefCount() const {
103    return ++ref_count_;
104  }
105  int DecrRefCount() const {
106    return --ref_count_;
107  }
108
109  string CheckSum() const {
110    if (!check_sum_finalized_) {
111      RecomputeCheckSum();
112      check_sum_string_ = check_sum_.Digest();
113    }
114    return check_sum_string_;
115  }
116
117  int64 AvailableKey() const {
118    return available_key_;
119  }
120
121  // private support methods
122 private:
123  void RecomputeCheckSum() const;
124  static SymbolTableImpl* Read1(istream &, const string &);
125
126  string name_;
127  int64 available_key_;
128  vector<const char *> symbols_;
129  hash_map<int64, string> key_map_;
130  hash_map<string, int64> symbol_map_;
131
132  mutable int ref_count_;
133  mutable bool check_sum_finalized_;
134  mutable MD5 check_sum_;
135  mutable string check_sum_string_;
136
137  DISALLOW_EVIL_CONSTRUCTORS(SymbolTableImpl);
138};
139
140
141class SymbolTableIterator;
142
143//
144// \class SymbolTable
145// \brief Symbol (string) to int and reverse mapping
146//
147// The SymbolTable implements the mappings of labels to strings and reverse.
148// SymbolTables are used to describe the alphabet of the input and output
149// labels for arcs in a Finite State Transducer.
150//
151// SymbolTables are reference counted and can therefore be shared across
152// multiple machines. For example a language model grammar G, with a
153// SymbolTable for the words in the language model can share this symbol
154// table with the lexical representation L o G.
155//
156class SymbolTable {
157  friend class SymbolTableIterator;
158 public:
159  static const int64 kNoSymbol = -1;
160
161  // Construct symbol table with a unique name.
162  SymbolTable(const string& name) : impl_(new SymbolTableImpl(name)) {}
163
164  // Create a reference counted copy.
165  SymbolTable(const SymbolTable& table) : impl_(table.impl_) {
166    impl_->IncrRefCount();
167  }
168
169  // Derefence implentation object. When reference count hits 0, delete
170  // implementation.
171  ~SymbolTable() {
172    if (!impl_->DecrRefCount()) delete impl_;
173  }
174
175  // create a reference counted copy
176  SymbolTable* Copy() const {
177    return new SymbolTable(*this);
178  }
179
180  // Add a symbol with given key to table. A symbol table also
181  // keeps track of the last available key (highest key value in
182  // the symbol table).
183  //
184  // \param symbol string symbol to add
185  // \param key associated key for string symbol
186  // \return the key created by the symbol table. Symbols allready added to
187  //         the symbol table will not get a different key.
188  int64 AddSymbol(const string& symbol, int64 key) {
189    return impl_->AddSymbol(symbol, key);
190  }
191
192  // Add a symbol to the table. The associated value key is automatically
193  // assigned by the symbol table.
194  //
195  // \param symbol string to add to the table
196  // \return the value key assigned to the associated string symbol
197  int64 AddSymbol(const string& symbol) {
198    return impl_->AddSymbol(symbol);
199  }
200
201  // Add another symbol table to this table. All key values will be offset
202  // by the current available key (highest key value in the symbol table).
203  // Note string symbols with the same key value with still have the same
204  // key value after the symbol table has been merged, but a different
205  // value. Adding symbol tables do not result in changes in the base table.
206  //
207  // Merging N symbol tables is often useful when combining the various
208  // name spaces of transducers to a unified representation.
209  //
210  // \param table the symbol table to add to this table
211  void AddTable(const SymbolTable& table) {
212    return impl_->AddTable(table.impl_);
213  }
214
215  // return the name of the symbol table
216  const string& Name() const {
217    return impl_->Name();
218  }
219
220  // return the MD5 check-sum for this table. All new symbols added to
221  // the table will result in an updated checksum.
222  string CheckSum() const {
223    return impl_->CheckSum();
224  }
225
226  // read an ascii representation of the symbol table
227  static SymbolTable* ReadText(const string& filename) {
228    SymbolTableImpl* impl = SymbolTableImpl::ReadText(filename);
229    if (!impl)
230      return 0;
231    else
232      return new SymbolTable(impl);
233  }
234
235  // read a binary dump of the symbol table
236  static SymbolTable* Read(istream &strm, const string& source) {
237    SymbolTableImpl* impl = SymbolTableImpl::Read(strm, source);
238    if (!impl)
239      return 0;
240    else
241      return new SymbolTable(impl);
242  }
243
244  // read a binary dump of the symbol table
245  static SymbolTable* Read(const string& filename) {
246    ifstream strm(filename.c_str());
247    if (!strm) {
248      LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename;
249      return 0;
250    }
251    return Read(strm, filename);
252  }
253
254  bool Write(ostream  &strm) const {
255    return impl_->Write(strm);
256  }
257
258  bool Write(const string& filename) const {
259    ofstream strm(filename.c_str());
260    if (!strm) {
261      LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename;
262      return false;
263    }
264    return Write(strm);
265  }
266
267  // Dump an ascii text representation of the symbol table
268  bool WriteText(ostream &strm) const {
269    return impl_->WriteText(strm);
270  }
271
272  // Dump an ascii text representation of the symbol table
273  bool WriteText(const string& filename) const {
274    ofstream strm(filename.c_str());
275    if (!strm) {
276      LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename;
277      return false;
278    }
279    return WriteText(strm);
280  }
281
282  // Return the string associated with the key. If the key is out of
283  // range (<0, >max), log error and return an empty string.
284  string Find(int64 key) const {
285    return impl_->Find(key);
286  }
287
288  // Return the key associated with the symbol. If the symbol
289  // does not exists, log error and  return -1
290  int64 Find(const string& symbol) const {
291    return impl_->Find(symbol);
292  }
293
294  // Return the key associated with the symbol. If the symbol
295  // does not exists, log error and  return -1
296  int64 Find(const char* symbol) const {
297    return impl_->Find(symbol);
298  }
299
300  // return the current available key (i.e highest key number) in
301  // the symbol table
302  int64 AvailableKey(void) const {
303    return impl_->AvailableKey();
304  }
305
306 protected:
307  explicit SymbolTable(SymbolTableImpl* impl) : impl_(impl) {}
308
309  const SymbolTableImpl* Impl() const {
310    return impl_;
311  }
312
313 private:
314  SymbolTableImpl* impl_;
315
316
317  void operator=(const SymbolTable &table);  // disallow
318};
319
320
321//
322// \class SymbolTableIterator
323// \brief Iterator class for symbols in a symbol table
324class SymbolTableIterator {
325 public:
326  // Constructor creates a refcounted copy of underlying implementation
327  SymbolTableIterator(const SymbolTable& symbol_table) {
328    impl_ = symbol_table.Impl();
329    impl_->IncrRefCount();
330    pos_ = 0;
331    size_ = impl_->symbols_.size();
332  }
333
334  // decrement implementation refcount, and delete if 0
335  ~SymbolTableIterator() {
336    if (!impl_->DecrRefCount()) delete impl_;
337  }
338
339  // is iterator done
340  bool Done(void) {
341    return (pos_ == size_);
342  }
343
344  // return the Value() of the current symbol (in64 key)
345  int64 Value(void) {
346    return impl_->Find(impl_->symbols_[pos_]);
347  }
348
349  // return the string of the current symbol
350  const char* Symbol(void) {
351    return impl_->symbols_[pos_];
352  }
353
354  // advance iterator forward
355  void Next(void) {
356    if (Done()) return;
357    ++pos_;
358  }
359
360  // reset iterator
361  void Reset(void) {
362    pos_ = 0;
363  }
364
365 private:
366  const SymbolTableImpl* impl_;
367  size_t pos_;
368  size_t size_;
369};
370
371
372// Tests compatibilty between two sets of symbol tables
373inline bool CompatSymbols(const SymbolTable *syms1,
374                          const SymbolTable *syms2) {
375  if (!FLAGS_fst_compat_symbols)
376    return true;
377  else if (!syms1 && !syms2)
378    return true;
379  else if (syms1 && !syms2 || !syms1 && syms2)
380    return false;
381  else
382    return syms1->CheckSum() == syms2->CheckSum();
383}
384
385}  // namespace fst
386
387#endif  // FST_LIB_SYMBOL_TABLE_H__
388