symbol-table.h revision f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2
1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License");
3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License.
4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at
5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//     http://www.apache.org/licenses/LICENSE-2.0
7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software
9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS,
10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and
12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License.
13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc.
15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// All Rights Reserved.
16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author : Johan Schalkwyk
18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file
20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Classes to provide symbol-to-integer and integer-to-symbol mappings.
21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_LIB_SYMBOL_TABLE_H__
23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_LIB_SYMBOL_TABLE_H__
24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <cstring>
26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string>
27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <utility>
28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::pair; using std::make_pair;
29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector>
30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector;
31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/compat.h>
34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <iostream>
35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fstream>
36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <map>
39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonDECLARE_bool(fst_compat_symbols);
41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst {
43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WARNING: Reading via symbol table read options should
45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//          not be used. This is a temporary work around for
46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//          reading symbol ranges of previously stored symbol sets.
47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonstruct SymbolTableReadOptions {
48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTableReadOptions() { }
49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTableReadOptions(vector<pair<int64, int64> > string_hash_ranges_,
51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                         const string& source_)
52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : string_hash_ranges(string_hash_ranges_),
53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        source(source_) { }
54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  vector<pair<int64, int64> > string_hash_ranges;
56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string source;
57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTableImpl {
60f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public:
61f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTableImpl(const string &name)
62f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : name_(name),
63f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        available_key_(0),
64f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        dense_key_limit_(0),
65f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        check_sum_finalized_(false) {}
66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  explicit SymbolTableImpl(const SymbolTableImpl& impl)
68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : name_(impl.name_),
69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        available_key_(0),
70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        dense_key_limit_(0),
71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        check_sum_finalized_(false) {
72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    for (size_t i = 0; i < impl.symbols_.size(); ++i) {
73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      AddSymbol(impl.symbols_[i], impl.Find(impl.symbols_[i]));
74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  ~SymbolTableImpl() {
78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    for (size_t i = 0; i < symbols_.size(); ++i)
79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      delete[] symbols_[i];
80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // TODO(johans): Add flag to specify whether the symbol
83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //               should be indexed as string or int or both.
84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 AddSymbol(const string& symbol, int64 key);
85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 AddSymbol(const string& symbol) {
87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    int64 key = Find(symbol);
88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return (key == -1) ? AddSymbol(symbol, available_key_++) : key;
89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
90f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTableImpl* ReadText(istream &strm,
92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                                   const string &name,
93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                                   bool allow_negative = false);
94f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
95f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTableImpl* Read(istream &strm,
96f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                               const SymbolTableReadOptions& opts);
97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool Write(ostream &strm) const;
99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
100f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //
101f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the string associated with the key. If the key is out of
102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // range (<0, >max), return an empty string.
103f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string Find(int64 key) const {
104f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (key >=0 && key < dense_key_limit_)
105f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return string(symbols_[key]);
106f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
107f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    map<int64, const char*>::const_iterator it =
108f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        key_map_.find(key);
109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (it == key_map_.end()) {
110f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return "";
111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
112f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return string(it->second);
113f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //
116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the key associated with the symbol. If the symbol
117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // does not exists, return SymbolTable::kNoSymbol.
118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 Find(const string& symbol) const {
119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return Find(symbol.c_str());
120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
121f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //
123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the key associated with the symbol. If the symbol
124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // does not exists, return SymbolTable::kNoSymbol.
125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 Find(const char* symbol) const {
126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    map<const char *, int64, StrCmp>::const_iterator it =
127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        symbol_map_.find(symbol);
128f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (it == symbol_map_.end()) {
129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return -1;
130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
131f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return it->second;
132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 GetNthKey(ssize_t pos) const {
135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if ((pos < 0) || (pos >= symbols_.size())) return -1;
136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    else return Find(symbols_[pos]);
137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  const string& Name() const { return name_; }
140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
141f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int IncrRefCount() const {
142f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return ref_count_.Incr();
143f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
144f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int DecrRefCount() const {
145f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return ref_count_.Decr();
146f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
147f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int RefCount() const {
148f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return ref_count_.count();
149f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
150f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
151f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string CheckSum() const {
152f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    MutexLock check_sum_lock(&check_sum_mutex_);
153f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    MaybeRecomputeCheckSum();
154f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return check_sum_string_;
155f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
156f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
157f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string LabeledCheckSum() const {
158f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    MutexLock check_sum_lock(&check_sum_mutex_);
159f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    MaybeRecomputeCheckSum();
160f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return labeled_check_sum_string_;
161f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
162f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
163f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 AvailableKey() const {
164f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return available_key_;
165f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
166f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
167f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  size_t NumSymbols() const {
168f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return symbols_.size();
169f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
170f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
171f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
172f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Recomputes the checksums (both of them) if we've had changes since the last
173f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // computation (i.e., if check_sum_finalized_ is false).
174f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  void MaybeRecomputeCheckSum() const;
175f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
176f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  struct StrCmp {
177f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    bool operator()(const char *s1, const char *s2) const {
178f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return strcmp(s1, s2) < 0;
179f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
180f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  };
181f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
182f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string name_;
183f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 available_key_;
184f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 dense_key_limit_;
185f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  vector<const char *> symbols_;
186f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  map<int64, const char*> key_map_;
187f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  map<const char *, int64, StrCmp> symbol_map_;
188f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
189f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable RefCounter ref_count_;
190f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable bool check_sum_finalized_;
191f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable CheckSummer check_sum_;
192f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable CheckSummer labeled_check_sum_;
193f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable string check_sum_string_;
194f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable string labeled_check_sum_string_;
195f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable Mutex check_sum_mutex_;
196f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
197f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
198f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
199f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \class SymbolTable
200f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \brief Symbol (string) to int and reverse mapping
201f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
202f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// The SymbolTable implements the mappings of labels to strings and reverse.
203f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTables are used to describe the alphabet of the input and output
204f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// labels for arcs in a Finite State Transducer.
205f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
206f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTables are reference counted and can therefore be shared across
207f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// multiple machines. For example a language model grammar G, with a
208f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTable for the words in the language model can share this symbol
209f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// table with the lexical representation L o G.
210f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
211f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTable {
212f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public:
213f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static const int64 kNoSymbol = -1;
214f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
215f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Construct symbol table with a unique name.
216f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTable(const string& name) : impl_(new SymbolTableImpl(name)) {}
217f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
218f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Create a reference counted copy.
219f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTable(const SymbolTable& table) : impl_(table.impl_) {
220f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    impl_->IncrRefCount();
221f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
222f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
223f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Derefence implentation object. When reference count hits 0, delete
224f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // implementation.
225f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual ~SymbolTable() {
226f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!impl_->DecrRefCount()) delete impl_;
227f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
228f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
229f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Read an ascii representation of the symbol table from an istream. Pass a
230f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // name to give the resulting SymbolTable.
231f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTable* ReadText(istream &strm,
232f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                               const string& name,
233f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                               bool allow_negative = false) {
234f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    SymbolTableImpl* impl = SymbolTableImpl::ReadText(strm,
235f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                                                      name,
236f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                                                      allow_negative);
237f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!impl)
238f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return 0;
239f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    else
240f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return new SymbolTable(impl);
241f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
242f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
243f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // read an ascii representation of the symbol table
244f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTable* ReadText(const string& filename,
245f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                               bool allow_negative = false) {
246f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ifstream strm(filename.c_str(), ifstream::in);
247f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!strm) {
248f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(ERROR) << "SymbolTable::ReadText: Can't open file " << filename;
249f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return 0;
250f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
251f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return ReadText(strm, filename, allow_negative);
252f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
253f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
254f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
255f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // WARNING: Reading via symbol table read options should
256f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //          not be used. This is a temporary work around.
257f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTable* Read(istream &strm,
258f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                           const SymbolTableReadOptions& opts) {
259f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    SymbolTableImpl* impl = SymbolTableImpl::Read(strm, opts);
260f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!impl)
261f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return 0;
262f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    else
263f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return new SymbolTable(impl);
264f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
265f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
266f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // read a binary dump of the symbol table from a stream
267f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTable* Read(istream &strm, const string& source) {
268f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    SymbolTableReadOptions opts;
269f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    opts.source = source;
270f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return Read(strm, opts);
271f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
272f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
273f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // read a binary dump of the symbol table
274f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTable* Read(const string& filename) {
275f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ifstream strm(filename.c_str(), ifstream::in | ifstream::binary);
276f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!strm) {
277f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename;
278f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return 0;
279f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
280f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return Read(strm, filename);
281f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
282f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
283f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //--------------------------------------------------------
284f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Derivable Interface (final)
285f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //--------------------------------------------------------
286f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // create a reference counted copy
287f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual SymbolTable* Copy() const {
288f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return new SymbolTable(*this);
289f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
290f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
291f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Add a symbol with given key to table. A symbol table also
292f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // keeps track of the last available key (highest key value in
293f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // the symbol table).
294f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 AddSymbol(const string& symbol, int64 key) {
295f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    MutateCheck();
296f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->AddSymbol(symbol, key);
297f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
298f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
299f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Add a symbol to the table. The associated value key is automatically
300f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // assigned by the symbol table.
301f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 AddSymbol(const string& symbol) {
302f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    MutateCheck();
303f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->AddSymbol(symbol);
304f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
305f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
306f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Add another symbol table to this table. All key values will be offset
307f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // by the current available key (highest key value in the symbol table).
308f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Note string symbols with the same key value with still have the same
309f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // key value after the symbol table has been merged, but a different
310f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // value. Adding symbol tables do not result in changes in the base table.
311f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual void AddTable(const SymbolTable& table);
312f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
313f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // return the name of the symbol table
314f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual const string& Name() const {
315f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->Name();
316f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
317f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
318f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the label-agnostic MD5 check-sum for this table.  All new symbols
319f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // added to the table will result in an updated checksum.
320f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // DEPRECATED.
321f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual string CheckSum() const {
322f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->CheckSum();
323f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
324f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
325f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Same as CheckSum(), but this returns an label-dependent version.
326f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual string LabeledCheckSum() const {
327f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->LabeledCheckSum();
328f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
329f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
330f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual bool Write(ostream &strm) const {
331f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->Write(strm);
332f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
333f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
334f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool Write(const string& filename) const {
335f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ofstream strm(filename.c_str(), ofstream::out | ofstream::binary);
336f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!strm) {
337f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename;
338f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return false;
339f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
340f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return Write(strm);
341f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
342f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
343f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Dump an ascii text representation of the symbol table via a stream
344f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual bool WriteText(ostream &strm) const;
345f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
346f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Dump an ascii text representation of the symbol table
347f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool WriteText(const string& filename) const {
348f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ofstream strm(filename.c_str());
349f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!strm) {
350f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename;
351f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return false;
352f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
353f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return WriteText(strm);
354f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
355f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
356f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the string associated with the key. If the key is out of
357f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // range (<0, >max), log error and return an empty string.
358f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual string Find(int64 key) const {
359f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->Find(key);
360f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
361f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
362f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the key associated with the symbol. If the symbol
363f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // does not exists, log error and  return SymbolTable::kNoSymbol
364f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 Find(const string& symbol) const {
365f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->Find(symbol);
366f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
367f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
368f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the key associated with the symbol. If the symbol
369f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // does not exists, log error and  return SymbolTable::kNoSymbol
370f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 Find(const char* symbol) const {
371f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->Find(symbol);
372f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
373f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
374f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the current available key (i.e highest key number+1) in
375f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // the symbol table
376f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 AvailableKey(void) const {
377f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->AvailableKey();
378f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
379f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
380f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the current number of symbols in table (not necessarily
381f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // equal to AvailableKey())
382f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual size_t NumSymbols(void) const {
383f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->NumSymbols();
384f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
385f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
386f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 GetNthKey(ssize_t pos) const {
387f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->GetNthKey(pos);
388f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
389f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
390f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
391f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  explicit SymbolTable(SymbolTableImpl* impl) : impl_(impl) {}
392f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
393f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  void MutateCheck() {
394f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    // Copy on write
395f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (impl_->RefCount() > 1) {
396f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      impl_->DecrRefCount();
397f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      impl_ = new SymbolTableImpl(*impl_);
398f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
399f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
400f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
401f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  const SymbolTableImpl* Impl() const {
402f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_;
403f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
404f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
405f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
406f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTableImpl* impl_;
407f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
408f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  void operator=(const SymbolTable &table);  // disallow
409f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
410f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
411f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
412f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
413f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \class SymbolTableIterator
414f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \brief Iterator class for symbols in a symbol table
415f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTableIterator {
416f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public:
417f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTableIterator(const SymbolTable& table)
418f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : table_(table),
419f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        pos_(0),
420f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        nsymbols_(table.NumSymbols()),
421f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        key_(table.GetNthKey(0)) { }
422f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
423f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  ~SymbolTableIterator() { }
424f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
425f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // is iterator done
426f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool Done(void) {
427f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return (pos_ == nsymbols_);
428f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
429f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
430f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // return the Value() of the current symbol (int64 key)
431f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 Value(void) {
432f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return key_;
433f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
434f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
435f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // return the string of the current symbol
436f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string Symbol(void) {
437f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return table_.Find(key_);
438f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
439f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
440f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // advance iterator forward
441f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  void Next(void) {
442f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ++pos_;
443f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (pos_ < nsymbols_) key_ = table_.GetNthKey(pos_);
444f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
445f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
446f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // reset iterator
447f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  void Reset(void) {
448f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    pos_ = 0;
449f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    key_ = table_.GetNthKey(0);
450f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
451f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
452f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
453f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  const SymbolTable& table_;
454f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  ssize_t pos_;
455f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  size_t nsymbols_;
456f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 key_;
457f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
458f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
459f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
460f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Tests compatibilty between two sets of symbol tables
461f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsoninline bool CompatSymbols(const SymbolTable *syms1, const SymbolTable *syms2,
462f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                          bool warning = true) {
463f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (!FLAGS_fst_compat_symbols) {
464f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
465f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (!syms1 && !syms2) {
466f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
467f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (syms1 && !syms2) {
468f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (warning)
469f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(WARNING) <<
470f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson          "CompatSymbols: first symbol table present but second missing";
471f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return false;
472f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (!syms1 && syms2) {
473f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (warning)
474f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(WARNING) <<
475f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson          "CompatSymbols: second symbol table present but first missing";
476f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return false;
477f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (syms1->LabeledCheckSum() != syms2->LabeledCheckSum()) {
478f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (warning)
479f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(WARNING) << "CompatSymbols: Symbol table check sums do not match";
480f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return false;
481f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else {
482f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
483f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
484f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}
485f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
486f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
487f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Relabels a symbol table as specified by the input vector of pairs
488f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// (old label, new label). The new symbol table only retains symbols
489f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// for which a relabeling is *explicitely* specified.
490f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// TODO(allauzen): consider adding options to allow for some form
491f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// of implicit identity relabeling.
492f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class Label>
493f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *RelabelSymbolTable(const SymbolTable *table,
494f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                                const vector<pair<Label, Label> > &pairs) {
495f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTable *new_table = new SymbolTable(
496f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      table->Name().empty() ? string() :
497f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      (string("relabeled_") + table->Name()));
498f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
499f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  for (size_t i = 0; i < pairs.size(); ++i)
500f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    new_table->AddSymbol(table->Find(pairs[i].first), pairs[i].second);
501f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
502f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  return new_table;
503f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}
504f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
505f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}  // namespace fst
506f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
507f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif  // FST_LIB_SYMBOL_TABLE_H__
508