1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License");
3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License.
4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at
5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//     http://www.apache.org/licenses/LICENSE-2.0
7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software
9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS,
10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and
12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License.
13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc.
15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// All Rights Reserved.
16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author : Johan Schalkwyk
18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file
20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Classes to provide symbol-to-integer and integer-to-symbol mappings.
21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_LIB_SYMBOL_TABLE_H__
23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_LIB_SYMBOL_TABLE_H__
24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <cstring>
26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string>
27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <utility>
28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::pair; using std::make_pair;
29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector>
30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector;
31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/compat.h>
34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <iostream>
35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fstream>
36dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <sstream>
37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <map>
40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonDECLARE_bool(fst_compat_symbols);
42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst {
44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WARNING: Reading via symbol table read options should
46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//          not be used. This is a temporary work around for
47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//          reading symbol ranges of previously stored symbol sets.
48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonstruct SymbolTableReadOptions {
49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTableReadOptions() { }
50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTableReadOptions(vector<pair<int64, int64> > string_hash_ranges_,
52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                         const string& source_)
53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : string_hash_ranges(string_hash_ranges_),
54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        source(source_) { }
55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  vector<pair<int64, int64> > string_hash_ranges;
57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string source;
58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
60dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinstruct SymbolTableTextOptions {
61dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  SymbolTableTextOptions();
62dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
63dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  bool allow_negative;
64dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  string fst_field_separator;
65dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin};
66dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTableImpl {
68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public:
69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTableImpl(const string &name)
70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : name_(name),
71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        available_key_(0),
72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        dense_key_limit_(0),
73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        check_sum_finalized_(false) {}
74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  explicit SymbolTableImpl(const SymbolTableImpl& impl)
76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : name_(impl.name_),
77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        available_key_(0),
78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        dense_key_limit_(0),
79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        check_sum_finalized_(false) {
80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    for (size_t i = 0; i < impl.symbols_.size(); ++i) {
81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      AddSymbol(impl.symbols_[i], impl.Find(impl.symbols_[i]));
82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  ~SymbolTableImpl() {
86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    for (size_t i = 0; i < symbols_.size(); ++i)
87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      delete[] symbols_[i];
88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
90f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // TODO(johans): Add flag to specify whether the symbol
91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //               should be indexed as string or int or both.
92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 AddSymbol(const string& symbol, int64 key);
93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
94f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 AddSymbol(const string& symbol) {
95f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    int64 key = Find(symbol);
96f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return (key == -1) ? AddSymbol(symbol, available_key_++) : key;
97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
99dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  static SymbolTableImpl* ReadText(
100dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      istream &strm, const string &name,
101dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      const SymbolTableTextOptions &opts = SymbolTableTextOptions());
102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
103f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTableImpl* Read(istream &strm,
104f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                               const SymbolTableReadOptions& opts);
105f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
106f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool Write(ostream &strm) const;
107f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
108f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //
109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the string associated with the key. If the key is out of
110f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // range (<0, >max), return an empty string.
111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string Find(int64 key) const {
112f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (key >=0 && key < dense_key_limit_)
113f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return string(symbols_[key]);
114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    map<int64, const char*>::const_iterator it =
116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        key_map_.find(key);
117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (it == key_map_.end()) {
118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return "";
119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return string(it->second);
121f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //
124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the key associated with the symbol. If the symbol
125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // does not exists, return SymbolTable::kNoSymbol.
126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 Find(const string& symbol) const {
127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return Find(symbol.c_str());
128f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //
131f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the key associated with the symbol. If the symbol
132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // does not exists, return SymbolTable::kNoSymbol.
133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 Find(const char* symbol) const {
134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    map<const char *, int64, StrCmp>::const_iterator it =
135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        symbol_map_.find(symbol);
136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (it == symbol_map_.end()) {
137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return -1;
138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return it->second;
140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
141f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
142f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 GetNthKey(ssize_t pos) const {
143f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if ((pos < 0) || (pos >= symbols_.size())) return -1;
144f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    else return Find(symbols_[pos]);
145f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
146f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
147f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  const string& Name() const { return name_; }
148f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
149f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int IncrRefCount() const {
150f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return ref_count_.Incr();
151f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
152f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int DecrRefCount() const {
153f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return ref_count_.Decr();
154f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
155f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int RefCount() const {
156f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return ref_count_.count();
157f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
158f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
159f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string CheckSum() const {
160f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    MaybeRecomputeCheckSum();
161f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return check_sum_string_;
162f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
163f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
164f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string LabeledCheckSum() const {
165f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    MaybeRecomputeCheckSum();
166f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return labeled_check_sum_string_;
167f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
168f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
169f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 AvailableKey() const {
170f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return available_key_;
171f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
172f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
173f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  size_t NumSymbols() const {
174f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return symbols_.size();
175f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
176f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
177f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
178f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Recomputes the checksums (both of them) if we've had changes since the last
179f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // computation (i.e., if check_sum_finalized_ is false).
180dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // Takes ~2.5 microseconds (dbg) or ~230 nanoseconds (opt) on a 2.67GHz Xeon
181dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // if the checksum is up-to-date (requiring no recomputation).
182f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  void MaybeRecomputeCheckSum() const;
183f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
184f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  struct StrCmp {
185f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    bool operator()(const char *s1, const char *s2) const {
186f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return strcmp(s1, s2) < 0;
187f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
188f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  };
189f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
190f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string name_;
191f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 available_key_;
192f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 dense_key_limit_;
193f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  vector<const char *> symbols_;
194f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  map<int64, const char*> key_map_;
195f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  map<const char *, int64, StrCmp> symbol_map_;
196f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
197f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable RefCounter ref_count_;
198f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable bool check_sum_finalized_;
199f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable string check_sum_string_;
200f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable string labeled_check_sum_string_;
201f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  mutable Mutex check_sum_mutex_;
202f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
203f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
204f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
205f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \class SymbolTable
206f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \brief Symbol (string) to int and reverse mapping
207f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
208f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// The SymbolTable implements the mappings of labels to strings and reverse.
209f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTables are used to describe the alphabet of the input and output
210f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// labels for arcs in a Finite State Transducer.
211f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
212f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTables are reference counted and can therefore be shared across
213f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// multiple machines. For example a language model grammar G, with a
214f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTable for the words in the language model can share this symbol
215f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// table with the lexical representation L o G.
216f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
217f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTable {
218f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public:
219f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static const int64 kNoSymbol = -1;
220f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
221dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // Construct symbol table with an unspecified name.
222dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  SymbolTable() : impl_(new SymbolTableImpl("<unspecified>")) {}
223dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
224f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Construct symbol table with a unique name.
225f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTable(const string& name) : impl_(new SymbolTableImpl(name)) {}
226f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
227f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Create a reference counted copy.
228f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTable(const SymbolTable& table) : impl_(table.impl_) {
229f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    impl_->IncrRefCount();
230f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
231f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
232f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Derefence implentation object. When reference count hits 0, delete
233f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // implementation.
234f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual ~SymbolTable() {
235f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!impl_->DecrRefCount()) delete impl_;
236f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
237f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
238dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // Copys the implemenation from one symbol table to another.
239dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  void operator=(const SymbolTable &st) {
240dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    if (impl_ != st.impl_) {
241dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      st.impl_->IncrRefCount();
242dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      if (!impl_->DecrRefCount()) delete impl_;
243dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      impl_ = st.impl_;
244dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    }
245dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  }
246dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
247f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Read an ascii representation of the symbol table from an istream. Pass a
248f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // name to give the resulting SymbolTable.
249dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  static SymbolTable* ReadText(
250dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      istream &strm, const string& name,
251dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      const SymbolTableTextOptions &opts = SymbolTableTextOptions()) {
252dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    SymbolTableImpl* impl = SymbolTableImpl::ReadText(strm, name, opts);
253f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!impl)
254f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return 0;
255f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    else
256f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return new SymbolTable(impl);
257f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
258f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
259f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // read an ascii representation of the symbol table
260f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTable* ReadText(const string& filename,
261dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      const SymbolTableTextOptions &opts = SymbolTableTextOptions()) {
262f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ifstream strm(filename.c_str(), ifstream::in);
263f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!strm) {
264f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(ERROR) << "SymbolTable::ReadText: Can't open file " << filename;
265f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return 0;
266f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
267dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    return ReadText(strm, filename, opts);
268f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
269f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
270f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
271f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // WARNING: Reading via symbol table read options should
272f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //          not be used. This is a temporary work around.
273f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTable* Read(istream &strm,
274f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                           const SymbolTableReadOptions& opts) {
275f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    SymbolTableImpl* impl = SymbolTableImpl::Read(strm, opts);
276f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!impl)
277f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return 0;
278f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    else
279f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return new SymbolTable(impl);
280f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
281f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
282f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // read a binary dump of the symbol table from a stream
283f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTable* Read(istream &strm, const string& source) {
284f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    SymbolTableReadOptions opts;
285f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    opts.source = source;
286f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return Read(strm, opts);
287f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
288f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
289f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // read a binary dump of the symbol table
290f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static SymbolTable* Read(const string& filename) {
291f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ifstream strm(filename.c_str(), ifstream::in | ifstream::binary);
292f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!strm) {
293f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename;
294f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return 0;
295f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
296f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return Read(strm, filename);
297f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
298f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
299f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //--------------------------------------------------------
300f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Derivable Interface (final)
301f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  //--------------------------------------------------------
302f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // create a reference counted copy
303f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual SymbolTable* Copy() const {
304f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return new SymbolTable(*this);
305f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
306f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
307f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Add a symbol with given key to table. A symbol table also
308f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // keeps track of the last available key (highest key value in
309f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // the symbol table).
310f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 AddSymbol(const string& symbol, int64 key) {
311f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    MutateCheck();
312f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->AddSymbol(symbol, key);
313f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
314f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
315f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Add a symbol to the table. The associated value key is automatically
316f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // assigned by the symbol table.
317f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 AddSymbol(const string& symbol) {
318f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    MutateCheck();
319f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->AddSymbol(symbol);
320f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
321f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
322f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Add another symbol table to this table. All key values will be offset
323f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // by the current available key (highest key value in the symbol table).
324f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Note string symbols with the same key value with still have the same
325f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // key value after the symbol table has been merged, but a different
326f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // value. Adding symbol tables do not result in changes in the base table.
327f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual void AddTable(const SymbolTable& table);
328f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
329f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // return the name of the symbol table
330f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual const string& Name() const {
331f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->Name();
332f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
333f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
334f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the label-agnostic MD5 check-sum for this table.  All new symbols
335f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // added to the table will result in an updated checksum.
336f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // DEPRECATED.
337f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual string CheckSum() const {
338f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->CheckSum();
339f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
340f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
341f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Same as CheckSum(), but this returns an label-dependent version.
342f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual string LabeledCheckSum() const {
343f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->LabeledCheckSum();
344f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
345f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
346f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual bool Write(ostream &strm) const {
347f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->Write(strm);
348f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
349f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
350f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool Write(const string& filename) const {
351f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ofstream strm(filename.c_str(), ofstream::out | ofstream::binary);
352f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!strm) {
353f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename;
354f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return false;
355f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
356f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return Write(strm);
357f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
358f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
359f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Dump an ascii text representation of the symbol table via a stream
360dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  virtual bool WriteText(
361dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      ostream &strm,
362dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      const SymbolTableTextOptions &opts = SymbolTableTextOptions()) const;
363f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
364f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Dump an ascii text representation of the symbol table
365f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool WriteText(const string& filename) const {
366f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ofstream strm(filename.c_str());
367f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!strm) {
368f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename;
369f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return false;
370f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
371f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return WriteText(strm);
372f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
373f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
374f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the string associated with the key. If the key is out of
375f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // range (<0, >max), log error and return an empty string.
376f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual string Find(int64 key) const {
377f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->Find(key);
378f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
379f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
380f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the key associated with the symbol. If the symbol
381f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // does not exists, log error and  return SymbolTable::kNoSymbol
382f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 Find(const string& symbol) const {
383f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->Find(symbol);
384f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
385f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
386f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the key associated with the symbol. If the symbol
387f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // does not exists, log error and  return SymbolTable::kNoSymbol
388f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 Find(const char* symbol) const {
389f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->Find(symbol);
390f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
391f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
392f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the current available key (i.e highest key number+1) in
393f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // the symbol table
394f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 AvailableKey(void) const {
395f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->AvailableKey();
396f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
397f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
398f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Return the current number of symbols in table (not necessarily
399f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // equal to AvailableKey())
400f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual size_t NumSymbols(void) const {
401f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->NumSymbols();
402f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
403f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
404f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  virtual int64 GetNthKey(ssize_t pos) const {
405f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_->GetNthKey(pos);
406f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
407f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
408f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
409f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  explicit SymbolTable(SymbolTableImpl* impl) : impl_(impl) {}
410f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
411f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  void MutateCheck() {
412f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    // Copy on write
413f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (impl_->RefCount() > 1) {
414f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      impl_->DecrRefCount();
415f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      impl_ = new SymbolTableImpl(*impl_);
416f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
417f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
418f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
419f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  const SymbolTableImpl* Impl() const {
420f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return impl_;
421f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
422f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
423f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
424f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTableImpl* impl_;
425f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
426f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
427f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
428f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
429f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \class SymbolTableIterator
430f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \brief Iterator class for symbols in a symbol table
431f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTableIterator {
432f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public:
433f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTableIterator(const SymbolTable& table)
434f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : table_(table),
435f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        pos_(0),
436f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        nsymbols_(table.NumSymbols()),
437f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        key_(table.GetNthKey(0)) { }
438f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
439f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  ~SymbolTableIterator() { }
440f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
441f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // is iterator done
442f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool Done(void) {
443f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return (pos_ == nsymbols_);
444f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
445f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
446f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // return the Value() of the current symbol (int64 key)
447f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 Value(void) {
448f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return key_;
449f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
450f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
451f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // return the string of the current symbol
452f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string Symbol(void) {
453f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return table_.Find(key_);
454f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
455f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
456f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // advance iterator forward
457f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  void Next(void) {
458f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    ++pos_;
459f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (pos_ < nsymbols_) key_ = table_.GetNthKey(pos_);
460f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
461f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
462f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // reset iterator
463f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  void Reset(void) {
464f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    pos_ = 0;
465f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    key_ = table_.GetNthKey(0);
466f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
467f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
468f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
469f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  const SymbolTable& table_;
470f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  ssize_t pos_;
471f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  size_t nsymbols_;
472f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int64 key_;
473f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
474f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
475f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
476f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Tests compatibilty between two sets of symbol tables
477f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsoninline bool CompatSymbols(const SymbolTable *syms1, const SymbolTable *syms2,
478f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                          bool warning = true) {
479f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (!FLAGS_fst_compat_symbols) {
480f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
481f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (!syms1 && !syms2) {
482f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
483f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (syms1 && !syms2) {
484f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (warning)
485f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(WARNING) <<
486f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson          "CompatSymbols: first symbol table present but second missing";
487f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return false;
488f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (!syms1 && syms2) {
489f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (warning)
490f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(WARNING) <<
491f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson          "CompatSymbols: second symbol table present but first missing";
492f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return false;
493f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (syms1->LabeledCheckSum() != syms2->LabeledCheckSum()) {
494f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (warning)
495f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      LOG(WARNING) << "CompatSymbols: Symbol table check sums do not match";
496f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return false;
497f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else {
498f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
499f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
500f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}
501f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
502f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
503f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Relabels a symbol table as specified by the input vector of pairs
504f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// (old label, new label). The new symbol table only retains symbols
505f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// for which a relabeling is *explicitely* specified.
506f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// TODO(allauzen): consider adding options to allow for some form
507f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// of implicit identity relabeling.
508f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class Label>
509f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *RelabelSymbolTable(const SymbolTable *table,
510f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                                const vector<pair<Label, Label> > &pairs) {
511f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SymbolTable *new_table = new SymbolTable(
512f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      table->Name().empty() ? string() :
513f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      (string("relabeled_") + table->Name()));
514f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
515f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  for (size_t i = 0; i < pairs.size(); ++i)
516f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    new_table->AddSymbol(table->Find(pairs[i].first), pairs[i].second);
517f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
518f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  return new_table;
519f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}
520f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
521dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Symbol Table Serialization
522dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkininline void SymbolTableToString(const SymbolTable *table, string *result) {
523dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  ostringstream ostrm;
524dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  table->Write(ostrm);
525dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  *result = ostrm.str();
526dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
527dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
528dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkininline SymbolTable *StringToSymbolTable(const string &s) {
529dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  istringstream istrm(s);
530dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  return SymbolTable::Read(istrm, SymbolTableReadOptions());
531dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}
532dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
533dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
534dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
535f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}  // namespace fst
536f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
537f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif  // FST_LIB_SYMBOL_TABLE_H__
538