1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License"); 3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License. 4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at 5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// http://www.apache.org/licenses/LICENSE-2.0 7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software 9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS, 10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and 12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License. 13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc. 15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// All Rights Reserved. 16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author : Johan Schalkwyk 18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file 20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Classes to provide symbol-to-integer and integer-to-symbol mappings. 21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_LIB_SYMBOL_TABLE_H__ 23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_LIB_SYMBOL_TABLE_H__ 24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <cstring> 26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string> 27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <utility> 28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::pair; using std::make_pair; 29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector> 30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector; 31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/compat.h> 34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <iostream> 35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fstream> 36dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <sstream> 37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <map> 40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonDECLARE_bool(fst_compat_symbols); 42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst { 44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WARNING: Reading via symbol table read options should 46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// not be used. This is a temporary work around for 47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// reading symbol ranges of previously stored symbol sets. 48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonstruct SymbolTableReadOptions { 49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableReadOptions() { } 50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableReadOptions(vector<pair<int64, int64> > string_hash_ranges_, 52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string& source_) 53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : string_hash_ranges(string_hash_ranges_), 54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson source(source_) { } 55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<pair<int64, int64> > string_hash_ranges; 57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string source; 58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 60dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkinstruct SymbolTableTextOptions { 61dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SymbolTableTextOptions(); 62dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 63dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool allow_negative; 64dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin string fst_field_separator; 65dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin}; 66dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTableImpl { 68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public: 69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableImpl(const string &name) 70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : name_(name), 71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson available_key_(0), 72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson dense_key_limit_(0), 73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson check_sum_finalized_(false) {} 74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson explicit SymbolTableImpl(const SymbolTableImpl& impl) 76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : name_(impl.name_), 77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson available_key_(0), 78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson dense_key_limit_(0), 79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson check_sum_finalized_(false) { 80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < impl.symbols_.size(); ++i) { 81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson AddSymbol(impl.symbols_[i], impl.Find(impl.symbols_[i])); 82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ~SymbolTableImpl() { 86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < symbols_.size(); ++i) 87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete[] symbols_[i]; 88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 90f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // TODO(johans): Add flag to specify whether the symbol 91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // should be indexed as string or int or both. 92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 AddSymbol(const string& symbol, int64 key); 93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 94f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 AddSymbol(const string& symbol) { 95f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 key = Find(symbol); 96f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return (key == -1) ? AddSymbol(symbol, available_key_++) : key; 97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 99dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin static SymbolTableImpl* ReadText( 100dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin istream &strm, const string &name, 101dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const SymbolTableTextOptions &opts = SymbolTableTextOptions()); 102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 103f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTableImpl* Read(istream &strm, 104f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTableReadOptions& opts); 105f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 106f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool Write(ostream &strm) const; 107f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 108f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // 109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the string associated with the key. If the key is out of 110f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // range (<0, >max), return an empty string. 111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string Find(int64 key) const { 112f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (key >=0 && key < dense_key_limit_) 113f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return string(symbols_[key]); 114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson map<int64, const char*>::const_iterator it = 116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson key_map_.find(key); 117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (it == key_map_.end()) { 118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return ""; 119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return string(it->second); 121f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // 124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the key associated with the symbol. If the symbol 125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // does not exists, return SymbolTable::kNoSymbol. 126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 Find(const string& symbol) const { 127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return Find(symbol.c_str()); 128f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // 131f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the key associated with the symbol. If the symbol 132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // does not exists, return SymbolTable::kNoSymbol. 133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 Find(const char* symbol) const { 134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson map<const char *, int64, StrCmp>::const_iterator it = 135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson symbol_map_.find(symbol); 136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (it == symbol_map_.end()) { 137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return -1; 138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return it->second; 140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 141f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 142f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 GetNthKey(ssize_t pos) const { 143f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if ((pos < 0) || (pos >= symbols_.size())) return -1; 144f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson else return Find(symbols_[pos]); 145f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 146f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 147f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string& Name() const { return name_; } 148f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 149f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int IncrRefCount() const { 150f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return ref_count_.Incr(); 151f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 152f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int DecrRefCount() const { 153f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return ref_count_.Decr(); 154f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 155f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int RefCount() const { 156f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return ref_count_.count(); 157f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 158f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 159f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string CheckSum() const { 160f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MaybeRecomputeCheckSum(); 161f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return check_sum_string_; 162f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 163f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 164f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string LabeledCheckSum() const { 165f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MaybeRecomputeCheckSum(); 166f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return labeled_check_sum_string_; 167f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 168f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 169f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 AvailableKey() const { 170f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return available_key_; 171f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 172f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 173f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t NumSymbols() const { 174f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return symbols_.size(); 175f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 176f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 177f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 178f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Recomputes the checksums (both of them) if we've had changes since the last 179f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // computation (i.e., if check_sum_finalized_ is false). 180dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Takes ~2.5 microseconds (dbg) or ~230 nanoseconds (opt) on a 2.67GHz Xeon 181dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // if the checksum is up-to-date (requiring no recomputation). 182f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson void MaybeRecomputeCheckSum() const; 183f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 184f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson struct StrCmp { 185f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool operator()(const char *s1, const char *s2) const { 186f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return strcmp(s1, s2) < 0; 187f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 188f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson }; 189f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 190f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string name_; 191f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 available_key_; 192f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 dense_key_limit_; 193f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<const char *> symbols_; 194f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson map<int64, const char*> key_map_; 195f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson map<const char *, int64, StrCmp> symbol_map_; 196f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 197f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable RefCounter ref_count_; 198f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable bool check_sum_finalized_; 199f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable string check_sum_string_; 200f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable string labeled_check_sum_string_; 201f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable Mutex check_sum_mutex_; 202f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 203f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 204f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 205f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \class SymbolTable 206f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \brief Symbol (string) to int and reverse mapping 207f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 208f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// The SymbolTable implements the mappings of labels to strings and reverse. 209f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTables are used to describe the alphabet of the input and output 210f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// labels for arcs in a Finite State Transducer. 211f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 212f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTables are reference counted and can therefore be shared across 213f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// multiple machines. For example a language model grammar G, with a 214f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTable for the words in the language model can share this symbol 215f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// table with the lexical representation L o G. 216f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 217f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTable { 218f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public: 219f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static const int64 kNoSymbol = -1; 220f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 221dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Construct symbol table with an unspecified name. 222dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SymbolTable() : impl_(new SymbolTableImpl("<unspecified>")) {} 223dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 224f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Construct symbol table with a unique name. 225f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTable(const string& name) : impl_(new SymbolTableImpl(name)) {} 226f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 227f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Create a reference counted copy. 228f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTable(const SymbolTable& table) : impl_(table.impl_) { 229f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson impl_->IncrRefCount(); 230f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 231f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 232f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Derefence implentation object. When reference count hits 0, delete 233f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // implementation. 234f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual ~SymbolTable() { 235f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!impl_->DecrRefCount()) delete impl_; 236f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 237f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 238dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin // Copys the implemenation from one symbol table to another. 239dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin void operator=(const SymbolTable &st) { 240dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (impl_ != st.impl_) { 241dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin st.impl_->IncrRefCount(); 242dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!impl_->DecrRefCount()) delete impl_; 243dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin impl_ = st.impl_; 244dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 245dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 246dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 247f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Read an ascii representation of the symbol table from an istream. Pass a 248f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // name to give the resulting SymbolTable. 249dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin static SymbolTable* ReadText( 250dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin istream &strm, const string& name, 251dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const SymbolTableTextOptions &opts = SymbolTableTextOptions()) { 252dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SymbolTableImpl* impl = SymbolTableImpl::ReadText(strm, name, opts); 253f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!impl) 254f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return 0; 255f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson else 256f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return new SymbolTable(impl); 257f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 258f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 259f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // read an ascii representation of the symbol table 260f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTable* ReadText(const string& filename, 261dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const SymbolTableTextOptions &opts = SymbolTableTextOptions()) { 262f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ifstream strm(filename.c_str(), ifstream::in); 263f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!strm) { 264f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "SymbolTable::ReadText: Can't open file " << filename; 265f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return 0; 266f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 267dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return ReadText(strm, filename, opts); 268f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 269f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 270f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 271f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // WARNING: Reading via symbol table read options should 272f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // not be used. This is a temporary work around. 273f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTable* Read(istream &strm, 274f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTableReadOptions& opts) { 275f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableImpl* impl = SymbolTableImpl::Read(strm, opts); 276f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!impl) 277f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return 0; 278f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson else 279f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return new SymbolTable(impl); 280f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 281f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 282f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // read a binary dump of the symbol table from a stream 283f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTable* Read(istream &strm, const string& source) { 284f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableReadOptions opts; 285f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson opts.source = source; 286f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return Read(strm, opts); 287f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 288f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 289f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // read a binary dump of the symbol table 290f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTable* Read(const string& filename) { 291f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ifstream strm(filename.c_str(), ifstream::in | ifstream::binary); 292f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!strm) { 293f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename; 294f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return 0; 295f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 296f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return Read(strm, filename); 297f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 298f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 299f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson //-------------------------------------------------------- 300f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Derivable Interface (final) 301f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson //-------------------------------------------------------- 302f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // create a reference counted copy 303f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual SymbolTable* Copy() const { 304f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return new SymbolTable(*this); 305f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 306f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 307f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Add a symbol with given key to table. A symbol table also 308f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // keeps track of the last available key (highest key value in 309f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // the symbol table). 310f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 AddSymbol(const string& symbol, int64 key) { 311f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MutateCheck(); 312f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->AddSymbol(symbol, key); 313f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 314f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 315f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Add a symbol to the table. The associated value key is automatically 316f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // assigned by the symbol table. 317f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 AddSymbol(const string& symbol) { 318f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MutateCheck(); 319f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->AddSymbol(symbol); 320f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 321f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 322f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Add another symbol table to this table. All key values will be offset 323f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // by the current available key (highest key value in the symbol table). 324f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Note string symbols with the same key value with still have the same 325f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // key value after the symbol table has been merged, but a different 326f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // value. Adding symbol tables do not result in changes in the base table. 327f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual void AddTable(const SymbolTable& table); 328f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 329f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // return the name of the symbol table 330f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual const string& Name() const { 331f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->Name(); 332f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 333f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 334f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the label-agnostic MD5 check-sum for this table. All new symbols 335f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // added to the table will result in an updated checksum. 336f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // DEPRECATED. 337f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual string CheckSum() const { 338f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->CheckSum(); 339f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 340f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 341f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Same as CheckSum(), but this returns an label-dependent version. 342f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual string LabeledCheckSum() const { 343f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->LabeledCheckSum(); 344f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 345f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 346f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual bool Write(ostream &strm) const { 347f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->Write(strm); 348f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 349f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 350f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool Write(const string& filename) const { 351f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ofstream strm(filename.c_str(), ofstream::out | ofstream::binary); 352f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!strm) { 353f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename; 354f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 355f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 356f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return Write(strm); 357f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 358f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 359f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Dump an ascii text representation of the symbol table via a stream 360dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin virtual bool WriteText( 361dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ostream &strm, 362dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const SymbolTableTextOptions &opts = SymbolTableTextOptions()) const; 363f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 364f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Dump an ascii text representation of the symbol table 365f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool WriteText(const string& filename) const { 366f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ofstream strm(filename.c_str()); 367f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!strm) { 368f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename; 369f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 370f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 371f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return WriteText(strm); 372f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 373f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 374f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the string associated with the key. If the key is out of 375f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // range (<0, >max), log error and return an empty string. 376f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual string Find(int64 key) const { 377f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->Find(key); 378f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 379f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 380f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the key associated with the symbol. If the symbol 381f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // does not exists, log error and return SymbolTable::kNoSymbol 382f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 Find(const string& symbol) const { 383f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->Find(symbol); 384f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 385f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 386f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the key associated with the symbol. If the symbol 387f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // does not exists, log error and return SymbolTable::kNoSymbol 388f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 Find(const char* symbol) const { 389f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->Find(symbol); 390f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 391f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 392f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the current available key (i.e highest key number+1) in 393f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // the symbol table 394f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 AvailableKey(void) const { 395f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->AvailableKey(); 396f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 397f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 398f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the current number of symbols in table (not necessarily 399f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // equal to AvailableKey()) 400f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual size_t NumSymbols(void) const { 401f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->NumSymbols(); 402f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 403f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 404f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 GetNthKey(ssize_t pos) const { 405f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->GetNthKey(pos); 406f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 407f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 408f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 409f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson explicit SymbolTable(SymbolTableImpl* impl) : impl_(impl) {} 410f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 411f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson void MutateCheck() { 412f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Copy on write 413f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (impl_->RefCount() > 1) { 414f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson impl_->DecrRefCount(); 415f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson impl_ = new SymbolTableImpl(*impl_); 416f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 417f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 418f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 419f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTableImpl* Impl() const { 420f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_; 421f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 422f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 423f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 424f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableImpl* impl_; 425f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 426f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 427f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 428f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 429f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \class SymbolTableIterator 430f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \brief Iterator class for symbols in a symbol table 431f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTableIterator { 432f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public: 433f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableIterator(const SymbolTable& table) 434f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : table_(table), 435f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson pos_(0), 436f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson nsymbols_(table.NumSymbols()), 437f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson key_(table.GetNthKey(0)) { } 438f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 439f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ~SymbolTableIterator() { } 440f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 441f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // is iterator done 442f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool Done(void) { 443f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return (pos_ == nsymbols_); 444f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 445f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 446f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // return the Value() of the current symbol (int64 key) 447f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 Value(void) { 448f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return key_; 449f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 450f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 451f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // return the string of the current symbol 452f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string Symbol(void) { 453f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return table_.Find(key_); 454f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 455f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 456f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // advance iterator forward 457f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson void Next(void) { 458f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ++pos_; 459f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (pos_ < nsymbols_) key_ = table_.GetNthKey(pos_); 460f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 461f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 462f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // reset iterator 463f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson void Reset(void) { 464f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson pos_ = 0; 465f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson key_ = table_.GetNthKey(0); 466f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 467f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 468f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 469f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTable& table_; 470f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ssize_t pos_; 471f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t nsymbols_; 472f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 key_; 473f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 474f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 475f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 476f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Tests compatibilty between two sets of symbol tables 477f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsoninline bool CompatSymbols(const SymbolTable *syms1, const SymbolTable *syms2, 478f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool warning = true) { 479f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!FLAGS_fst_compat_symbols) { 480f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 481f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (!syms1 && !syms2) { 482f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 483f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (syms1 && !syms2) { 484f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (warning) 485f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(WARNING) << 486f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson "CompatSymbols: first symbol table present but second missing"; 487f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 488f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (!syms1 && syms2) { 489f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (warning) 490f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(WARNING) << 491f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson "CompatSymbols: second symbol table present but first missing"; 492f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 493f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (syms1->LabeledCheckSum() != syms2->LabeledCheckSum()) { 494f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (warning) 495f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(WARNING) << "CompatSymbols: Symbol table check sums do not match"; 496f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 497f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 498f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 499f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 500f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 501f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 502f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 503f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Relabels a symbol table as specified by the input vector of pairs 504f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// (old label, new label). The new symbol table only retains symbols 505f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// for which a relabeling is *explicitely* specified. 506f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// TODO(allauzen): consider adding options to allow for some form 507f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// of implicit identity relabeling. 508f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class Label> 509f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *RelabelSymbolTable(const SymbolTable *table, 510f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const vector<pair<Label, Label> > &pairs) { 511f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTable *new_table = new SymbolTable( 512f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson table->Name().empty() ? string() : 513f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson (string("relabeled_") + table->Name())); 514f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 515f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < pairs.size(); ++i) 516f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson new_table->AddSymbol(table->Find(pairs[i].first), pairs[i].second); 517f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 518f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return new_table; 519f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 520f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 521dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Symbol Table Serialization 522dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkininline void SymbolTableToString(const SymbolTable *table, string *result) { 523dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin ostringstream ostrm; 524dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin table->Write(ostrm); 525dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin *result = ostrm.str(); 526dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 527dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 528dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkininline SymbolTable *StringToSymbolTable(const string &s) { 529dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin istringstream istrm(s); 530dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return SymbolTable::Read(istrm, SymbolTableReadOptions()); 531dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin} 532dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 533dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 534dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 535f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} // namespace fst 536f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 537f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif // FST_LIB_SYMBOL_TABLE_H__ 538