symbol-table.h revision f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2
1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License"); 3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License. 4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at 5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// http://www.apache.org/licenses/LICENSE-2.0 7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software 9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS, 10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and 12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License. 13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc. 15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// All Rights Reserved. 16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author : Johan Schalkwyk 18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file 20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Classes to provide symbol-to-integer and integer-to-symbol mappings. 21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_LIB_SYMBOL_TABLE_H__ 23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_LIB_SYMBOL_TABLE_H__ 24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <cstring> 26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string> 27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <utility> 28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::pair; using std::make_pair; 29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector> 30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector; 31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/compat.h> 34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <iostream> 35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fstream> 36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <map> 39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonDECLARE_bool(fst_compat_symbols); 41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst { 43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WARNING: Reading via symbol table read options should 45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// not be used. This is a temporary work around for 46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// reading symbol ranges of previously stored symbol sets. 47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonstruct SymbolTableReadOptions { 48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableReadOptions() { } 49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableReadOptions(vector<pair<int64, int64> > string_hash_ranges_, 51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string& source_) 52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : string_hash_ranges(string_hash_ranges_), 53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson source(source_) { } 54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<pair<int64, int64> > string_hash_ranges; 56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string source; 57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTableImpl { 60f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public: 61f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableImpl(const string &name) 62f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : name_(name), 63f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson available_key_(0), 64f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson dense_key_limit_(0), 65f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson check_sum_finalized_(false) {} 66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson explicit SymbolTableImpl(const SymbolTableImpl& impl) 68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : name_(impl.name_), 69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson available_key_(0), 70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson dense_key_limit_(0), 71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson check_sum_finalized_(false) { 72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < impl.symbols_.size(); ++i) { 73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson AddSymbol(impl.symbols_[i], impl.Find(impl.symbols_[i])); 74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ~SymbolTableImpl() { 78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < symbols_.size(); ++i) 79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete[] symbols_[i]; 80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // TODO(johans): Add flag to specify whether the symbol 83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // should be indexed as string or int or both. 84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 AddSymbol(const string& symbol, int64 key); 85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 AddSymbol(const string& symbol) { 87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 key = Find(symbol); 88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return (key == -1) ? AddSymbol(symbol, available_key_++) : key; 89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 90f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTableImpl* ReadText(istream &strm, 92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string &name, 93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool allow_negative = false); 94f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 95f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTableImpl* Read(istream &strm, 96f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTableReadOptions& opts); 97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool Write(ostream &strm) const; 99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 100f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // 101f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the string associated with the key. If the key is out of 102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // range (<0, >max), return an empty string. 103f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string Find(int64 key) const { 104f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (key >=0 && key < dense_key_limit_) 105f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return string(symbols_[key]); 106f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 107f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson map<int64, const char*>::const_iterator it = 108f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson key_map_.find(key); 109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (it == key_map_.end()) { 110f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return ""; 111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 112f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return string(it->second); 113f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // 116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the key associated with the symbol. If the symbol 117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // does not exists, return SymbolTable::kNoSymbol. 118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 Find(const string& symbol) const { 119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return Find(symbol.c_str()); 120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 121f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // 123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the key associated with the symbol. If the symbol 124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // does not exists, return SymbolTable::kNoSymbol. 125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 Find(const char* symbol) const { 126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson map<const char *, int64, StrCmp>::const_iterator it = 127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson symbol_map_.find(symbol); 128f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (it == symbol_map_.end()) { 129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return -1; 130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 131f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return it->second; 132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 GetNthKey(ssize_t pos) const { 135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if ((pos < 0) || (pos >= symbols_.size())) return -1; 136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson else return Find(symbols_[pos]); 137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string& Name() const { return name_; } 140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 141f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int IncrRefCount() const { 142f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return ref_count_.Incr(); 143f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 144f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int DecrRefCount() const { 145f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return ref_count_.Decr(); 146f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 147f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int RefCount() const { 148f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return ref_count_.count(); 149f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 150f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 151f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string CheckSum() const { 152f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MutexLock check_sum_lock(&check_sum_mutex_); 153f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MaybeRecomputeCheckSum(); 154f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return check_sum_string_; 155f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 156f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 157f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string LabeledCheckSum() const { 158f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MutexLock check_sum_lock(&check_sum_mutex_); 159f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MaybeRecomputeCheckSum(); 160f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return labeled_check_sum_string_; 161f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 162f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 163f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 AvailableKey() const { 164f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return available_key_; 165f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 166f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 167f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t NumSymbols() const { 168f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return symbols_.size(); 169f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 170f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 171f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 172f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Recomputes the checksums (both of them) if we've had changes since the last 173f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // computation (i.e., if check_sum_finalized_ is false). 174f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson void MaybeRecomputeCheckSum() const; 175f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 176f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson struct StrCmp { 177f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool operator()(const char *s1, const char *s2) const { 178f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return strcmp(s1, s2) < 0; 179f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 180f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson }; 181f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 182f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string name_; 183f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 available_key_; 184f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 dense_key_limit_; 185f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<const char *> symbols_; 186f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson map<int64, const char*> key_map_; 187f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson map<const char *, int64, StrCmp> symbol_map_; 188f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 189f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable RefCounter ref_count_; 190f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable bool check_sum_finalized_; 191f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable CheckSummer check_sum_; 192f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable CheckSummer labeled_check_sum_; 193f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable string check_sum_string_; 194f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable string labeled_check_sum_string_; 195f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson mutable Mutex check_sum_mutex_; 196f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 197f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 198f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 199f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \class SymbolTable 200f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \brief Symbol (string) to int and reverse mapping 201f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 202f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// The SymbolTable implements the mappings of labels to strings and reverse. 203f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTables are used to describe the alphabet of the input and output 204f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// labels for arcs in a Finite State Transducer. 205f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 206f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTables are reference counted and can therefore be shared across 207f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// multiple machines. For example a language model grammar G, with a 208f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// SymbolTable for the words in the language model can share this symbol 209f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// table with the lexical representation L o G. 210f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 211f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTable { 212f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public: 213f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static const int64 kNoSymbol = -1; 214f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 215f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Construct symbol table with a unique name. 216f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTable(const string& name) : impl_(new SymbolTableImpl(name)) {} 217f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 218f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Create a reference counted copy. 219f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTable(const SymbolTable& table) : impl_(table.impl_) { 220f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson impl_->IncrRefCount(); 221f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 222f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 223f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Derefence implentation object. When reference count hits 0, delete 224f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // implementation. 225f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual ~SymbolTable() { 226f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!impl_->DecrRefCount()) delete impl_; 227f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 228f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 229f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Read an ascii representation of the symbol table from an istream. Pass a 230f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // name to give the resulting SymbolTable. 231f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTable* ReadText(istream &strm, 232f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string& name, 233f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool allow_negative = false) { 234f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableImpl* impl = SymbolTableImpl::ReadText(strm, 235f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson name, 236f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson allow_negative); 237f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!impl) 238f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return 0; 239f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson else 240f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return new SymbolTable(impl); 241f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 242f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 243f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // read an ascii representation of the symbol table 244f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTable* ReadText(const string& filename, 245f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool allow_negative = false) { 246f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ifstream strm(filename.c_str(), ifstream::in); 247f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!strm) { 248f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "SymbolTable::ReadText: Can't open file " << filename; 249f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return 0; 250f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 251f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return ReadText(strm, filename, allow_negative); 252f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 253f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 254f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 255f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // WARNING: Reading via symbol table read options should 256f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // not be used. This is a temporary work around. 257f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTable* Read(istream &strm, 258f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTableReadOptions& opts) { 259f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableImpl* impl = SymbolTableImpl::Read(strm, opts); 260f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!impl) 261f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return 0; 262f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson else 263f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return new SymbolTable(impl); 264f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 265f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 266f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // read a binary dump of the symbol table from a stream 267f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTable* Read(istream &strm, const string& source) { 268f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableReadOptions opts; 269f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson opts.source = source; 270f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return Read(strm, opts); 271f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 272f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 273f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // read a binary dump of the symbol table 274f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson static SymbolTable* Read(const string& filename) { 275f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ifstream strm(filename.c_str(), ifstream::in | ifstream::binary); 276f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!strm) { 277f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename; 278f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return 0; 279f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 280f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return Read(strm, filename); 281f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 282f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 283f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson //-------------------------------------------------------- 284f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Derivable Interface (final) 285f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson //-------------------------------------------------------- 286f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // create a reference counted copy 287f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual SymbolTable* Copy() const { 288f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return new SymbolTable(*this); 289f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 290f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 291f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Add a symbol with given key to table. A symbol table also 292f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // keeps track of the last available key (highest key value in 293f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // the symbol table). 294f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 AddSymbol(const string& symbol, int64 key) { 295f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MutateCheck(); 296f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->AddSymbol(symbol, key); 297f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 298f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 299f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Add a symbol to the table. The associated value key is automatically 300f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // assigned by the symbol table. 301f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 AddSymbol(const string& symbol) { 302f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson MutateCheck(); 303f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->AddSymbol(symbol); 304f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 305f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 306f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Add another symbol table to this table. All key values will be offset 307f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // by the current available key (highest key value in the symbol table). 308f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Note string symbols with the same key value with still have the same 309f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // key value after the symbol table has been merged, but a different 310f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // value. Adding symbol tables do not result in changes in the base table. 311f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual void AddTable(const SymbolTable& table); 312f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 313f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // return the name of the symbol table 314f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual const string& Name() const { 315f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->Name(); 316f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 317f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 318f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the label-agnostic MD5 check-sum for this table. All new symbols 319f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // added to the table will result in an updated checksum. 320f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // DEPRECATED. 321f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual string CheckSum() const { 322f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->CheckSum(); 323f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 324f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 325f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Same as CheckSum(), but this returns an label-dependent version. 326f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual string LabeledCheckSum() const { 327f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->LabeledCheckSum(); 328f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 329f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 330f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual bool Write(ostream &strm) const { 331f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->Write(strm); 332f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 333f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 334f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool Write(const string& filename) const { 335f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ofstream strm(filename.c_str(), ofstream::out | ofstream::binary); 336f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!strm) { 337f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename; 338f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 339f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 340f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return Write(strm); 341f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 342f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 343f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Dump an ascii text representation of the symbol table via a stream 344f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual bool WriteText(ostream &strm) const; 345f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 346f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Dump an ascii text representation of the symbol table 347f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool WriteText(const string& filename) const { 348f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ofstream strm(filename.c_str()); 349f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!strm) { 350f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename; 351f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 352f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 353f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return WriteText(strm); 354f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 355f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 356f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the string associated with the key. If the key is out of 357f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // range (<0, >max), log error and return an empty string. 358f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual string Find(int64 key) const { 359f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->Find(key); 360f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 361f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 362f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the key associated with the symbol. If the symbol 363f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // does not exists, log error and return SymbolTable::kNoSymbol 364f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 Find(const string& symbol) const { 365f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->Find(symbol); 366f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 367f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 368f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the key associated with the symbol. If the symbol 369f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // does not exists, log error and return SymbolTable::kNoSymbol 370f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 Find(const char* symbol) const { 371f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->Find(symbol); 372f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 373f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 374f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the current available key (i.e highest key number+1) in 375f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // the symbol table 376f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 AvailableKey(void) const { 377f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->AvailableKey(); 378f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 379f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 380f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Return the current number of symbols in table (not necessarily 381f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // equal to AvailableKey()) 382f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual size_t NumSymbols(void) const { 383f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->NumSymbols(); 384f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 385f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 386f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson virtual int64 GetNthKey(ssize_t pos) const { 387f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_->GetNthKey(pos); 388f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 389f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 390f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 391f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson explicit SymbolTable(SymbolTableImpl* impl) : impl_(impl) {} 392f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 393f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson void MutateCheck() { 394f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Copy on write 395f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (impl_->RefCount() > 1) { 396f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson impl_->DecrRefCount(); 397f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson impl_ = new SymbolTableImpl(*impl_); 398f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 399f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 400f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 401f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTableImpl* Impl() const { 402f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return impl_; 403f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 404f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 405f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 406f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableImpl* impl_; 407f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 408f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson void operator=(const SymbolTable &table); // disallow 409f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 410f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 411f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 412f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 413f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \class SymbolTableIterator 414f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \brief Iterator class for symbols in a symbol table 415f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass SymbolTableIterator { 416f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public: 417f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableIterator(const SymbolTable& table) 418f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : table_(table), 419f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson pos_(0), 420f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson nsymbols_(table.NumSymbols()), 421f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson key_(table.GetNthKey(0)) { } 422f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 423f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ~SymbolTableIterator() { } 424f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 425f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // is iterator done 426f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool Done(void) { 427f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return (pos_ == nsymbols_); 428f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 429f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 430f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // return the Value() of the current symbol (int64 key) 431f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 Value(void) { 432f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return key_; 433f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 434f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 435f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // return the string of the current symbol 436f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string Symbol(void) { 437f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return table_.Find(key_); 438f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 439f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 440f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // advance iterator forward 441f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson void Next(void) { 442f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ++pos_; 443f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (pos_ < nsymbols_) key_ = table_.GetNthKey(pos_); 444f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 445f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 446f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // reset iterator 447f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson void Reset(void) { 448f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson pos_ = 0; 449f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson key_ = table_.GetNthKey(0); 450f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 451f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 452f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 453f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTable& table_; 454f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ssize_t pos_; 455f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t nsymbols_; 456f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 key_; 457f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 458f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 459f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 460f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Tests compatibilty between two sets of symbol tables 461f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsoninline bool CompatSymbols(const SymbolTable *syms1, const SymbolTable *syms2, 462f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool warning = true) { 463f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!FLAGS_fst_compat_symbols) { 464f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 465f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (!syms1 && !syms2) { 466f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 467f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (syms1 && !syms2) { 468f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (warning) 469f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(WARNING) << 470f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson "CompatSymbols: first symbol table present but second missing"; 471f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 472f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (!syms1 && syms2) { 473f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (warning) 474f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(WARNING) << 475f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson "CompatSymbols: second symbol table present but first missing"; 476f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 477f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (syms1->LabeledCheckSum() != syms2->LabeledCheckSum()) { 478f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (warning) 479f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(WARNING) << "CompatSymbols: Symbol table check sums do not match"; 480f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 481f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 482f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 483f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 484f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 485f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 486f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 487f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Relabels a symbol table as specified by the input vector of pairs 488f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// (old label, new label). The new symbol table only retains symbols 489f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// for which a relabeling is *explicitely* specified. 490f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// TODO(allauzen): consider adding options to allow for some form 491f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// of implicit identity relabeling. 492f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class Label> 493f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *RelabelSymbolTable(const SymbolTable *table, 494f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const vector<pair<Label, Label> > &pairs) { 495f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTable *new_table = new SymbolTable( 496f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson table->Name().empty() ? string() : 497f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson (string("relabeled_") + table->Name())); 498f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 499f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < pairs.size(); ++i) 500f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson new_table->AddSymbol(table->Find(pairs[i].first), pairs[i].second); 501f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 502f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return new_table; 503f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 504f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 505f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} // namespace fst 506f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 507f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif // FST_LIB_SYMBOL_TABLE_H__ 508