1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License"); 3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License. 4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at 5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// http://www.apache.org/licenses/LICENSE-2.0 7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software 9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS, 10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and 12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License. 13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc. 15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author: sorenj@google.com (Jeffrey Sorensen) 16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/symbol-table-ops.h> 18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst { 20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *MergeSymbolTable(const SymbolTable &left, const SymbolTable &right, 22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool *right_relabel_output) { 23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // MergeSymbolTable detects several special cases. It will return a reference 24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // copied version of SymbolTable of left or right if either symbol table is 25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // a superset of the other. 26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTable *merged = new SymbolTable("merge_" + left.Name() + "_" + 27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson right.Name()); 28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // copy everything from the left symbol table 29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool left_has_all = true, right_has_all = true, relabel = false; 30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableIterator liter(left); 31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (; !liter.Done(); liter.Next()) { 32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson merged->AddSymbol(liter.Symbol(), liter.Value()); 33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (right_has_all) { 34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 key = right.Find(liter.Symbol()); 35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (key == -1) { 36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson right_has_all = false; 37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (!relabel && key != liter.Value()) { 38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson relabel = true; 39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (right_has_all) { 43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete merged; 44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (right_relabel_output != NULL) { 45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson *right_relabel_output = relabel; 46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return right.Copy(); 48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // add all symbols we can from right symbol table 50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<string> conflicts; 51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableIterator riter(right); 52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (; !riter.Done(); riter.Next()) { 53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 key = merged->Find(riter.Symbol()); 54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (key != -1) { 55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Symbol already exists, maybe with different value 56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (key != riter.Value()) { 57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson relabel = true; 58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson continue; 60f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 61f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Symbol doesn't exist from left 62f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson left_has_all = false; 63f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!merged->Find(riter.Value()).empty()) { 64f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // we can't add this where we want to, add it later, in order 65f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson conflicts.push_back(riter.Symbol()); 66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson continue; 67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // there is a hole and we can add this symbol with its id 69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson merged->AddSymbol(riter.Symbol(), riter.Value()); 70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (right_relabel_output != NULL) { 72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson *right_relabel_output = relabel; 73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (left_has_all) { 75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete merged; 76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return left.Copy(); 77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Add all symbols that conflicted, in order 79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (int i= 0; i < conflicts.size(); ++i) { 80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson merged->AddSymbol(conflicts[i]); 81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return merged; 83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *CompactSymbolTable(const SymbolTable &syms) { 86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson map<int, string> sorted; 87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTableIterator stiter(syms); 88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (; !stiter.Done(); stiter.Next()) { 89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson sorted[stiter.Value()] = stiter.Symbol(); 90f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTable *compact = new SymbolTable(syms.Name() + "_compact"); 92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson uint64 newkey = 0; 93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (map<int, string>::const_iterator si = sorted.begin(); 94f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson si != sorted.end(); ++si) { 95f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson compact->AddSymbol(si->second, newkey++); 96f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return compact; 98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 100f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonSymbolTable *FstReadSymbols(const string &filename, bool input_symbols) { 101f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ifstream in(filename.c_str(), ifstream::in | ifstream::binary); 102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!in) { 103f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "FstReadSymbols: Can't open file " << filename; 104f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return NULL; 105f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 106f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FstHeader hdr; 107f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!hdr.Read(in, filename)) { 108f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "FstReadSymbols: Couldn't read header from " << filename; 109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return NULL; 110f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (hdr.GetFlags() & FstHeader::HAS_ISYMBOLS) { 112f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTable *isymbols = SymbolTable::Read(in, filename); 113f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (isymbols == NULL) { 114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "FstReadSymbols: Could not read input symbols from " 115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << filename; 116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return NULL; 117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (input_symbols) { 119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return isymbols; 120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 121f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete isymbols; 122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (hdr.GetFlags() & FstHeader::HAS_OSYMBOLS) { 124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SymbolTable *osymbols = SymbolTable::Read(in, filename); 125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (osymbols == NULL) { 126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "FstReadSymbols: Could not read output symbols from " 127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << filename; 128f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return NULL; 129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!input_symbols) { 131f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return osymbols; 132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete osymbols; 134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson LOG(ERROR) << "FstReadSymbols: The file " << filename 136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << " doesn't contain the requested symbols"; 137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return NULL; 138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} // namespace fst 141