symbol-table.cpp revision a8a167d6883e4acee42619e0bbfd811984f6e94d
1// symbol-table.cc
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15//
16// \file
17// Classes to provide symbol-to-integer and integer-to-symbol mappings.
18
19#include "fst/lib/symbol-table.h"
20#include "fst/lib/util.h"
21
22#include <string.h>
23
24DEFINE_bool(fst_compat_symbols, true,
25            "Require symbol tables to match when appropriate");
26
27namespace fst {
28
29// Maximum line length in textual symbols file.
30const int kLineLen = 8096;
31
32// Identifies stream data as a symbol table (and its endianity)
33static const int32 kSymbolTableMagicNumber = 2125658996;
34
35SymbolTableImpl* SymbolTableImpl::ReadText(const string &filename) {
36  ifstream strm(filename.c_str());
37  if (!strm) {
38    LOG(ERROR) << "SymbolTable::ReadText: Can't open symbol file: "
39               << filename;
40    return 0;
41  }
42
43  SymbolTableImpl* impl = new SymbolTableImpl(filename);
44
45  int64 nline = 0;
46  char line[kLineLen];
47  while (strm.getline(line, kLineLen)) {
48    ++nline;
49    vector<char *> col;
50    SplitToVector(line, "\n\t ", &col, true);
51    if (col.size() == 0)  // empty line
52      continue;
53    if (col.size() != 2) {
54      LOG(ERROR) << "SymbolTable::ReadText: Bad number of columns (skipping), "
55                 << "file = " << filename << ", line = " << nline;
56      continue;
57    }
58    const char *symbol = col[0];
59    const char *value = col[1];
60    char *p;
61    int64 key = strtoll(value, &p, 10);
62    if (p < value + strlen(value) || key < 0) {
63      LOG(ERROR) << "SymbolTable::ReadText: Bad non-negative integer \""
64                 << value << "\" (skipping), "
65                 << "file = " << filename << ", line = " << nline;
66      continue;
67    }
68    impl->AddSymbol(symbol, key);
69  }
70
71  return impl;
72}
73
74void SymbolTableImpl::RecomputeCheckSum() const {
75  check_sum_.Reset();
76  for (size_t i = 0; i < symbols_.size(); ++i) {
77    check_sum_.Update(symbols_[i], strlen(symbols_[i])+1);
78  }
79  check_sum_finalized_ = true;
80}
81
82int64 SymbolTableImpl::AddSymbol(const string& symbol, int64 key) {
83  std::unordered_map<string, int64>::const_iterator it =
84    symbol_map_.find(symbol);
85  if (it == symbol_map_.end()) {  // only add if not in table
86    check_sum_finalized_ = false;
87
88    char *csymbol = new char[symbol.size() + 1];
89    strcpy(csymbol, symbol.c_str());
90    symbols_.push_back(csymbol);
91    key_map_[key] = csymbol;
92    symbol_map_[csymbol] = key;
93
94    if (key >= available_key_) {
95      available_key_ = key + 1;
96    }
97  }
98
99  return key;
100}
101
102SymbolTableImpl* SymbolTableImpl::Read(istream &strm,
103                                       const string &source) {
104  int32 magic_number = 0;
105  ReadType(strm, &magic_number);
106  if (magic_number != kSymbolTableMagicNumber) {
107    LOG(ERROR) << "SymbolTable::Read: read failed";
108    return 0;
109  }
110  string name;
111  ReadType(strm, &name);
112  SymbolTableImpl* impl = new SymbolTableImpl(name);
113  ReadType(strm, &impl->available_key_);
114  int64 size;
115  ReadType(strm, &size);
116  string symbol;
117  int64 key = 0;
118  for (size_t i = 0; i < size; ++i) {
119    ReadType(strm, &symbol);
120    ReadType(strm, &key);
121    impl->AddSymbol(symbol, key);
122  }
123  if (!strm)
124    LOG(ERROR) << "SymbolTable::Read: read failed";
125  return impl;
126}
127
128bool SymbolTableImpl::Write(ostream &strm) const {
129  WriteType(strm, kSymbolTableMagicNumber);
130  WriteType(strm, name_);
131  WriteType(strm, available_key_);
132  int64 size = symbols_.size();
133  WriteType(strm, size);
134  for (size_t i = 0; i < symbols_.size(); ++i) {
135    const string symbol = symbols_[i];
136    WriteType(strm, symbol);
137    std::unordered_map<string, int64>::const_iterator it = symbol_map_.find(symbol);
138    WriteType(strm, it->second);
139  }
140  strm.flush();
141  if (!strm)
142    LOG(ERROR) << "SymbolTable::Write: write failed";
143  return strm;
144}
145
146bool SymbolTableImpl::WriteText(ostream &strm) const {
147  for (size_t i = 0; i < symbols_.size(); ++i) {
148    char line[kLineLen];
149    snprintf(line, kLineLen, "%s\t%lld\n", symbols_[i], Find(symbols_[i]));
150    strm.write(line, strlen(line));
151  }
152  strm.flush();
153  if (!strm)
154    LOG(ERROR) << "SymbolTable::WriteText: write failed";
155  return strm;
156}
157
158}  // namespace fst
159