compile-strings.h revision f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2
1 2// Licensed under the Apache License, Version 2.0 (the "License"); 3// you may not use this file except in compliance with the License. 4// You may obtain a copy of the License at 5// 6// http://www.apache.org/licenses/LICENSE-2.0 7// 8// Unless required by applicable law or agreed to in writing, software 9// distributed under the License is distributed on an "AS IS" BASIS, 10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11// See the License for the specific language governing permissions and 12// limitations under the License. 13// 14// Copyright 2005-2010 Google, Inc. 15// Authors: allauzen@google.com (Cyril Allauzen) 16// ttai@google.com (Terry Tai) 17// jpr@google.com (Jake Ratkiewicz) 18 19 20#ifndef FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ 21#define FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ 22 23#include <libgen.h> 24#include <string> 25#include <vector> 26using std::vector; 27 28#include <fst/extensions/far/far.h> 29#include <fst/string.h> 30 31namespace fst { 32 33// Construct a reader that provides FSTs from a file (stream) either on a 34// line-by-line basis or on a per-stream basis. Note that the freshly 35// constructed reader is already set to the first input. 36// 37// Sample Usage: 38// for (StringReader<Arc> reader(...); !reader.Done(); reader.Next()) { 39// Fst *fst = reader.GetVectorFst(); 40// } 41template <class A> 42class StringReader { 43 public: 44 typedef A Arc; 45 typedef typename A::Label Label; 46 typedef typename A::Weight Weight; 47 typedef typename StringCompiler<A>::TokenType TokenType; 48 49 enum EntryType { LINE = 1, FILE = 2 }; 50 51 StringReader(istream &istrm, 52 const string &source, 53 EntryType entry_type, 54 TokenType token_type, 55 bool allow_negative_labels, 56 const SymbolTable *syms = 0, 57 Label unknown_label = kNoStateId) 58 : nline_(0), strm_(istrm), source_(source), entry_type_(entry_type), 59 token_type_(token_type), done_(false), 60 compiler_(token_type, syms, unknown_label, allow_negative_labels) { 61 Next(); // Initialize the reader to the first input. 62 } 63 64 bool Done() { 65 return done_; 66 } 67 68 void Next() { 69 VLOG(1) << "Processing source " << source_ << " at line " << nline_; 70 if (!strm_) { // We're done if we have no more input. 71 done_ = true; 72 return; 73 } 74 if (entry_type_ == LINE) { 75 getline(strm_, content_); 76 ++nline_; 77 } else { 78 content_.clear(); 79 string line; 80 while (getline(strm_, line)) { 81 ++nline_; 82 content_.append(line); 83 content_.append("\n"); 84 } 85 } 86 if (!strm_ && content_.empty()) // We're also done if we read off all the 87 done_ = true; // whitespace at the end of a file. 88 } 89 90 VectorFst<A> *GetVectorFst() { 91 VectorFst<A> *fst = new VectorFst<A>; 92 if (compiler_(content_, fst)) { 93 return fst; 94 } else { 95 delete fst; 96 return NULL; 97 } 98 } 99 100 CompactFst<A, StringCompactor<A> > *GetCompactFst() { 101 CompactFst<A, StringCompactor<A> > *fst = 102 new CompactFst<A, StringCompactor<A> >; 103 if (compiler_(content_, fst)) { 104 return fst; 105 } else { 106 delete fst; 107 return NULL; 108 } 109 } 110 111 private: 112 size_t nline_; 113 istream &strm_; 114 string source_; 115 EntryType entry_type_; 116 TokenType token_type_; 117 bool done_; 118 StringCompiler<A> compiler_; 119 string content_; // The actual content of the input stream's next FST. 120 121 DISALLOW_COPY_AND_ASSIGN(StringReader); 122}; 123 124// Compute the minimal length required to encode each line number as a decimal 125// number. 126int KeySize(const char *filename); 127 128template <class Arc> 129void FarCompileStrings(const vector<string> &in_fnames, 130 const string &out_fname, 131 const string &fst_type, 132 const FarType &far_type, 133 int32 generate_keys, 134 FarEntryType fet, 135 FarTokenType tt, 136 const string &symbols_fname, 137 const string &unknown_symbol, 138 bool allow_negative_labels, 139 bool file_list_input, 140 const string &key_prefix, 141 const string &key_suffix) { 142 typename StringReader<Arc>::EntryType entry_type; 143 if (fet == FET_LINE) { 144 entry_type = StringReader<Arc>::LINE; 145 } else if (fet == FET_FILE) { 146 entry_type = StringReader<Arc>::FILE; 147 } else { 148 FSTERROR() << "FarCompileStrings: unknown entry type"; 149 return; 150 } 151 152 typename StringCompiler<Arc>::TokenType token_type; 153 if (tt == FTT_SYMBOL) { 154 token_type = StringCompiler<Arc>::SYMBOL; 155 } else if (tt == FTT_BYTE) { 156 token_type = StringCompiler<Arc>::BYTE; 157 } else if (tt == FTT_UTF8) { 158 token_type = StringCompiler<Arc>::UTF8; 159 } else { 160 FSTERROR() << "FarCompileStrings: unknown token type"; 161 return; 162 } 163 164 bool compact; 165 if (fst_type.empty() || (fst_type == "vector")) { 166 compact = false; 167 } else if (fst_type == "compact") { 168 compact = true; 169 } else { 170 FSTERROR() << "FarCompileStrings: unknown fst type: " 171 << fst_type; 172 return; 173 } 174 175 const SymbolTable *syms = 0; 176 typename Arc::Label unknown_label = kNoLabel; 177 if (!symbols_fname.empty()) { 178 syms = SymbolTable::ReadText(symbols_fname, 179 allow_negative_labels); 180 if (!syms) { 181 FSTERROR() << "FarCompileStrings: error reading symbol table: " 182 << symbols_fname; 183 return; 184 } 185 if (!unknown_symbol.empty()) { 186 unknown_label = syms->Find(unknown_symbol); 187 if (unknown_label == kNoLabel) { 188 FSTERROR() << "FarCompileStrings: unknown label \"" << unknown_label 189 << "\" missing from symbol table: " << symbols_fname; 190 return; 191 } 192 } 193 } 194 195 FarWriter<Arc> *far_writer = 196 FarWriter<Arc>::Create(out_fname, far_type); 197 if (!far_writer) return; 198 199 vector<string> inputs; 200 if (file_list_input) { 201 for (int i = 1; i < in_fnames.size(); ++i) { 202 ifstream istrm(in_fnames[i].c_str()); 203 string str; 204 while (getline(istrm, str)) 205 inputs.push_back(str); 206 } 207 } else { 208 inputs = in_fnames; 209 } 210 211 for (int i = 0, n = 0; i < inputs.size(); ++i) { 212 int key_size = generate_keys ? generate_keys : 213 (entry_type == StringReader<Arc>::FILE ? 1 : 214 KeySize(inputs[i].c_str())); 215 ifstream istrm(inputs[i].c_str()); 216 217 for (StringReader<Arc> reader( 218 istrm, inputs[i], entry_type, token_type, 219 allow_negative_labels, syms, unknown_label); 220 !reader.Done(); 221 reader.Next()) { 222 ++n; 223 const Fst<Arc> *fst; 224 if (compact) 225 fst = reader.GetCompactFst(); 226 else 227 fst = reader.GetVectorFst(); 228 if (!fst) { 229 FSTERROR() << "FarCompileStrings: compiling string number " << n 230 << " in file " << inputs[i] << " failed with token_type = " 231 << (tt == FTT_BYTE ? "byte" : 232 (tt == FTT_UTF8 ? "utf8" : 233 (tt == FTT_SYMBOL ? "symbol" : "unknown"))) 234 << " and entry_type = " 235 << (fet == FET_LINE ? "line" : 236 (fet == FET_FILE ? "file" : "unknown")); 237 delete far_writer; 238 delete syms; 239 return; 240 } 241 ostringstream keybuf; 242 keybuf.width(key_size); 243 keybuf.fill('0'); 244 keybuf << n; 245 string key; 246 if (generate_keys > 0) { 247 key = keybuf.str(); 248 } else { 249 char* filename = new char[inputs[i].size() + 1]; 250 strcpy(filename, inputs[i].c_str()); 251 key = basename(filename); 252 if (entry_type != StringReader<Arc>::FILE) { 253 key += "-"; 254 key += keybuf.str(); 255 } 256 delete[] filename; 257 } 258 far_writer->Add(key_prefix + key + key_suffix, *fst); 259 delete fst; 260 } 261 if (generate_keys == 0) 262 n = 0; 263 } 264 265 delete far_writer; 266} 267 268} // namespace fst 269 270 271#endif // FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ 272