1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License"); 3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License. 4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at 5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// http://www.apache.org/licenses/LICENSE-2.0 7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software 9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS, 10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and 12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License. 13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc. 15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Authors: allauzen@google.com (Cyril Allauzen) 16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// ttai@google.com (Terry Tai) 17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// jpr@google.com (Jake Ratkiewicz) 18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ 21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ 22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <libgen.h> 24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string> 25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector> 26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector; 27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/extensions/far/far.h> 29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/string.h> 30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst { 32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Construct a reader that provides FSTs from a file (stream) either on a 34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// line-by-line basis or on a per-stream basis. Note that the freshly 35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// constructed reader is already set to the first input. 36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Sample Usage: 38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// for (StringReader<Arc> reader(...); !reader.Done(); reader.Next()) { 39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Fst *fst = reader.GetVectorFst(); 40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// } 41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class A> 42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass StringReader { 43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public: 44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef A Arc; 45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename A::Label Label; 46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename A::Weight Weight; 47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename StringCompiler<A>::TokenType TokenType; 48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson enum EntryType { LINE = 1, FILE = 2 }; 50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson StringReader(istream &istrm, 52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string &source, 53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson EntryType entry_type, 54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson TokenType token_type, 55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool allow_negative_labels, 56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTable *syms = 0, 57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Label unknown_label = kNoStateId) 58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : nline_(0), strm_(istrm), source_(source), entry_type_(entry_type), 59dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin token_type_(token_type), symbols_(syms), done_(false), 60f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson compiler_(token_type, syms, unknown_label, allow_negative_labels) { 61f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Next(); // Initialize the reader to the first input. 62f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 63f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 64f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool Done() { 65f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return done_; 66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson void Next() { 69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(1) << "Processing source " << source_ << " at line " << nline_; 70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!strm_) { // We're done if we have no more input. 71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson done_ = true; 72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (entry_type_ == LINE) { 75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson getline(strm_, content_); 76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ++nline_; 77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson content_.clear(); 79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string line; 80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson while (getline(strm_, line)) { 81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ++nline_; 82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson content_.append(line); 83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson content_.append("\n"); 84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!strm_ && content_.empty()) // We're also done if we read off all the 87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson done_ = true; // whitespace at the end of a file. 88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 90dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin VectorFst<A> *GetVectorFst(bool keep_symbols = false) { 91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VectorFst<A> *fst = new VectorFst<A>; 92dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (keep_symbols) { 93dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst->SetInputSymbols(symbols_); 94dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst->SetOutputSymbols(symbols_); 95dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 96f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (compiler_(content_, fst)) { 97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return fst; 98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete fst; 100f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return NULL; 101f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 103f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 104dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin CompactFst<A, StringCompactor<A> > *GetCompactFst(bool keep_symbols = false) { 105dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin CompactFst<A, StringCompactor<A> > *fst; 106dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (keep_symbols) { 107dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin VectorFst<A> tmp; 108dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin tmp.SetInputSymbols(symbols_); 109dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin tmp.SetOutputSymbols(symbols_); 110dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst = new CompactFst<A, StringCompactor<A> >(tmp); 111dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } else { 112dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst = new CompactFst<A, StringCompactor<A> >; 113dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (compiler_(content_, fst)) { 115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return fst; 116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete fst; 118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return NULL; 119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 121f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson size_t nline_; 124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson istream &strm_; 125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string source_; 126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson EntryType entry_type_; 127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson TokenType token_type_; 128dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const SymbolTable *symbols_; 129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool done_; 130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson StringCompiler<A> compiler_; 131f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string content_; // The actual content of the input stream's next FST. 132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson DISALLOW_COPY_AND_ASSIGN(StringReader); 134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Compute the minimal length required to encode each line number as a decimal 137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// number. 138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonint KeySize(const char *filename); 139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class Arc> 141f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonvoid FarCompileStrings(const vector<string> &in_fnames, 142f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string &out_fname, 143f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string &fst_type, 144f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const FarType &far_type, 145f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int32 generate_keys, 146f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FarEntryType fet, 147f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FarTokenType tt, 148f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string &symbols_fname, 149f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string &unknown_symbol, 150dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool keep_symbols, 151dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool initial_symbols, 152f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool allow_negative_labels, 153f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool file_list_input, 154f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string &key_prefix, 155f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string &key_suffix) { 156f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typename StringReader<Arc>::EntryType entry_type; 157f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (fet == FET_LINE) { 158f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson entry_type = StringReader<Arc>::LINE; 159f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (fet == FET_FILE) { 160f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson entry_type = StringReader<Arc>::FILE; 161f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 162f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FSTERROR() << "FarCompileStrings: unknown entry type"; 163f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 164f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 165f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 166f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typename StringCompiler<Arc>::TokenType token_type; 167f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (tt == FTT_SYMBOL) { 168f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson token_type = StringCompiler<Arc>::SYMBOL; 169f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (tt == FTT_BYTE) { 170f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson token_type = StringCompiler<Arc>::BYTE; 171f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (tt == FTT_UTF8) { 172f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson token_type = StringCompiler<Arc>::UTF8; 173f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 174f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FSTERROR() << "FarCompileStrings: unknown token type"; 175f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 176f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 177f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 178f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool compact; 179f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (fst_type.empty() || (fst_type == "vector")) { 180f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson compact = false; 181f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (fst_type == "compact") { 182f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson compact = true; 183f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 184f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FSTERROR() << "FarCompileStrings: unknown fst type: " 185f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << fst_type; 186f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 187f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 188f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 189f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTable *syms = 0; 190f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typename Arc::Label unknown_label = kNoLabel; 191f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!symbols_fname.empty()) { 192dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SymbolTableTextOptions opts; 193dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin opts.allow_negative = allow_negative_labels; 194dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin syms = SymbolTable::ReadText(symbols_fname, opts); 195f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!syms) { 196f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FSTERROR() << "FarCompileStrings: error reading symbol table: " 197f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << symbols_fname; 198f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 199f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 200f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!unknown_symbol.empty()) { 201f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson unknown_label = syms->Find(unknown_symbol); 202f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (unknown_label == kNoLabel) { 203f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FSTERROR() << "FarCompileStrings: unknown label \"" << unknown_label 204f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << "\" missing from symbol table: " << symbols_fname; 205f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 206f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 207f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 208f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 209f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 210f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FarWriter<Arc> *far_writer = 211f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FarWriter<Arc>::Create(out_fname, far_type); 212f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!far_writer) return; 213f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 214f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<string> inputs; 215f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (file_list_input) { 216f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (int i = 1; i < in_fnames.size(); ++i) { 217dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin istream *istrm = in_fnames.empty() ? &cin : 218dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin new ifstream(in_fnames[i].c_str()); 219f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string str; 220dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin while (getline(*istrm, str)) 221f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson inputs.push_back(str); 222dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!in_fnames.empty()) 223dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin delete istrm; 224f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 225f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 226f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson inputs = in_fnames; 227f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 228f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 229f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (int i = 0, n = 0; i < inputs.size(); ++i) { 230dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (generate_keys == 0 && inputs[i].empty()) { 231dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin FSTERROR() << "FarCompileStrings: read from a file instead of stdin or" 232dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin << " set the --generate_keys flags."; 233dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin delete far_writer; 234dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin delete syms; 235dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin return; 236dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin } 237f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int key_size = generate_keys ? generate_keys : 238f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson (entry_type == StringReader<Arc>::FILE ? 1 : 239f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson KeySize(inputs[i].c_str())); 240dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin istream *istrm = inputs[i].empty() ? &cin : 241dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin new ifstream(inputs[i].c_str()); 242f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 243dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool keep_syms = keep_symbols; 244f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (StringReader<Arc> reader( 245dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin *istrm, inputs[i].empty() ? "stdin" : inputs[i], 246dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin entry_type, token_type, allow_negative_labels, 247dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin syms, unknown_label); 248f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson !reader.Done(); 249f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson reader.Next()) { 250f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ++n; 251f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const Fst<Arc> *fst; 252f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (compact) 253dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst = reader.GetCompactFst(keep_syms); 254f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson else 255dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin fst = reader.GetVectorFst(keep_syms); 256dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (initial_symbols) 257dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin keep_syms = false; 258f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!fst) { 259f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FSTERROR() << "FarCompileStrings: compiling string number " << n 260f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << " in file " << inputs[i] << " failed with token_type = " 261f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << (tt == FTT_BYTE ? "byte" : 262f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson (tt == FTT_UTF8 ? "utf8" : 263f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson (tt == FTT_SYMBOL ? "symbol" : "unknown"))) 264f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << " and entry_type = " 265f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << (fet == FET_LINE ? "line" : 266f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson (fet == FET_FILE ? "file" : "unknown")); 267f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete far_writer; 268f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete syms; 269dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!inputs[i].empty()) delete istrm; 270f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 271f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 272f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ostringstream keybuf; 273f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson keybuf.width(key_size); 274f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson keybuf.fill('0'); 275f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson keybuf << n; 276f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string key; 277f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (generate_keys > 0) { 278f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson key = keybuf.str(); 279f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 280f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson char* filename = new char[inputs[i].size() + 1]; 281f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson strcpy(filename, inputs[i].c_str()); 282f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson key = basename(filename); 283f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (entry_type != StringReader<Arc>::FILE) { 284f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson key += "-"; 285f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson key += keybuf.str(); 286f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 287f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete[] filename; 288f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 289f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson far_writer->Add(key_prefix + key + key_suffix, *fst); 290f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete fst; 291f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 292f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (generate_keys == 0) 293f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson n = 0; 294dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (!inputs[i].empty()) 295dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin delete istrm; 296f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 297f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 298f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete far_writer; 299f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 300f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 301f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} // namespace fst 302f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 303f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 304f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif // FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ 305