1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// printstrings-main.h 2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License"); 4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License. 5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at 6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// http://www.apache.org/licenses/LICENSE-2.0 8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software 10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS, 11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and 13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License. 14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc. 16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author: allauzen@google.com (Cyril Allauzen) 17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Modified by: jpr@google.com (Jake Ratkiewicz) 18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file 20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Output as strings the string FSTs in a finite-state archive. 21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_EXTENSIONS_FAR_PRINT_STRINGS_H__ 23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_EXTENSIONS_FAR_PRINT_STRINGS_H__ 24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string> 26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector> 27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector; 28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/extensions/far/far.h> 30dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/shortest-distance.h> 31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/string.h> 32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 33dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander GutkinDECLARE_string(far_field_separator); 34dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin 35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst { 36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class Arc> 38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonvoid FarPrintStrings( 39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const vector<string> &ifilenames, const FarEntryType entry_type, 40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const FarTokenType far_token_type, const string &begin_key, 41dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const string &end_key, const bool print_key, const bool print_weight, 42dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const string &symbols_fname, const bool initial_symbols, 43dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const int32 generate_filenames, 44dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin const string &filename_prefix, const string &filename_suffix) { 45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typename StringPrinter<Arc>::TokenType token_type; 47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (far_token_type == FTT_SYMBOL) { 48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson token_type = StringPrinter<Arc>::SYMBOL; 49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (far_token_type == FTT_BYTE) { 50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson token_type = StringPrinter<Arc>::BYTE; 51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (far_token_type == FTT_UTF8) { 52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson token_type = StringPrinter<Arc>::UTF8; 53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FSTERROR() << "FarPrintStrings: unknown token type"; 55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTable *syms = 0; 59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!symbols_fname.empty()) { 60f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // allow negative flag? 61dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin SymbolTableTextOptions opts; 62dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin opts.allow_negative = true; 63dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin syms = SymbolTable::ReadText(symbols_fname, opts); 64f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!syms) { 65f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FSTERROR() << "FarPrintStrings: error reading symbol table: " 66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << symbols_fname; 67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FarReader<Arc> *far_reader = FarReader<Arc>::Open(ifilenames); 72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!far_reader) return; 73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!begin_key.empty()) 75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson far_reader->Find(begin_key); 76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string okey; 78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int nrep = 0; 79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (int i = 1; !far_reader->Done(); far_reader->Next(), ++i) { 80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string key = far_reader->GetKey(); 81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!end_key.empty() && end_key < key) 82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson break; 83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (okey == key) 84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ++nrep; 85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson else 86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson nrep = 0; 87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson okey = key; 88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const Fst<Arc> &fst = far_reader->GetFst(); 90dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (i == 1 && initial_symbols && syms == 0 && fst.InputSymbols() != 0) 91dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin syms = fst.InputSymbols()->Copy(); 92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string str; 93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(2) << "Handling key: " << key; 94dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin StringPrinter<Arc> string_printer( 95dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin token_type, syms ? syms : fst.InputSymbols()); 96f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string_printer(fst, &str); 97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (entry_type == FET_LINE) { 99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (print_key) 100dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin cout << key << FLAGS_far_field_separator[0]; 101dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin cout << str; 102dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin if (print_weight) 103dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin cout << FLAGS_far_field_separator[0] << ShortestDistance(fst); 104dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin cout << endl; 105f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (entry_type == FET_FILE) { 106f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson stringstream sstrm; 107f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (generate_filenames) { 108f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson sstrm.fill('0'); 109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson sstrm << std::right << setw(generate_filenames) << i; 110f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson sstrm << key; 112f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (nrep > 0) 113f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson sstrm << "." << nrep; 114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string filename; 117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson filename = filename_prefix + sstrm.str() + filename_suffix; 118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ofstream ostrm(filename.c_str()); 120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!ostrm) { 121f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FSTERROR() << "FarPrintStrings: Can't open file:" << filename; 122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete syms; 123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete far_reader; 124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ostrm << str; 127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (token_type == StringPrinter<Arc>::SYMBOL) 128f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ostrm << "\n"; 129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 131dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin delete syms; 132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} // namespace fst 137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif // FST_EXTENSIONS_FAR_PRINT_STRINGS_H__ 139