1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// printstrings-main.h
2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License");
4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License.
5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at
6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//     http://www.apache.org/licenses/LICENSE-2.0
8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software
10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS,
11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and
13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License.
14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc.
16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author: allauzen@google.com (Cyril Allauzen)
17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Modified by: jpr@google.com (Jake Ratkiewicz)
18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file
20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Output as strings the string FSTs in a finite-state archive.
21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_EXTENSIONS_FAR_PRINT_STRINGS_H__
23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_EXTENSIONS_FAR_PRINT_STRINGS_H__
24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string>
26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector>
27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector;
28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/extensions/far/far.h>
30dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/shortest-distance.h>
31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/string.h>
32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
33dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander GutkinDECLARE_string(far_field_separator);
34dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst {
36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class Arc>
38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonvoid FarPrintStrings(
39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    const vector<string> &ifilenames, const FarEntryType entry_type,
40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    const FarTokenType far_token_type, const string &begin_key,
41dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    const string &end_key, const bool print_key, const bool print_weight,
42dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    const string &symbols_fname, const bool initial_symbols,
43dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    const int32 generate_filenames,
44dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    const string &filename_prefix, const string &filename_suffix) {
45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typename StringPrinter<Arc>::TokenType token_type;
47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (far_token_type == FTT_SYMBOL) {
48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    token_type = StringPrinter<Arc>::SYMBOL;
49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (far_token_type == FTT_BYTE) {
50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    token_type = StringPrinter<Arc>::BYTE;
51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (far_token_type == FTT_UTF8) {
52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    token_type = StringPrinter<Arc>::UTF8;
53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else {
54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    FSTERROR() << "FarPrintStrings: unknown token type";
55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return;
56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  const SymbolTable *syms = 0;
59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (!symbols_fname.empty()) {
60f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    // allow negative flag?
61dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    SymbolTableTextOptions opts;
62dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    opts.allow_negative = true;
63dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    syms = SymbolTable::ReadText(symbols_fname, opts);
64f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!syms) {
65f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      FSTERROR() << "FarPrintStrings: error reading symbol table: "
66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                 << symbols_fname;
67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return;
68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  FarReader<Arc> *far_reader = FarReader<Arc>::Open(ifilenames);
72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (!far_reader) return;
73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (!begin_key.empty())
75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    far_reader->Find(begin_key);
76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string okey;
78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int nrep = 0;
79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  for (int i = 1; !far_reader->Done(); far_reader->Next(), ++i) {
80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    string key = far_reader->GetKey();
81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!end_key.empty() && end_key < key)
82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      break;
83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (okey == key)
84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      ++nrep;
85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    else
86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      nrep = 0;
87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    okey = key;
88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    const Fst<Arc> &fst = far_reader->GetFst();
90dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    if (i == 1 && initial_symbols && syms == 0 && fst.InputSymbols() != 0)
91dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      syms = fst.InputSymbols()->Copy();
92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    string str;
93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    VLOG(2) << "Handling key: " << key;
94dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    StringPrinter<Arc> string_printer(
95dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        token_type, syms ? syms : fst.InputSymbols());
96f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    string_printer(fst, &str);
97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (entry_type == FET_LINE) {
99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (print_key)
100dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        cout << key << FLAGS_far_field_separator[0];
101dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      cout << str;
102dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      if (print_weight)
103dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        cout << FLAGS_far_field_separator[0] << ShortestDistance(fst);
104dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      cout << endl;
105f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else if (entry_type == FET_FILE) {
106f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      stringstream sstrm;
107f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (generate_filenames) {
108f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        sstrm.fill('0');
109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        sstrm << std::right << setw(generate_filenames) << i;
110f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      } else {
111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        sstrm << key;
112f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        if (nrep > 0)
113f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson          sstrm << "." << nrep;
114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      string filename;
117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      filename = filename_prefix +  sstrm.str() + filename_suffix;
118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      ofstream ostrm(filename.c_str());
120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (!ostrm) {
121f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        FSTERROR() << "FarPrintStrings: Can't open file:" << filename;
122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        delete syms;
123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        delete far_reader;
124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        return;
125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      ostrm << str;
127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (token_type == StringPrinter<Arc>::SYMBOL)
128f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        ostrm << "\n";
129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
131dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  delete syms;
132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}
133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}  // namespace fst
137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif  // FST_EXTENSIONS_FAR_PRINT_STRINGS_H__
139