1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License");
3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License.
4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at
5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//     http://www.apache.org/licenses/LICENSE-2.0
7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software
9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS,
10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and
12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License.
13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc.
15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Authors: allauzen@google.com (Cyril Allauzen)
16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//          ttai@google.com (Terry Tai)
17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//          jpr@google.com (Jake Ratkiewicz)
18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <libgen.h>
24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string>
25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector>
26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector;
27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/extensions/far/far.h>
29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/string.h>
30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst {
32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Construct a reader that provides FSTs from a file (stream) either on a
34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// line-by-line basis or on a per-stream basis.  Note that the freshly
35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// constructed reader is already set to the first input.
36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Sample Usage:
38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//   for (StringReader<Arc> reader(...); !reader.Done(); reader.Next()) {
39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//     Fst *fst = reader.GetVectorFst();
40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//   }
41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class A>
42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass StringReader {
43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public:
44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef A Arc;
45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef typename A::Label Label;
46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef typename A::Weight Weight;
47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef typename StringCompiler<A>::TokenType TokenType;
48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  enum EntryType { LINE = 1, FILE = 2 };
50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  StringReader(istream &istrm,
52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson               const string &source,
53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson               EntryType entry_type,
54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson               TokenType token_type,
55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson               bool allow_negative_labels,
56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson               const SymbolTable *syms = 0,
57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson               Label unknown_label = kNoStateId)
58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : nline_(0), strm_(istrm), source_(source), entry_type_(entry_type),
59dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        token_type_(token_type), symbols_(syms), done_(false),
60f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        compiler_(token_type, syms, unknown_label, allow_negative_labels) {
61f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    Next();  // Initialize the reader to the first input.
62f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
63f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
64f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool Done() {
65f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return done_;
66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  void Next() {
69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    VLOG(1) << "Processing source " << source_ << " at line " << nline_;
70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!strm_) {                    // We're done if we have no more input.
71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      done_ = true;
72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return;
73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (entry_type_ == LINE) {
75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      getline(strm_, content_);
76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      ++nline_;
77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else {
78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      content_.clear();
79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      string line;
80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      while (getline(strm_, line)) {
81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        ++nline_;
82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        content_.append(line);
83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        content_.append("\n");
84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!strm_ && content_.empty())  // We're also done if we read off all the
87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      done_ = true;                  // whitespace at the end of a file.
88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
90dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  VectorFst<A> *GetVectorFst(bool keep_symbols = false) {
91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    VectorFst<A> *fst = new VectorFst<A>;
92dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    if (keep_symbols) {
93dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      fst->SetInputSymbols(symbols_);
94dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      fst->SetOutputSymbols(symbols_);
95dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    }
96f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (compiler_(content_, fst)) {
97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return fst;
98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else {
99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      delete fst;
100f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return NULL;
101f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
103f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
104dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  CompactFst<A, StringCompactor<A> > *GetCompactFst(bool keep_symbols = false) {
105dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    CompactFst<A, StringCompactor<A> > *fst;
106dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    if (keep_symbols) {
107dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      VectorFst<A> tmp;
108dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      tmp.SetInputSymbols(symbols_);
109dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      tmp.SetOutputSymbols(symbols_);
110dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      fst = new CompactFst<A, StringCompactor<A> >(tmp);
111dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    } else {
112dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      fst = new CompactFst<A, StringCompactor<A> >;
113dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    }
114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (compiler_(content_, fst)) {
115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return fst;
116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else {
117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      delete fst;
118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return NULL;
119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
121f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  size_t nline_;
124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  istream &strm_;
125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string source_;
126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  EntryType entry_type_;
127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  TokenType token_type_;
128dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  const SymbolTable *symbols_;
129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool done_;
130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  StringCompiler<A> compiler_;
131f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string content_;  // The actual content of the input stream's next FST.
132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  DISALLOW_COPY_AND_ASSIGN(StringReader);
134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Compute the minimal length required to encode each line number as a decimal
137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// number.
138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonint KeySize(const char *filename);
139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class Arc>
141f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonvoid FarCompileStrings(const vector<string> &in_fnames,
142f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       const string &out_fname,
143f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       const string &fst_type,
144f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       const FarType &far_type,
145f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       int32 generate_keys,
146f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       FarEntryType fet,
147f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       FarTokenType tt,
148f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       const string &symbols_fname,
149f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       const string &unknown_symbol,
150dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                       bool keep_symbols,
151dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                       bool initial_symbols,
152f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       bool allow_negative_labels,
153f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       bool file_list_input,
154f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       const string &key_prefix,
155f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       const string &key_suffix) {
156f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typename StringReader<Arc>::EntryType entry_type;
157f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (fet == FET_LINE) {
158f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    entry_type = StringReader<Arc>::LINE;
159f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (fet == FET_FILE) {
160f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    entry_type = StringReader<Arc>::FILE;
161f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else {
162f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    FSTERROR() << "FarCompileStrings: unknown entry type";
163f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return;
164f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
165f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
166f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typename StringCompiler<Arc>::TokenType token_type;
167f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (tt == FTT_SYMBOL) {
168f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    token_type = StringCompiler<Arc>::SYMBOL;
169f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (tt == FTT_BYTE) {
170f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    token_type = StringCompiler<Arc>::BYTE;
171f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (tt == FTT_UTF8) {
172f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    token_type = StringCompiler<Arc>::UTF8;
173f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else {
174f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    FSTERROR() << "FarCompileStrings: unknown token type";
175f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return;
176f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
177f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
178f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool compact;
179f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (fst_type.empty() || (fst_type == "vector")) {
180f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    compact = false;
181f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else if (fst_type == "compact") {
182f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    compact = true;
183f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else {
184f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    FSTERROR() << "FarCompileStrings: unknown fst type: "
185f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson               << fst_type;
186f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return;
187f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
188f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
189f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  const SymbolTable *syms = 0;
190f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typename Arc::Label unknown_label = kNoLabel;
191f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (!symbols_fname.empty()) {
192dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    SymbolTableTextOptions opts;
193dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    opts.allow_negative = allow_negative_labels;
194dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    syms = SymbolTable::ReadText(symbols_fname, opts);
195f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!syms) {
196f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      FSTERROR() << "FarCompileStrings: error reading symbol table: "
197f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                 << symbols_fname;
198f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return;
199f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
200f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!unknown_symbol.empty()) {
201f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      unknown_label = syms->Find(unknown_symbol);
202f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (unknown_label == kNoLabel) {
203f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        FSTERROR() << "FarCompileStrings: unknown label \"" << unknown_label
204f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                   << "\" missing from symbol table: " << symbols_fname;
205f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        return;
206f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
207f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
208f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
209f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
210f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  FarWriter<Arc> *far_writer =
211f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      FarWriter<Arc>::Create(out_fname, far_type);
212f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (!far_writer) return;
213f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
214f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  vector<string> inputs;
215f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (file_list_input) {
216f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    for (int i = 1; i < in_fnames.size(); ++i) {
217dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      istream *istrm = in_fnames.empty() ? &cin :
218dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin          new ifstream(in_fnames[i].c_str());
219f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      string str;
220dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      while (getline(*istrm, str))
221f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        inputs.push_back(str);
222dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      if (!in_fnames.empty())
223dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        delete istrm;
224f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
225f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  } else {
226f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    inputs = in_fnames;
227f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
228f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
229f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  for (int i = 0, n = 0; i < inputs.size(); ++i) {
230dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    if (generate_keys == 0 && inputs[i].empty()) {
231dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      FSTERROR() << "FarCompileStrings: read from a file instead of stdin or"
232dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin                 << " set the --generate_keys flags.";
233dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      delete far_writer;
234dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      delete syms;
235dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      return;
236dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    }
237f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    int key_size = generate_keys ? generate_keys :
238f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        (entry_type == StringReader<Arc>::FILE ? 1 :
239f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson         KeySize(inputs[i].c_str()));
240dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    istream *istrm = inputs[i].empty() ? &cin :
241dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        new ifstream(inputs[i].c_str());
242f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
243dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    bool keep_syms = keep_symbols;
244f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    for (StringReader<Arc> reader(
245dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin             *istrm, inputs[i].empty() ? "stdin" : inputs[i],
246dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin             entry_type, token_type, allow_negative_labels,
247dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin             syms, unknown_label);
248f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson         !reader.Done();
249f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson         reader.Next()) {
250f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      ++n;
251f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      const Fst<Arc> *fst;
252f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (compact)
253dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        fst = reader.GetCompactFst(keep_syms);
254f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      else
255dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        fst = reader.GetVectorFst(keep_syms);
256dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      if (initial_symbols)
257dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        keep_syms = false;
258f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (!fst) {
259f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        FSTERROR() << "FarCompileStrings: compiling string number " << n
260f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                   << " in file " << inputs[i] << " failed with token_type = "
261f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                   << (tt == FTT_BYTE ? "byte" :
262f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       (tt == FTT_UTF8 ? "utf8" :
263f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                        (tt == FTT_SYMBOL ? "symbol" : "unknown")))
264f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                   << " and entry_type = "
265f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                   << (fet == FET_LINE ? "line" :
266f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                       (fet == FET_FILE ? "file" : "unknown"));
267f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        delete far_writer;
268f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        delete syms;
269dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin        if (!inputs[i].empty()) delete istrm;
270f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        return;
271f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
272f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      ostringstream keybuf;
273f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      keybuf.width(key_size);
274f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      keybuf.fill('0');
275f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      keybuf << n;
276f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      string key;
277f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (generate_keys > 0) {
278f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        key = keybuf.str();
279f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      } else {
280f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        char* filename = new char[inputs[i].size() + 1];
281f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        strcpy(filename, inputs[i].c_str());
282f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        key = basename(filename);
283f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        if (entry_type != StringReader<Arc>::FILE) {
284f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson          key += "-";
285f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson          key += keybuf.str();
286f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        }
287f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        delete[] filename;
288f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
289f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      far_writer->Add(key_prefix + key + key_suffix, *fst);
290f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      delete fst;
291f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
292f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (generate_keys == 0)
293f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      n = 0;
294dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    if (!inputs[i].empty())
295dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      delete istrm;
296f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
297f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
298f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  delete far_writer;
299f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}
300f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
301f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}  // namespace fst
302f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
303f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
304f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif  // FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
305