stlist.h revision 5b6dc79427b8f7eeb6a7ff68034ab8548ce670ea
1
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6//     http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13//
14// Copyright 2005-2010 Google, Inc.
15// Author: allauzen@google.com (Cyril Allauzen)
16//
17// \file
18// A generic (string,type) list file format.
19//
20// This is a stripped-down version of STTable that does
21// not support the Find() operation but that does support
22// reading/writting from standard in/out.
23
24#ifndef FST_EXTENSIONS_FAR_STLIST_H_
25#define FST_EXTENSIONS_FAR_STLIST_H_
26
27#include <iostream>
28#include <fstream>
29#include <sstream>
30#include <fst/util.h>
31
32#include <algorithm>
33#include <functional>
34#include <queue>
35#include <string>
36#include <utility>
37using std::pair; using std::make_pair;
38#include <vector>
39using std::vector;
40
41namespace fst {
42
43static const int32 kSTListMagicNumber = 5656924;
44static const int32 kSTListFileVersion = 1;
45
46// String-type list writing class for object of type 'T' using functor 'W'
47// to write an object of type 'T' from a stream. 'W' must conform to the
48// following interface:
49//
50//   struct Writer {
51//     void operator()(ostream &, const T &) const;
52//   };
53//
54template <class T, class W>
55class STListWriter {
56 public:
57  typedef T EntryType;
58  typedef W EntryWriter;
59
60  explicit STListWriter(const string filename)
61      : stream_(
62          filename.empty() ? &cout :
63          new ofstream(filename.c_str(), ofstream::out | ofstream::binary)),
64        error_(false) {
65    WriteType(*stream_, kSTListMagicNumber);
66    WriteType(*stream_, kSTListFileVersion);
67    if (!stream_) {
68      FSTERROR() << "STListWriter::STListWriter: error writing to file: "
69                 << filename;
70      error_ = true;
71    }
72  }
73
74  static STListWriter<T, W> *Create(const string &filename) {
75    return new STListWriter<T, W>(filename);
76  }
77
78  void Add(const string &key, const T &t) {
79    if (key == "") {
80      FSTERROR() << "STListWriter::Add: key empty: " << key;
81      error_ = true;
82    } else if (key < last_key_) {
83      FSTERROR() << "STListWriter::Add: key disorder: " << key;
84      error_ = true;
85    }
86    if (error_) return;
87    last_key_ = key;
88    WriteType(*stream_, key);
89    entry_writer_(*stream_, t);
90  }
91
92  bool Error() const { return error_; }
93
94  ~STListWriter() {
95    WriteType(*stream_, string());
96    if (stream_ != &cout)
97      delete stream_;
98  }
99
100 private:
101  EntryWriter entry_writer_;  // Write functor for 'EntryType'
102  ostream *stream_;           // Output stream
103  string last_key_;           // Last key
104  bool error_;
105
106  DISALLOW_COPY_AND_ASSIGN(STListWriter);
107};
108
109
110// String-type list reading class for object of type 'T' using functor 'R'
111// to read an object of type 'T' form a stream. 'R' must conform to the
112// following interface:
113//
114//   struct Reader {
115//     T *operator()(istream &) const;
116//   };
117//
118template <class T, class R>
119class STListReader {
120 public:
121  typedef T EntryType;
122  typedef R EntryReader;
123
124  explicit STListReader(const vector<string> &filenames)
125      : sources_(filenames), entry_(0), error_(false) {
126    streams_.resize(filenames.size(), 0);
127    bool has_stdin = false;
128    for (size_t i = 0; i < filenames.size(); ++i) {
129      if (filenames[i].empty()) {
130        if (!has_stdin) {
131          streams_[i] = &cin;
132          sources_[i] = "stdin";
133          has_stdin = true;
134        } else {
135          FSTERROR() << "STListReader::STListReader: stdin should only "
136                     << "appear once in the input file list.";
137          error_ = true;
138          return;
139        }
140      } else {
141        streams_[i] = new ifstream(
142            filenames[i].c_str(), ifstream::in | ifstream::binary);
143      }
144      int32 magic_number = 0, file_version = 0;
145      ReadType(*streams_[i], &magic_number);
146      ReadType(*streams_[i], &file_version);
147      if (magic_number != kSTListMagicNumber) {
148        FSTERROR() << "STListReader::STListReader: wrong file type: "
149                   << filenames[i];
150        error_ = true;
151        return;
152      }
153      if (file_version != kSTListFileVersion) {
154        FSTERROR() << "STListReader::STListReader: wrong file version: "
155                   << filenames[i];
156        error_ = true;
157        return;
158      }
159      string key;
160      ReadType(*streams_[i], &key);
161      if (!key.empty())
162        heap_.push(make_pair(key, i));
163      if (!*streams_[i]) {
164        FSTERROR() << "STListReader: error reading file: " << sources_[i];
165        error_ = true;
166        return;
167      }
168    }
169    if (heap_.empty()) return;
170    size_t current = heap_.top().second;
171    entry_ = entry_reader_(*streams_[current]);
172    if (!entry_ || !*streams_[current]) {
173      FSTERROR() << "STListReader: error reading entry for key: "
174                 << heap_.top().first << ", file: " << sources_[current];
175      error_ = true;
176    }
177  }
178
179  ~STListReader() {
180    for (size_t i = 0; i < streams_.size(); ++i) {
181      if (streams_[i] != &cin)
182        delete streams_[i];
183    }
184    if (entry_)
185      delete entry_;
186  }
187
188  static STListReader<T, R> *Open(const string &filename) {
189    vector<string> filenames;
190    filenames.push_back(filename);
191    return new STListReader<T, R>(filenames);
192  }
193
194  static STListReader<T, R> *Open(const vector<string> &filenames) {
195    return new STListReader<T, R>(filenames);
196  }
197
198  void Reset() {
199    FSTERROR()
200        << "STListReader::Reset: stlist does not support reset operation";
201    error_ = true;
202  }
203
204  bool Find(const string &key) {
205    FSTERROR()
206        << "STListReader::Find: stlist does not support find operation";
207    error_ = true;
208    return false;
209  }
210
211  bool Done() const {
212    return error_ || heap_.empty();
213  }
214
215  void Next() {
216    if (error_) return;
217    size_t current = heap_.top().second;
218    string key;
219    heap_.pop();
220    ReadType(*(streams_[current]), &key);
221    if (!*streams_[current]) {
222      FSTERROR() << "STListReader: error reading file: "
223                 << sources_[current];
224      error_ = true;
225      return;
226    }
227    if (!key.empty())
228      heap_.push(make_pair(key, current));
229
230    if(!heap_.empty()) {
231      current = heap_.top().second;
232      if (entry_)
233        delete entry_;
234      entry_ = entry_reader_(*streams_[current]);
235      if (!entry_ || !*streams_[current]) {
236        FSTERROR() << "STListReader: error reading entry for key: "
237                   << heap_.top().first << ", file: " << sources_[current];
238        error_ = true;
239      }
240    }
241  }
242
243  const string &GetKey() const {
244    return heap_.top().first;
245  }
246
247  const EntryType &GetEntry() const {
248    return *entry_;
249  }
250
251  bool Error() const { return error_; }
252
253 private:
254  EntryReader entry_reader_;   // Read functor for 'EntryType'
255  vector<istream*> streams_;   // Input streams
256  vector<string> sources_;     // and corresponding file names
257  priority_queue<
258    pair<string, size_t>, vector<pair<string, size_t> >,
259    greater<pair<string, size_t> > > heap_;  // (Key, stream id) heap
260  mutable EntryType *entry_;   // Pointer to the currently read entry
261  bool error_;
262
263  DISALLOW_COPY_AND_ASSIGN(STListReader);
264};
265
266
267// String-type list header reading function template on the entry header
268// type 'H' having a member function:
269//   Read(istream &strm, const string &filename);
270// Checks that 'filename' is an STList and call the H::Read() on the last
271// entry in the STList.
272// Does not support reading from stdin.
273template <class H>
274bool ReadSTListHeader(const string &filename, H *header) {
275  if (filename.empty()) {
276    LOG(ERROR) << "ReadSTListHeader: reading header not supported on stdin";
277    return false;
278  }
279  ifstream strm(filename.c_str(), ifstream::in | ifstream::binary);
280  int32 magic_number = 0, file_version = 0;
281  ReadType(strm, &magic_number);
282  ReadType(strm, &file_version);
283  if (magic_number != kSTListMagicNumber) {
284    LOG(ERROR) << "ReadSTListHeader: wrong file type: " << filename;
285    return false;
286  }
287  if (file_version != kSTListFileVersion) {
288    LOG(ERROR) << "ReadSTListHeader: wrong file version: " << filename;
289    return false;
290  }
291  string key;
292  ReadType(strm, &key);
293  header->Read(strm, filename + ":" + key);
294  if (!strm) {
295    LOG(ERROR) << "ReadSTListHeader: error reading file: " << filename;
296    return false;
297  }
298  return true;
299}
300
301bool IsSTList(const string &filename);
302
303}  // namespace fst
304
305#endif  // FST_EXTENSIONS_FAR_STLIST_H_
306