util.h revision f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2
158537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// util.h
258537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)
358537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// Licensed under the Apache License, Version 2.0 (the "License");
458537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// you may not use this file except in compliance with the License.
558537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// You may obtain a copy of the License at
658537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)//
758537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)//     http://www.apache.org/licenses/LICENSE-2.0
858537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)//
9116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch// Unless required by applicable law or agreed to in writing, software
1058537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// distributed under the License is distributed on an "AS IS" BASIS,
1158537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1258537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// See the License for the specific language governing permissions and
1358537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// limitations under the License.
1458537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)//
1558537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// Copyright 2005-2010 Google, Inc.
1658537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// Author: riley@google.com (Michael Riley)
1758537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)//
18a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// \file
195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// FST utility inline definitions.
205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#ifndef FST_LIB_UTIL_H__
225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define FST_LIB_UTIL_H__
235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <unordered_map>
255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)using std::tr1::unordered_map;
26a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochusing std::tr1::unordered_multimap;
275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <unordered_set>
285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)using std::tr1::unordered_set;
2958537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)using std::tr1::unordered_multiset;
3058537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)#include <list>
3158537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)#include <map>
3258537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)#include <set>
3358537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)#include <sstream>
3458537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)#include <string>
355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <vector>
365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)using std::vector;
375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
39a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include <fst/compat.h>
405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <fst/types.h>
415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include <iostream>
4358537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)#include <fstream>
4458537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)
4558537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)//
4658537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// UTILITY FOR ERROR HANDLING
4758537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)//
4858537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)
4958537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)DECLARE_bool(fst_error_fatal);
50a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define FSTERROR() (FLAGS_fst_error_fatal ? LOG(FATAL) : LOG(ERROR))
525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace fst {
545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//
565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// UTILITIES FOR TYPE I/O
575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//
5858537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)
59a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Read some types from an input stream.
60010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
61010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Generic case.
62010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)template <typename T>
6358537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)inline istream &ReadType(istream &strm, T *t) {
6458537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)  return t->Read(strm);
6558537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)}
6658537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)
6758537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// Fixed size, contiguous memory read.
6858537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)#define READ_POD_TYPE(T)                                    \
6958537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)inline istream &ReadType(istream &strm, T *t) {             \
7058537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)  return strm.read(reinterpret_cast<char *>(t), sizeof(T)); \
7158537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)}
7258537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)
7358537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(bool);
7458537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(char);
7558537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(signed char);
7658537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(unsigned char);
7758537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(short);
7858537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(unsigned short);
7958537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(int);
8058537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(unsigned int);
8158537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(long);
8258537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(unsigned long);
8358537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(long long);
8458537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(unsigned long long);
8558537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)READ_POD_TYPE(float);
86a02191e04bc25c4935f804f2c080ae28663d096dBen MurdochREAD_POD_TYPE(double);
8758537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)
88a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// String case.
8958537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)inline istream &ReadType(istream &strm, string *s) {
9058537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)  s->clear();
91a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch  int32 ns = 0;
9258537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)  strm.read(reinterpret_cast<char *>(&ns), sizeof(ns));
935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  for (int i = 0; i < ns; ++i) {
945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    char c;
955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    strm.read(&c, 1);
965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    *s += c;
97116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  }
98116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  return strm;
99116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch}
100116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
101116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch// Pair case.
102116680a4aac90f2aa7413d9095a592090648e557Ben Murdochtemplate <typename S, typename T>
103010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)inline istream &ReadType(istream &strm, pair<S, T> *p) {
104010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  ReadType(strm, &p->first);
1056d86b77056ed63eb6871182f42a9fd5f07550f90Torne (Richard Coles)  ReadType(strm, &p->second);
106116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  return strm;
10758537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)}
10823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
1091320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tuccitemplate <typename S, typename T>
1101320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucciinline istream &ReadType(istream &strm, pair<const S, T> *p) {
1111320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  ReadType(strm, const_cast<S *>(&p->first));
1121320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  ReadType(strm, &p->second);
11323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  return strm;
11423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)}
11523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
116116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch// General case - no-op.
117116680a4aac90f2aa7413d9095a592090648e557Ben Murdochtemplate <typename C>
118a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)void StlReserve(C *c, int64 n) {}
119a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
12058537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// Specialization for vectors.
12158537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)template <typename S, typename T>
12258537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)void StlReserve(vector<S, T> *c, int64 n) {
123  c->reserve(n);
124}
125
126// STL sequence container.
127#define READ_STL_SEQ_TYPE(C)                             \
128template <typename S, typename T>                        \
129inline istream &ReadType(istream &strm, C<S, T> *c) {    \
130  c->clear();                                            \
131  int64 n = 0;                                           \
132  strm.read(reinterpret_cast<char *>(&n), sizeof(n));    \
133  StlReserve(c, n);                                      \
134  for (ssize_t i = 0; i < n; ++i) {                      \
135    typename C<S, T>::value_type value;                  \
136    ReadType(strm, &value);                              \
137    c->insert(c->end(), value);                          \
138  }                                                      \
139  return strm;                                           \
140}
141
142READ_STL_SEQ_TYPE(vector);
143READ_STL_SEQ_TYPE(list);
144
145// STL associative container.
146#define READ_STL_ASSOC_TYPE(C)                           \
147template <typename S, typename T, typename U>            \
148inline istream &ReadType(istream &strm, C<S, T, U> *c) { \
149  c->clear();                                            \
150  int64 n = 0;                                           \
151  strm.read(reinterpret_cast<char *>(&n), sizeof(n));    \
152  for (ssize_t i = 0; i < n; ++i) {                      \
153    typename C<S, T, U>::value_type value;               \
154    ReadType(strm, &value);                              \
155    c->insert(value);                                    \
156  }                                                      \
157  return strm;                                           \
158}
159
160READ_STL_ASSOC_TYPE(set);
161READ_STL_ASSOC_TYPE(unordered_set);
162READ_STL_ASSOC_TYPE(map);
163READ_STL_ASSOC_TYPE(unordered_map);
164
165// Write some types to an output stream.
166
167// Generic case.
168template <typename T>
169inline ostream &WriteType(ostream &strm, const T t) {
170  t.Write(strm);
171  return strm;
172}
173
174// Fixed size, contiguous memory write.
175#define WRITE_POD_TYPE(T)                                           \
176inline ostream &WriteType(ostream &strm, const T t) {               \
177  return strm.write(reinterpret_cast<const char *>(&t), sizeof(T)); \
178}
179
180WRITE_POD_TYPE(bool);
181WRITE_POD_TYPE(char);
182WRITE_POD_TYPE(signed char);
183WRITE_POD_TYPE(unsigned char);
184WRITE_POD_TYPE(short);
185WRITE_POD_TYPE(unsigned short);
186WRITE_POD_TYPE(int);
187WRITE_POD_TYPE(unsigned int);
188WRITE_POD_TYPE(long);
189WRITE_POD_TYPE(unsigned long);
190WRITE_POD_TYPE(long long);
191WRITE_POD_TYPE(unsigned long long);
192WRITE_POD_TYPE(float);
193WRITE_POD_TYPE(double);
194
195// String case.
196inline ostream &WriteType(ostream &strm, const string &s) {
197  int32 ns = s.size();
198  strm.write(reinterpret_cast<const char *>(&ns), sizeof(ns));
199  return strm.write(s.data(), ns);
200}
201
202// Pair case.
203template <typename S, typename T>
204inline ostream &WriteType(ostream &strm, const pair<S, T> &p) {
205  WriteType(strm, p.first);
206  WriteType(strm, p.second);
207  return strm;
208}
209
210// STL sequence container.
211#define WRITE_STL_SEQ_TYPE(C)                                                \
212template <typename S, typename T>                                            \
213inline ostream &WriteType(ostream &strm, const C<S, T> &c) {                 \
214  int64 n = c.size();                                                        \
215  strm.write(reinterpret_cast<char *>(&n), sizeof(n));                       \
216  for (typename C<S, T>::const_iterator it = c.begin();                      \
217       it != c.end(); ++it)                                                  \
218     WriteType(strm, *it);                                                   \
219  return strm;                                                               \
220}
221
222WRITE_STL_SEQ_TYPE(vector);
223WRITE_STL_SEQ_TYPE(list);
224
225// STL associative container.
226#define WRITE_STL_ASSOC_TYPE(C)                                              \
227template <typename S, typename T, typename U>                                \
228inline ostream &WriteType(ostream &strm, const C<S, T, U> &c) {              \
229  int64 n = c.size();                                                        \
230  strm.write(reinterpret_cast<char *>(&n), sizeof(n));                       \
231  for (typename C<S, T, U>::const_iterator it = c.begin();                   \
232       it != c.end(); ++it)                                                  \
233     WriteType(strm, *it);                                                   \
234  return strm;                                                               \
235}
236
237WRITE_STL_ASSOC_TYPE(set);
238WRITE_STL_ASSOC_TYPE(unordered_set);
239WRITE_STL_ASSOC_TYPE(map);
240WRITE_STL_ASSOC_TYPE(unordered_map);
241
242// Utilities for converting between int64 or Weight and string.
243
244int64 StrToInt64(const string &s, const string &src, size_t nline,
245                 bool allow_negative, bool *error = 0);
246
247template <typename Weight>
248Weight StrToWeight(const string &s, const string &src, size_t nline) {
249  Weight w;
250  istringstream strm(s);
251  strm >> w;
252  if (!strm) {
253    FSTERROR() << "StrToWeight: Bad weight = \"" << s
254               << "\", source = " << src << ", line = " << nline;
255    return Weight::NoWeight();
256  }
257  return w;
258}
259
260void Int64ToStr(int64 n, string *s);
261
262template <typename Weight>
263void WeightToStr(Weight w, string *s) {
264  ostringstream strm;
265  strm.precision(9);
266  strm << w;
267  *s += strm.str();
268}
269
270// Utilities for reading/writing label pairs
271
272// Returns true on success
273template <typename Label>
274bool ReadLabelPairs(const string& filename,
275                    vector<pair<Label, Label> >* pairs,
276                    bool allow_negative = false) {
277  ifstream strm(filename.c_str());
278
279  if (!strm) {
280    LOG(ERROR) << "ReadLabelPairs: Can't open file: " << filename;
281    return false;
282  }
283
284  const int kLineLen = 8096;
285  char line[kLineLen];
286  size_t nline = 0;
287
288  pairs->clear();
289  while (strm.getline(line, kLineLen)) {
290    ++nline;
291    vector<char *> col;
292    SplitToVector(line, "\n\t ", &col, true);
293    if (col.size() == 0 || col[0][0] == '\0')  // empty line
294      continue;
295    if (col.size() != 2) {
296      LOG(ERROR) << "ReadLabelPairs: Bad number of columns, "
297                 << "file = " << filename << ", line = " << nline;
298      return false;
299    }
300
301    bool err;
302    Label frmlabel = StrToInt64(col[0], filename, nline, allow_negative, &err);
303    if (err) return false;
304    Label tolabel = StrToInt64(col[1], filename, nline, allow_negative, &err);
305    if (err) return false;
306    pairs->push_back(make_pair(frmlabel, tolabel));
307  }
308  return true;
309}
310
311// Returns true on success
312template <typename Label>
313bool WriteLabelPairs(const string& filename,
314                     const vector<pair<Label, Label> >& pairs) {
315  ostream *strm = &std::cout;
316  if (!filename.empty()) {
317    strm = new ofstream(filename.c_str());
318    if (!*strm) {
319      LOG(ERROR) << "WriteLabelPairs: Can't open file: " << filename;
320      return false;
321    }
322  }
323
324  for (ssize_t n = 0; n < pairs.size(); ++n)
325    *strm << pairs[n].first << "\t" << pairs[n].second << "\n";
326
327  if (!*strm) {
328    LOG(ERROR) << "WriteLabelPairs: Write failed: "
329               << (filename.empty() ? "standard output" : filename);
330    return false;
331  }
332  if (strm != &std::cout)
333    delete strm;
334  return true;
335}
336
337// Utilities for converting a type name to a legal C symbol.
338
339void ConvertToLegalCSymbol(string *s);
340
341
342//
343// UTILITIES FOR STREAM I/O
344//
345
346bool AlignInput(istream &strm, int align);
347bool AlignOutput(ostream &strm, int align);
348
349//
350// UTILITIES FOR PROTOCOL BUFFER I/O
351//
352
353
354// An associative container for which testing membership is
355// faster than an STL set if members are restricted to an interval
356// that excludes most non-members. A 'Key' must have ==, !=, and < defined.
357// Element 'NoKey' should be a key that marks an uninitialized key and
358// is otherwise unused. 'Find()' returns an STL const_iterator to the match
359// found, otherwise it equals 'End()'.
360template <class Key, Key NoKey>
361class CompactSet {
362public:
363  typedef typename set<Key>::const_iterator const_iterator;
364
365  CompactSet()
366    : min_key_(NoKey),
367      max_key_(NoKey) { }
368
369  CompactSet(const CompactSet<Key, NoKey> &compact_set)
370    : set_(compact_set.set_),
371      min_key_(compact_set.min_key_),
372      max_key_(compact_set.max_key_) { }
373
374  void Insert(Key key) {
375    set_.insert(key);
376    if (min_key_ == NoKey || key < min_key_)
377      min_key_ = key;
378    if (max_key_ == NoKey || max_key_ < key)
379        max_key_ = key;
380  }
381
382  void Clear() {
383    set_.clear();
384    min_key_ = max_key_ = NoKey;
385  }
386
387  const_iterator Find(Key key) const {
388    if (min_key_ == NoKey ||
389        key < min_key_ || max_key_ < key)
390      return set_.end();
391    else
392      return set_.find(key);
393  }
394
395  const_iterator Begin() const { return set_.begin(); }
396
397  const_iterator End() const { return set_.end(); }
398
399private:
400  set<Key> set_;
401  Key min_key_;
402  Key max_key_;
403
404  void operator=(const CompactSet<Key, NoKey> &);  //disallow
405};
406
407}  // namespace fst
408
409#endif  // FST_LIB_UTIL_H__
410