util.h revision 5b6dc79427b8f7eeb6a7ff68034ab8548ce670ea
1// util.h
2
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Copyright 2005-2010 Google, Inc.
16// Author: riley@google.com (Michael Riley)
17//
18// \file
19// FST utility inline definitions.
20
21#ifndef FST_LIB_UTIL_H__
22#define FST_LIB_UTIL_H__
23
24#include <tr1/unordered_map>
25using std::tr1::unordered_map;
26using std::tr1::unordered_multimap;
27#include <tr1/unordered_set>
28using std::tr1::unordered_set;
29using std::tr1::unordered_multiset;
30#include <list>
31#include <map>
32#include <set>
33#include <sstream>
34#include <string>
35#include <vector>
36using std::vector;
37
38
39#include <fst/compat.h>
40#include <fst/types.h>
41
42#include <iostream>
43#include <fstream>
44#include <sstream>
45
46//
47// UTILITY FOR ERROR HANDLING
48//
49
50DECLARE_bool(fst_error_fatal);
51
52#define FSTERROR() (FLAGS_fst_error_fatal ? LOG(FATAL) : LOG(ERROR))
53
54namespace fst {
55
56//
57// UTILITIES FOR TYPE I/O
58//
59
60// Read some types from an input stream.
61
62// Generic case.
63template <typename T>
64inline istream &ReadType(istream &strm, T *t) {
65  return t->Read(strm);
66}
67
68// Fixed size, contiguous memory read.
69#define READ_POD_TYPE(T)                                    \
70inline istream &ReadType(istream &strm, T *t) {             \
71  return strm.read(reinterpret_cast<char *>(t), sizeof(T)); \
72}
73
74READ_POD_TYPE(bool);
75READ_POD_TYPE(char);
76READ_POD_TYPE(signed char);
77READ_POD_TYPE(unsigned char);
78READ_POD_TYPE(short);
79READ_POD_TYPE(unsigned short);
80READ_POD_TYPE(int);
81READ_POD_TYPE(unsigned int);
82READ_POD_TYPE(long);
83READ_POD_TYPE(unsigned long);
84READ_POD_TYPE(long long);
85READ_POD_TYPE(unsigned long long);
86READ_POD_TYPE(float);
87READ_POD_TYPE(double);
88
89// String case.
90inline istream &ReadType(istream &strm, string *s) {
91  s->clear();
92  int32 ns = 0;
93  strm.read(reinterpret_cast<char *>(&ns), sizeof(ns));
94  for (int i = 0; i < ns; ++i) {
95    char c;
96    strm.read(&c, 1);
97    *s += c;
98  }
99  return strm;
100}
101
102// Pair case.
103template <typename S, typename T>
104inline istream &ReadType(istream &strm, pair<S, T> *p) {
105  ReadType(strm, &p->first);
106  ReadType(strm, &p->second);
107  return strm;
108}
109
110template <typename S, typename T>
111inline istream &ReadType(istream &strm, pair<const S, T> *p) {
112  ReadType(strm, const_cast<S *>(&p->first));
113  ReadType(strm, &p->second);
114  return strm;
115}
116
117// General case - no-op.
118template <typename C>
119void StlReserve(C *c, int64 n) {}
120
121// Specialization for vectors.
122template <typename S, typename T>
123void StlReserve(vector<S, T> *c, int64 n) {
124  c->reserve(n);
125}
126
127// STL sequence container.
128#define READ_STL_SEQ_TYPE(C)                             \
129template <typename S, typename T>                        \
130inline istream &ReadType(istream &strm, C<S, T> *c) {    \
131  c->clear();                                            \
132  int64 n = 0;                                           \
133  strm.read(reinterpret_cast<char *>(&n), sizeof(n));    \
134  StlReserve(c, n);                                      \
135  for (ssize_t i = 0; i < n; ++i) {                      \
136    typename C<S, T>::value_type value;                  \
137    ReadType(strm, &value);                              \
138    c->insert(c->end(), value);                          \
139  }                                                      \
140  return strm;                                           \
141}
142
143READ_STL_SEQ_TYPE(vector);
144READ_STL_SEQ_TYPE(list);
145
146// STL associative container.
147#define READ_STL_ASSOC_TYPE(C)                           \
148template <typename S, typename T, typename U>            \
149inline istream &ReadType(istream &strm, C<S, T, U> *c) { \
150  c->clear();                                            \
151  int64 n = 0;                                           \
152  strm.read(reinterpret_cast<char *>(&n), sizeof(n));    \
153  for (ssize_t i = 0; i < n; ++i) {                      \
154    typename C<S, T, U>::value_type value;               \
155    ReadType(strm, &value);                              \
156    c->insert(value);                                    \
157  }                                                      \
158  return strm;                                           \
159}
160
161READ_STL_ASSOC_TYPE(set);
162READ_STL_ASSOC_TYPE(unordered_set);
163READ_STL_ASSOC_TYPE(map);
164READ_STL_ASSOC_TYPE(unordered_map);
165
166// Write some types to an output stream.
167
168// Generic case.
169template <typename T>
170inline ostream &WriteType(ostream &strm, const T t) {
171  t.Write(strm);
172  return strm;
173}
174
175// Fixed size, contiguous memory write.
176#define WRITE_POD_TYPE(T)                                           \
177inline ostream &WriteType(ostream &strm, const T t) {               \
178  return strm.write(reinterpret_cast<const char *>(&t), sizeof(T)); \
179}
180
181WRITE_POD_TYPE(bool);
182WRITE_POD_TYPE(char);
183WRITE_POD_TYPE(signed char);
184WRITE_POD_TYPE(unsigned char);
185WRITE_POD_TYPE(short);
186WRITE_POD_TYPE(unsigned short);
187WRITE_POD_TYPE(int);
188WRITE_POD_TYPE(unsigned int);
189WRITE_POD_TYPE(long);
190WRITE_POD_TYPE(unsigned long);
191WRITE_POD_TYPE(long long);
192WRITE_POD_TYPE(unsigned long long);
193WRITE_POD_TYPE(float);
194WRITE_POD_TYPE(double);
195
196// String case.
197inline ostream &WriteType(ostream &strm, const string &s) {
198  int32 ns = s.size();
199  strm.write(reinterpret_cast<const char *>(&ns), sizeof(ns));
200  return strm.write(s.data(), ns);
201}
202
203// Pair case.
204template <typename S, typename T>
205inline ostream &WriteType(ostream &strm, const pair<S, T> &p) {
206  WriteType(strm, p.first);
207  WriteType(strm, p.second);
208  return strm;
209}
210
211// STL sequence container.
212#define WRITE_STL_SEQ_TYPE(C)                                                \
213template <typename S, typename T>                                            \
214inline ostream &WriteType(ostream &strm, const C<S, T> &c) {                 \
215  int64 n = c.size();                                                        \
216  strm.write(reinterpret_cast<char *>(&n), sizeof(n));                       \
217  for (typename C<S, T>::const_iterator it = c.begin();                      \
218       it != c.end(); ++it)                                                  \
219     WriteType(strm, *it);                                                   \
220  return strm;                                                               \
221}
222
223WRITE_STL_SEQ_TYPE(vector);
224WRITE_STL_SEQ_TYPE(list);
225
226// STL associative container.
227#define WRITE_STL_ASSOC_TYPE(C)                                              \
228template <typename S, typename T, typename U>                                \
229inline ostream &WriteType(ostream &strm, const C<S, T, U> &c) {              \
230  int64 n = c.size();                                                        \
231  strm.write(reinterpret_cast<char *>(&n), sizeof(n));                       \
232  for (typename C<S, T, U>::const_iterator it = c.begin();                   \
233       it != c.end(); ++it)                                                  \
234     WriteType(strm, *it);                                                   \
235  return strm;                                                               \
236}
237
238WRITE_STL_ASSOC_TYPE(set);
239WRITE_STL_ASSOC_TYPE(unordered_set);
240WRITE_STL_ASSOC_TYPE(map);
241WRITE_STL_ASSOC_TYPE(unordered_map);
242
243// Utilities for converting between int64 or Weight and string.
244
245int64 StrToInt64(const string &s, const string &src, size_t nline,
246                 bool allow_negative, bool *error = 0);
247
248template <typename Weight>
249Weight StrToWeight(const string &s, const string &src, size_t nline) {
250  Weight w;
251  istringstream strm(s);
252  strm >> w;
253  if (!strm) {
254    FSTERROR() << "StrToWeight: Bad weight = \"" << s
255               << "\", source = " << src << ", line = " << nline;
256    return Weight::NoWeight();
257  }
258  return w;
259}
260
261void Int64ToStr(int64 n, string *s);
262
263template <typename Weight>
264void WeightToStr(Weight w, string *s) {
265  ostringstream strm;
266  strm.precision(9);
267  strm << w;
268  s->append(strm.str().data(), strm.str().size());
269}
270
271// Utilities for reading/writing integer pairs (typically labels)
272
273// Returns true on success
274template <typename I>
275bool ReadIntPairs(const string& filename,
276                    vector<pair<I, I> >* pairs,
277                    bool allow_negative = false) {
278  ifstream strm(filename.c_str());
279
280  if (!strm) {
281    LOG(ERROR) << "ReadIntPairs: Can't open file: " << filename;
282    return false;
283  }
284
285  const int kLineLen = 8096;
286  char line[kLineLen];
287  size_t nline = 0;
288
289  pairs->clear();
290  while (strm.getline(line, kLineLen)) {
291    ++nline;
292    vector<char *> col;
293    SplitToVector(line, "\n\t ", &col, true);
294    // empty line or comment?
295    if (col.size() == 0 || col[0][0] == '\0' || col[0][0] == '#')
296      continue;
297    if (col.size() != 2) {
298      LOG(ERROR) << "ReadIntPairs: Bad number of columns, "
299                 << "file = " << filename << ", line = " << nline;
300      return false;
301    }
302
303    bool err;
304    I i1 = StrToInt64(col[0], filename, nline, allow_negative, &err);
305    if (err) return false;
306    I i2 = StrToInt64(col[1], filename, nline, allow_negative, &err);
307    if (err) return false;
308    pairs->push_back(make_pair(i1, i2));
309  }
310  return true;
311}
312
313// Returns true on success
314template <typename I>
315bool WriteIntPairs(const string& filename,
316                   const vector<pair<I, I> >& pairs) {
317  ostream *strm = &cout;
318  if (!filename.empty()) {
319    strm = new ofstream(filename.c_str());
320    if (!*strm) {
321      LOG(ERROR) << "WriteIntPairs: Can't open file: " << filename;
322      return false;
323    }
324  }
325
326  for (ssize_t n = 0; n < pairs.size(); ++n)
327    *strm << pairs[n].first << "\t" << pairs[n].second << "\n";
328
329  if (!*strm) {
330    LOG(ERROR) << "WriteIntPairs: Write failed: "
331               << (filename.empty() ? "standard output" : filename);
332    return false;
333  }
334  if (strm != &cout)
335    delete strm;
336  return true;
337}
338
339// Utilities for reading/writing label pairs
340
341template <typename Label>
342bool ReadLabelPairs(const string& filename,
343                    vector<pair<Label, Label> >* pairs,
344                    bool allow_negative = false) {
345  return ReadIntPairs(filename, pairs, allow_negative);
346}
347
348template <typename Label>
349bool WriteLabelPairs(const string& filename,
350                     vector<pair<Label, Label> >& pairs) {
351  return WriteIntPairs(filename, pairs);
352}
353
354// Utilities for converting a type name to a legal C symbol.
355
356void ConvertToLegalCSymbol(string *s);
357
358
359//
360// UTILITIES FOR STREAM I/O
361//
362
363bool AlignInput(istream &strm);
364bool AlignOutput(ostream &strm);
365
366//
367// UTILITIES FOR PROTOCOL BUFFER I/O
368//
369
370
371// An associative container for which testing membership is
372// faster than an STL set if members are restricted to an interval
373// that excludes most non-members. A 'Key' must have ==, !=, and < defined.
374// Element 'NoKey' should be a key that marks an uninitialized key and
375// is otherwise unused. 'Find()' returns an STL const_iterator to the match
376// found, otherwise it equals 'End()'.
377template <class Key, Key NoKey>
378class CompactSet {
379public:
380  typedef typename set<Key>::const_iterator const_iterator;
381
382  CompactSet()
383    : min_key_(NoKey),
384      max_key_(NoKey) { }
385
386  CompactSet(const CompactSet<Key, NoKey> &compact_set)
387    : set_(compact_set.set_),
388      min_key_(compact_set.min_key_),
389      max_key_(compact_set.max_key_) { }
390
391  void Insert(Key key) {
392    set_.insert(key);
393    if (min_key_ == NoKey || key < min_key_)
394      min_key_ = key;
395    if (max_key_ == NoKey || max_key_ < key)
396        max_key_ = key;
397  }
398
399  void Erase(Key key) {
400    set_.erase(key);
401    if (set_.empty()) {
402        min_key_ = max_key_ = NoKey;
403    } else if (key == min_key_) {
404      ++min_key_;
405    } else if (key == max_key_) {
406      --max_key_;
407    }
408  }
409
410  void Clear() {
411    set_.clear();
412    min_key_ = max_key_ = NoKey;
413  }
414
415  const_iterator Find(Key key) const {
416    if (min_key_ == NoKey ||
417        key < min_key_ || max_key_ < key)
418      return set_.end();
419    else
420      return set_.find(key);
421  }
422
423  bool Member(Key key) const {
424    if (min_key_ == NoKey || key < min_key_ || max_key_ < key) {
425      return false;   // out of range
426    } else if (min_key_ != NoKey && max_key_ + 1 == min_key_ + set_.size()) {
427      return true;    // dense range
428    } else {
429      return set_.find(key) != set_.end();
430    }
431  }
432
433  const_iterator Begin() const { return set_.begin(); }
434
435  const_iterator End() const { return set_.end(); }
436
437  // All stored keys are greater than or equal to this value.
438  Key LowerBound() const { return min_key_; }
439
440  // All stored keys are less than or equal to this value.
441  Key UpperBound() const { return max_key_; }
442
443private:
444  set<Key> set_;
445  Key min_key_;
446  Key max_key_;
447
448  void operator=(const CompactSet<Key, NoKey> &);  //disallow
449};
450
451}  // namespace fst
452
453#endif  // FST_LIB_UTIL_H__
454