1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// string.h
3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License");
5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License.
6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at
7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//     http://www.apache.org/licenses/LICENSE-2.0
9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software
11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS,
12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and
14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License.
15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc.
17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author: allauzen@google.com (Cyril Allauzen)
18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file
20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Utilities to convert strings into FSTs.
21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_LIB_STRING_H_
24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_LIB_STRING_H_
25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/compact-fst.h>
27dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/icu.h>
28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/mutable-fst.h>
29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonDECLARE_string(fst_field_separator);
31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst {
33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Functor compiling a string in an FST
35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class A>
36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass StringCompiler {
37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public:
38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef A Arc;
39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef typename A::Label Label;
40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef typename A::Weight Weight;
41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  enum TokenType { SYMBOL = 1, BYTE = 2, UTF8 = 3 };
43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  StringCompiler(TokenType type, const SymbolTable *syms = 0,
45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                 Label unknown_label = kNoLabel,
46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                 bool allow_negative = false)
47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : token_type_(type), syms_(syms), unknown_label_(unknown_label),
48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        allow_negative_(allow_negative) {}
49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Compile string 's' into FST 'fst'.
51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  template <class F>
52dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  bool operator()(const string &s, F *fst) const {
53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    vector<Label> labels;
54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!ConvertStringToLabels(s, &labels))
55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return false;
56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    Compile(labels, fst);
57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
605b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  template <class F>
615b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  bool operator()(const string &s, F *fst, Weight w) const {
625b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    vector<Label> labels;
635b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    if (!ConvertStringToLabels(s, &labels))
645b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      return false;
655b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    Compile(labels, fst, w);
665b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    return true;
675b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  }
685b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin
69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool ConvertStringToLabels(const string &str, vector<Label> *labels) const {
71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    labels->clear();
72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (token_type_ == BYTE) {
73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      for (size_t i = 0; i < str.size(); ++i)
74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        labels->push_back(static_cast<unsigned char>(str[i]));
75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else if (token_type_ == UTF8) {
76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return UTF8StringToLabels(str, labels);
77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else {
78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      char *c_str = new char[str.size() + 1];
79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      str.copy(c_str, str.size());
80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      c_str[str.size()] = 0;
81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      vector<char *> vec;
82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      string separator = "\n" + FLAGS_fst_field_separator;
83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      SplitToVector(c_str, separator.c_str(), &vec, true);
84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      for (size_t i = 0; i < vec.size(); ++i) {
85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        Label label;
86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        if (!ConvertSymbolToLabel(vec[i], &label))
87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson          return false;
88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        labels->push_back(label);
89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
90f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      delete[] c_str;
91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
94f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
955b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  void Compile(const vector<Label> &labels, MutableFst<A> *fst,
965b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin               const Weight &weight = Weight::One()) const {
97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    fst->DeleteStates();
98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    while (fst->NumStates() <= labels.size())
99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      fst->AddState();
100f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    for (size_t i = 0; i < labels.size(); ++i)
101f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      fst->AddArc(i, Arc(labels[i], labels[i], Weight::One(), i + 1));
102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    fst->SetStart(0);
1035b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    fst->SetFinal(labels.size(), weight);
104f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
105f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
106f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  template <class Unsigned>
1075b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  void Compile(const vector<Label> &labels,
1085b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin               CompactFst<A, StringCompactor<A>, Unsigned> *fst) const {
109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    fst->SetCompactElements(labels.begin(), labels.end());
110f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
1125b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  template <class Unsigned>
1135b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  void Compile(const vector<Label> &labels,
1145b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin               CompactFst<A, WeightedStringCompactor<A>, Unsigned> *fst,
1155b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin               const Weight &weight = Weight::One()) const {
1165b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    vector<pair<Label, Weight> > compacts;
1175b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    compacts.reserve(labels.size());
1185b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    for (size_t i = 0; i < labels.size(); ++i)
1195b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      compacts.push_back(make_pair(labels[i], Weight::One()));
1205b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    compacts.back().second = weight;
1215b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    fst->SetCompactElements(compacts.begin(), compacts.end());
1225b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  }
1235b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin
124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool ConvertSymbolToLabel(const char *s, Label* output) const {
125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    int64 n;
126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (syms_) {
127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      n = syms_->Find(s);
128f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if ((n == -1) && (unknown_label_ != kNoLabel))
129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        n = unknown_label_;
130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (n == -1 || (!allow_negative_ && n < 0)) {
131f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        VLOG(1) << "StringCompiler::ConvertSymbolToLabel: Symbol \"" << s
132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                << "\" is not mapped to any integer label, symbol table = "
133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                 << syms_->Name();
134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        return false;
135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else {
137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      char *p;
138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      n = strtoll(s, &p, 10);
139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (p < s + strlen(s) || (!allow_negative_ && n < 0)) {
140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        VLOG(1) << "StringCompiler::ConvertSymbolToLabel: Bad label integer "
141f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                << "= \"" << s << "\"";
142f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        return false;
143f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
144f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
145f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    *output = n;
146f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
147f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
148f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
149f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  TokenType token_type_;     // Token type: symbol, byte or utf8 encoded
150f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  const SymbolTable *syms_;  // Symbol table used when token type is symbol
151f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  Label unknown_label_;      // Label for token missing from symbol table
152f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool allow_negative_;      // Negative labels allowed?
153f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
154f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  DISALLOW_COPY_AND_ASSIGN(StringCompiler);
155f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
156f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
157f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Functor to print a string FST as a string.
158f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class A>
159f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass StringPrinter {
160f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public:
161f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef A Arc;
162f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef typename A::Label Label;
163f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef typename A::StateId StateId;
164f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef typename A::Weight Weight;
165f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
166f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  enum TokenType { SYMBOL = 1, BYTE = 2, UTF8 = 3 };
167f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
168f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  StringPrinter(TokenType token_type,
169f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                const SymbolTable *syms = 0)
170f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      : token_type_(token_type), syms_(syms) {}
171f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
172f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  // Convert the FST 'fst' into the string 'output'
173f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool operator()(const Fst<A> &fst, string *output) {
174f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    bool is_a_string = FstToLabels(fst);
175f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (!is_a_string) {
176f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      VLOG(1) << "StringPrinter::operator(): Fst is not a string.";
177f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return false;
178f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
179f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
180f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    output->clear();
181f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
182f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (token_type_ == SYMBOL) {
183f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      stringstream sstrm;
184f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      for (size_t i = 0; i < labels_.size(); ++i) {
185f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        if (i)
186f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson          sstrm << *(FLAGS_fst_field_separator.rbegin());
187f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        if (!PrintLabel(labels_[i], sstrm))
188f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson          return false;
189f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
190f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      *output = sstrm.str();
191f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else if (token_type_ == BYTE) {
1925b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      output->reserve(labels_.size());
193f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      for (size_t i = 0; i < labels_.size(); ++i) {
194f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        output->push_back(labels_[i]);
195f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
196f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else if (token_type_ == UTF8) {
197f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return LabelsToUTF8String(labels_, output);
198f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else {
199f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      VLOG(1) << "StringPrinter::operator(): Unknown token type: "
200f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson              << token_type_;
201f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return false;
202f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
203f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
204f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
205f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
206f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
207f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool FstToLabels(const Fst<A> &fst) {
208f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    labels_.clear();
209f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
210f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    StateId s = fst.Start();
211f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (s == kNoStateId) {
212f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      VLOG(2) << "StringPrinter::FstToLabels: Invalid starting state for "
213f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson              << "string fst.";
214f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return false;
215f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
216f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
217f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    while (fst.Final(s) == Weight::Zero()) {
218f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      ArcIterator<Fst<A> > aiter(fst, s);
219f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (aiter.Done()) {
220f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        VLOG(2) << "StringPrinter::FstToLabels: String fst traversal does "
221f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                << "not reach final state.";
222f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        return false;
223f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
224f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
225f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      const A& arc = aiter.Value();
226f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      labels_.push_back(arc.olabel);
227f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
228f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      s = arc.nextstate;
229f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (s == kNoStateId) {
230f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        VLOG(2) << "StringPrinter::FstToLabels: Transition to invalid "
231f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                << "state.";
232f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        return false;
233f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
234f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
235f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      aiter.Next();
236f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (!aiter.Done()) {
237f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        VLOG(2) << "StringPrinter::FstToLabels: State with multiple "
238f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                << "outgoing arcs found.";
239f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        return false;
240f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
241f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
242f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
243f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
244f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
245f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
246f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  bool PrintLabel(Label lab, ostream& ostrm) {
247f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    if (syms_) {
248f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      string symbol = syms_->Find(lab);
249f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (symbol == "") {
250f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        VLOG(2) << "StringPrinter::PrintLabel: Integer " << lab << " is not "
251f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                << "mapped to any textual symbol, symbol table = "
252f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                 << syms_->Name();
253f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        return false;
254f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
255f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      ostrm << symbol;
256f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else {
257f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      ostrm << lab;
258f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
259f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return true;
260f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
261f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
262f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  TokenType token_type_;     // Token type: symbol, byte or utf8 encoded
263f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  const SymbolTable *syms_;  // Symbol table used when token type is symbol
264f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  vector<Label> labels_;     // Input FST labels.
265f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
266f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  DISALLOW_COPY_AND_ASSIGN(StringPrinter);
267f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
268f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
269f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}  // namespace fst
270f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
271f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif // FST_LIB_STRING_H_
272