1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// string.h 3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License"); 5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License. 6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at 7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// http://www.apache.org/licenses/LICENSE-2.0 9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software 11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS, 12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and 14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License. 15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc. 17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author: allauzen@google.com (Cyril Allauzen) 18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file 20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Utilities to convert strings into FSTs. 21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_LIB_STRING_H_ 24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_LIB_STRING_H_ 25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/compact-fst.h> 27dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin#include <fst/icu.h> 28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/mutable-fst.h> 29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian HodsonDECLARE_string(fst_field_separator); 31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst { 33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Functor compiling a string in an FST 35f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class A> 36f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass StringCompiler { 37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public: 38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef A Arc; 39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename A::Label Label; 40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename A::Weight Weight; 41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson enum TokenType { SYMBOL = 1, BYTE = 2, UTF8 = 3 }; 43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson StringCompiler(TokenType type, const SymbolTable *syms = 0, 45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Label unknown_label = kNoLabel, 46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool allow_negative = false) 47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : token_type_(type), syms_(syms), unknown_label_(unknown_label), 48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson allow_negative_(allow_negative) {} 49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Compile string 's' into FST 'fst'. 51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson template <class F> 52dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin bool operator()(const string &s, F *fst) const { 53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<Label> labels; 54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!ConvertStringToLabels(s, &labels)) 55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Compile(labels, fst); 57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 605b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin template <class F> 615b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin bool operator()(const string &s, F *fst, Weight w) const { 625b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin vector<Label> labels; 635b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (!ConvertStringToLabels(s, &labels)) 645b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return false; 655b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin Compile(labels, fst, w); 665b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return true; 675b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 685b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool ConvertStringToLabels(const string &str, vector<Label> *labels) const { 71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson labels->clear(); 72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (token_type_ == BYTE) { 73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < str.size(); ++i) 74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson labels->push_back(static_cast<unsigned char>(str[i])); 75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (token_type_ == UTF8) { 76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return UTF8StringToLabels(str, labels); 77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson char *c_str = new char[str.size() + 1]; 79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson str.copy(c_str, str.size()); 80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson c_str[str.size()] = 0; 81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<char *> vec; 82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string separator = "\n" + FLAGS_fst_field_separator; 83f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson SplitToVector(c_str, separator.c_str(), &vec, true); 84f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < vec.size(); ++i) { 85f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Label label; 86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!ConvertSymbolToLabel(vec[i], &label)) 87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson labels->push_back(label); 89f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 90f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson delete[] c_str; 91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 94f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 955b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin void Compile(const vector<Label> &labels, MutableFst<A> *fst, 965b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const Weight &weight = Weight::One()) const { 97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson fst->DeleteStates(); 98f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson while (fst->NumStates() <= labels.size()) 99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson fst->AddState(); 100f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < labels.size(); ++i) 101f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson fst->AddArc(i, Arc(labels[i], labels[i], Weight::One(), i + 1)); 102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson fst->SetStart(0); 1035b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin fst->SetFinal(labels.size(), weight); 104f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 105f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 106f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson template <class Unsigned> 1075b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin void Compile(const vector<Label> &labels, 1085b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin CompactFst<A, StringCompactor<A>, Unsigned> *fst) const { 109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson fst->SetCompactElements(labels.begin(), labels.end()); 110f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 1125b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin template <class Unsigned> 1135b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin void Compile(const vector<Label> &labels, 1145b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin CompactFst<A, WeightedStringCompactor<A>, Unsigned> *fst, 1155b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const Weight &weight = Weight::One()) const { 1165b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin vector<pair<Label, Weight> > compacts; 1175b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin compacts.reserve(labels.size()); 1185b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin for (size_t i = 0; i < labels.size(); ++i) 1195b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin compacts.push_back(make_pair(labels[i], Weight::One())); 1205b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin compacts.back().second = weight; 1215b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin fst->SetCompactElements(compacts.begin(), compacts.end()); 1225b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 1235b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool ConvertSymbolToLabel(const char *s, Label* output) const { 125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int64 n; 126f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (syms_) { 127f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson n = syms_->Find(s); 128f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if ((n == -1) && (unknown_label_ != kNoLabel)) 129f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson n = unknown_label_; 130f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (n == -1 || (!allow_negative_ && n < 0)) { 131f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(1) << "StringCompiler::ConvertSymbolToLabel: Symbol \"" << s 132f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << "\" is not mapped to any integer label, symbol table = " 133f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << syms_->Name(); 134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson char *p; 138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson n = strtoll(s, &p, 10); 139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (p < s + strlen(s) || (!allow_negative_ && n < 0)) { 140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(1) << "StringCompiler::ConvertSymbolToLabel: Bad label integer " 141f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << "= \"" << s << "\""; 142f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 143f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 144f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 145f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson *output = n; 146f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 147f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 148f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 149f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson TokenType token_type_; // Token type: symbol, byte or utf8 encoded 150f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTable *syms_; // Symbol table used when token type is symbol 151f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson Label unknown_label_; // Label for token missing from symbol table 152f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool allow_negative_; // Negative labels allowed? 153f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 154f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson DISALLOW_COPY_AND_ASSIGN(StringCompiler); 155f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 156f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 157f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Functor to print a string FST as a string. 158f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class A> 159f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass StringPrinter { 160f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public: 161f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef A Arc; 162f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename A::Label Label; 163f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename A::StateId StateId; 164f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson typedef typename A::Weight Weight; 165f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 166f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson enum TokenType { SYMBOL = 1, BYTE = 2, UTF8 = 3 }; 167f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 168f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson StringPrinter(TokenType token_type, 169f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTable *syms = 0) 170f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson : token_type_(token_type), syms_(syms) {} 171f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 172f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson // Convert the FST 'fst' into the string 'output' 173f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool operator()(const Fst<A> &fst, string *output) { 174f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool is_a_string = FstToLabels(fst); 175f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!is_a_string) { 176f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(1) << "StringPrinter::operator(): Fst is not a string."; 177f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 178f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 179f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 180f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson output->clear(); 181f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 182f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (token_type_ == SYMBOL) { 183f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson stringstream sstrm; 184f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < labels_.size(); ++i) { 185f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (i) 186f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson sstrm << *(FLAGS_fst_field_separator.rbegin()); 187f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!PrintLabel(labels_[i], sstrm)) 188f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 189f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 190f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson *output = sstrm.str(); 191f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (token_type_ == BYTE) { 1925b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin output->reserve(labels_.size()); 193f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson for (size_t i = 0; i < labels_.size(); ++i) { 194f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson output->push_back(labels_[i]); 195f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 196f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else if (token_type_ == UTF8) { 197f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return LabelsToUTF8String(labels_, output); 198f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 199f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(1) << "StringPrinter::operator(): Unknown token type: " 200f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << token_type_; 201f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 202f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 203f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 204f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 205f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 206f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private: 207f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool FstToLabels(const Fst<A> &fst) { 208f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson labels_.clear(); 209f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 210f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson StateId s = fst.Start(); 211f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (s == kNoStateId) { 212f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(2) << "StringPrinter::FstToLabels: Invalid starting state for " 213f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << "string fst."; 214f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 215f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 216f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 217f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson while (fst.Final(s) == Weight::Zero()) { 218f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ArcIterator<Fst<A> > aiter(fst, s); 219f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (aiter.Done()) { 220f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(2) << "StringPrinter::FstToLabels: String fst traversal does " 221f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << "not reach final state."; 222f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 223f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 224f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 225f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const A& arc = aiter.Value(); 226f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson labels_.push_back(arc.olabel); 227f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 228f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson s = arc.nextstate; 229f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (s == kNoStateId) { 230f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(2) << "StringPrinter::FstToLabels: Transition to invalid " 231f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << "state."; 232f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 233f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 234f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 235f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson aiter.Next(); 236f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!aiter.Done()) { 237f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(2) << "StringPrinter::FstToLabels: State with multiple " 238f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << "outgoing arcs found."; 239f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 240f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 241f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 242f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 243f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 244f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 245f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 246f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson bool PrintLabel(Label lab, ostream& ostrm) { 247f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (syms_) { 248f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string symbol = syms_->Find(lab); 249f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (symbol == "") { 250f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson VLOG(2) << "StringPrinter::PrintLabel: Integer " << lab << " is not " 251f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << "mapped to any textual symbol, symbol table = " 252f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson << syms_->Name(); 253f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return false; 254f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 255f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ostrm << symbol; 256f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } else { 257f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson ostrm << lab; 258f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 259f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return true; 260f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 261f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 262f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson TokenType token_type_; // Token type: symbol, byte or utf8 encoded 263f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const SymbolTable *syms_; // Symbol table used when token type is symbol 264f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson vector<Label> labels_; // Input FST labels. 265f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 266f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson DISALLOW_COPY_AND_ASSIGN(StringPrinter); 267f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}; 268f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 269f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} // namespace fst 270f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 271f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif // FST_LIB_STRING_H_ 272