1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// extract-main.h 2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License"); 4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License. 5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at 6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// http://www.apache.org/licenses/LICENSE-2.0 8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software 10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS, 11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and 13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License. 14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc. 16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author: riley@google.com (Michael Riley) 17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Modified: jpr@google.com (Jake Ratkiewicz) to use the new arc-dispatch 18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file 20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Extracts component FSTs from an finite-state archive. 21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// 22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_EXTENSIONS_FAR_EXTRACT_H__ 24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_EXTENSIONS_FAR_EXTRACT_H__ 25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string> 27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector> 28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector; 29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/extensions/far/far.h> 31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst { 33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate<class Arc> 355b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkininline void FarWriteFst(const Fst<Arc>* fst, string key, 365b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin string* okey, int* nrep, 375b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const int32 &generate_filenames, int i, 385b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const string &filename_prefix, 395b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const string &filename_suffix) { 405b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (key == *okey) 415b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin ++*nrep; 425b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin else 435b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin *nrep = 0; 445b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 455b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin *okey = key; 465b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 475b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin string ofilename; 485b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (generate_filenames) { 495b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin ostringstream tmp; 505b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin tmp.width(generate_filenames); 515b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin tmp.fill('0'); 525b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin tmp << i; 535b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin ofilename = tmp.str(); 545b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } else { 555b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (*nrep > 0) { 565b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin ostringstream tmp; 575b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin tmp << '.' << nrep; 585b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin key.append(tmp.str().data(), tmp.str().size()); 595b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 605b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin ofilename = key; 615b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 625b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin fst->Write(filename_prefix + ofilename + filename_suffix); 635b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin} 645b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin 655b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkintemplate<class Arc> 66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonvoid FarExtract(const vector<string> &ifilenames, 67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const int32 &generate_filenames, 685b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const string &keys, 695b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const string &key_separator, 705b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const string &range_delimiter, 71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string &filename_prefix, 72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson const string &filename_suffix) { 73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson FarReader<Arc> *far_reader = FarReader<Arc>::Open(ifilenames); 74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson if (!far_reader) return; 75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson string okey; 77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson int nrep = 0; 78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 795b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin vector<char *> key_vector; 805b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin // User has specified a set of fsts to extract, where some of the "fsts" could 815b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin // be ranges. 825b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (!keys.empty()) { 835b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin char *keys_cstr = new char[keys.size()+1]; 845b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin strcpy(keys_cstr, keys.c_str()); 855b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin SplitToVector(keys_cstr, key_separator.c_str(), &key_vector, true); 865b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin int i = 0; 875b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin for (int k = 0; k < key_vector.size(); ++k, ++i) { 885b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin string key = string(key_vector[k]); 895b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin char *key_cstr = new char[key.size()+1]; 905b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin strcpy(key_cstr, key.c_str()); 915b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin vector<char *> range_vector; 925b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin SplitToVector(key_cstr, range_delimiter.c_str(), &range_vector, false); 935b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (range_vector.size() == 1) { // Not a range 945b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (!far_reader->Find(key)) { 955b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin LOG(ERROR) << "FarExtract: Cannot find key: " << key; 965b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return; 975b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 985b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const Fst<Arc> &fst = far_reader->GetFst(); 995b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin FarWriteFst(&fst, key, &okey, &nrep, generate_filenames, i, 1005b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin filename_prefix, filename_suffix); 1015b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } else if (range_vector.size() == 2) { // A legal range 1025b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin string begin_key = string(range_vector[0]); 1035b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin string end_key = string(range_vector[1]); 1045b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (begin_key.empty() || end_key.empty()) { 1055b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin LOG(ERROR) << "FarExtract: Illegal range specification: " << key; 1065b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return; 1075b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 1085b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (!far_reader->Find(begin_key)) { 1095b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin LOG(ERROR) << "FarExtract: Cannot find key: " << begin_key; 1105b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return; 1115b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 1125b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin for ( ; !far_reader->Done(); far_reader->Next(), ++i) { 1135b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin string ikey = far_reader->GetKey(); 1145b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin if (end_key < ikey) break; 1155b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const Fst<Arc> &fst = far_reader->GetFst(); 1165b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin FarWriteFst(&fst, ikey, &okey, &nrep, generate_filenames, i, 1175b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin filename_prefix, filename_suffix); 1185b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 1195b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } else { 1205b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin LOG(ERROR) << "FarExtract: Illegal range specification: " << key; 1215b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return; 122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 1235b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin delete key_cstr; 124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 1255b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin delete keys_cstr; 1265b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin return; 1275b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin } 1285b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin // Nothing specified: extract everything. 1295b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin for (int i = 1; !far_reader->Done(); far_reader->Next(), ++i) { 1305b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin string key = far_reader->GetKey(); 1315b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin const Fst<Arc> &fst = far_reader->GetFst(); 1325b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin FarWriteFst(&fst, key, &okey, &nrep, generate_filenames, i, 1335b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin filename_prefix, filename_suffix); 134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson } 135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson return; 136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} 137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson} // namespace fst 139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson 140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif // FST_EXTENSIONS_FAR_EXTRACT_H__ 141