1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// extract-main.h
2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License");
4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License.
5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at
6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//     http://www.apache.org/licenses/LICENSE-2.0
8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software
10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS,
11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and
13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License.
14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc.
16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author: riley@google.com (Michael Riley)
17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Modified: jpr@google.com (Jake Ratkiewicz) to use the new arc-dispatch
18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
19f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file
20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Extracts component FSTs from an finite-state archive.
21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_EXTENSIONS_FAR_EXTRACT_H__
24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_EXTENSIONS_FAR_EXTRACT_H__
25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <string>
27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector>
28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector;
29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/extensions/far/far.h>
31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
32f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst {
33f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
34f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate<class Arc>
355b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkininline void FarWriteFst(const Fst<Arc>* fst, string key,
365b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin                        string* okey, int* nrep,
375b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin                        const int32 &generate_filenames, int i,
385b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin                        const string &filename_prefix,
395b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin                        const string &filename_suffix) {
405b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  if (key == *okey)
415b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    ++*nrep;
425b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  else
435b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    *nrep = 0;
445b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin
455b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  *okey = key;
465b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin
475b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  string ofilename;
485b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  if (generate_filenames) {
495b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    ostringstream tmp;
505b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    tmp.width(generate_filenames);
515b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    tmp.fill('0');
525b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    tmp << i;
535b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    ofilename = tmp.str();
545b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  } else {
555b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    if (*nrep > 0) {
565b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      ostringstream tmp;
575b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      tmp << '.' << nrep;
585b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      key.append(tmp.str().data(), tmp.str().size());
595b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    }
605b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    ofilename = key;
615b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  }
625b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  fst->Write(filename_prefix + ofilename + filename_suffix);
635b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin}
645b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin
655b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkintemplate<class Arc>
66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonvoid FarExtract(const vector<string> &ifilenames,
67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                const int32 &generate_filenames,
685b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin                const string &keys,
695b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin                const string &key_separator,
705b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin                const string &range_delimiter,
71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                const string &filename_prefix,
72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson                const string &filename_suffix) {
73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  FarReader<Arc> *far_reader = FarReader<Arc>::Open(ifilenames);
74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  if (!far_reader) return;
75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  string okey;
77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  int nrep = 0;
78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
795b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  vector<char *> key_vector;
805b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  // User has specified a set of fsts to extract, where some of the "fsts" could
815b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  // be ranges.
825b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  if (!keys.empty()) {
835b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    char *keys_cstr = new char[keys.size()+1];
845b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    strcpy(keys_cstr, keys.c_str());
855b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    SplitToVector(keys_cstr, key_separator.c_str(), &key_vector, true);
865b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    int i = 0;
875b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    for (int k = 0; k < key_vector.size(); ++k, ++i) {
885b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      string key = string(key_vector[k]);
895b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      char *key_cstr = new char[key.size()+1];
905b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      strcpy(key_cstr, key.c_str());
915b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      vector<char *> range_vector;
925b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      SplitToVector(key_cstr, range_delimiter.c_str(), &range_vector, false);
935b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      if (range_vector.size() == 1) {  // Not a range
945b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        if (!far_reader->Find(key)) {
955b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin          LOG(ERROR) << "FarExtract: Cannot find key: " << key;
965b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin          return;
975b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        }
985b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        const Fst<Arc> &fst = far_reader->GetFst();
995b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        FarWriteFst(&fst, key, &okey, &nrep, generate_filenames, i,
1005b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin                    filename_prefix, filename_suffix);
1015b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      } else if (range_vector.size() == 2) {  // A legal range
1025b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        string begin_key = string(range_vector[0]);
1035b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        string end_key = string(range_vector[1]);
1045b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        if (begin_key.empty() || end_key.empty()) {
1055b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin          LOG(ERROR) << "FarExtract: Illegal range specification: " << key;
1065b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin          return;
1075b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        }
1085b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        if (!far_reader->Find(begin_key)) {
1095b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin          LOG(ERROR) << "FarExtract: Cannot find key: " << begin_key;
1105b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin          return;
1115b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        }
1125b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        for ( ; !far_reader->Done(); far_reader->Next(), ++i) {
1135b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin          string ikey = far_reader->GetKey();
1145b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin          if (end_key < ikey) break;
1155b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin          const Fst<Arc> &fst = far_reader->GetFst();
1165b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin          FarWriteFst(&fst, ikey, &okey, &nrep, generate_filenames, i,
1175b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin                      filename_prefix, filename_suffix);
1185b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        }
1195b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      } else {
1205b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        LOG(ERROR) << "FarExtract: Illegal range specification: " << key;
1215b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin        return;
122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      }
1235b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin      delete key_cstr;
124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
1255b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    delete keys_cstr;
1265b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    return;
1275b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  }
1285b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  // Nothing specified: extract everything.
1295b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin  for (int i = 1; !far_reader->Done(); far_reader->Next(), ++i) {
1305b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    string key = far_reader->GetKey();
1315b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    const Fst<Arc> &fst = far_reader->GetFst();
1325b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin    FarWriteFst(&fst, key, &okey, &nrep, generate_filenames, i,
1335b6dc79427b8f7eeb6a7ff68034ab8548ce670eaAlexander Gutkin                filename_prefix, filename_suffix);
134f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
135f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  return;
136f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}
137f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
138f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}  // namespace fst
139f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
140f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif  // FST_EXTENSIONS_FAR_EXTRACT_H__
141