1f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// collection.h
2f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
3f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Licensed under the Apache License, Version 2.0 (the "License");
4f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// you may not use this file except in compliance with the License.
5f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// You may obtain a copy of the License at
6f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
7f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//     http://www.apache.org/licenses/LICENSE-2.0
8f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
9f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Unless required by applicable law or agreed to in writing, software
10f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// distributed under the License is distributed on an "AS IS" BASIS,
11f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// See the License for the specific language governing permissions and
13f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// limitations under the License.
14f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
15f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Copyright 2005-2010 Google, Inc.
16f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// Author: riley@google.com (Michael Riley)
17f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson//
18f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson// \file
19dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Class to store a collection of ordered (multi-)sets with elements of type T.
20f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
21f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#ifndef FST_EXTENSIONS_PDT_COLLECTION_H__
22f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#define FST_EXTENSIONS_PDT_COLLECTION_H__
23f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
24f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <algorithm>
25f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <vector>
26f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonusing std::vector;
27f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
28f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#include <fst/bi-table.h>
29f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
30f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonnamespace fst {
31f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
32dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// Stores a collection of non-empty, ordered (multi-)sets with elements
33dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// of type T. A default constructor, equality ==, and an STL-style
34dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// hash class must be defined on the elements. Provides signed integer
35dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// ID (of type I) of each unique set. The IDs are allocated starting
36dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin// from 0 in order.
37f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class I, class T>
38f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsonclass Collection {
39f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson public:
40f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  struct Node {  // Trie node
41f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    I node_id;   // Root is kNoNodeId;
42f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    T element;
43f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
44f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    Node() : node_id(kNoNodeId), element(T()) {}
45f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    Node(I i, const T &t) : node_id(i), element(t) {}
46f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
47f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    bool operator==(const Node& n) const {
48f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return n.node_id == node_id && n.element == element;
49f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
50f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  };
51f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
52f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  struct NodeHash {
53f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    size_t operator()(const Node &n) const {
54f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return n.node_id + hash_(n.element) * kPrime;
55f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
56f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  };
57f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
58f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  typedef CompactHashBiTable<I, Node, NodeHash> NodeTable;
59f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
60f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  class SetIterator {
61f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson   public:
62f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    SetIterator(I id, Node node, NodeTable *node_table)
63f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        :id_(id), node_(node), node_table_(node_table) {}
64f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
65f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    bool Done() const { return id_ == kNoNodeId; }
66f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
67f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    const T &Element() const { return node_.element; }
68f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
69f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    void Next() {
70f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      id_ = node_.node_id;
71f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      if (id_ != kNoNodeId)
72f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson        node_ = node_table_->FindEntry(id_);
73f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
74f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
75f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson   private:
76f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    I id_;                     // Iterator set node id
77f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    Node node_;                // Iterator set node
78f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    NodeTable *node_table_;
79f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  };
80f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
81f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  Collection() {}
82f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
83dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // Lookups integer ID from ordered multi-set. If it doesn't exist
84dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // and 'insert' is true, then adds it. Otherwise returns -1.
85dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  I FindId(const vector<T> &set, bool insert = true) {
86f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    I node_id = kNoNodeId;
87f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    for (ssize_t i = set.size() - 1; i >= 0; --i) {
88f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      Node node(node_id, set[i]);
89dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      node_id = node_table_.FindId(node, insert);
90dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin      if (node_id == -1) break;
91f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
92f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    return node_id;
93f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
94f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
95dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // Finds ordered (multi-)set given integer ID. Returns set iterator
96dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  // to traverse result.
97f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  SetIterator FindSet(I id) {
98dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin    if (id < 0 || id >= node_table_.Size()) {
99f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return SetIterator(kNoNodeId, Node(kNoNodeId, T()), &node_table_);
100f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    } else {
101f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson      return SetIterator(id, node_table_.FindEntry(id), &node_table_);
102f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson    }
103f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  }
104f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
105dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin  I Size() const { return node_table_.Size(); }
106dfd8b8327b93660601d016cdc6f29f433b45a8d8Alexander Gutkin
107f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson private:
108f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static const I kNoNodeId;
109f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  static const size_t kPrime;
1103da1eb108d36da35333b2d655202791af854996bPrzemyslaw Szczepaniak  static std::tr1::hash<T> hash_;
111f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
112f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  NodeTable node_table_;
113f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
114f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson  DISALLOW_COPY_AND_ASSIGN(Collection);
115f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson};
116f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
117f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate<class I, class T> const I Collection<I, T>::kNoNodeId = -1;
118f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
119f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodsontemplate <class I, class T> const size_t Collection<I, T>::kPrime = 7853;
120f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
1213da1eb108d36da35333b2d655202791af854996bPrzemyslaw Szczepaniaktemplate <class I, class T> std::tr1::hash<T> Collection<I, T>::hash_;
122f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
123f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson}  // namespace fst
124f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson
125f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2Ian Hodson#endif  // FST_EXTENSIONS_PDT_COLLECTION_H__
126