1//===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Defines facilities for reading and writing on-disk hash tables.
12///
13//===----------------------------------------------------------------------===//
14#ifndef LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
15#define LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
16
17#include "llvm/Support/Allocator.h"
18#include "llvm/Support/AlignOf.h"
19#include "llvm/Support/DataTypes.h"
20#include "llvm/Support/EndianStream.h"
21#include "llvm/Support/Host.h"
22#include "llvm/Support/MathExtras.h"
23#include "llvm/Support/raw_ostream.h"
24#include <cassert>
25#include <cstdlib>
26
27namespace llvm {
28
29/// \brief Generates an on disk hash table.
30///
31/// This needs an \c Info that handles storing values into the hash table's
32/// payload and computes the hash for a given key. This should provide the
33/// following interface:
34///
35/// \code
36/// class ExampleInfo {
37/// public:
38///   typedef ExampleKey key_type;   // Must be copy constructible
39///   typedef ExampleKey &key_type_ref;
40///   typedef ExampleData data_type; // Must be copy constructible
41///   typedef ExampleData &data_type_ref;
42///   typedef uint32_t hash_value_type; // The type the hash function returns.
43///   typedef uint32_t offset_type; // The type for offsets into the table.
44///
45///   /// Calculate the hash for Key
46///   static hash_value_type ComputeHash(key_type_ref Key);
47///   /// Return the lengths, in bytes, of the given Key/Data pair.
48///   static std::pair<offset_type, offset_type>
49///   EmitKeyDataLength(raw_ostream &Out, key_type_ref Key, data_type_ref Data);
50///   /// Write Key to Out.  KeyLen is the length from EmitKeyDataLength.
51///   static void EmitKey(raw_ostream &Out, key_type_ref Key,
52///                       offset_type KeyLen);
53///   /// Write Data to Out.  DataLen is the length from EmitKeyDataLength.
54///   static void EmitData(raw_ostream &Out, key_type_ref Key,
55///                        data_type_ref Data, offset_type DataLen);
56/// };
57/// \endcode
58template <typename Info> class OnDiskChainedHashTableGenerator {
59  /// \brief A single item in the hash table.
60  class Item {
61  public:
62    typename Info::key_type Key;
63    typename Info::data_type Data;
64    Item *Next;
65    const typename Info::hash_value_type Hash;
66
67    Item(typename Info::key_type_ref Key, typename Info::data_type_ref Data,
68         Info &InfoObj)
69        : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {}
70  };
71
72  typedef typename Info::offset_type offset_type;
73  offset_type NumBuckets;
74  offset_type NumEntries;
75  llvm::SpecificBumpPtrAllocator<Item> BA;
76
77  /// \brief A linked list of values in a particular hash bucket.
78  class Bucket {
79  public:
80    offset_type Off;
81    Item *Head;
82    unsigned Length;
83
84    Bucket() {}
85  };
86
87  Bucket *Buckets;
88
89private:
90  /// \brief Insert an item into the appropriate hash bucket.
91  void insert(Bucket *Buckets, size_t Size, Item *E) {
92    Bucket &B = Buckets[E->Hash & (Size - 1)];
93    E->Next = B.Head;
94    ++B.Length;
95    B.Head = E;
96  }
97
98  /// \brief Resize the hash table, moving the old entries into the new buckets.
99  void resize(size_t NewSize) {
100    Bucket *NewBuckets = (Bucket *)std::calloc(NewSize, sizeof(Bucket));
101    // Populate NewBuckets with the old entries.
102    for (size_t I = 0; I < NumBuckets; ++I)
103      for (Item *E = Buckets[I].Head; E;) {
104        Item *N = E->Next;
105        E->Next = nullptr;
106        insert(NewBuckets, NewSize, E);
107        E = N;
108      }
109
110    free(Buckets);
111    NumBuckets = NewSize;
112    Buckets = NewBuckets;
113  }
114
115public:
116  /// \brief Insert an entry into the table.
117  void insert(typename Info::key_type_ref Key,
118              typename Info::data_type_ref Data) {
119    Info InfoObj;
120    insert(Key, Data, InfoObj);
121  }
122
123  /// \brief Insert an entry into the table.
124  ///
125  /// Uses the provided Info instead of a stack allocated one.
126  void insert(typename Info::key_type_ref Key,
127              typename Info::data_type_ref Data, Info &InfoObj) {
128
129    ++NumEntries;
130    if (4 * NumEntries >= 3 * NumBuckets)
131      resize(NumBuckets * 2);
132    insert(Buckets, NumBuckets, new (BA.Allocate()) Item(Key, Data, InfoObj));
133  }
134
135  /// \brief Emit the table to Out, which must not be at offset 0.
136  offset_type Emit(raw_ostream &Out) {
137    Info InfoObj;
138    return Emit(Out, InfoObj);
139  }
140
141  /// \brief Emit the table to Out, which must not be at offset 0.
142  ///
143  /// Uses the provided Info instead of a stack allocated one.
144  offset_type Emit(raw_ostream &Out, Info &InfoObj) {
145    using namespace llvm::support;
146    endian::Writer<little> LE(Out);
147
148    // Emit the payload of the table.
149    for (offset_type I = 0; I < NumBuckets; ++I) {
150      Bucket &B = Buckets[I];
151      if (!B.Head)
152        continue;
153
154      // Store the offset for the data of this bucket.
155      B.Off = Out.tell();
156      assert(B.Off && "Cannot write a bucket at offset 0. Please add padding.");
157
158      // Write out the number of items in the bucket.
159      LE.write<uint16_t>(B.Length);
160      assert(B.Length != 0 && "Bucket has a head but zero length?");
161
162      // Write out the entries in the bucket.
163      for (Item *I = B.Head; I; I = I->Next) {
164        LE.write<typename Info::hash_value_type>(I->Hash);
165        const std::pair<offset_type, offset_type> &Len =
166            InfoObj.EmitKeyDataLength(Out, I->Key, I->Data);
167        InfoObj.EmitKey(Out, I->Key, Len.first);
168        InfoObj.EmitData(Out, I->Key, I->Data, Len.second);
169      }
170    }
171
172    // Pad with zeros so that we can start the hashtable at an aligned address.
173    offset_type TableOff = Out.tell();
174    uint64_t N = llvm::OffsetToAlignment(TableOff, alignOf<offset_type>());
175    TableOff += N;
176    while (N--)
177      LE.write<uint8_t>(0);
178
179    // Emit the hashtable itself.
180    LE.write<offset_type>(NumBuckets);
181    LE.write<offset_type>(NumEntries);
182    for (offset_type I = 0; I < NumBuckets; ++I)
183      LE.write<offset_type>(Buckets[I].Off);
184
185    return TableOff;
186  }
187
188  OnDiskChainedHashTableGenerator() {
189    NumEntries = 0;
190    NumBuckets = 64;
191    // Note that we do not need to run the constructors of the individual
192    // Bucket objects since 'calloc' returns bytes that are all 0.
193    Buckets = (Bucket *)std::calloc(NumBuckets, sizeof(Bucket));
194  }
195
196  ~OnDiskChainedHashTableGenerator() { std::free(Buckets); }
197};
198
199/// \brief Provides lookup on an on disk hash table.
200///
201/// This needs an \c Info that handles reading values from the hash table's
202/// payload and computes the hash for a given key. This should provide the
203/// following interface:
204///
205/// \code
206/// class ExampleLookupInfo {
207/// public:
208///   typedef ExampleData data_type;
209///   typedef ExampleInternalKey internal_key_type; // The stored key type.
210///   typedef ExampleKey external_key_type; // The type to pass to find().
211///   typedef uint32_t hash_value_type; // The type the hash function returns.
212///   typedef uint32_t offset_type; // The type for offsets into the table.
213///
214///   /// Compare two keys for equality.
215///   static bool EqualKey(internal_key_type &Key1, internal_key_type &Key2);
216///   /// Calculate the hash for the given key.
217///   static hash_value_type ComputeHash(internal_key_type &IKey);
218///   /// Translate from the semantic type of a key in the hash table to the
219///   /// type that is actually stored and used for hashing and comparisons.
220///   /// The internal and external types are often the same, in which case this
221///   /// can simply return the passed in value.
222///   static const internal_key_type &GetInternalKey(external_key_type &EKey);
223///   /// Read the key and data length from Buffer, leaving it pointing at the
224///   /// following byte.
225///   static std::pair<offset_type, offset_type>
226///   ReadKeyDataLength(const unsigned char *&Buffer);
227///   /// Read the key from Buffer, given the KeyLen as reported from
228///   /// ReadKeyDataLength.
229///   const internal_key_type &ReadKey(const unsigned char *Buffer,
230///                                    offset_type KeyLen);
231///   /// Read the data for Key from Buffer, given the DataLen as reported from
232///   /// ReadKeyDataLength.
233///   data_type ReadData(StringRef Key, const unsigned char *Buffer,
234///                      offset_type DataLen);
235/// };
236/// \endcode
237template <typename Info> class OnDiskChainedHashTable {
238  const typename Info::offset_type NumBuckets;
239  const typename Info::offset_type NumEntries;
240  const unsigned char *const Buckets;
241  const unsigned char *const Base;
242  Info InfoObj;
243
244public:
245  typedef typename Info::internal_key_type internal_key_type;
246  typedef typename Info::external_key_type external_key_type;
247  typedef typename Info::data_type         data_type;
248  typedef typename Info::hash_value_type   hash_value_type;
249  typedef typename Info::offset_type       offset_type;
250
251  OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
252                         const unsigned char *Buckets,
253                         const unsigned char *Base,
254                         const Info &InfoObj = Info())
255      : NumBuckets(NumBuckets), NumEntries(NumEntries), Buckets(Buckets),
256        Base(Base), InfoObj(InfoObj) {
257    assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
258           "'buckets' must have a 4-byte alignment");
259  }
260
261  offset_type getNumBuckets() const { return NumBuckets; }
262  offset_type getNumEntries() const { return NumEntries; }
263  const unsigned char *getBase() const { return Base; }
264  const unsigned char *getBuckets() const { return Buckets; }
265
266  bool isEmpty() const { return NumEntries == 0; }
267
268  class iterator {
269    internal_key_type Key;
270    const unsigned char *const Data;
271    const offset_type Len;
272    Info *InfoObj;
273
274  public:
275    iterator() : Data(nullptr), Len(0) {}
276    iterator(const internal_key_type K, const unsigned char *D, offset_type L,
277             Info *InfoObj)
278        : Key(K), Data(D), Len(L), InfoObj(InfoObj) {}
279
280    data_type operator*() const { return InfoObj->ReadData(Key, Data, Len); }
281    bool operator==(const iterator &X) const { return X.Data == Data; }
282    bool operator!=(const iterator &X) const { return X.Data != Data; }
283  };
284
285  /// \brief Look up the stored data for a particular key.
286  iterator find(const external_key_type &EKey, Info *InfoPtr = 0) {
287    if (!InfoPtr)
288      InfoPtr = &InfoObj;
289
290    using namespace llvm::support;
291    const internal_key_type &IKey = InfoObj.GetInternalKey(EKey);
292    hash_value_type KeyHash = InfoObj.ComputeHash(IKey);
293
294    // Each bucket is just an offset into the hash table file.
295    offset_type Idx = KeyHash & (NumBuckets - 1);
296    const unsigned char *Bucket = Buckets + sizeof(offset_type) * Idx;
297
298    offset_type Offset = endian::readNext<offset_type, little, aligned>(Bucket);
299    if (Offset == 0)
300      return iterator(); // Empty bucket.
301    const unsigned char *Items = Base + Offset;
302
303    // 'Items' starts with a 16-bit unsigned integer representing the
304    // number of items in this bucket.
305    unsigned Len = endian::readNext<uint16_t, little, unaligned>(Items);
306
307    for (unsigned i = 0; i < Len; ++i) {
308      // Read the hash.
309      hash_value_type ItemHash =
310          endian::readNext<hash_value_type, little, unaligned>(Items);
311
312      // Determine the length of the key and the data.
313      const std::pair<offset_type, offset_type> &L =
314          Info::ReadKeyDataLength(Items);
315      offset_type ItemLen = L.first + L.second;
316
317      // Compare the hashes.  If they are not the same, skip the entry entirely.
318      if (ItemHash != KeyHash) {
319        Items += ItemLen;
320        continue;
321      }
322
323      // Read the key.
324      const internal_key_type &X =
325          InfoPtr->ReadKey((const unsigned char *const)Items, L.first);
326
327      // If the key doesn't match just skip reading the value.
328      if (!InfoPtr->EqualKey(X, IKey)) {
329        Items += ItemLen;
330        continue;
331      }
332
333      // The key matches!
334      return iterator(X, Items + L.first, L.second, InfoPtr);
335    }
336
337    return iterator();
338  }
339
340  iterator end() const { return iterator(); }
341
342  Info &getInfoObj() { return InfoObj; }
343
344  /// \brief Create the hash table.
345  ///
346  /// \param Buckets is the beginning of the hash table itself, which follows
347  /// the payload of entire structure. This is the value returned by
348  /// OnDiskHashTableGenerator::Emit.
349  ///
350  /// \param Base is the point from which all offsets into the structure are
351  /// based. This is offset 0 in the stream that was used when Emitting the
352  /// table.
353  static OnDiskChainedHashTable *Create(const unsigned char *Buckets,
354                                        const unsigned char *const Base,
355                                        const Info &InfoObj = Info()) {
356    using namespace llvm::support;
357    assert(Buckets > Base);
358    assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
359           "buckets should be 4-byte aligned.");
360
361    offset_type NumBuckets =
362        endian::readNext<offset_type, little, aligned>(Buckets);
363    offset_type NumEntries =
364        endian::readNext<offset_type, little, aligned>(Buckets);
365    return new OnDiskChainedHashTable<Info>(NumBuckets, NumEntries, Buckets,
366                                            Base, InfoObj);
367  }
368};
369
370/// \brief Provides lookup and iteration over an on disk hash table.
371///
372/// \copydetails llvm::OnDiskChainedHashTable
373template <typename Info>
374class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
375  const unsigned char *Payload;
376
377public:
378  typedef OnDiskChainedHashTable<Info>          base_type;
379  typedef typename base_type::internal_key_type internal_key_type;
380  typedef typename base_type::external_key_type external_key_type;
381  typedef typename base_type::data_type         data_type;
382  typedef typename base_type::hash_value_type   hash_value_type;
383  typedef typename base_type::offset_type       offset_type;
384
385  OnDiskIterableChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
386                                 const unsigned char *Buckets,
387                                 const unsigned char *Payload,
388                                 const unsigned char *Base,
389                                 const Info &InfoObj = Info())
390      : base_type(NumBuckets, NumEntries, Buckets, Base, InfoObj),
391        Payload(Payload) {}
392
393  /// \brief Iterates over all of the keys in the table.
394  class key_iterator {
395    const unsigned char *Ptr;
396    offset_type NumItemsInBucketLeft;
397    offset_type NumEntriesLeft;
398    Info *InfoObj;
399
400  public:
401    typedef external_key_type value_type;
402
403    key_iterator(const unsigned char *const Ptr, offset_type NumEntries,
404                 Info *InfoObj)
405        : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
406          InfoObj(InfoObj) {}
407    key_iterator()
408        : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0),
409          InfoObj(0) {}
410
411    friend bool operator==(const key_iterator &X, const key_iterator &Y) {
412      return X.NumEntriesLeft == Y.NumEntriesLeft;
413    }
414    friend bool operator!=(const key_iterator &X, const key_iterator &Y) {
415      return X.NumEntriesLeft != Y.NumEntriesLeft;
416    }
417
418    key_iterator &operator++() { // Preincrement
419      using namespace llvm::support;
420      if (!NumItemsInBucketLeft) {
421        // 'Items' starts with a 16-bit unsigned integer representing the
422        // number of items in this bucket.
423        NumItemsInBucketLeft =
424            endian::readNext<uint16_t, little, unaligned>(Ptr);
425      }
426      Ptr += sizeof(hash_value_type); // Skip the hash.
427      // Determine the length of the key and the data.
428      const std::pair<offset_type, offset_type> &L =
429          Info::ReadKeyDataLength(Ptr);
430      Ptr += L.first + L.second;
431      assert(NumItemsInBucketLeft);
432      --NumItemsInBucketLeft;
433      assert(NumEntriesLeft);
434      --NumEntriesLeft;
435      return *this;
436    }
437    key_iterator operator++(int) { // Postincrement
438      key_iterator tmp = *this; ++*this; return tmp;
439    }
440
441    value_type operator*() const {
442      const unsigned char *LocalPtr = Ptr;
443      if (!NumItemsInBucketLeft)
444        LocalPtr += 2; // number of items in bucket
445      LocalPtr += sizeof(hash_value_type); // Skip the hash.
446
447      // Determine the length of the key and the data.
448      const std::pair<offset_type, offset_type> &L =
449          Info::ReadKeyDataLength(LocalPtr);
450
451      // Read the key.
452      const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
453      return InfoObj->GetExternalKey(Key);
454    }
455  };
456
457  key_iterator key_begin() {
458    return key_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
459  }
460  key_iterator key_end() { return key_iterator(); }
461
462  iterator_range<key_iterator> keys() {
463    return make_range(key_begin(), key_end());
464  }
465
466  /// \brief Iterates over all the entries in the table, returning the data.
467  class data_iterator {
468    const unsigned char *Ptr;
469    offset_type NumItemsInBucketLeft;
470    offset_type NumEntriesLeft;
471    Info *InfoObj;
472
473  public:
474    typedef data_type value_type;
475
476    data_iterator(const unsigned char *const Ptr, offset_type NumEntries,
477                  Info *InfoObj)
478        : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
479          InfoObj(InfoObj) {}
480    data_iterator()
481        : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0),
482          InfoObj(nullptr) {}
483
484    bool operator==(const data_iterator &X) const {
485      return X.NumEntriesLeft == NumEntriesLeft;
486    }
487    bool operator!=(const data_iterator &X) const {
488      return X.NumEntriesLeft != NumEntriesLeft;
489    }
490
491    data_iterator &operator++() { // Preincrement
492      using namespace llvm::support;
493      if (!NumItemsInBucketLeft) {
494        // 'Items' starts with a 16-bit unsigned integer representing the
495        // number of items in this bucket.
496        NumItemsInBucketLeft =
497            endian::readNext<uint16_t, little, unaligned>(Ptr);
498      }
499      Ptr += sizeof(hash_value_type); // Skip the hash.
500      // Determine the length of the key and the data.
501      const std::pair<offset_type, offset_type> &L =
502          Info::ReadKeyDataLength(Ptr);
503      Ptr += L.first + L.second;
504      assert(NumItemsInBucketLeft);
505      --NumItemsInBucketLeft;
506      assert(NumEntriesLeft);
507      --NumEntriesLeft;
508      return *this;
509    }
510    data_iterator operator++(int) { // Postincrement
511      data_iterator tmp = *this; ++*this; return tmp;
512    }
513
514    value_type operator*() const {
515      const unsigned char *LocalPtr = Ptr;
516      if (!NumItemsInBucketLeft)
517        LocalPtr += 2; // number of items in bucket
518      LocalPtr += sizeof(hash_value_type); // Skip the hash.
519
520      // Determine the length of the key and the data.
521      const std::pair<offset_type, offset_type> &L =
522          Info::ReadKeyDataLength(LocalPtr);
523
524      // Read the key.
525      const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
526      return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
527    }
528  };
529
530  data_iterator data_begin() {
531    return data_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
532  }
533  data_iterator data_end() { return data_iterator(); }
534
535  iterator_range<data_iterator> data() {
536    return make_range(data_begin(), data_end());
537  }
538
539  /// \brief Create the hash table.
540  ///
541  /// \param Buckets is the beginning of the hash table itself, which follows
542  /// the payload of entire structure. This is the value returned by
543  /// OnDiskHashTableGenerator::Emit.
544  ///
545  /// \param Payload is the beginning of the data contained in the table.  This
546  /// is Base plus any padding or header data that was stored, ie, the offset
547  /// that the stream was at when calling Emit.
548  ///
549  /// \param Base is the point from which all offsets into the structure are
550  /// based. This is offset 0 in the stream that was used when Emitting the
551  /// table.
552  static OnDiskIterableChainedHashTable *
553  Create(const unsigned char *Buckets, const unsigned char *const Payload,
554         const unsigned char *const Base, const Info &InfoObj = Info()) {
555    using namespace llvm::support;
556    assert(Buckets > Base);
557    assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
558           "buckets should be 4-byte aligned.");
559
560    offset_type NumBuckets =
561        endian::readNext<offset_type, little, aligned>(Buckets);
562    offset_type NumEntries =
563        endian::readNext<offset_type, little, aligned>(Buckets);
564    return new OnDiskIterableChainedHashTable<Info>(
565        NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj);
566  }
567};
568
569} // end namespace llvm
570
571#endif // LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
572