1//===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Defines facilities for reading and writing on-disk hash tables.
12///
13//===----------------------------------------------------------------------===//
14#ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
15#define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
16
17#include "llvm/Support/Allocator.h"
18#include "llvm/Support/DataTypes.h"
19#include "llvm/Support/MathExtras.h"
20#include "llvm/Support/raw_ostream.h"
21#include "llvm/Support/Host.h"
22#include <cassert>
23#include <cstdlib>
24
25namespace clang {
26
27namespace io {
28
29typedef uint32_t Offset;
30
31inline void Emit8(raw_ostream& Out, uint32_t V) {
32  Out << (unsigned char)(V);
33}
34
35inline void Emit16(raw_ostream& Out, uint32_t V) {
36  Out << (unsigned char)(V);
37  Out << (unsigned char)(V >>  8);
38  assert((V >> 16) == 0);
39}
40
41inline void Emit24(raw_ostream& Out, uint32_t V) {
42  Out << (unsigned char)(V);
43  Out << (unsigned char)(V >>  8);
44  Out << (unsigned char)(V >> 16);
45  assert((V >> 24) == 0);
46}
47
48inline void Emit32(raw_ostream& Out, uint32_t V) {
49  Out << (unsigned char)(V);
50  Out << (unsigned char)(V >>  8);
51  Out << (unsigned char)(V >> 16);
52  Out << (unsigned char)(V >> 24);
53}
54
55inline void Emit64(raw_ostream& Out, uint64_t V) {
56  Out << (unsigned char)(V);
57  Out << (unsigned char)(V >>  8);
58  Out << (unsigned char)(V >> 16);
59  Out << (unsigned char)(V >> 24);
60  Out << (unsigned char)(V >> 32);
61  Out << (unsigned char)(V >> 40);
62  Out << (unsigned char)(V >> 48);
63  Out << (unsigned char)(V >> 56);
64}
65
66inline void Pad(raw_ostream& Out, unsigned A) {
67  Offset off = (Offset) Out.tell();
68  for (uint32_t n = llvm::OffsetToAlignment(off, A); n; --n)
69    Emit8(Out, 0);
70}
71
72inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) {
73  uint16_t V = ((uint16_t)Data[0]) |
74               ((uint16_t)Data[1] <<  8);
75  Data += 2;
76  return V;
77}
78
79inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) {
80  uint32_t V = ((uint32_t)Data[0])  |
81               ((uint32_t)Data[1] << 8)  |
82               ((uint32_t)Data[2] << 16) |
83               ((uint32_t)Data[3] << 24);
84  Data += 4;
85  return V;
86}
87
88inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) {
89  uint64_t V = ((uint64_t)Data[0])  |
90    ((uint64_t)Data[1] << 8)  |
91    ((uint64_t)Data[2] << 16) |
92    ((uint64_t)Data[3] << 24) |
93    ((uint64_t)Data[4] << 32) |
94    ((uint64_t)Data[5] << 40) |
95    ((uint64_t)Data[6] << 48) |
96    ((uint64_t)Data[7] << 56);
97  Data += 8;
98  return V;
99}
100
101inline uint32_t ReadLE32(const unsigned char *&Data) {
102  // Hosts that directly support little-endian 32-bit loads can just
103  // use them.  Big-endian hosts need a bswap.
104  uint32_t V = *((const uint32_t*)Data);
105  if (llvm::sys::isBigEndianHost())
106    V = llvm::ByteSwap_32(V);
107  Data += 4;
108  return V;
109}
110
111} // end namespace io
112
113template<typename Info>
114class OnDiskChainedHashTableGenerator {
115  unsigned NumBuckets;
116  unsigned NumEntries;
117  llvm::BumpPtrAllocator BA;
118
119  class Item {
120  public:
121    typename Info::key_type key;
122    typename Info::data_type data;
123    Item *next;
124    const uint32_t hash;
125
126    Item(typename Info::key_type_ref k, typename Info::data_type_ref d,
127         Info &InfoObj)
128    : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {}
129  };
130
131  class Bucket {
132  public:
133    io::Offset off;
134    Item* head;
135    unsigned length;
136
137    Bucket() {}
138  };
139
140  Bucket* Buckets;
141
142private:
143  void insert(Bucket* b, size_t size, Item* E) {
144    unsigned idx = E->hash & (size - 1);
145    Bucket& B = b[idx];
146    E->next = B.head;
147    ++B.length;
148    B.head = E;
149  }
150
151  void resize(size_t newsize) {
152    Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket));
153    // Populate newBuckets with the old entries.
154    for (unsigned i = 0; i < NumBuckets; ++i)
155      for (Item* E = Buckets[i].head; E ; ) {
156        Item* N = E->next;
157        E->next = 0;
158        insert(newBuckets, newsize, E);
159        E = N;
160      }
161
162    free(Buckets);
163    NumBuckets = newsize;
164    Buckets = newBuckets;
165  }
166
167public:
168
169  void insert(typename Info::key_type_ref key,
170              typename Info::data_type_ref data) {
171    Info InfoObj;
172    insert(key, data, InfoObj);
173  }
174
175  void insert(typename Info::key_type_ref key,
176              typename Info::data_type_ref data, Info &InfoObj) {
177
178    ++NumEntries;
179    if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2);
180    insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data,
181                                                               InfoObj));
182  }
183
184  io::Offset Emit(raw_ostream &out) {
185    Info InfoObj;
186    return Emit(out, InfoObj);
187  }
188
189  io::Offset Emit(raw_ostream &out, Info &InfoObj) {
190    using namespace clang::io;
191
192    // Emit the payload of the table.
193    for (unsigned i = 0; i < NumBuckets; ++i) {
194      Bucket& B = Buckets[i];
195      if (!B.head) continue;
196
197      // Store the offset for the data of this bucket.
198      B.off = out.tell();
199      assert(B.off && "Cannot write a bucket at offset 0. Please add padding.");
200
201      // Write out the number of items in the bucket.
202      Emit16(out, B.length);
203      assert(B.length != 0  && "Bucket has a head but zero length?");
204
205      // Write out the entries in the bucket.
206      for (Item *I = B.head; I ; I = I->next) {
207        Emit32(out, I->hash);
208        const std::pair<unsigned, unsigned>& Len =
209          InfoObj.EmitKeyDataLength(out, I->key, I->data);
210        InfoObj.EmitKey(out, I->key, Len.first);
211        InfoObj.EmitData(out, I->key, I->data, Len.second);
212      }
213    }
214
215    // Emit the hashtable itself.
216    Pad(out, 4);
217    io::Offset TableOff = out.tell();
218    Emit32(out, NumBuckets);
219    Emit32(out, NumEntries);
220    for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off);
221
222    return TableOff;
223  }
224
225  OnDiskChainedHashTableGenerator() {
226    NumEntries = 0;
227    NumBuckets = 64;
228    // Note that we do not need to run the constructors of the individual
229    // Bucket objects since 'calloc' returns bytes that are all 0.
230    Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket));
231  }
232
233  ~OnDiskChainedHashTableGenerator() {
234    std::free(Buckets);
235  }
236};
237
238template<typename Info>
239class OnDiskChainedHashTable {
240  const unsigned NumBuckets;
241  const unsigned NumEntries;
242  const unsigned char* const Buckets;
243  const unsigned char* const Base;
244  Info InfoObj;
245
246public:
247  typedef typename Info::internal_key_type internal_key_type;
248  typedef typename Info::external_key_type external_key_type;
249  typedef typename Info::data_type         data_type;
250
251  OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries,
252                         const unsigned char* buckets,
253                         const unsigned char* base,
254                         const Info &InfoObj = Info())
255    : NumBuckets(numBuckets), NumEntries(numEntries),
256      Buckets(buckets), Base(base), InfoObj(InfoObj) {
257        assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
258               "'buckets' must have a 4-byte alignment");
259      }
260
261  unsigned getNumBuckets() const { return NumBuckets; }
262  unsigned getNumEntries() const { return NumEntries; }
263  const unsigned char* getBase() const { return Base; }
264  const unsigned char* getBuckets() const { return Buckets; }
265
266  bool isEmpty() const { return NumEntries == 0; }
267
268  class iterator {
269    internal_key_type key;
270    const unsigned char* const data;
271    const unsigned len;
272    Info *InfoObj;
273  public:
274    iterator() : data(0), len(0) {}
275    iterator(const internal_key_type k, const unsigned char* d, unsigned l,
276             Info *InfoObj)
277      : key(k), data(d), len(l), InfoObj(InfoObj) {}
278
279    data_type operator*() const { return InfoObj->ReadData(key, data, len); }
280    bool operator==(const iterator& X) const { return X.data == data; }
281    bool operator!=(const iterator& X) const { return X.data != data; }
282  };
283
284  iterator find(const external_key_type& eKey, Info *InfoPtr = 0) {
285    if (!InfoPtr)
286      InfoPtr = &InfoObj;
287
288    using namespace io;
289    const internal_key_type& iKey = InfoObj.GetInternalKey(eKey);
290    unsigned key_hash = InfoObj.ComputeHash(iKey);
291
292    // Each bucket is just a 32-bit offset into the hash table file.
293    unsigned idx = key_hash & (NumBuckets - 1);
294    const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx;
295
296    unsigned offset = ReadLE32(Bucket);
297    if (offset == 0) return iterator(); // Empty bucket.
298    const unsigned char* Items = Base + offset;
299
300    // 'Items' starts with a 16-bit unsigned integer representing the
301    // number of items in this bucket.
302    unsigned len = ReadUnalignedLE16(Items);
303
304    for (unsigned i = 0; i < len; ++i) {
305      // Read the hash.
306      uint32_t item_hash = ReadUnalignedLE32(Items);
307
308      // Determine the length of the key and the data.
309      const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items);
310      unsigned item_len = L.first + L.second;
311
312      // Compare the hashes.  If they are not the same, skip the entry entirely.
313      if (item_hash != key_hash) {
314        Items += item_len;
315        continue;
316      }
317
318      // Read the key.
319      const internal_key_type& X =
320        InfoPtr->ReadKey((const unsigned char* const) Items, L.first);
321
322      // If the key doesn't match just skip reading the value.
323      if (!InfoPtr->EqualKey(X, iKey)) {
324        Items += item_len;
325        continue;
326      }
327
328      // The key matches!
329      return iterator(X, Items + L.first, L.second, InfoPtr);
330    }
331
332    return iterator();
333  }
334
335  iterator end() const { return iterator(); }
336
337  /// \brief Iterates over all of the keys in the table.
338  class key_iterator {
339    const unsigned char* Ptr;
340    unsigned NumItemsInBucketLeft;
341    unsigned NumEntriesLeft;
342    Info *InfoObj;
343  public:
344    typedef external_key_type value_type;
345
346    key_iterator(const unsigned char* const Ptr, unsigned NumEntries,
347                  Info *InfoObj)
348      : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
349        InfoObj(InfoObj) { }
350    key_iterator()
351      : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
352
353    friend bool operator==(const key_iterator &X, const key_iterator &Y) {
354      return X.NumEntriesLeft == Y.NumEntriesLeft;
355    }
356    friend bool operator!=(const key_iterator& X, const key_iterator &Y) {
357      return X.NumEntriesLeft != Y.NumEntriesLeft;
358    }
359
360    key_iterator& operator++() {  // Preincrement
361      if (!NumItemsInBucketLeft) {
362        // 'Items' starts with a 16-bit unsigned integer representing the
363        // number of items in this bucket.
364        NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
365      }
366      Ptr += 4; // Skip the hash.
367      // Determine the length of the key and the data.
368      const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
369      Ptr += L.first + L.second;
370      assert(NumItemsInBucketLeft);
371      --NumItemsInBucketLeft;
372      assert(NumEntriesLeft);
373      --NumEntriesLeft;
374      return *this;
375    }
376    key_iterator operator++(int) {  // Postincrement
377      key_iterator tmp = *this; ++*this; return tmp;
378    }
379
380    value_type operator*() const {
381      const unsigned char* LocalPtr = Ptr;
382      if (!NumItemsInBucketLeft)
383        LocalPtr += 2; // number of items in bucket
384      LocalPtr += 4; // Skip the hash.
385
386      // Determine the length of the key and the data.
387      const std::pair<unsigned, unsigned>& L
388        = Info::ReadKeyDataLength(LocalPtr);
389
390      // Read the key.
391      const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first);
392      return InfoObj->GetExternalKey(Key);
393    }
394  };
395
396  key_iterator key_begin() {
397    return key_iterator(Base + 4, getNumEntries(), &InfoObj);
398  }
399  key_iterator key_end() { return key_iterator(); }
400
401  /// \brief Iterates over all the entries in the table, returning the data.
402  class data_iterator {
403    const unsigned char* Ptr;
404    unsigned NumItemsInBucketLeft;
405    unsigned NumEntriesLeft;
406    Info *InfoObj;
407  public:
408    typedef data_type value_type;
409
410    data_iterator(const unsigned char* const Ptr, unsigned NumEntries,
411                  Info *InfoObj)
412      : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
413        InfoObj(InfoObj) { }
414    data_iterator()
415      : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
416
417    bool operator==(const data_iterator& X) const {
418      return X.NumEntriesLeft == NumEntriesLeft;
419    }
420    bool operator!=(const data_iterator& X) const {
421      return X.NumEntriesLeft != NumEntriesLeft;
422    }
423
424    data_iterator& operator++() {  // Preincrement
425      if (!NumItemsInBucketLeft) {
426        // 'Items' starts with a 16-bit unsigned integer representing the
427        // number of items in this bucket.
428        NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
429      }
430      Ptr += 4; // Skip the hash.
431      // Determine the length of the key and the data.
432      const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
433      Ptr += L.first + L.second;
434      assert(NumItemsInBucketLeft);
435      --NumItemsInBucketLeft;
436      assert(NumEntriesLeft);
437      --NumEntriesLeft;
438      return *this;
439    }
440    data_iterator operator++(int) {  // Postincrement
441      data_iterator tmp = *this; ++*this; return tmp;
442    }
443
444    value_type operator*() const {
445      const unsigned char* LocalPtr = Ptr;
446      if (!NumItemsInBucketLeft)
447        LocalPtr += 2; // number of items in bucket
448      LocalPtr += 4; // Skip the hash.
449
450      // Determine the length of the key and the data.
451      const std::pair<unsigned, unsigned>& L =Info::ReadKeyDataLength(LocalPtr);
452
453      // Read the key.
454      const internal_key_type& Key =
455        InfoObj->ReadKey(LocalPtr, L.first);
456      return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
457    }
458  };
459
460  data_iterator data_begin() {
461    return data_iterator(Base + 4, getNumEntries(), &InfoObj);
462  }
463  data_iterator data_end() { return data_iterator(); }
464
465  Info &getInfoObj() { return InfoObj; }
466
467  static OnDiskChainedHashTable* Create(const unsigned char* buckets,
468                                        const unsigned char* const base,
469                                        const Info &InfoObj = Info()) {
470    using namespace io;
471    assert(buckets > base);
472    assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
473           "buckets should be 4-byte aligned.");
474
475    unsigned numBuckets = ReadLE32(buckets);
476    unsigned numEntries = ReadLE32(buckets);
477    return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets,
478                                            base, InfoObj);
479  }
480};
481
482} // end namespace clang
483
484#endif
485