1//===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Defines facilities for reading and writing on-disk hash tables.
12///
13//===----------------------------------------------------------------------===//
14#ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
15#define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
16
17#include "clang/Basic/LLVM.h"
18#include "llvm/Support/Allocator.h"
19#include "llvm/Support/DataTypes.h"
20#include "llvm/Support/Host.h"
21#include "llvm/Support/MathExtras.h"
22#include "llvm/Support/raw_ostream.h"
23#include <cassert>
24#include <cstdlib>
25
26namespace clang {
27
28namespace io {
29
30typedef uint32_t Offset;
31
32inline void Emit8(raw_ostream& Out, uint32_t V) {
33  Out << (unsigned char)(V);
34}
35
36inline void Emit16(raw_ostream& Out, uint32_t V) {
37  Out << (unsigned char)(V);
38  Out << (unsigned char)(V >>  8);
39  assert((V >> 16) == 0);
40}
41
42inline void Emit24(raw_ostream& Out, uint32_t V) {
43  Out << (unsigned char)(V);
44  Out << (unsigned char)(V >>  8);
45  Out << (unsigned char)(V >> 16);
46  assert((V >> 24) == 0);
47}
48
49inline void Emit32(raw_ostream& Out, uint32_t V) {
50  Out << (unsigned char)(V);
51  Out << (unsigned char)(V >>  8);
52  Out << (unsigned char)(V >> 16);
53  Out << (unsigned char)(V >> 24);
54}
55
56inline void Emit64(raw_ostream& Out, uint64_t V) {
57  Out << (unsigned char)(V);
58  Out << (unsigned char)(V >>  8);
59  Out << (unsigned char)(V >> 16);
60  Out << (unsigned char)(V >> 24);
61  Out << (unsigned char)(V >> 32);
62  Out << (unsigned char)(V >> 40);
63  Out << (unsigned char)(V >> 48);
64  Out << (unsigned char)(V >> 56);
65}
66
67inline void Pad(raw_ostream& Out, unsigned A) {
68  Offset off = (Offset) Out.tell();
69  for (uint32_t n = llvm::OffsetToAlignment(off, A); n; --n)
70    Emit8(Out, 0);
71}
72
73inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) {
74  uint16_t V = ((uint16_t)Data[0]) |
75               ((uint16_t)Data[1] <<  8);
76  Data += 2;
77  return V;
78}
79
80inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) {
81  uint32_t V = ((uint32_t)Data[0])  |
82               ((uint32_t)Data[1] << 8)  |
83               ((uint32_t)Data[2] << 16) |
84               ((uint32_t)Data[3] << 24);
85  Data += 4;
86  return V;
87}
88
89inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) {
90  uint64_t V = ((uint64_t)Data[0])  |
91    ((uint64_t)Data[1] << 8)  |
92    ((uint64_t)Data[2] << 16) |
93    ((uint64_t)Data[3] << 24) |
94    ((uint64_t)Data[4] << 32) |
95    ((uint64_t)Data[5] << 40) |
96    ((uint64_t)Data[6] << 48) |
97    ((uint64_t)Data[7] << 56);
98  Data += 8;
99  return V;
100}
101
102inline uint32_t ReadLE32(const unsigned char *&Data) {
103  // Hosts that directly support little-endian 32-bit loads can just
104  // use them.  Big-endian hosts need a bswap.
105  uint32_t V = *((const uint32_t*)Data);
106  if (llvm::sys::IsBigEndianHost)
107    V = llvm::ByteSwap_32(V);
108  Data += 4;
109  return V;
110}
111
112} // end namespace io
113
114template<typename Info>
115class OnDiskChainedHashTableGenerator {
116  unsigned NumBuckets;
117  unsigned NumEntries;
118  llvm::BumpPtrAllocator BA;
119
120  class Item {
121  public:
122    typename Info::key_type key;
123    typename Info::data_type data;
124    Item *next;
125    const uint32_t hash;
126
127    Item(typename Info::key_type_ref k, typename Info::data_type_ref d,
128         Info &InfoObj)
129    : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {}
130  };
131
132  class Bucket {
133  public:
134    io::Offset off;
135    Item* head;
136    unsigned length;
137
138    Bucket() {}
139  };
140
141  Bucket* Buckets;
142
143private:
144  void insert(Bucket* b, size_t size, Item* E) {
145    unsigned idx = E->hash & (size - 1);
146    Bucket& B = b[idx];
147    E->next = B.head;
148    ++B.length;
149    B.head = E;
150  }
151
152  void resize(size_t newsize) {
153    Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket));
154    // Populate newBuckets with the old entries.
155    for (unsigned i = 0; i < NumBuckets; ++i)
156      for (Item* E = Buckets[i].head; E ; ) {
157        Item* N = E->next;
158        E->next = 0;
159        insert(newBuckets, newsize, E);
160        E = N;
161      }
162
163    free(Buckets);
164    NumBuckets = newsize;
165    Buckets = newBuckets;
166  }
167
168public:
169
170  void insert(typename Info::key_type_ref key,
171              typename Info::data_type_ref data) {
172    Info InfoObj;
173    insert(key, data, InfoObj);
174  }
175
176  void insert(typename Info::key_type_ref key,
177              typename Info::data_type_ref data, Info &InfoObj) {
178
179    ++NumEntries;
180    if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2);
181    insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data,
182                                                               InfoObj));
183  }
184
185  io::Offset Emit(raw_ostream &out) {
186    Info InfoObj;
187    return Emit(out, InfoObj);
188  }
189
190  io::Offset Emit(raw_ostream &out, Info &InfoObj) {
191    using namespace clang::io;
192
193    // Emit the payload of the table.
194    for (unsigned i = 0; i < NumBuckets; ++i) {
195      Bucket& B = Buckets[i];
196      if (!B.head) continue;
197
198      // Store the offset for the data of this bucket.
199      B.off = out.tell();
200      assert(B.off && "Cannot write a bucket at offset 0. Please add padding.");
201
202      // Write out the number of items in the bucket.
203      Emit16(out, B.length);
204      assert(B.length != 0  && "Bucket has a head but zero length?");
205
206      // Write out the entries in the bucket.
207      for (Item *I = B.head; I ; I = I->next) {
208        Emit32(out, I->hash);
209        const std::pair<unsigned, unsigned>& Len =
210          InfoObj.EmitKeyDataLength(out, I->key, I->data);
211        InfoObj.EmitKey(out, I->key, Len.first);
212        InfoObj.EmitData(out, I->key, I->data, Len.second);
213      }
214    }
215
216    // Emit the hashtable itself.
217    Pad(out, 4);
218    io::Offset TableOff = out.tell();
219    Emit32(out, NumBuckets);
220    Emit32(out, NumEntries);
221    for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off);
222
223    return TableOff;
224  }
225
226  OnDiskChainedHashTableGenerator() {
227    NumEntries = 0;
228    NumBuckets = 64;
229    // Note that we do not need to run the constructors of the individual
230    // Bucket objects since 'calloc' returns bytes that are all 0.
231    Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket));
232  }
233
234  ~OnDiskChainedHashTableGenerator() {
235    std::free(Buckets);
236  }
237};
238
239template<typename Info>
240class OnDiskChainedHashTable {
241  const unsigned NumBuckets;
242  const unsigned NumEntries;
243  const unsigned char* const Buckets;
244  const unsigned char* const Base;
245  Info InfoObj;
246
247public:
248  typedef typename Info::internal_key_type internal_key_type;
249  typedef typename Info::external_key_type external_key_type;
250  typedef typename Info::data_type         data_type;
251
252  OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries,
253                         const unsigned char* buckets,
254                         const unsigned char* base,
255                         const Info &InfoObj = Info())
256    : NumBuckets(numBuckets), NumEntries(numEntries),
257      Buckets(buckets), Base(base), InfoObj(InfoObj) {
258        assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
259               "'buckets' must have a 4-byte alignment");
260      }
261
262  unsigned getNumBuckets() const { return NumBuckets; }
263  unsigned getNumEntries() const { return NumEntries; }
264  const unsigned char* getBase() const { return Base; }
265  const unsigned char* getBuckets() const { return Buckets; }
266
267  bool isEmpty() const { return NumEntries == 0; }
268
269  class iterator {
270    internal_key_type key;
271    const unsigned char* const data;
272    const unsigned len;
273    Info *InfoObj;
274  public:
275    iterator() : data(0), len(0) {}
276    iterator(const internal_key_type k, const unsigned char* d, unsigned l,
277             Info *InfoObj)
278      : key(k), data(d), len(l), InfoObj(InfoObj) {}
279
280    data_type operator*() const { return InfoObj->ReadData(key, data, len); }
281    bool operator==(const iterator& X) const { return X.data == data; }
282    bool operator!=(const iterator& X) const { return X.data != data; }
283  };
284
285  iterator find(const external_key_type& eKey, Info *InfoPtr = 0) {
286    if (!InfoPtr)
287      InfoPtr = &InfoObj;
288
289    using namespace io;
290    const internal_key_type& iKey = InfoObj.GetInternalKey(eKey);
291    unsigned key_hash = InfoObj.ComputeHash(iKey);
292
293    // Each bucket is just a 32-bit offset into the hash table file.
294    unsigned idx = key_hash & (NumBuckets - 1);
295    const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx;
296
297    unsigned offset = ReadLE32(Bucket);
298    if (offset == 0) return iterator(); // Empty bucket.
299    const unsigned char* Items = Base + offset;
300
301    // 'Items' starts with a 16-bit unsigned integer representing the
302    // number of items in this bucket.
303    unsigned len = ReadUnalignedLE16(Items);
304
305    for (unsigned i = 0; i < len; ++i) {
306      // Read the hash.
307      uint32_t item_hash = ReadUnalignedLE32(Items);
308
309      // Determine the length of the key and the data.
310      const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items);
311      unsigned item_len = L.first + L.second;
312
313      // Compare the hashes.  If they are not the same, skip the entry entirely.
314      if (item_hash != key_hash) {
315        Items += item_len;
316        continue;
317      }
318
319      // Read the key.
320      const internal_key_type& X =
321        InfoPtr->ReadKey((const unsigned char* const) Items, L.first);
322
323      // If the key doesn't match just skip reading the value.
324      if (!InfoPtr->EqualKey(X, iKey)) {
325        Items += item_len;
326        continue;
327      }
328
329      // The key matches!
330      return iterator(X, Items + L.first, L.second, InfoPtr);
331    }
332
333    return iterator();
334  }
335
336  iterator end() const { return iterator(); }
337
338  /// \brief Iterates over all of the keys in the table.
339  class key_iterator {
340    const unsigned char* Ptr;
341    unsigned NumItemsInBucketLeft;
342    unsigned NumEntriesLeft;
343    Info *InfoObj;
344  public:
345    typedef external_key_type value_type;
346
347    key_iterator(const unsigned char* const Ptr, unsigned NumEntries,
348                  Info *InfoObj)
349      : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
350        InfoObj(InfoObj) { }
351    key_iterator()
352      : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
353
354    friend bool operator==(const key_iterator &X, const key_iterator &Y) {
355      return X.NumEntriesLeft == Y.NumEntriesLeft;
356    }
357    friend bool operator!=(const key_iterator& X, const key_iterator &Y) {
358      return X.NumEntriesLeft != Y.NumEntriesLeft;
359    }
360
361    key_iterator& operator++() {  // Preincrement
362      if (!NumItemsInBucketLeft) {
363        // 'Items' starts with a 16-bit unsigned integer representing the
364        // number of items in this bucket.
365        NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
366      }
367      Ptr += 4; // Skip the hash.
368      // Determine the length of the key and the data.
369      const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
370      Ptr += L.first + L.second;
371      assert(NumItemsInBucketLeft);
372      --NumItemsInBucketLeft;
373      assert(NumEntriesLeft);
374      --NumEntriesLeft;
375      return *this;
376    }
377    key_iterator operator++(int) {  // Postincrement
378      key_iterator tmp = *this; ++*this; return tmp;
379    }
380
381    value_type operator*() const {
382      const unsigned char* LocalPtr = Ptr;
383      if (!NumItemsInBucketLeft)
384        LocalPtr += 2; // number of items in bucket
385      LocalPtr += 4; // Skip the hash.
386
387      // Determine the length of the key and the data.
388      const std::pair<unsigned, unsigned>& L
389        = Info::ReadKeyDataLength(LocalPtr);
390
391      // Read the key.
392      const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first);
393      return InfoObj->GetExternalKey(Key);
394    }
395  };
396
397  key_iterator key_begin() {
398    return key_iterator(Base + 4, getNumEntries(), &InfoObj);
399  }
400  key_iterator key_end() { return key_iterator(); }
401
402  /// \brief Iterates over all the entries in the table, returning the data.
403  class data_iterator {
404    const unsigned char* Ptr;
405    unsigned NumItemsInBucketLeft;
406    unsigned NumEntriesLeft;
407    Info *InfoObj;
408  public:
409    typedef data_type value_type;
410
411    data_iterator(const unsigned char* const Ptr, unsigned NumEntries,
412                  Info *InfoObj)
413      : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
414        InfoObj(InfoObj) { }
415    data_iterator()
416      : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
417
418    bool operator==(const data_iterator& X) const {
419      return X.NumEntriesLeft == NumEntriesLeft;
420    }
421    bool operator!=(const data_iterator& X) const {
422      return X.NumEntriesLeft != NumEntriesLeft;
423    }
424
425    data_iterator& operator++() {  // Preincrement
426      if (!NumItemsInBucketLeft) {
427        // 'Items' starts with a 16-bit unsigned integer representing the
428        // number of items in this bucket.
429        NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
430      }
431      Ptr += 4; // Skip the hash.
432      // Determine the length of the key and the data.
433      const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
434      Ptr += L.first + L.second;
435      assert(NumItemsInBucketLeft);
436      --NumItemsInBucketLeft;
437      assert(NumEntriesLeft);
438      --NumEntriesLeft;
439      return *this;
440    }
441    data_iterator operator++(int) {  // Postincrement
442      data_iterator tmp = *this; ++*this; return tmp;
443    }
444
445    value_type operator*() const {
446      const unsigned char* LocalPtr = Ptr;
447      if (!NumItemsInBucketLeft)
448        LocalPtr += 2; // number of items in bucket
449      LocalPtr += 4; // Skip the hash.
450
451      // Determine the length of the key and the data.
452      const std::pair<unsigned, unsigned>& L =Info::ReadKeyDataLength(LocalPtr);
453
454      // Read the key.
455      const internal_key_type& Key =
456        InfoObj->ReadKey(LocalPtr, L.first);
457      return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
458    }
459  };
460
461  data_iterator data_begin() {
462    return data_iterator(Base + 4, getNumEntries(), &InfoObj);
463  }
464  data_iterator data_end() { return data_iterator(); }
465
466  Info &getInfoObj() { return InfoObj; }
467
468  static OnDiskChainedHashTable* Create(const unsigned char* buckets,
469                                        const unsigned char* const base,
470                                        const Info &InfoObj = Info()) {
471    using namespace io;
472    assert(buckets > base);
473    assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
474           "buckets should be 4-byte aligned.");
475
476    unsigned numBuckets = ReadLE32(buckets);
477    unsigned numEntries = ReadLE32(buckets);
478    return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets,
479                                            base, InfoObj);
480  }
481};
482
483} // end namespace clang
484
485#endif
486