1//===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// 10/// \file 11/// \brief Defines facilities for reading and writing on-disk hash tables. 12/// 13//===----------------------------------------------------------------------===// 14#ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H 15#define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H 16 17#include "llvm/Support/Allocator.h" 18#include "llvm/Support/DataTypes.h" 19#include "llvm/Support/MathExtras.h" 20#include "llvm/Support/raw_ostream.h" 21#include "llvm/Support/Host.h" 22#include <cassert> 23#include <cstdlib> 24 25namespace clang { 26 27namespace io { 28 29typedef uint32_t Offset; 30 31inline void Emit8(raw_ostream& Out, uint32_t V) { 32 Out << (unsigned char)(V); 33} 34 35inline void Emit16(raw_ostream& Out, uint32_t V) { 36 Out << (unsigned char)(V); 37 Out << (unsigned char)(V >> 8); 38 assert((V >> 16) == 0); 39} 40 41inline void Emit24(raw_ostream& Out, uint32_t V) { 42 Out << (unsigned char)(V); 43 Out << (unsigned char)(V >> 8); 44 Out << (unsigned char)(V >> 16); 45 assert((V >> 24) == 0); 46} 47 48inline void Emit32(raw_ostream& Out, uint32_t V) { 49 Out << (unsigned char)(V); 50 Out << (unsigned char)(V >> 8); 51 Out << (unsigned char)(V >> 16); 52 Out << (unsigned char)(V >> 24); 53} 54 55inline void Emit64(raw_ostream& Out, uint64_t V) { 56 Out << (unsigned char)(V); 57 Out << (unsigned char)(V >> 8); 58 Out << (unsigned char)(V >> 16); 59 Out << (unsigned char)(V >> 24); 60 Out << (unsigned char)(V >> 32); 61 Out << (unsigned char)(V >> 40); 62 Out << (unsigned char)(V >> 48); 63 Out << (unsigned char)(V >> 56); 64} 65 66inline void Pad(raw_ostream& Out, unsigned A) { 67 Offset off = (Offset) Out.tell(); 68 for (uint32_t n = llvm::OffsetToAlignment(off, A); n; --n) 69 Emit8(Out, 0); 70} 71 72inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) { 73 uint16_t V = ((uint16_t)Data[0]) | 74 ((uint16_t)Data[1] << 8); 75 Data += 2; 76 return V; 77} 78 79inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) { 80 uint32_t V = ((uint32_t)Data[0]) | 81 ((uint32_t)Data[1] << 8) | 82 ((uint32_t)Data[2] << 16) | 83 ((uint32_t)Data[3] << 24); 84 Data += 4; 85 return V; 86} 87 88inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) { 89 uint64_t V = ((uint64_t)Data[0]) | 90 ((uint64_t)Data[1] << 8) | 91 ((uint64_t)Data[2] << 16) | 92 ((uint64_t)Data[3] << 24) | 93 ((uint64_t)Data[4] << 32) | 94 ((uint64_t)Data[5] << 40) | 95 ((uint64_t)Data[6] << 48) | 96 ((uint64_t)Data[7] << 56); 97 Data += 8; 98 return V; 99} 100 101inline uint32_t ReadLE32(const unsigned char *&Data) { 102 // Hosts that directly support little-endian 32-bit loads can just 103 // use them. Big-endian hosts need a bswap. 104 uint32_t V = *((const uint32_t*)Data); 105 if (llvm::sys::isBigEndianHost()) 106 V = llvm::ByteSwap_32(V); 107 Data += 4; 108 return V; 109} 110 111} // end namespace io 112 113template<typename Info> 114class OnDiskChainedHashTableGenerator { 115 unsigned NumBuckets; 116 unsigned NumEntries; 117 llvm::BumpPtrAllocator BA; 118 119 class Item { 120 public: 121 typename Info::key_type key; 122 typename Info::data_type data; 123 Item *next; 124 const uint32_t hash; 125 126 Item(typename Info::key_type_ref k, typename Info::data_type_ref d, 127 Info &InfoObj) 128 : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {} 129 }; 130 131 class Bucket { 132 public: 133 io::Offset off; 134 Item* head; 135 unsigned length; 136 137 Bucket() {} 138 }; 139 140 Bucket* Buckets; 141 142private: 143 void insert(Bucket* b, size_t size, Item* E) { 144 unsigned idx = E->hash & (size - 1); 145 Bucket& B = b[idx]; 146 E->next = B.head; 147 ++B.length; 148 B.head = E; 149 } 150 151 void resize(size_t newsize) { 152 Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket)); 153 // Populate newBuckets with the old entries. 154 for (unsigned i = 0; i < NumBuckets; ++i) 155 for (Item* E = Buckets[i].head; E ; ) { 156 Item* N = E->next; 157 E->next = 0; 158 insert(newBuckets, newsize, E); 159 E = N; 160 } 161 162 free(Buckets); 163 NumBuckets = newsize; 164 Buckets = newBuckets; 165 } 166 167public: 168 169 void insert(typename Info::key_type_ref key, 170 typename Info::data_type_ref data) { 171 Info InfoObj; 172 insert(key, data, InfoObj); 173 } 174 175 void insert(typename Info::key_type_ref key, 176 typename Info::data_type_ref data, Info &InfoObj) { 177 178 ++NumEntries; 179 if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2); 180 insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data, 181 InfoObj)); 182 } 183 184 io::Offset Emit(raw_ostream &out) { 185 Info InfoObj; 186 return Emit(out, InfoObj); 187 } 188 189 io::Offset Emit(raw_ostream &out, Info &InfoObj) { 190 using namespace clang::io; 191 192 // Emit the payload of the table. 193 for (unsigned i = 0; i < NumBuckets; ++i) { 194 Bucket& B = Buckets[i]; 195 if (!B.head) continue; 196 197 // Store the offset for the data of this bucket. 198 B.off = out.tell(); 199 assert(B.off && "Cannot write a bucket at offset 0. Please add padding."); 200 201 // Write out the number of items in the bucket. 202 Emit16(out, B.length); 203 assert(B.length != 0 && "Bucket has a head but zero length?"); 204 205 // Write out the entries in the bucket. 206 for (Item *I = B.head; I ; I = I->next) { 207 Emit32(out, I->hash); 208 const std::pair<unsigned, unsigned>& Len = 209 InfoObj.EmitKeyDataLength(out, I->key, I->data); 210 InfoObj.EmitKey(out, I->key, Len.first); 211 InfoObj.EmitData(out, I->key, I->data, Len.second); 212 } 213 } 214 215 // Emit the hashtable itself. 216 Pad(out, 4); 217 io::Offset TableOff = out.tell(); 218 Emit32(out, NumBuckets); 219 Emit32(out, NumEntries); 220 for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off); 221 222 return TableOff; 223 } 224 225 OnDiskChainedHashTableGenerator() { 226 NumEntries = 0; 227 NumBuckets = 64; 228 // Note that we do not need to run the constructors of the individual 229 // Bucket objects since 'calloc' returns bytes that are all 0. 230 Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket)); 231 } 232 233 ~OnDiskChainedHashTableGenerator() { 234 std::free(Buckets); 235 } 236}; 237 238template<typename Info> 239class OnDiskChainedHashTable { 240 const unsigned NumBuckets; 241 const unsigned NumEntries; 242 const unsigned char* const Buckets; 243 const unsigned char* const Base; 244 Info InfoObj; 245 246public: 247 typedef typename Info::internal_key_type internal_key_type; 248 typedef typename Info::external_key_type external_key_type; 249 typedef typename Info::data_type data_type; 250 251 OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries, 252 const unsigned char* buckets, 253 const unsigned char* base, 254 const Info &InfoObj = Info()) 255 : NumBuckets(numBuckets), NumEntries(numEntries), 256 Buckets(buckets), Base(base), InfoObj(InfoObj) { 257 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 && 258 "'buckets' must have a 4-byte alignment"); 259 } 260 261 unsigned getNumBuckets() const { return NumBuckets; } 262 unsigned getNumEntries() const { return NumEntries; } 263 const unsigned char* getBase() const { return Base; } 264 const unsigned char* getBuckets() const { return Buckets; } 265 266 bool isEmpty() const { return NumEntries == 0; } 267 268 class iterator { 269 internal_key_type key; 270 const unsigned char* const data; 271 const unsigned len; 272 Info *InfoObj; 273 public: 274 iterator() : data(0), len(0) {} 275 iterator(const internal_key_type k, const unsigned char* d, unsigned l, 276 Info *InfoObj) 277 : key(k), data(d), len(l), InfoObj(InfoObj) {} 278 279 data_type operator*() const { return InfoObj->ReadData(key, data, len); } 280 bool operator==(const iterator& X) const { return X.data == data; } 281 bool operator!=(const iterator& X) const { return X.data != data; } 282 }; 283 284 iterator find(const external_key_type& eKey, Info *InfoPtr = 0) { 285 if (!InfoPtr) 286 InfoPtr = &InfoObj; 287 288 using namespace io; 289 const internal_key_type& iKey = InfoObj.GetInternalKey(eKey); 290 unsigned key_hash = InfoObj.ComputeHash(iKey); 291 292 // Each bucket is just a 32-bit offset into the hash table file. 293 unsigned idx = key_hash & (NumBuckets - 1); 294 const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx; 295 296 unsigned offset = ReadLE32(Bucket); 297 if (offset == 0) return iterator(); // Empty bucket. 298 const unsigned char* Items = Base + offset; 299 300 // 'Items' starts with a 16-bit unsigned integer representing the 301 // number of items in this bucket. 302 unsigned len = ReadUnalignedLE16(Items); 303 304 for (unsigned i = 0; i < len; ++i) { 305 // Read the hash. 306 uint32_t item_hash = ReadUnalignedLE32(Items); 307 308 // Determine the length of the key and the data. 309 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items); 310 unsigned item_len = L.first + L.second; 311 312 // Compare the hashes. If they are not the same, skip the entry entirely. 313 if (item_hash != key_hash) { 314 Items += item_len; 315 continue; 316 } 317 318 // Read the key. 319 const internal_key_type& X = 320 InfoPtr->ReadKey((const unsigned char* const) Items, L.first); 321 322 // If the key doesn't match just skip reading the value. 323 if (!InfoPtr->EqualKey(X, iKey)) { 324 Items += item_len; 325 continue; 326 } 327 328 // The key matches! 329 return iterator(X, Items + L.first, L.second, InfoPtr); 330 } 331 332 return iterator(); 333 } 334 335 iterator end() const { return iterator(); } 336 337 /// \brief Iterates over all of the keys in the table. 338 class key_iterator { 339 const unsigned char* Ptr; 340 unsigned NumItemsInBucketLeft; 341 unsigned NumEntriesLeft; 342 Info *InfoObj; 343 public: 344 typedef external_key_type value_type; 345 346 key_iterator(const unsigned char* const Ptr, unsigned NumEntries, 347 Info *InfoObj) 348 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries), 349 InfoObj(InfoObj) { } 350 key_iterator() 351 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { } 352 353 friend bool operator==(const key_iterator &X, const key_iterator &Y) { 354 return X.NumEntriesLeft == Y.NumEntriesLeft; 355 } 356 friend bool operator!=(const key_iterator& X, const key_iterator &Y) { 357 return X.NumEntriesLeft != Y.NumEntriesLeft; 358 } 359 360 key_iterator& operator++() { // Preincrement 361 if (!NumItemsInBucketLeft) { 362 // 'Items' starts with a 16-bit unsigned integer representing the 363 // number of items in this bucket. 364 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr); 365 } 366 Ptr += 4; // Skip the hash. 367 // Determine the length of the key and the data. 368 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr); 369 Ptr += L.first + L.second; 370 assert(NumItemsInBucketLeft); 371 --NumItemsInBucketLeft; 372 assert(NumEntriesLeft); 373 --NumEntriesLeft; 374 return *this; 375 } 376 key_iterator operator++(int) { // Postincrement 377 key_iterator tmp = *this; ++*this; return tmp; 378 } 379 380 value_type operator*() const { 381 const unsigned char* LocalPtr = Ptr; 382 if (!NumItemsInBucketLeft) 383 LocalPtr += 2; // number of items in bucket 384 LocalPtr += 4; // Skip the hash. 385 386 // Determine the length of the key and the data. 387 const std::pair<unsigned, unsigned>& L 388 = Info::ReadKeyDataLength(LocalPtr); 389 390 // Read the key. 391 const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first); 392 return InfoObj->GetExternalKey(Key); 393 } 394 }; 395 396 key_iterator key_begin() { 397 return key_iterator(Base + 4, getNumEntries(), &InfoObj); 398 } 399 key_iterator key_end() { return key_iterator(); } 400 401 /// \brief Iterates over all the entries in the table, returning the data. 402 class data_iterator { 403 const unsigned char* Ptr; 404 unsigned NumItemsInBucketLeft; 405 unsigned NumEntriesLeft; 406 Info *InfoObj; 407 public: 408 typedef data_type value_type; 409 410 data_iterator(const unsigned char* const Ptr, unsigned NumEntries, 411 Info *InfoObj) 412 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries), 413 InfoObj(InfoObj) { } 414 data_iterator() 415 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { } 416 417 bool operator==(const data_iterator& X) const { 418 return X.NumEntriesLeft == NumEntriesLeft; 419 } 420 bool operator!=(const data_iterator& X) const { 421 return X.NumEntriesLeft != NumEntriesLeft; 422 } 423 424 data_iterator& operator++() { // Preincrement 425 if (!NumItemsInBucketLeft) { 426 // 'Items' starts with a 16-bit unsigned integer representing the 427 // number of items in this bucket. 428 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr); 429 } 430 Ptr += 4; // Skip the hash. 431 // Determine the length of the key and the data. 432 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr); 433 Ptr += L.first + L.second; 434 assert(NumItemsInBucketLeft); 435 --NumItemsInBucketLeft; 436 assert(NumEntriesLeft); 437 --NumEntriesLeft; 438 return *this; 439 } 440 data_iterator operator++(int) { // Postincrement 441 data_iterator tmp = *this; ++*this; return tmp; 442 } 443 444 value_type operator*() const { 445 const unsigned char* LocalPtr = Ptr; 446 if (!NumItemsInBucketLeft) 447 LocalPtr += 2; // number of items in bucket 448 LocalPtr += 4; // Skip the hash. 449 450 // Determine the length of the key and the data. 451 const std::pair<unsigned, unsigned>& L =Info::ReadKeyDataLength(LocalPtr); 452 453 // Read the key. 454 const internal_key_type& Key = 455 InfoObj->ReadKey(LocalPtr, L.first); 456 return InfoObj->ReadData(Key, LocalPtr + L.first, L.second); 457 } 458 }; 459 460 data_iterator data_begin() { 461 return data_iterator(Base + 4, getNumEntries(), &InfoObj); 462 } 463 data_iterator data_end() { return data_iterator(); } 464 465 Info &getInfoObj() { return InfoObj; } 466 467 static OnDiskChainedHashTable* Create(const unsigned char* buckets, 468 const unsigned char* const base, 469 const Info &InfoObj = Info()) { 470 using namespace io; 471 assert(buckets > base); 472 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 && 473 "buckets should be 4-byte aligned."); 474 475 unsigned numBuckets = ReadLE32(buckets); 476 unsigned numEntries = ReadLE32(buckets); 477 return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets, 478 base, InfoObj); 479 } 480}; 481 482} // end namespace clang 483 484#endif 485