1#pragma once
2
3#include "Types.h"
4
5#include <math.h>
6#include <vector>
7#include <map>
8#include <algorithm>   // for std::sort
9#include <string.h>    // for memset
10#include <stdio.h>     // for printf
11
12double calcScore ( const int * bins, const int bincount, const int ballcount );
13
14void plot ( double n );
15
16inline double ExpectedCollisions ( double balls, double bins )
17{
18  return balls - bins + bins * pow(1 - 1/bins,balls);
19}
20
21double chooseK ( int b, int k );
22double chooseUpToK ( int n, int k );
23
24//-----------------------------------------------------------------------------
25
26inline uint32_t f3mix ( uint32_t k )
27{
28  k ^= k >> 16;
29  k *= 0x85ebca6b;
30  k ^= k >> 13;
31  k *= 0xc2b2ae35;
32  k ^= k >> 16;
33
34  return k;
35}
36
37//-----------------------------------------------------------------------------
38// Sort the hash list, count the total number of collisions and return
39// the first N collisions for further processing
40
41template< typename hashtype >
42int FindCollisions ( std::vector<hashtype> & hashes,
43                     HashSet<hashtype> & collisions,
44                     int maxCollisions )
45{
46  int collcount = 0;
47
48  std::sort(hashes.begin(),hashes.end());
49
50  for(size_t i = 1; i < hashes.size(); i++)
51  {
52    if(hashes[i] == hashes[i-1])
53    {
54      collcount++;
55
56      if((int)collisions.size() < maxCollisions)
57      {
58        collisions.insert(hashes[i]);
59      }
60    }
61  }
62
63  return collcount;
64}
65
66//-----------------------------------------------------------------------------
67
68template < class keytype, typename hashtype >
69int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys )
70{
71  int collcount = 0;
72
73  typedef std::map<hashtype,keytype> htab;
74  htab tab;
75
76  for(size_t i = 1; i < keys.size(); i++)
77  {
78    keytype & k1 = keys[i];
79
80    hashtype h = hash(&k1,sizeof(keytype),0);
81
82    typename htab::iterator it = tab.find(h);
83
84    if(it != tab.end())
85    {
86      keytype & k2 = (*it).second;
87
88      printf("A: ");
89      printbits(&k1,sizeof(keytype));
90      printf("B: ");
91      printbits(&k2,sizeof(keytype));
92    }
93    else
94    {
95      tab.insert( std::make_pair(h,k1) );
96    }
97  }
98
99  return collcount;
100}
101
102//----------------------------------------------------------------------------
103// Measure the distribution "score" for each possible N-bit span up to 20 bits
104
105template< typename hashtype >
106double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram )
107{
108  printf("Testing distribution - ");
109
110  if(drawDiagram) printf("\n");
111
112  const int hashbits = sizeof(hashtype) * 8;
113
114  int maxwidth = 20;
115
116  // We need at least 5 keys per bin to reliably test distribution biases
117  // down to 1%, so don't bother to test sparser distributions than that
118
119  while(double(hashes.size()) / double(1 << maxwidth) < 5.0)
120  {
121    maxwidth--;
122  }
123
124  std::vector<int> bins;
125  bins.resize(1 << maxwidth);
126
127  double worst = 0;
128  int worstStart = -1;
129  int worstWidth = -1;
130
131  for(int start = 0; start < hashbits; start++)
132  {
133    int width = maxwidth;
134    int bincount = (1 << width);
135
136    memset(&bins[0],0,sizeof(int)*bincount);
137
138    for(size_t j = 0; j < hashes.size(); j++)
139    {
140      hashtype & hash = hashes[j];
141
142      uint32_t index = window(&hash,sizeof(hash),start,width);
143
144      bins[index]++;
145    }
146
147    // Test the distribution, then fold the bins in half,
148    // repeat until we're down to 256 bins
149
150    if(drawDiagram) printf("[");
151
152    while(bincount >= 256)
153    {
154      double n = calcScore(&bins[0],bincount,(int)hashes.size());
155
156      if(drawDiagram) plot(n);
157
158      if(n > worst)
159      {
160        worst = n;
161        worstStart = start;
162        worstWidth = width;
163      }
164
165      width--;
166      bincount /= 2;
167
168      if(width < 8) break;
169
170      for(int i = 0; i < bincount; i++)
171      {
172        bins[i] += bins[i+bincount];
173      }
174    }
175
176    if(drawDiagram) printf("]\n");
177  }
178
179  double pct = worst * 100.0;
180
181  printf("Worst bias is the %3d-bit window at bit %3d - %5.3f%%",worstWidth,worstStart,pct);
182  if(pct >= 1.0) printf(" !!!!! ");
183  printf("\n");
184
185  return worst;
186}
187
188//----------------------------------------------------------------------------
189
190template < typename hashtype >
191bool TestHashList ( std::vector<hashtype> & hashes, std::vector<hashtype> & collisions, bool testDist, bool drawDiagram )
192{
193  bool result = true;
194
195  {
196    size_t count = hashes.size();
197
198    double expected = (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1));
199
200    printf("Testing collisions   - Expected %8.2f, ",expected);
201
202    double collcount = 0;
203
204    HashSet<hashtype> collisions;
205
206    collcount = FindCollisions(hashes,collisions,1000);
207
208    printf("actual %8.2f (%5.2fx)",collcount, collcount / expected);
209
210    if(sizeof(hashtype) == sizeof(uint32_t))
211    {
212    // 2x expected collisions = fail
213
214    // #TODO - collision failure cutoff needs to be expressed as a standard deviation instead
215    // of a scale factor, otherwise we fail erroneously if there are a small expected number
216    // of collisions
217
218    if(double(collcount) / double(expected) > 2.0)
219    {
220      printf(" !!!!! ");
221      result = false;
222    }
223    }
224    else
225    {
226      // For all hashes larger than 32 bits, _any_ collisions are a failure.
227
228      if(collcount > 0)
229      {
230        printf(" !!!!! ");
231        result = false;
232      }
233    }
234
235    printf("\n");
236  }
237
238  //----------
239
240  if(testDist)
241  {
242    TestDistribution(hashes,drawDiagram);
243  }
244
245  return result;
246}
247
248//----------
249
250template < typename hashtype >
251bool TestHashList ( std::vector<hashtype> & hashes, bool /*testColl*/, bool testDist, bool drawDiagram )
252{
253  std::vector<hashtype> collisions;
254
255  return TestHashList(hashes,collisions,testDist,drawDiagram);
256}
257
258//-----------------------------------------------------------------------------
259
260template < class keytype, typename hashtype >
261bool TestKeyList ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool testColl, bool testDist, bool drawDiagram )
262{
263  int keycount = (int)keys.size();
264
265  std::vector<hashtype> hashes;
266
267  hashes.resize(keycount);
268
269  printf("Hashing");
270
271  for(int i = 0; i < keycount; i++)
272  {
273    if(i % (keycount / 10) == 0) printf(".");
274
275    keytype & k = keys[i];
276
277    hash(&k,sizeof(k),0,&hashes[i]);
278  }
279
280  printf("\n");
281
282  bool result = TestHashList(hashes,testColl,testDist,drawDiagram);
283
284  printf("\n");
285
286  return result;
287}
288
289//-----------------------------------------------------------------------------
290// Bytepair test - generate 16-bit indices from all possible non-overlapping
291// 8-bit sections of the hash value, check distribution on all of them.
292
293// This is a very good test for catching weak intercorrelations between bits -
294// much harder to pass than the normal distribution test. However, it doesn't
295// really model the normal usage of hash functions in hash table lookup, so
296// I'm not sure it's that useful (and hash functions that fail this test but
297// pass the normal distribution test still work well in practice)
298
299template < typename hashtype >
300double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiagram )
301{
302  const int nbytes = sizeof(hashtype);
303  const int hashbits = nbytes * 8;
304
305  const int nbins = 65536;
306
307  std::vector<int> bins(nbins,0);
308
309  double worst = 0;
310
311  for(int a = 0; a < hashbits; a++)
312  {
313    if(drawDiagram) if((a % 8 == 0) && (a > 0)) printf("\n");
314
315    if(drawDiagram) printf("[");
316
317    for(int b = 0; b < hashbits; b++)
318    {
319      if(drawDiagram) if((b % 8 == 0) && (b > 0)) printf(" ");
320
321      bins.clear();
322      bins.resize(nbins,0);
323
324      for(size_t i = 0; i < hashes.size(); i++)
325      {
326        hashtype & hash = hashes[i];
327
328        uint32_t pa = window(&hash,sizeof(hash),a,8);
329        uint32_t pb = window(&hash,sizeof(hash),b,8);
330
331        bins[pa | (pb << 8)]++;
332      }
333
334      double s = calcScore(bins,bins.size(),hashes.size());
335
336      if(drawDiagram) plot(s);
337
338      if(s > worst)
339      {
340        worst = s;
341      }
342    }
343
344    if(drawDiagram) printf("]\n");
345  }
346
347  return worst;
348}
349
350//-----------------------------------------------------------------------------
351// Simplified test - only check 64k distributions, and only on byte boundaries
352
353template < typename hashtype >
354void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, double & davg )
355{
356  const int hashbits = sizeof(hashtype) * 8;
357  const int nbins = 65536;
358
359  std::vector<int> bins(nbins,0);
360
361  dworst = -1.0e90;
362  davg = 0;
363
364  for(int start = 0; start < hashbits; start += 8)
365  {
366    bins.clear();
367    bins.resize(nbins,0);
368
369    for(size_t j = 0; j < hashes.size(); j++)
370    {
371      hashtype & hash = hashes[j];
372
373      uint32_t index = window(&hash,sizeof(hash),start,16);
374
375      bins[index]++;
376    }
377
378    double n = calcScore(&bins.front(),(int)bins.size(),(int)hashes.size());
379
380    davg += n;
381
382    if(n > dworst) dworst = n;
383  }
384
385  davg /= double(hashbits/8);
386}
387
388//-----------------------------------------------------------------------------
389