genzipf.c revision 444256eaf4f0ffdc4cacef37873d58f2a65bf8e6
1/* 2 * Generate/analyze pareto/zipf distributions to better understand 3 * what an access pattern would look like. 4 * 5 * For instance, the following would generate a zipf distribution 6 * with theta 1.2, using 100,000 values and split the reporting into 7 * 20 buckets: 8 * 9 * t/genzipf zipf 1.2 100000 20 10 * 11 * Only the distribution type (zipf or pareto) and spread input need 12 * to be given, if not given defaults are used. 13 * 14 */ 15#include <stdio.h> 16#include <stdlib.h> 17#include <fcntl.h> 18#include <string.h> 19#include <unistd.h> 20 21#include "../lib/zipf.h" 22#include "../flist.h" 23#include "../hash.h" 24#include "../rbtree.h" 25 26#define DEF_NR 1000000 27#define DEF_NR_OUTPUT 23 28 29struct node { 30 struct flist_head list; 31 struct rb_node rb; 32 unsigned long long val; 33 unsigned long hits; 34}; 35 36static struct flist_head *hash; 37static unsigned long hash_bits = 24; 38static unsigned long hash_size = 1 << 24; 39static struct rb_root rb; 40 41enum { 42 TYPE_NONE = 0, 43 TYPE_ZIPF, 44 TYPE_PARETO, 45}; 46static const char *dist_types[] = { "None", "Zipf", "Pareto" }; 47 48static int dist_type = TYPE_ZIPF; 49static unsigned long gb_size = 500; 50static unsigned long block_size = 4096; 51static unsigned long output_nranges = DEF_NR_OUTPUT; 52static double percentage; 53static double dist_val; 54 55#define DEF_ZIPF_VAL 1.2 56#define DEF_PARETO_VAL 0.3 57 58static struct node *hash_lookup(unsigned long long val) 59{ 60 struct flist_head *l = &hash[hash_long(val, hash_bits)]; 61 struct flist_head *entry; 62 struct node *n; 63 64 flist_for_each(entry, l) { 65 n = flist_entry(entry, struct node, list); 66 if (n->val == val) 67 return n; 68 } 69 70 return NULL; 71} 72 73static void hash_insert(unsigned long long val) 74{ 75 struct flist_head *l = &hash[hash_long(val, hash_bits)]; 76 struct node *n = malloc(sizeof(*n)); 77 78 n->val = val; 79 n->hits = 1; 80 flist_add_tail(&n->list, l); 81} 82 83static void rb_insert(struct node *n) 84{ 85 struct rb_node **p, *parent; 86 87 memset(&n->rb, 0, sizeof(n->rb)); 88 p = &rb.rb_node; 89 parent = NULL; 90 while (*p) { 91 struct node *__n; 92 93 parent = *p; 94 __n = rb_entry(parent, struct node, rb); 95 if (n->hits > __n->hits) 96 p = &(*p)->rb_left; 97 else 98 p = &(*p)->rb_right; 99 } 100 101 rb_link_node(&n->rb, parent, p); 102 rb_insert_color(&n->rb, &rb); 103} 104 105static unsigned long rb_add(struct flist_head *list) 106{ 107 struct flist_head *entry; 108 unsigned long ret = 0; 109 struct node *n; 110 111 flist_for_each(entry, list) { 112 n = flist_entry(entry, struct node, list); 113 114 rb_insert(n); 115 ret++; 116 } 117 118 return ret; 119} 120 121static unsigned long rb_gen(void) 122{ 123 unsigned long ret = 0; 124 unsigned int i; 125 126 for (i = 0; i < hash_size; i++) 127 ret += rb_add(&hash[i]); 128 129 return ret; 130} 131 132static int parse_options(int argc, char *argv[]) 133{ 134 const char *optstring = "t:g:i:o:b:p:"; 135 int c, dist_val_set = 0; 136 137 while ((c = getopt(argc, argv, optstring)) != -1) { 138 switch (c) { 139 case 'p': 140 percentage = atof(optarg); 141 break; 142 case 'b': 143 block_size = strtoul(optarg, NULL, 10); 144 break; 145 case 't': 146 if (!strncmp(optarg, "zipf", 4)) 147 dist_type = TYPE_ZIPF; 148 else if (!strncmp(optarg, "pareto", 6)) 149 dist_type = TYPE_PARETO; 150 else { 151 printf("wrong dist type: %s\n", optarg); 152 return 1; 153 } 154 break; 155 case 'g': 156 gb_size = strtoul(optarg, NULL, 10); 157 break; 158 case 'i': 159 dist_val = atof(optarg); 160 dist_val_set = 1; 161 break; 162 case 'o': 163 output_nranges = strtoul(optarg, NULL, 10); 164 break; 165 default: 166 printf("bad option %c\n", c); 167 return 1; 168 } 169 } 170 171 if (dist_type == TYPE_PARETO) { 172 if ((dist_val >= 1.00 || dist_val < 0.00)) { 173 printf("pareto input must be > 0.00 and < 1.00\n"); 174 return 1; 175 } 176 if (!dist_val_set) 177 dist_val = DEF_PARETO_VAL; 178 } else if (dist_type == TYPE_ZIPF) { 179 if (dist_val == 1.0) { 180 printf("zipf input must be different than 1.0\n"); 181 return 1; 182 } 183 if (!dist_val_set) 184 dist_val = DEF_ZIPF_VAL; 185 } 186 187 return 0; 188} 189 190struct output_sum { 191 double output; 192 unsigned int nranges; 193}; 194 195int main(int argc, char *argv[]) 196{ 197 unsigned long offset; 198 unsigned long i, j, nr_vals, cur_vals, interval, total_vals; 199 unsigned long long nranges; 200 struct output_sum *output_sums; 201 double perc, perc_i; 202 struct zipf_state zs; 203 struct rb_node *n; 204 205 if (parse_options(argc, argv)) 206 return 1; 207 208 printf("Generating %s distribution with %f input and %lu GB size and %lu block_size.\n", dist_types[dist_type], dist_val, gb_size, block_size); 209 210 nranges = gb_size * 1024 * 1024 * 1024ULL; 211 nranges /= block_size; 212 213 if (dist_type == TYPE_ZIPF) 214 zipf_init(&zs, nranges, dist_val, 1); 215 else 216 pareto_init(&zs, nranges, dist_val, 1); 217 218 hash_bits = 0; 219 hash_size = nranges; 220 while ((hash_size >>= 1) != 0) 221 hash_bits++; 222 223 hash_size = 1 << hash_bits; 224 225 hash = malloc(hash_size * sizeof(struct flist_head)); 226 for (i = 0; i < hash_size; i++) 227 INIT_FLIST_HEAD(&hash[i]); 228 229 for (nr_vals = i = 0; i < nranges; i++) { 230 struct node *n; 231 232 if (dist_type == TYPE_ZIPF) 233 offset = zipf_next(&zs); 234 else 235 offset = pareto_next(&zs); 236 237 n = hash_lookup(offset); 238 if (n) 239 n->hits++; 240 else 241 hash_insert(offset); 242 243 nr_vals++; 244 } 245 246 nr_vals = rb_gen(); 247 248 interval = (nr_vals + output_nranges - 1) / output_nranges; 249 250 output_sums = malloc(output_nranges * sizeof(struct output_sum)); 251 for (i = 0; i < output_nranges; i++) { 252 output_sums[i].output = 0.0; 253 output_sums[i].nranges = 1; 254 } 255 256 total_vals = i = j = cur_vals = 0; 257 258 n = rb_first(&rb); 259 while (n) { 260 struct node *node = rb_entry(n, struct node, rb); 261 struct output_sum *os = &output_sums[j]; 262 263 if (i >= interval) { 264 os->output = (double) (cur_vals + 1) / (double) nranges; 265 os->output *= 100.0; 266 j++; 267 cur_vals = node->hits; 268 interval += (nr_vals + output_nranges - 1) / output_nranges; 269 } else { 270 cur_vals += node->hits; 271 os->nranges += node->hits; 272 } 273 274 i++; 275 total_vals += node->hits; 276 277 if (percentage) { 278 unsigned long blocks = percentage * nranges / 100; 279 280 if (total_vals >= blocks) { 281 double cs = i * block_size / (1024 * 1024); 282 char p = 'M'; 283 284 if (cs > 1024.0) { 285 cs /= 1024.0; 286 p = 'G'; 287 } 288 if (cs > 1024.0) { 289 cs /= 1024.0; 290 p = 'T'; 291 } 292 293 printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p); 294 percentage = 0.0; 295 } 296 } 297 298 n = rb_next(n); 299 } 300 301 perc_i = 100.0 / (double) output_nranges; 302 perc = 0.0; 303 304 printf("\n Rows Hits No Hits Size\n"); 305 printf("--------------------------------------------------------\n"); 306 for (i = 0; i < j; i++) { 307 struct output_sum *os = &output_sums[i]; 308 double gb = (double) os->nranges * block_size / 1024.0; 309 char p = 'K'; 310 311 if (gb > 1024.0) { 312 p = 'M'; 313 gb /= 1024.0; 314 } 315 if (gb > 1024.0) { 316 p = 'G'; 317 gb /= 1024.0; 318 } 319 320 perc += perc_i; 321 printf("%s %6.2f%%\t%6.2f%%\t\t%8u\t%6.2f%c\n", i ? "|->" : "Top", perc, os->output, os->nranges, gb, p); 322 } 323 324 free(output_sums); 325 free(hash); 326 return 0; 327} 328