prof.h revision 122449b073bcbaa504c4f592ea2d733503c272d2
1/******************************************************************************/ 2#ifdef JEMALLOC_H_TYPES 3 4typedef struct prof_bt_s prof_bt_t; 5typedef struct prof_cnt_s prof_cnt_t; 6typedef struct prof_thr_cnt_s prof_thr_cnt_t; 7typedef struct prof_ctx_s prof_ctx_t; 8typedef struct prof_tdata_s prof_tdata_t; 9 10/* Option defaults. */ 11#define PROF_PREFIX_DEFAULT "jeprof" 12#define LG_PROF_SAMPLE_DEFAULT 0 13#define LG_PROF_INTERVAL_DEFAULT -1 14 15/* 16 * Hard limit on stack backtrace depth. The version of prof_backtrace() that 17 * is based on __builtin_return_address() necessarily has a hard-coded number 18 * of backtrace frame handlers, and should be kept in sync with this setting. 19 */ 20#define PROF_BT_MAX 128 21 22/* Maximum number of backtraces to store in each per thread LRU cache. */ 23#define PROF_TCMAX 1024 24 25/* Initial hash table size. */ 26#define PROF_CKH_MINITEMS 64 27 28/* Size of memory buffer to use when writing dump files. */ 29#define PROF_DUMP_BUFSIZE 65536 30 31/* Size of stack-allocated buffer used by prof_printf(). */ 32#define PROF_PRINTF_BUFSIZE 128 33 34/* 35 * Number of mutexes shared among all ctx's. No space is allocated for these 36 * unless profiling is enabled, so it's okay to over-provision. 37 */ 38#define PROF_NCTX_LOCKS 1024 39 40#endif /* JEMALLOC_H_TYPES */ 41/******************************************************************************/ 42#ifdef JEMALLOC_H_STRUCTS 43 44struct prof_bt_s { 45 /* Backtrace, stored as len program counters. */ 46 void **vec; 47 unsigned len; 48}; 49 50#ifdef JEMALLOC_PROF_LIBGCC 51/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ 52typedef struct { 53 prof_bt_t *bt; 54 unsigned nignore; 55 unsigned max; 56} prof_unwind_data_t; 57#endif 58 59struct prof_cnt_s { 60 /* 61 * Profiling counters. An allocation/deallocation pair can operate on 62 * different prof_thr_cnt_t objects that are linked into the same 63 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go 64 * negative. In principle it is possible for the *bytes counters to 65 * overflow/underflow, but a general solution would require something 66 * like 128-bit counters; this implementation doesn't bother to solve 67 * that problem. 68 */ 69 int64_t curobjs; 70 int64_t curbytes; 71 uint64_t accumobjs; 72 uint64_t accumbytes; 73}; 74 75struct prof_thr_cnt_s { 76 /* Linkage into prof_ctx_t's cnts_ql. */ 77 ql_elm(prof_thr_cnt_t) cnts_link; 78 79 /* Linkage into thread's LRU. */ 80 ql_elm(prof_thr_cnt_t) lru_link; 81 82 /* 83 * Associated context. If a thread frees an object that it did not 84 * allocate, it is possible that the context is not cached in the 85 * thread's hash table, in which case it must be able to look up the 86 * context, insert a new prof_thr_cnt_t into the thread's hash table, 87 * and link it into the prof_ctx_t's cnts_ql. 88 */ 89 prof_ctx_t *ctx; 90 91 /* 92 * Threads use memory barriers to update the counters. Since there is 93 * only ever one writer, the only challenge is for the reader to get a 94 * consistent read of the counters. 95 * 96 * The writer uses this series of operations: 97 * 98 * 1) Increment epoch to an odd number. 99 * 2) Update counters. 100 * 3) Increment epoch to an even number. 101 * 102 * The reader must assure 1) that the epoch is even while it reads the 103 * counters, and 2) that the epoch doesn't change between the time it 104 * starts and finishes reading the counters. 105 */ 106 unsigned epoch; 107 108 /* Profiling counters. */ 109 prof_cnt_t cnts; 110}; 111 112struct prof_ctx_s { 113 /* Associated backtrace. */ 114 prof_bt_t *bt; 115 116 /* Protects cnt_merged and cnts_ql. */ 117 malloc_mutex_t *lock; 118 119 /* Temporary storage for summation during dump. */ 120 prof_cnt_t cnt_summed; 121 122 /* When threads exit, they merge their stats into cnt_merged. */ 123 prof_cnt_t cnt_merged; 124 125 /* 126 * List of profile counters, one for each thread that has allocated in 127 * this context. 128 */ 129 ql_head(prof_thr_cnt_t) cnts_ql; 130}; 131 132struct prof_tdata_s { 133 /* 134 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a 135 * cache of backtraces, with associated thread-specific prof_thr_cnt_t 136 * objects. Other threads may read the prof_thr_cnt_t contents, but no 137 * others will ever write them. 138 * 139 * Upon thread exit, the thread must merge all the prof_thr_cnt_t 140 * counter data into the associated prof_ctx_t objects, and unlink/free 141 * the prof_thr_cnt_t objects. 142 */ 143 ckh_t bt2cnt; 144 145 /* LRU for contents of bt2cnt. */ 146 ql_head(prof_thr_cnt_t) lru_ql; 147 148 /* Backtrace vector, used for calls to prof_backtrace(). */ 149 void **vec; 150 151 /* Sampling state. */ 152 uint64_t prng_state; 153 uint64_t threshold; 154 uint64_t accum; 155}; 156 157#endif /* JEMALLOC_H_STRUCTS */ 158/******************************************************************************/ 159#ifdef JEMALLOC_H_EXTERNS 160 161extern bool opt_prof; 162/* 163 * Even if opt_prof is true, sampling can be temporarily disabled by setting 164 * opt_prof_active to false. No locking is used when updating opt_prof_active, 165 * so there are no guarantees regarding how long it will take for all threads 166 * to notice state changes. 167 */ 168extern bool opt_prof_active; 169extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ 170extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ 171extern bool opt_prof_gdump; /* High-water memory dumping. */ 172extern bool opt_prof_leak; /* Dump leak summary at exit. */ 173extern bool opt_prof_accum; /* Report cumulative bytes. */ 174extern char opt_prof_prefix[PATH_MAX + 1]; 175 176/* 177 * Profile dump interval, measured in bytes allocated. Each arena triggers a 178 * profile dump when it reaches this threshold. The effect is that the 179 * interval between profile dumps averages prof_interval, though the actual 180 * interval between dumps will tend to be sporadic, and the interval will be a 181 * maximum of approximately (prof_interval * narenas). 182 */ 183extern uint64_t prof_interval; 184 185/* 186 * If true, promote small sampled objects to large objects, since small run 187 * headers do not have embedded profile context pointers. 188 */ 189extern bool prof_promote; 190 191void bt_init(prof_bt_t *bt, void **vec); 192void prof_backtrace(prof_bt_t *bt, unsigned nignore); 193prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); 194void prof_idump(void); 195bool prof_mdump(const char *filename); 196void prof_gdump(void); 197prof_tdata_t *prof_tdata_init(void); 198void prof_tdata_cleanup(void *arg); 199void prof_boot0(void); 200void prof_boot1(void); 201bool prof_boot2(void); 202 203#endif /* JEMALLOC_H_EXTERNS */ 204/******************************************************************************/ 205#ifdef JEMALLOC_H_INLINES 206 207#define PROF_ALLOC_PREP(nignore, size, ret) do { \ 208 prof_tdata_t *prof_tdata; \ 209 prof_bt_t bt; \ 210 \ 211 assert(size == s2u(size)); \ 212 \ 213 prof_tdata = *prof_tdata_tsd_get(); \ 214 if (prof_tdata == NULL) { \ 215 prof_tdata = prof_tdata_init(); \ 216 if (prof_tdata == NULL) { \ 217 ret = NULL; \ 218 break; \ 219 } \ 220 } \ 221 \ 222 if (opt_prof_active == false) { \ 223 /* Sampling is currently inactive, so avoid sampling. */\ 224 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 225 } else if (opt_lg_prof_sample == 0) { \ 226 /* Don't bother with sampling logic, since sampling */\ 227 /* interval is 1. */\ 228 bt_init(&bt, prof_tdata->vec); \ 229 prof_backtrace(&bt, nignore); \ 230 ret = prof_lookup(&bt); \ 231 } else { \ 232 if (prof_tdata->threshold == 0) { \ 233 /* Initialize. Seed the prng differently for */\ 234 /* each thread. */\ 235 prof_tdata->prng_state = \ 236 (uint64_t)(uintptr_t)&size; \ 237 prof_sample_threshold_update(prof_tdata); \ 238 } \ 239 \ 240 /* Determine whether to capture a backtrace based on */\ 241 /* whether size is enough for prof_accum to reach */\ 242 /* prof_tdata->threshold. However, delay updating */\ 243 /* these variables until prof_{m,re}alloc(), because */\ 244 /* we don't know for sure that the allocation will */\ 245 /* succeed. */\ 246 /* */\ 247 /* Use subtraction rather than addition to avoid */\ 248 /* potential integer overflow. */\ 249 if (size >= prof_tdata->threshold - \ 250 prof_tdata->accum) { \ 251 bt_init(&bt, prof_tdata->vec); \ 252 prof_backtrace(&bt, nignore); \ 253 ret = prof_lookup(&bt); \ 254 } else \ 255 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 256 } \ 257} while (0) 258 259#ifndef JEMALLOC_ENABLE_INLINE 260malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *) 261 262void prof_sample_threshold_update(prof_tdata_t *prof_tdata); 263prof_ctx_t *prof_ctx_get(const void *ptr); 264void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); 265bool prof_sample_accum_update(size_t size); 266void prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt); 267void prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, 268 size_t old_size, prof_ctx_t *old_ctx); 269void prof_free(const void *ptr, size_t size); 270#endif 271 272#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) 273/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ 274malloc_tsd_externs(prof_tdata, prof_tdata_t *) 275malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL, 276 prof_tdata_cleanup) 277 278JEMALLOC_INLINE void 279prof_sample_threshold_update(prof_tdata_t *prof_tdata) 280{ 281 uint64_t r; 282 double u; 283 284 cassert(config_prof); 285 286 /* 287 * Compute sample threshold as a geometrically distributed random 288 * variable with mean (2^opt_lg_prof_sample). 289 * 290 * __ __ 291 * | log(u) | 1 292 * prof_tdata->threshold = | -------- |, where p = ------------------- 293 * | log(1-p) | opt_lg_prof_sample 294 * 2 295 * 296 * For more information on the math, see: 297 * 298 * Non-Uniform Random Variate Generation 299 * Luc Devroye 300 * Springer-Verlag, New York, 1986 301 * pp 500 302 * (http://cg.scs.carleton.ca/~luc/rnbookindex.html) 303 */ 304 prng64(r, 53, prof_tdata->prng_state, 305 UINT64_C(6364136223846793005), UINT64_C(1442695040888963407)); 306 u = (double)r * (1.0/9007199254740992.0L); 307 prof_tdata->threshold = (uint64_t)(log(u) / 308 log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) 309 + (uint64_t)1U; 310} 311 312JEMALLOC_INLINE prof_ctx_t * 313prof_ctx_get(const void *ptr) 314{ 315 prof_ctx_t *ret; 316 arena_chunk_t *chunk; 317 318 cassert(config_prof); 319 assert(ptr != NULL); 320 321 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 322 if (chunk != ptr) { 323 /* Region. */ 324 ret = arena_prof_ctx_get(ptr); 325 } else 326 ret = huge_prof_ctx_get(ptr); 327 328 return (ret); 329} 330 331JEMALLOC_INLINE void 332prof_ctx_set(const void *ptr, prof_ctx_t *ctx) 333{ 334 arena_chunk_t *chunk; 335 336 cassert(config_prof); 337 assert(ptr != NULL); 338 339 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 340 if (chunk != ptr) { 341 /* Region. */ 342 arena_prof_ctx_set(ptr, ctx); 343 } else 344 huge_prof_ctx_set(ptr, ctx); 345} 346 347JEMALLOC_INLINE bool 348prof_sample_accum_update(size_t size) 349{ 350 prof_tdata_t *prof_tdata; 351 352 cassert(config_prof); 353 /* Sampling logic is unnecessary if the interval is 1. */ 354 assert(opt_lg_prof_sample != 0); 355 356 prof_tdata = *prof_tdata_tsd_get(); 357 assert(prof_tdata != NULL); 358 359 /* Take care to avoid integer overflow. */ 360 if (size >= prof_tdata->threshold - prof_tdata->accum) { 361 prof_tdata->accum -= (prof_tdata->threshold - size); 362 /* Compute new sample threshold. */ 363 prof_sample_threshold_update(prof_tdata); 364 while (prof_tdata->accum >= prof_tdata->threshold) { 365 prof_tdata->accum -= prof_tdata->threshold; 366 prof_sample_threshold_update(prof_tdata); 367 } 368 return (false); 369 } else { 370 prof_tdata->accum += size; 371 return (true); 372 } 373} 374 375JEMALLOC_INLINE void 376prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt) 377{ 378 379 cassert(config_prof); 380 assert(ptr != NULL); 381 assert(size == isalloc(ptr, true)); 382 383 if (opt_lg_prof_sample != 0) { 384 if (prof_sample_accum_update(size)) { 385 /* 386 * Don't sample. For malloc()-like allocation, it is 387 * always possible to tell in advance how large an 388 * object's usable size will be, so there should never 389 * be a difference between the size passed to 390 * PROF_ALLOC_PREP() and prof_malloc(). 391 */ 392 assert((uintptr_t)cnt == (uintptr_t)1U); 393 } 394 } 395 396 if ((uintptr_t)cnt > (uintptr_t)1U) { 397 prof_ctx_set(ptr, cnt->ctx); 398 399 cnt->epoch++; 400 /*********/ 401 mb_write(); 402 /*********/ 403 cnt->cnts.curobjs++; 404 cnt->cnts.curbytes += size; 405 if (opt_prof_accum) { 406 cnt->cnts.accumobjs++; 407 cnt->cnts.accumbytes += size; 408 } 409 /*********/ 410 mb_write(); 411 /*********/ 412 cnt->epoch++; 413 /*********/ 414 mb_write(); 415 /*********/ 416 } else 417 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); 418} 419 420JEMALLOC_INLINE void 421prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, 422 size_t old_size, prof_ctx_t *old_ctx) 423{ 424 prof_thr_cnt_t *told_cnt; 425 426 cassert(config_prof); 427 assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); 428 429 if (ptr != NULL) { 430 assert(size == isalloc(ptr, true)); 431 if (opt_lg_prof_sample != 0) { 432 if (prof_sample_accum_update(size)) { 433 /* 434 * Don't sample. The size passed to 435 * PROF_ALLOC_PREP() was larger than what 436 * actually got allocated, so a backtrace was 437 * captured for this allocation, even though 438 * its actual size was insufficient to cross 439 * the sample threshold. 440 */ 441 cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 442 } 443 } 444 } 445 446 if ((uintptr_t)old_ctx > (uintptr_t)1U) { 447 told_cnt = prof_lookup(old_ctx->bt); 448 if (told_cnt == NULL) { 449 /* 450 * It's too late to propagate OOM for this realloc(), 451 * so operate directly on old_cnt->ctx->cnt_merged. 452 */ 453 malloc_mutex_lock(old_ctx->lock); 454 old_ctx->cnt_merged.curobjs--; 455 old_ctx->cnt_merged.curbytes -= old_size; 456 malloc_mutex_unlock(old_ctx->lock); 457 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 458 } 459 } else 460 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 461 462 if ((uintptr_t)told_cnt > (uintptr_t)1U) 463 told_cnt->epoch++; 464 if ((uintptr_t)cnt > (uintptr_t)1U) { 465 prof_ctx_set(ptr, cnt->ctx); 466 cnt->epoch++; 467 } else 468 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); 469 /*********/ 470 mb_write(); 471 /*********/ 472 if ((uintptr_t)told_cnt > (uintptr_t)1U) { 473 told_cnt->cnts.curobjs--; 474 told_cnt->cnts.curbytes -= old_size; 475 } 476 if ((uintptr_t)cnt > (uintptr_t)1U) { 477 cnt->cnts.curobjs++; 478 cnt->cnts.curbytes += size; 479 if (opt_prof_accum) { 480 cnt->cnts.accumobjs++; 481 cnt->cnts.accumbytes += size; 482 } 483 } 484 /*********/ 485 mb_write(); 486 /*********/ 487 if ((uintptr_t)told_cnt > (uintptr_t)1U) 488 told_cnt->epoch++; 489 if ((uintptr_t)cnt > (uintptr_t)1U) 490 cnt->epoch++; 491 /*********/ 492 mb_write(); /* Not strictly necessary. */ 493} 494 495JEMALLOC_INLINE void 496prof_free(const void *ptr, size_t size) 497{ 498 prof_ctx_t *ctx = prof_ctx_get(ptr); 499 500 cassert(config_prof); 501 502 if ((uintptr_t)ctx > (uintptr_t)1) { 503 assert(size == isalloc(ptr, true)); 504 prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt); 505 506 if (tcnt != NULL) { 507 tcnt->epoch++; 508 /*********/ 509 mb_write(); 510 /*********/ 511 tcnt->cnts.curobjs--; 512 tcnt->cnts.curbytes -= size; 513 /*********/ 514 mb_write(); 515 /*********/ 516 tcnt->epoch++; 517 /*********/ 518 mb_write(); 519 /*********/ 520 } else { 521 /* 522 * OOM during free() cannot be propagated, so operate 523 * directly on cnt->ctx->cnt_merged. 524 */ 525 malloc_mutex_lock(ctx->lock); 526 ctx->cnt_merged.curobjs--; 527 ctx->cnt_merged.curbytes -= size; 528 malloc_mutex_unlock(ctx->lock); 529 } 530 } 531} 532#endif 533 534#endif /* JEMALLOC_H_INLINES */ 535/******************************************************************************/ 536