prof.h revision 5f60afa01eb2cf7d44024d162a1ecc6cceedcca1
1/******************************************************************************/ 2#ifdef JEMALLOC_H_TYPES 3 4typedef struct prof_bt_s prof_bt_t; 5typedef struct prof_cnt_s prof_cnt_t; 6typedef struct prof_thr_cnt_s prof_thr_cnt_t; 7typedef struct prof_ctx_s prof_ctx_t; 8typedef struct prof_tdata_s prof_tdata_t; 9 10/* Option defaults. */ 11#ifdef JEMALLOC_PROF 12# define PROF_PREFIX_DEFAULT "jeprof" 13#else 14# define PROF_PREFIX_DEFAULT "" 15#endif 16#define LG_PROF_SAMPLE_DEFAULT 19 17#define LG_PROF_INTERVAL_DEFAULT -1 18 19/* 20 * Hard limit on stack backtrace depth. The version of prof_backtrace() that 21 * is based on __builtin_return_address() necessarily has a hard-coded number 22 * of backtrace frame handlers, and should be kept in sync with this setting. 23 */ 24#define PROF_BT_MAX 128 25 26/* Maximum number of backtraces to store in each per thread LRU cache. */ 27#define PROF_TCMAX 1024 28 29/* Initial hash table size. */ 30#define PROF_CKH_MINITEMS 64 31 32/* Size of memory buffer to use when writing dump files. */ 33#define PROF_DUMP_BUFSIZE 65536 34 35/* Size of stack-allocated buffer used by prof_printf(). */ 36#define PROF_PRINTF_BUFSIZE 128 37 38/* 39 * Number of mutexes shared among all ctx's. No space is allocated for these 40 * unless profiling is enabled, so it's okay to over-provision. 41 */ 42#define PROF_NCTX_LOCKS 1024 43 44/* 45 * prof_tdata pointers close to NULL are used to encode state information that 46 * is used for cleaning up during thread shutdown. 47 */ 48#define PROF_TDATA_STATE_REINCARNATED ((prof_tdata_t *)(uintptr_t)1) 49#define PROF_TDATA_STATE_PURGATORY ((prof_tdata_t *)(uintptr_t)2) 50#define PROF_TDATA_STATE_MAX PROF_TDATA_STATE_PURGATORY 51 52#endif /* JEMALLOC_H_TYPES */ 53/******************************************************************************/ 54#ifdef JEMALLOC_H_STRUCTS 55 56struct prof_bt_s { 57 /* Backtrace, stored as len program counters. */ 58 void **vec; 59 unsigned len; 60}; 61 62#ifdef JEMALLOC_PROF_LIBGCC 63/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ 64typedef struct { 65 prof_bt_t *bt; 66 unsigned nignore; 67 unsigned max; 68} prof_unwind_data_t; 69#endif 70 71struct prof_cnt_s { 72 /* 73 * Profiling counters. An allocation/deallocation pair can operate on 74 * different prof_thr_cnt_t objects that are linked into the same 75 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go 76 * negative. In principle it is possible for the *bytes counters to 77 * overflow/underflow, but a general solution would require something 78 * like 128-bit counters; this implementation doesn't bother to solve 79 * that problem. 80 */ 81 int64_t curobjs; 82 int64_t curbytes; 83 uint64_t accumobjs; 84 uint64_t accumbytes; 85}; 86 87struct prof_thr_cnt_s { 88 /* Linkage into prof_ctx_t's cnts_ql. */ 89 ql_elm(prof_thr_cnt_t) cnts_link; 90 91 /* Linkage into thread's LRU. */ 92 ql_elm(prof_thr_cnt_t) lru_link; 93 94 /* 95 * Associated context. If a thread frees an object that it did not 96 * allocate, it is possible that the context is not cached in the 97 * thread's hash table, in which case it must be able to look up the 98 * context, insert a new prof_thr_cnt_t into the thread's hash table, 99 * and link it into the prof_ctx_t's cnts_ql. 100 */ 101 prof_ctx_t *ctx; 102 103 /* 104 * Threads use memory barriers to update the counters. Since there is 105 * only ever one writer, the only challenge is for the reader to get a 106 * consistent read of the counters. 107 * 108 * The writer uses this series of operations: 109 * 110 * 1) Increment epoch to an odd number. 111 * 2) Update counters. 112 * 3) Increment epoch to an even number. 113 * 114 * The reader must assure 1) that the epoch is even while it reads the 115 * counters, and 2) that the epoch doesn't change between the time it 116 * starts and finishes reading the counters. 117 */ 118 unsigned epoch; 119 120 /* Profiling counters. */ 121 prof_cnt_t cnts; 122}; 123 124struct prof_ctx_s { 125 /* Associated backtrace. */ 126 prof_bt_t *bt; 127 128 /* Protects nlimbo, cnt_merged, and cnts_ql. */ 129 malloc_mutex_t *lock; 130 131 /* 132 * Number of threads that currently cause this ctx to be in a state of 133 * limbo due to one of: 134 * - Initializing per thread counters associated with this ctx. 135 * - Preparing to destroy this ctx. 136 * - Dumping a heap profile that includes this ctx. 137 * nlimbo must be 1 (single destroyer) in order to safely destroy the 138 * ctx. 139 */ 140 unsigned nlimbo; 141 142 /* Temporary storage for summation during dump. */ 143 prof_cnt_t cnt_summed; 144 145 /* When threads exit, they merge their stats into cnt_merged. */ 146 prof_cnt_t cnt_merged; 147 148 /* 149 * List of profile counters, one for each thread that has allocated in 150 * this context. 151 */ 152 ql_head(prof_thr_cnt_t) cnts_ql; 153 154 /* Linkage for list of contexts to be dumped. */ 155 ql_elm(prof_ctx_t) dump_link; 156}; 157typedef ql_head(prof_ctx_t) prof_ctx_list_t; 158 159struct prof_tdata_s { 160 /* 161 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a 162 * cache of backtraces, with associated thread-specific prof_thr_cnt_t 163 * objects. Other threads may read the prof_thr_cnt_t contents, but no 164 * others will ever write them. 165 * 166 * Upon thread exit, the thread must merge all the prof_thr_cnt_t 167 * counter data into the associated prof_ctx_t objects, and unlink/free 168 * the prof_thr_cnt_t objects. 169 */ 170 ckh_t bt2cnt; 171 172 /* LRU for contents of bt2cnt. */ 173 ql_head(prof_thr_cnt_t) lru_ql; 174 175 /* Backtrace vector, used for calls to prof_backtrace(). */ 176 void **vec; 177 178 /* Sampling state. */ 179 uint64_t prng_state; 180 uint64_t threshold; 181 uint64_t accum; 182 183 /* State used to avoid dumping while operating on prof internals. */ 184 bool enq; 185 bool enq_idump; 186 bool enq_gdump; 187}; 188 189#endif /* JEMALLOC_H_STRUCTS */ 190/******************************************************************************/ 191#ifdef JEMALLOC_H_EXTERNS 192 193extern bool opt_prof; 194/* 195 * Even if opt_prof is true, sampling can be temporarily disabled by setting 196 * opt_prof_active to false. No locking is used when updating opt_prof_active, 197 * so there are no guarantees regarding how long it will take for all threads 198 * to notice state changes. 199 */ 200extern bool opt_prof_active; 201extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ 202extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ 203extern bool opt_prof_gdump; /* High-water memory dumping. */ 204extern bool opt_prof_final; /* Final profile dumping. */ 205extern bool opt_prof_leak; /* Dump leak summary at exit. */ 206extern bool opt_prof_accum; /* Report cumulative bytes. */ 207extern char opt_prof_prefix[ 208 /* Minimize memory bloat for non-prof builds. */ 209#ifdef JEMALLOC_PROF 210 PATH_MAX + 211#endif 212 1]; 213 214/* 215 * Profile dump interval, measured in bytes allocated. Each arena triggers a 216 * profile dump when it reaches this threshold. The effect is that the 217 * interval between profile dumps averages prof_interval, though the actual 218 * interval between dumps will tend to be sporadic, and the interval will be a 219 * maximum of approximately (prof_interval * narenas). 220 */ 221extern uint64_t prof_interval; 222 223/* 224 * If true, promote small sampled objects to large objects, since small run 225 * headers do not have embedded profile context pointers. 226 */ 227extern bool prof_promote; 228 229void bt_init(prof_bt_t *bt, void **vec); 230void prof_backtrace(prof_bt_t *bt, unsigned nignore); 231prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); 232#ifdef JEMALLOC_JET 233size_t prof_bt_count(void); 234typedef int (prof_dump_open_t)(bool, const char *); 235extern prof_dump_open_t *prof_dump_open; 236#endif 237void prof_idump(void); 238bool prof_mdump(const char *filename); 239void prof_gdump(void); 240prof_tdata_t *prof_tdata_init(void); 241void prof_tdata_cleanup(void *arg); 242void prof_boot0(void); 243void prof_boot1(void); 244bool prof_boot2(void); 245void prof_prefork(void); 246void prof_postfork_parent(void); 247void prof_postfork_child(void); 248 249#endif /* JEMALLOC_H_EXTERNS */ 250/******************************************************************************/ 251#ifdef JEMALLOC_H_INLINES 252 253#define PROF_ALLOC_PREP(nignore, size, ret) do { \ 254 prof_tdata_t *prof_tdata; \ 255 prof_bt_t bt; \ 256 \ 257 assert(size == s2u(size)); \ 258 \ 259 prof_tdata = prof_tdata_get(true); \ 260 if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) { \ 261 if (prof_tdata != NULL) \ 262 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 263 else \ 264 ret = NULL; \ 265 break; \ 266 } \ 267 \ 268 if (opt_prof_active == false) { \ 269 /* Sampling is currently inactive, so avoid sampling. */\ 270 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 271 } else if (opt_lg_prof_sample == 0) { \ 272 /* Don't bother with sampling logic, since sampling */\ 273 /* interval is 1. */\ 274 bt_init(&bt, prof_tdata->vec); \ 275 prof_backtrace(&bt, nignore); \ 276 ret = prof_lookup(&bt); \ 277 } else { \ 278 if (prof_tdata->threshold == 0) { \ 279 /* Initialize. Seed the prng differently for */\ 280 /* each thread. */\ 281 prof_tdata->prng_state = \ 282 (uint64_t)(uintptr_t)&size; \ 283 prof_sample_threshold_update(prof_tdata); \ 284 } \ 285 \ 286 /* Determine whether to capture a backtrace based on */\ 287 /* whether size is enough for prof_accum to reach */\ 288 /* prof_tdata->threshold. However, delay updating */\ 289 /* these variables until prof_{m,re}alloc(), because */\ 290 /* we don't know for sure that the allocation will */\ 291 /* succeed. */\ 292 /* */\ 293 /* Use subtraction rather than addition to avoid */\ 294 /* potential integer overflow. */\ 295 if (size >= prof_tdata->threshold - \ 296 prof_tdata->accum) { \ 297 bt_init(&bt, prof_tdata->vec); \ 298 prof_backtrace(&bt, nignore); \ 299 ret = prof_lookup(&bt); \ 300 } else \ 301 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 302 } \ 303} while (0) 304 305#ifndef JEMALLOC_ENABLE_INLINE 306malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *) 307 308prof_tdata_t *prof_tdata_get(bool create); 309void prof_sample_threshold_update(prof_tdata_t *prof_tdata); 310prof_ctx_t *prof_ctx_get(const void *ptr); 311void prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx); 312bool prof_sample_accum_update(size_t size); 313void prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt); 314void prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt, 315 size_t old_usize, prof_ctx_t *old_ctx); 316void prof_free(const void *ptr, size_t size); 317#endif 318 319#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) 320/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ 321malloc_tsd_externs(prof_tdata, prof_tdata_t *) 322malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL, 323 prof_tdata_cleanup) 324 325JEMALLOC_INLINE prof_tdata_t * 326prof_tdata_get(bool create) 327{ 328 prof_tdata_t *prof_tdata; 329 330 cassert(config_prof); 331 332 prof_tdata = *prof_tdata_tsd_get(); 333 if (create && prof_tdata == NULL) 334 prof_tdata = prof_tdata_init(); 335 336 return (prof_tdata); 337} 338 339JEMALLOC_INLINE void 340prof_sample_threshold_update(prof_tdata_t *prof_tdata) 341{ 342 /* 343 * The body of this function is compiled out unless heap profiling is 344 * enabled, so that it is possible to compile jemalloc with floating 345 * point support completely disabled. Avoiding floating point code is 346 * important on memory-constrained systems, but it also enables a 347 * workaround for versions of glibc that don't properly save/restore 348 * floating point registers during dynamic lazy symbol loading (which 349 * internally calls into whatever malloc implementation happens to be 350 * integrated into the application). Note that some compilers (e.g. 351 * gcc 4.8) may use floating point registers for fast memory moves, so 352 * jemalloc must be compiled with such optimizations disabled (e.g. 353 * -mno-sse) in order for the workaround to be complete. 354 */ 355#ifdef JEMALLOC_PROF 356 uint64_t r; 357 double u; 358 359 cassert(config_prof); 360 361 /* 362 * Compute sample threshold as a geometrically distributed random 363 * variable with mean (2^opt_lg_prof_sample). 364 * 365 * __ __ 366 * | log(u) | 1 367 * prof_tdata->threshold = | -------- |, where p = ------------------- 368 * | log(1-p) | opt_lg_prof_sample 369 * 2 370 * 371 * For more information on the math, see: 372 * 373 * Non-Uniform Random Variate Generation 374 * Luc Devroye 375 * Springer-Verlag, New York, 1986 376 * pp 500 377 * (http://luc.devroye.org/rnbookindex.html) 378 */ 379 prng64(r, 53, prof_tdata->prng_state, 380 UINT64_C(6364136223846793005), UINT64_C(1442695040888963407)); 381 u = (double)r * (1.0/9007199254740992.0L); 382 prof_tdata->threshold = (uint64_t)(log(u) / 383 log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) 384 + (uint64_t)1U; 385#endif 386} 387 388JEMALLOC_INLINE prof_ctx_t * 389prof_ctx_get(const void *ptr) 390{ 391 prof_ctx_t *ret; 392 arena_chunk_t *chunk; 393 394 cassert(config_prof); 395 assert(ptr != NULL); 396 397 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 398 if (chunk != ptr) { 399 /* Region. */ 400 ret = arena_prof_ctx_get(ptr); 401 } else 402 ret = huge_prof_ctx_get(ptr); 403 404 return (ret); 405} 406 407JEMALLOC_INLINE void 408prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx) 409{ 410 arena_chunk_t *chunk; 411 412 cassert(config_prof); 413 assert(ptr != NULL); 414 415 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 416 if (chunk != ptr) { 417 /* Region. */ 418 arena_prof_ctx_set(ptr, usize, ctx); 419 } else 420 huge_prof_ctx_set(ptr, ctx); 421} 422 423JEMALLOC_INLINE bool 424prof_sample_accum_update(size_t size) 425{ 426 prof_tdata_t *prof_tdata; 427 428 cassert(config_prof); 429 /* Sampling logic is unnecessary if the interval is 1. */ 430 assert(opt_lg_prof_sample != 0); 431 432 prof_tdata = prof_tdata_get(false); 433 if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) 434 return (true); 435 436 /* Take care to avoid integer overflow. */ 437 if (size >= prof_tdata->threshold - prof_tdata->accum) { 438 prof_tdata->accum -= (prof_tdata->threshold - size); 439 /* Compute new sample threshold. */ 440 prof_sample_threshold_update(prof_tdata); 441 while (prof_tdata->accum >= prof_tdata->threshold) { 442 prof_tdata->accum -= prof_tdata->threshold; 443 prof_sample_threshold_update(prof_tdata); 444 } 445 return (false); 446 } else { 447 prof_tdata->accum += size; 448 return (true); 449 } 450} 451 452JEMALLOC_INLINE void 453prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) 454{ 455 456 cassert(config_prof); 457 assert(ptr != NULL); 458 assert(usize == isalloc(ptr, true)); 459 460 if (opt_lg_prof_sample != 0) { 461 if (prof_sample_accum_update(usize)) { 462 /* 463 * Don't sample. For malloc()-like allocation, it is 464 * always possible to tell in advance how large an 465 * object's usable size will be, so there should never 466 * be a difference between the usize passed to 467 * PROF_ALLOC_PREP() and prof_malloc(). 468 */ 469 assert((uintptr_t)cnt == (uintptr_t)1U); 470 } 471 } 472 473 if ((uintptr_t)cnt > (uintptr_t)1U) { 474 prof_ctx_set(ptr, usize, cnt->ctx); 475 476 cnt->epoch++; 477 /*********/ 478 mb_write(); 479 /*********/ 480 cnt->cnts.curobjs++; 481 cnt->cnts.curbytes += usize; 482 if (opt_prof_accum) { 483 cnt->cnts.accumobjs++; 484 cnt->cnts.accumbytes += usize; 485 } 486 /*********/ 487 mb_write(); 488 /*********/ 489 cnt->epoch++; 490 /*********/ 491 mb_write(); 492 /*********/ 493 } else 494 prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U); 495} 496 497JEMALLOC_INLINE void 498prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt, 499 size_t old_usize, prof_ctx_t *old_ctx) 500{ 501 prof_thr_cnt_t *told_cnt; 502 503 cassert(config_prof); 504 assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); 505 506 if (ptr != NULL) { 507 assert(usize == isalloc(ptr, true)); 508 if (opt_lg_prof_sample != 0) { 509 if (prof_sample_accum_update(usize)) { 510 /* 511 * Don't sample. The usize passed to 512 * PROF_ALLOC_PREP() was larger than what 513 * actually got allocated, so a backtrace was 514 * captured for this allocation, even though 515 * its actual usize was insufficient to cross 516 * the sample threshold. 517 */ 518 cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 519 } 520 } 521 } 522 523 if ((uintptr_t)old_ctx > (uintptr_t)1U) { 524 told_cnt = prof_lookup(old_ctx->bt); 525 if (told_cnt == NULL) { 526 /* 527 * It's too late to propagate OOM for this realloc(), 528 * so operate directly on old_cnt->ctx->cnt_merged. 529 */ 530 malloc_mutex_lock(old_ctx->lock); 531 old_ctx->cnt_merged.curobjs--; 532 old_ctx->cnt_merged.curbytes -= old_usize; 533 malloc_mutex_unlock(old_ctx->lock); 534 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 535 } 536 } else 537 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 538 539 if ((uintptr_t)told_cnt > (uintptr_t)1U) 540 told_cnt->epoch++; 541 if ((uintptr_t)cnt > (uintptr_t)1U) { 542 prof_ctx_set(ptr, usize, cnt->ctx); 543 cnt->epoch++; 544 } else if (ptr != NULL) 545 prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U); 546 /*********/ 547 mb_write(); 548 /*********/ 549 if ((uintptr_t)told_cnt > (uintptr_t)1U) { 550 told_cnt->cnts.curobjs--; 551 told_cnt->cnts.curbytes -= old_usize; 552 } 553 if ((uintptr_t)cnt > (uintptr_t)1U) { 554 cnt->cnts.curobjs++; 555 cnt->cnts.curbytes += usize; 556 if (opt_prof_accum) { 557 cnt->cnts.accumobjs++; 558 cnt->cnts.accumbytes += usize; 559 } 560 } 561 /*********/ 562 mb_write(); 563 /*********/ 564 if ((uintptr_t)told_cnt > (uintptr_t)1U) 565 told_cnt->epoch++; 566 if ((uintptr_t)cnt > (uintptr_t)1U) 567 cnt->epoch++; 568 /*********/ 569 mb_write(); /* Not strictly necessary. */ 570} 571 572JEMALLOC_INLINE void 573prof_free(const void *ptr, size_t size) 574{ 575 prof_ctx_t *ctx = prof_ctx_get(ptr); 576 577 cassert(config_prof); 578 579 if ((uintptr_t)ctx > (uintptr_t)1) { 580 prof_thr_cnt_t *tcnt; 581 assert(size == isalloc(ptr, true)); 582 tcnt = prof_lookup(ctx->bt); 583 584 if (tcnt != NULL) { 585 tcnt->epoch++; 586 /*********/ 587 mb_write(); 588 /*********/ 589 tcnt->cnts.curobjs--; 590 tcnt->cnts.curbytes -= size; 591 /*********/ 592 mb_write(); 593 /*********/ 594 tcnt->epoch++; 595 /*********/ 596 mb_write(); 597 /*********/ 598 } else { 599 /* 600 * OOM during free() cannot be propagated, so operate 601 * directly on cnt->ctx->cnt_merged. 602 */ 603 malloc_mutex_lock(ctx->lock); 604 ctx->cnt_merged.curobjs--; 605 ctx->cnt_merged.curbytes -= size; 606 malloc_mutex_unlock(ctx->lock); 607 } 608 } 609} 610#endif 611 612#endif /* JEMALLOC_H_INLINES */ 613/******************************************************************************/ 614