prof.h revision 73b37a9697acd53496bbef06ed25696e0c897341
1/******************************************************************************/ 2#ifdef JEMALLOC_H_TYPES 3 4typedef struct prof_bt_s prof_bt_t; 5typedef struct prof_cnt_s prof_cnt_t; 6typedef struct prof_thr_cnt_s prof_thr_cnt_t; 7typedef struct prof_ctx_s prof_ctx_t; 8typedef struct prof_tdata_s prof_tdata_t; 9 10/* Option defaults. */ 11#ifdef JEMALLOC_PROF 12# define PROF_PREFIX_DEFAULT "jeprof" 13#else 14# define PROF_PREFIX_DEFAULT "" 15#endif 16#define LG_PROF_SAMPLE_DEFAULT 19 17#define LG_PROF_INTERVAL_DEFAULT -1 18 19/* 20 * Hard limit on stack backtrace depth. The version of prof_backtrace() that 21 * is based on __builtin_return_address() necessarily has a hard-coded number 22 * of backtrace frame handlers, and should be kept in sync with this setting. 23 */ 24#define PROF_BT_MAX 128 25 26/* Maximum number of backtraces to store in each per thread LRU cache. */ 27#define PROF_TCMAX 1024 28 29/* Initial hash table size. */ 30#define PROF_CKH_MINITEMS 64 31 32/* Size of memory buffer to use when writing dump files. */ 33#define PROF_DUMP_BUFSIZE 65536 34 35/* Size of stack-allocated buffer used by prof_printf(). */ 36#define PROF_PRINTF_BUFSIZE 128 37 38/* 39 * Number of mutexes shared among all ctx's. No space is allocated for these 40 * unless profiling is enabled, so it's okay to over-provision. 41 */ 42#define PROF_NCTX_LOCKS 1024 43 44/* 45 * prof_tdata pointers close to NULL are used to encode state information that 46 * is used for cleaning up during thread shutdown. 47 */ 48#define PROF_TDATA_STATE_REINCARNATED ((prof_tdata_t *)(uintptr_t)1) 49#define PROF_TDATA_STATE_PURGATORY ((prof_tdata_t *)(uintptr_t)2) 50#define PROF_TDATA_STATE_MAX PROF_TDATA_STATE_PURGATORY 51 52#endif /* JEMALLOC_H_TYPES */ 53/******************************************************************************/ 54#ifdef JEMALLOC_H_STRUCTS 55 56struct prof_bt_s { 57 /* Backtrace, stored as len program counters. */ 58 void **vec; 59 unsigned len; 60}; 61 62#ifdef JEMALLOC_PROF_LIBGCC 63/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ 64typedef struct { 65 prof_bt_t *bt; 66 unsigned max; 67} prof_unwind_data_t; 68#endif 69 70struct prof_cnt_s { 71 /* 72 * Profiling counters. An allocation/deallocation pair can operate on 73 * different prof_thr_cnt_t objects that are linked into the same 74 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go 75 * negative. In principle it is possible for the *bytes counters to 76 * overflow/underflow, but a general solution would require something 77 * like 128-bit counters; this implementation doesn't bother to solve 78 * that problem. 79 */ 80 int64_t curobjs; 81 int64_t curbytes; 82 uint64_t accumobjs; 83 uint64_t accumbytes; 84}; 85 86struct prof_thr_cnt_s { 87 /* Linkage into prof_ctx_t's cnts_ql. */ 88 ql_elm(prof_thr_cnt_t) cnts_link; 89 90 /* Linkage into thread's LRU. */ 91 ql_elm(prof_thr_cnt_t) lru_link; 92 93 /* 94 * Associated context. If a thread frees an object that it did not 95 * allocate, it is possible that the context is not cached in the 96 * thread's hash table, in which case it must be able to look up the 97 * context, insert a new prof_thr_cnt_t into the thread's hash table, 98 * and link it into the prof_ctx_t's cnts_ql. 99 */ 100 prof_ctx_t *ctx; 101 102 /* 103 * Threads use memory barriers to update the counters. Since there is 104 * only ever one writer, the only challenge is for the reader to get a 105 * consistent read of the counters. 106 * 107 * The writer uses this series of operations: 108 * 109 * 1) Increment epoch to an odd number. 110 * 2) Update counters. 111 * 3) Increment epoch to an even number. 112 * 113 * The reader must assure 1) that the epoch is even while it reads the 114 * counters, and 2) that the epoch doesn't change between the time it 115 * starts and finishes reading the counters. 116 */ 117 unsigned epoch; 118 119 /* Profiling counters. */ 120 prof_cnt_t cnts; 121}; 122 123struct prof_ctx_s { 124 /* Associated backtrace. */ 125 prof_bt_t *bt; 126 127 /* Protects nlimbo, cnt_merged, and cnts_ql. */ 128 malloc_mutex_t *lock; 129 130 /* 131 * Number of threads that currently cause this ctx to be in a state of 132 * limbo due to one of: 133 * - Initializing per thread counters associated with this ctx. 134 * - Preparing to destroy this ctx. 135 * - Dumping a heap profile that includes this ctx. 136 * nlimbo must be 1 (single destroyer) in order to safely destroy the 137 * ctx. 138 */ 139 unsigned nlimbo; 140 141 /* Temporary storage for summation during dump. */ 142 prof_cnt_t cnt_summed; 143 144 /* When threads exit, they merge their stats into cnt_merged. */ 145 prof_cnt_t cnt_merged; 146 147 /* 148 * List of profile counters, one for each thread that has allocated in 149 * this context. 150 */ 151 ql_head(prof_thr_cnt_t) cnts_ql; 152 153 /* Linkage for list of contexts to be dumped. */ 154 ql_elm(prof_ctx_t) dump_link; 155}; 156typedef ql_head(prof_ctx_t) prof_ctx_list_t; 157 158struct prof_tdata_s { 159 /* 160 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a 161 * cache of backtraces, with associated thread-specific prof_thr_cnt_t 162 * objects. Other threads may read the prof_thr_cnt_t contents, but no 163 * others will ever write them. 164 * 165 * Upon thread exit, the thread must merge all the prof_thr_cnt_t 166 * counter data into the associated prof_ctx_t objects, and unlink/free 167 * the prof_thr_cnt_t objects. 168 */ 169 ckh_t bt2cnt; 170 171 /* LRU for contents of bt2cnt. */ 172 ql_head(prof_thr_cnt_t) lru_ql; 173 174 /* Backtrace vector, used for calls to prof_backtrace(). */ 175 void **vec; 176 177 /* Sampling state. */ 178 uint64_t prng_state; 179 uint64_t bytes_until_sample; 180 181 /* State used to avoid dumping while operating on prof internals. */ 182 bool enq; 183 bool enq_idump; 184 bool enq_gdump; 185}; 186 187#endif /* JEMALLOC_H_STRUCTS */ 188/******************************************************************************/ 189#ifdef JEMALLOC_H_EXTERNS 190 191extern bool opt_prof; 192/* 193 * Even if opt_prof is true, sampling can be temporarily disabled by setting 194 * opt_prof_active to false. No locking is used when updating opt_prof_active, 195 * so there are no guarantees regarding how long it will take for all threads 196 * to notice state changes. 197 */ 198extern bool opt_prof_active; 199extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ 200extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ 201extern bool opt_prof_gdump; /* High-water memory dumping. */ 202extern bool opt_prof_final; /* Final profile dumping. */ 203extern bool opt_prof_leak; /* Dump leak summary at exit. */ 204extern bool opt_prof_accum; /* Report cumulative bytes. */ 205extern char opt_prof_prefix[ 206 /* Minimize memory bloat for non-prof builds. */ 207#ifdef JEMALLOC_PROF 208 PATH_MAX + 209#endif 210 1]; 211 212/* 213 * Profile dump interval, measured in bytes allocated. Each arena triggers a 214 * profile dump when it reaches this threshold. The effect is that the 215 * interval between profile dumps averages prof_interval, though the actual 216 * interval between dumps will tend to be sporadic, and the interval will be a 217 * maximum of approximately (prof_interval * narenas). 218 */ 219extern uint64_t prof_interval; 220 221void bt_init(prof_bt_t *bt, void **vec); 222void prof_backtrace(prof_bt_t *bt); 223prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); 224#ifdef JEMALLOC_JET 225size_t prof_bt_count(void); 226typedef int (prof_dump_open_t)(bool, const char *); 227extern prof_dump_open_t *prof_dump_open; 228#endif 229void prof_idump(void); 230bool prof_mdump(const char *filename); 231void prof_gdump(void); 232prof_tdata_t *prof_tdata_init(void); 233void prof_tdata_cleanup(void *arg); 234void prof_boot0(void); 235void prof_boot1(void); 236bool prof_boot2(void); 237void prof_prefork(void); 238void prof_postfork_parent(void); 239void prof_postfork_child(void); 240void prof_sample_threshold_update(prof_tdata_t *prof_tdata); 241 242#endif /* JEMALLOC_H_EXTERNS */ 243/******************************************************************************/ 244#ifdef JEMALLOC_H_INLINES 245 246#define PROF_ALLOC_PREP(size, ret) do { \ 247 prof_tdata_t *prof_tdata; \ 248 prof_bt_t bt; \ 249 \ 250 assert(size == s2u(size)); \ 251 \ 252 if (!opt_prof_active || \ 253 prof_sample_accum_update(size, false, &prof_tdata)) { \ 254 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 255 } else { \ 256 bt_init(&bt, prof_tdata->vec); \ 257 prof_backtrace(&bt); \ 258 ret = prof_lookup(&bt); \ 259 } \ 260} while (0) 261 262#ifndef JEMALLOC_ENABLE_INLINE 263malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *) 264 265prof_tdata_t *prof_tdata_get(bool create); 266bool prof_sample_accum_update(size_t size, bool commit, 267 prof_tdata_t **prof_tdata_out); 268prof_ctx_t *prof_ctx_get(const void *ptr); 269void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); 270void prof_malloc_record_object(const void *ptr, size_t usize, 271 prof_thr_cnt_t *cnt); 272void prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt); 273void prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt, 274 size_t old_usize, prof_ctx_t *old_ctx); 275void prof_free(const void *ptr, size_t size); 276#endif 277 278#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) 279/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ 280malloc_tsd_externs(prof_tdata, prof_tdata_t *) 281malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL, 282 prof_tdata_cleanup) 283 284JEMALLOC_INLINE prof_tdata_t * 285prof_tdata_get(bool create) 286{ 287 prof_tdata_t *prof_tdata; 288 289 cassert(config_prof); 290 291 prof_tdata = *prof_tdata_tsd_get(); 292 if (create && prof_tdata == NULL) 293 prof_tdata = prof_tdata_init(); 294 295 return (prof_tdata); 296} 297 298JEMALLOC_INLINE prof_ctx_t * 299prof_ctx_get(const void *ptr) 300{ 301 prof_ctx_t *ret; 302 arena_chunk_t *chunk; 303 304 cassert(config_prof); 305 assert(ptr != NULL); 306 307 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 308 if (chunk != ptr) { 309 /* Region. */ 310 ret = arena_prof_ctx_get(ptr); 311 } else 312 ret = huge_prof_ctx_get(ptr); 313 314 return (ret); 315} 316 317JEMALLOC_INLINE void 318prof_ctx_set(const void *ptr, prof_ctx_t *ctx) 319{ 320 arena_chunk_t *chunk; 321 322 cassert(config_prof); 323 assert(ptr != NULL); 324 325 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 326 if (chunk != ptr) { 327 /* Region. */ 328 arena_prof_ctx_set(ptr, ctx); 329 } else 330 huge_prof_ctx_set(ptr, ctx); 331} 332 333JEMALLOC_INLINE bool 334prof_sample_accum_update(size_t size, bool commit, 335 prof_tdata_t **prof_tdata_out) 336{ 337 prof_tdata_t *prof_tdata; 338 339 cassert(config_prof); 340 341 prof_tdata = prof_tdata_get(true); 342 if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) 343 prof_tdata = NULL; 344 345 if (prof_tdata_out != NULL) 346 *prof_tdata_out = prof_tdata; 347 348 if (prof_tdata == NULL) 349 return (true); 350 351 if (prof_tdata->bytes_until_sample >= size) { 352 if (commit) 353 prof_tdata->bytes_until_sample -= size; 354 return (true); 355 } else { 356 /* Compute new sample threshold. */ 357 if (commit) 358 prof_sample_threshold_update(prof_tdata); 359 return (false); 360 } 361} 362 363JEMALLOC_INLINE void 364prof_malloc_record_object(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) { 365 prof_ctx_set(ptr, cnt->ctx); 366 367 cnt->epoch++; 368 /*********/ 369 mb_write(); 370 /*********/ 371 cnt->cnts.curobjs++; 372 cnt->cnts.curbytes += usize; 373 if (opt_prof_accum) { 374 cnt->cnts.accumobjs++; 375 cnt->cnts.accumbytes += usize; 376 } 377 /*********/ 378 mb_write(); 379 /*********/ 380 cnt->epoch++; 381 /*********/ 382 mb_write(); 383 /*********/ 384} 385 386JEMALLOC_INLINE void 387prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) 388{ 389 390 cassert(config_prof); 391 assert(ptr != NULL); 392 assert(usize == isalloc(ptr, true)); 393 394 if (prof_sample_accum_update(usize, true, NULL)) { 395 /* 396 * Don't sample. For malloc()-like allocation, it is 397 * always possible to tell in advance how large an 398 * object's usable size will be, so there should never 399 * be a difference between the usize passed to 400 * PROF_ALLOC_PREP() and prof_malloc(). 401 */ 402 assert((uintptr_t)cnt == (uintptr_t)1U); 403 } 404 405 if ((uintptr_t)cnt > (uintptr_t)1U) 406 prof_malloc_record_object(ptr, usize, cnt); 407 else 408 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); 409} 410 411JEMALLOC_INLINE void 412prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt, 413 size_t old_usize, prof_ctx_t *old_ctx) 414{ 415 prof_thr_cnt_t *told_cnt; 416 417 cassert(config_prof); 418 assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); 419 420 if (ptr != NULL) { 421 assert(usize == isalloc(ptr, true)); 422 if (prof_sample_accum_update(usize, true, NULL)) { 423 /* 424 * Don't sample. The usize passed to 425 * PROF_ALLOC_PREP() was larger than what 426 * actually got allocated, so a backtrace was 427 * captured for this allocation, even though 428 * its actual usize was insufficient to cross 429 * the sample threshold. 430 */ 431 cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 432 } 433 } 434 435 if ((uintptr_t)old_ctx > (uintptr_t)1U) { 436 told_cnt = prof_lookup(old_ctx->bt); 437 if (told_cnt == NULL) { 438 /* 439 * It's too late to propagate OOM for this realloc(), 440 * so operate directly on old_cnt->ctx->cnt_merged. 441 */ 442 malloc_mutex_lock(old_ctx->lock); 443 old_ctx->cnt_merged.curobjs--; 444 old_ctx->cnt_merged.curbytes -= old_usize; 445 malloc_mutex_unlock(old_ctx->lock); 446 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 447 } 448 } else 449 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 450 451 if ((uintptr_t)told_cnt > (uintptr_t)1U) 452 told_cnt->epoch++; 453 if ((uintptr_t)cnt > (uintptr_t)1U) { 454 prof_ctx_set(ptr, cnt->ctx); 455 cnt->epoch++; 456 } else if (ptr != NULL) 457 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); 458 /*********/ 459 mb_write(); 460 /*********/ 461 if ((uintptr_t)told_cnt > (uintptr_t)1U) { 462 told_cnt->cnts.curobjs--; 463 told_cnt->cnts.curbytes -= old_usize; 464 } 465 if ((uintptr_t)cnt > (uintptr_t)1U) { 466 cnt->cnts.curobjs++; 467 cnt->cnts.curbytes += usize; 468 if (opt_prof_accum) { 469 cnt->cnts.accumobjs++; 470 cnt->cnts.accumbytes += usize; 471 } 472 } 473 /*********/ 474 mb_write(); 475 /*********/ 476 if ((uintptr_t)told_cnt > (uintptr_t)1U) 477 told_cnt->epoch++; 478 if ((uintptr_t)cnt > (uintptr_t)1U) 479 cnt->epoch++; 480 /*********/ 481 mb_write(); /* Not strictly necessary. */ 482} 483 484JEMALLOC_INLINE void 485prof_free(const void *ptr, size_t size) 486{ 487 prof_ctx_t *ctx = prof_ctx_get(ptr); 488 489 cassert(config_prof); 490 491 if ((uintptr_t)ctx > (uintptr_t)1) { 492 prof_thr_cnt_t *tcnt; 493 assert(size == isalloc(ptr, true)); 494 tcnt = prof_lookup(ctx->bt); 495 496 if (tcnt != NULL) { 497 tcnt->epoch++; 498 /*********/ 499 mb_write(); 500 /*********/ 501 tcnt->cnts.curobjs--; 502 tcnt->cnts.curbytes -= size; 503 /*********/ 504 mb_write(); 505 /*********/ 506 tcnt->epoch++; 507 /*********/ 508 mb_write(); 509 /*********/ 510 } else { 511 /* 512 * OOM during free() cannot be propagated, so operate 513 * directly on cnt->ctx->cnt_merged. 514 */ 515 malloc_mutex_lock(ctx->lock); 516 ctx->cnt_merged.curobjs--; 517 ctx->cnt_merged.curbytes -= size; 518 malloc_mutex_unlock(ctx->lock); 519 } 520 } 521} 522#endif 523 524#endif /* JEMALLOC_H_INLINES */ 525/******************************************************************************/ 526