prof.h revision 7427525c28d58c423a68930160e3b0fe577fe953
1#ifdef JEMALLOC_PROF 2/******************************************************************************/ 3#ifdef JEMALLOC_H_TYPES 4 5typedef struct prof_bt_s prof_bt_t; 6typedef struct prof_cnt_s prof_cnt_t; 7typedef struct prof_thr_cnt_s prof_thr_cnt_t; 8typedef struct prof_ctx_s prof_ctx_t; 9typedef struct prof_tdata_s prof_tdata_t; 10 11/* Option defaults. */ 12#define PROF_PREFIX_DEFAULT "jeprof" 13#define LG_PROF_BT_MAX_DEFAULT 7 14#define LG_PROF_SAMPLE_DEFAULT 0 15#define LG_PROF_INTERVAL_DEFAULT -1 16#define LG_PROF_TCMAX_DEFAULT -1 17 18/* 19 * Hard limit on stack backtrace depth. Note that the version of 20 * prof_backtrace() that is based on __builtin_return_address() necessarily has 21 * a hard-coded number of backtrace frame handlers. 22 */ 23#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND)) 24# define LG_PROF_BT_MAX ((ZU(1) << (LG_SIZEOF_PTR+3)) - 1) 25#else 26# define LG_PROF_BT_MAX 7 /* >= LG_PROF_BT_MAX_DEFAULT */ 27#endif 28#define PROF_BT_MAX (1U << LG_PROF_BT_MAX) 29 30/* Initial hash table size. */ 31#define PROF_CKH_MINITEMS 64 32 33/* Size of memory buffer to use when writing dump files. */ 34#define PROF_DUMP_BUF_SIZE 65536 35 36#endif /* JEMALLOC_H_TYPES */ 37/******************************************************************************/ 38#ifdef JEMALLOC_H_STRUCTS 39 40struct prof_bt_s { 41 /* Backtrace, stored as len program counters. */ 42 void **vec; 43 unsigned len; 44}; 45 46#ifdef JEMALLOC_PROF_LIBGCC 47/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ 48typedef struct { 49 prof_bt_t *bt; 50 unsigned nignore; 51 unsigned max; 52} prof_unwind_data_t; 53#endif 54 55struct prof_cnt_s { 56 /* 57 * Profiling counters. An allocation/deallocation pair can operate on 58 * different prof_thr_cnt_t objects that are linked into the same 59 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go 60 * negative. In principle it is possible for the *bytes counters to 61 * overflow/underflow, but a general solution would require something 62 * like 128-bit counters; this implementation doesn't bother to solve 63 * that problem. 64 */ 65 int64_t curobjs; 66 int64_t curbytes; 67 uint64_t accumobjs; 68 uint64_t accumbytes; 69}; 70 71struct prof_thr_cnt_s { 72 /* Linkage into prof_ctx_t's cnts_ql. */ 73 ql_elm(prof_thr_cnt_t) cnts_link; 74 75 /* Linkage into thread's LRU. */ 76 ql_elm(prof_thr_cnt_t) lru_link; 77 78 /* 79 * Associated context. If a thread frees an object that it did not 80 * allocate, it is possible that the context is not cached in the 81 * thread's hash table, in which case it must be able to look up the 82 * context, insert a new prof_thr_cnt_t into the thread's hash table, 83 * and link it into the prof_ctx_t's cnts_ql. 84 */ 85 prof_ctx_t *ctx; 86 87 /* 88 * Threads use memory barriers to update the counters. Since there is 89 * only ever one writer, the only challenge is for the reader to get a 90 * consistent read of the counters. 91 * 92 * The writer uses this series of operations: 93 * 94 * 1) Increment epoch to an odd number. 95 * 2) Update counters. 96 * 3) Increment epoch to an even number. 97 * 98 * The reader must assure 1) that the epoch is even while it reads the 99 * counters, and 2) that the epoch doesn't change between the time it 100 * starts and finishes reading the counters. 101 */ 102 unsigned epoch; 103 104 /* Profiling counters. */ 105 prof_cnt_t cnts; 106}; 107 108struct prof_ctx_s { 109 /* Associated backtrace. */ 110 prof_bt_t *bt; 111 112 /* Protects cnt_merged and cnts_ql. */ 113 malloc_mutex_t lock; 114 115 /* Temporary storage for summation during dump. */ 116 prof_cnt_t cnt_summed; 117 118 /* When threads exit, they merge their stats into cnt_merged. */ 119 prof_cnt_t cnt_merged; 120 121 /* 122 * List of profile counters, one for each thread that has allocated in 123 * this context. 124 */ 125 ql_head(prof_thr_cnt_t) cnts_ql; 126}; 127 128struct prof_tdata_s { 129 /* 130 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a 131 * cache of backtraces, with associated thread-specific prof_thr_cnt_t 132 * objects. Other threads may read the prof_thr_cnt_t contents, but no 133 * others will ever write them. 134 * 135 * Upon thread exit, the thread must merge all the prof_thr_cnt_t 136 * counter data into the associated prof_ctx_t objects, and unlink/free 137 * the prof_thr_cnt_t objects. 138 */ 139 ckh_t bt2cnt; 140 141 /* LRU for contents of bt2cnt. */ 142 ql_head(prof_thr_cnt_t) lru_ql; 143 144 /* Backtrace vector, used for calls to prof_backtrace(). */ 145 void **vec; 146 147 /* Sampling state. */ 148 uint64_t prn_state; 149 uint64_t threshold; 150 uint64_t accum; 151}; 152 153#endif /* JEMALLOC_H_STRUCTS */ 154/******************************************************************************/ 155#ifdef JEMALLOC_H_EXTERNS 156 157extern bool opt_prof; 158/* 159 * Even if opt_prof is true, sampling can be temporarily disabled by setting 160 * opt_prof_active to false. No locking is used when updating opt_prof_active, 161 * so there are no guarantees regarding how long it will take for all threads 162 * to notice state changes. 163 */ 164extern bool opt_prof_active; 165extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */ 166extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ 167extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ 168extern bool opt_prof_gdump; /* High-water memory dumping. */ 169extern bool opt_prof_leak; /* Dump leak summary at exit. */ 170extern bool opt_prof_accum; /* Report cumulative bytes. */ 171extern ssize_t opt_lg_prof_tcmax; /* lg(max per thread bactrace cache) */ 172extern char opt_prof_prefix[PATH_MAX + 1]; 173 174/* 175 * Profile dump interval, measured in bytes allocated. Each arena triggers a 176 * profile dump when it reaches this threshold. The effect is that the 177 * interval between profile dumps averages prof_interval, though the actual 178 * interval between dumps will tend to be sporadic, and the interval will be a 179 * maximum of approximately (prof_interval * narenas). 180 */ 181extern uint64_t prof_interval; 182 183/* 184 * If true, promote small sampled objects to large objects, since small run 185 * headers do not have embedded profile context pointers. 186 */ 187extern bool prof_promote; 188 189/* (1U << opt_lg_prof_bt_max). */ 190extern unsigned prof_bt_max; 191 192/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ 193#ifndef NO_TLS 194extern __thread prof_tdata_t *prof_tdata_tls 195 JEMALLOC_ATTR(tls_model("initial-exec")); 196# define PROF_TCACHE_GET() prof_tdata_tls 197# define PROF_TCACHE_SET(v) do { \ 198 prof_tdata_tls = (v); \ 199 pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ 200} while (0) 201#else 202# define PROF_TCACHE_GET() \ 203 ((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd)) 204# define PROF_TCACHE_SET(v) do { \ 205 pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ 206} while (0) 207#endif 208/* 209 * Same contents as b2cnt_tls, but initialized such that the TSD destructor is 210 * called when a thread exits, so that prof_tdata_tls contents can be merged, 211 * unlinked, and deallocated. 212 */ 213extern pthread_key_t prof_tdata_tsd; 214 215void bt_init(prof_bt_t *bt, void **vec); 216void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max); 217prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); 218void prof_idump(void); 219bool prof_mdump(const char *filename); 220void prof_gdump(void); 221prof_tdata_t *prof_tdata_init(void); 222void prof_boot0(void); 223void prof_boot1(void); 224bool prof_boot2(void); 225 226#endif /* JEMALLOC_H_EXTERNS */ 227/******************************************************************************/ 228#ifdef JEMALLOC_H_INLINES 229 230#ifndef JEMALLOC_ENABLE_INLINE 231void prof_sample_threshold_update(prof_tdata_t *prof_tdata); 232prof_thr_cnt_t *prof_alloc_prep(size_t size); 233prof_ctx_t *prof_ctx_get(const void *ptr); 234void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); 235bool prof_sample_accum_update(size_t size); 236void prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt); 237void prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, 238 size_t old_size, prof_ctx_t *old_ctx); 239void prof_free(const void *ptr, size_t size); 240#endif 241 242#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) 243JEMALLOC_INLINE void 244prof_sample_threshold_update(prof_tdata_t *prof_tdata) 245{ 246 uint64_t r; 247 double u; 248 249 /* 250 * Compute sample threshold as a geometrically distributed random 251 * variable with mean (2^opt_lg_prof_sample). 252 * 253 * __ __ 254 * | log(u) | 1 255 * prof_tdata->threshold = | -------- |, where p = ------------------- 256 * | log(1-p) | opt_lg_prof_sample 257 * 2 258 * 259 * For more information on the math, see: 260 * 261 * Non-Uniform Random Variate Generation 262 * Luc Devroye 263 * Springer-Verlag, New York, 1986 264 * pp 500 265 * (http://cg.scs.carleton.ca/~luc/rnbookindex.html) 266 */ 267 prn64(r, 53, prof_tdata->prn_state, 268 (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU); 269 u = (double)r * (1.0/9007199254740992.0L); 270 prof_tdata->threshold = (uint64_t)(log(u) / 271 log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) 272 + (uint64_t)1U; 273} 274 275JEMALLOC_INLINE prof_thr_cnt_t * 276prof_alloc_prep(size_t size) 277{ 278#ifdef JEMALLOC_ENABLE_INLINE 279 /* This function does not have its own stack frame, because it is inlined. */ 280# define NIGNORE 1 281#else 282# define NIGNORE 2 283#endif 284 prof_thr_cnt_t *ret; 285 prof_tdata_t *prof_tdata; 286 prof_bt_t bt; 287 288 assert(size == s2u(size)); 289 290 prof_tdata = PROF_TCACHE_GET(); 291 if (prof_tdata == NULL) { 292 prof_tdata = prof_tdata_init(); 293 if (prof_tdata == NULL) 294 return (NULL); 295 } 296 297 if (opt_prof_active == false) { 298 /* Sampling is currently inactive, so avoid sampling. */ 299 ret = (prof_thr_cnt_t *)(uintptr_t)1U; 300 } else if (opt_lg_prof_sample == 0) { 301 /* 302 * Don't bother with sampling logic, since sampling interval is 303 * 1. 304 */ 305 bt_init(&bt, prof_tdata->vec); 306 prof_backtrace(&bt, NIGNORE, prof_bt_max); 307 ret = prof_lookup(&bt); 308 } else { 309 if (prof_tdata->threshold == 0) { 310 /* 311 * Initialize. Seed the prng differently for each 312 * thread. 313 */ 314 prof_tdata->prn_state = (uint64_t)(uintptr_t)&size; 315 prof_sample_threshold_update(prof_tdata); 316 } 317 318 /* 319 * Determine whether to capture a backtrace based on whether 320 * size is enough for prof_accum to reach 321 * prof_tdata->threshold. However, delay updating these 322 * variables until prof_{m,re}alloc(), because we don't know 323 * for sure that the allocation will succeed. 324 * 325 * Use subtraction rather than addition to avoid potential 326 * integer overflow. 327 */ 328 if (size >= prof_tdata->threshold - prof_tdata->accum) { 329 bt_init(&bt, prof_tdata->vec); 330 prof_backtrace(&bt, NIGNORE, prof_bt_max); 331 ret = prof_lookup(&bt); 332 } else 333 ret = (prof_thr_cnt_t *)(uintptr_t)1U; 334 } 335 336 return (ret); 337#undef NIGNORE 338} 339 340JEMALLOC_INLINE prof_ctx_t * 341prof_ctx_get(const void *ptr) 342{ 343 prof_ctx_t *ret; 344 arena_chunk_t *chunk; 345 346 assert(ptr != NULL); 347 348 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 349 if (chunk != ptr) { 350 /* Region. */ 351 dassert(chunk->arena->magic == ARENA_MAGIC); 352 353 ret = arena_prof_ctx_get(ptr); 354 } else 355 ret = huge_prof_ctx_get(ptr); 356 357 return (ret); 358} 359 360JEMALLOC_INLINE void 361prof_ctx_set(const void *ptr, prof_ctx_t *ctx) 362{ 363 arena_chunk_t *chunk; 364 365 assert(ptr != NULL); 366 367 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 368 if (chunk != ptr) { 369 /* Region. */ 370 dassert(chunk->arena->magic == ARENA_MAGIC); 371 372 arena_prof_ctx_set(ptr, ctx); 373 } else 374 huge_prof_ctx_set(ptr, ctx); 375} 376 377JEMALLOC_INLINE bool 378prof_sample_accum_update(size_t size) 379{ 380 prof_tdata_t *prof_tdata; 381 382 /* Sampling logic is unnecessary if the interval is 1. */ 383 assert(opt_lg_prof_sample != 0); 384 385 prof_tdata = PROF_TCACHE_GET(); 386 assert(prof_tdata != NULL); 387 388 /* Take care to avoid integer overflow. */ 389 if (size >= prof_tdata->threshold - prof_tdata->accum) { 390 prof_tdata->accum -= (prof_tdata->threshold - size); 391 /* Compute new sample threshold. */ 392 prof_sample_threshold_update(prof_tdata); 393 while (prof_tdata->accum >= prof_tdata->threshold) { 394 prof_tdata->accum -= prof_tdata->threshold; 395 prof_sample_threshold_update(prof_tdata); 396 } 397 return (false); 398 } else { 399 prof_tdata->accum += size; 400 return (true); 401 } 402} 403 404JEMALLOC_INLINE void 405prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt) 406{ 407 408 assert(ptr != NULL); 409 assert(size == isalloc(ptr)); 410 411 if (opt_lg_prof_sample != 0) { 412 if (prof_sample_accum_update(size)) { 413 /* 414 * Don't sample. For malloc()-like allocation, it is 415 * always possible to tell in advance how large an 416 * object's usable size will be, so there should never 417 * be a difference between the size passed to 418 * prof_alloc_prep() and prof_malloc(). 419 */ 420 assert((uintptr_t)cnt == (uintptr_t)1U); 421 } 422 } 423 424 if ((uintptr_t)cnt > (uintptr_t)1U) { 425 prof_ctx_set(ptr, cnt->ctx); 426 427 cnt->epoch++; 428 /*********/ 429 mb_write(); 430 /*********/ 431 cnt->cnts.curobjs++; 432 cnt->cnts.curbytes += size; 433 if (opt_prof_accum) { 434 cnt->cnts.accumobjs++; 435 cnt->cnts.accumbytes += size; 436 } 437 /*********/ 438 mb_write(); 439 /*********/ 440 cnt->epoch++; 441 /*********/ 442 mb_write(); 443 /*********/ 444 } else 445 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); 446} 447 448JEMALLOC_INLINE void 449prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, 450 size_t old_size, prof_ctx_t *old_ctx) 451{ 452 prof_thr_cnt_t *told_cnt; 453 454 assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); 455 456 if (ptr != NULL) { 457 assert(size == isalloc(ptr)); 458 if (opt_lg_prof_sample != 0) { 459 if (prof_sample_accum_update(size)) { 460 /* 461 * Don't sample. The size passed to 462 * prof_alloc_prep() was larger than what 463 * actually got allocated, so a backtrace was 464 * captured for this allocation, even though 465 * its actual size was insufficient to cross 466 * the sample threshold. 467 */ 468 cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 469 } 470 } 471 } 472 473 if ((uintptr_t)old_ctx > (uintptr_t)1U) { 474 told_cnt = prof_lookup(old_ctx->bt); 475 if (told_cnt == NULL) { 476 /* 477 * It's too late to propagate OOM for this realloc(), 478 * so operate directly on old_cnt->ctx->cnt_merged. 479 */ 480 malloc_mutex_lock(&old_ctx->lock); 481 old_ctx->cnt_merged.curobjs--; 482 old_ctx->cnt_merged.curbytes -= old_size; 483 malloc_mutex_unlock(&old_ctx->lock); 484 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 485 } 486 } else 487 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 488 489 if ((uintptr_t)told_cnt > (uintptr_t)1U) 490 told_cnt->epoch++; 491 if ((uintptr_t)cnt > (uintptr_t)1U) { 492 prof_ctx_set(ptr, cnt->ctx); 493 cnt->epoch++; 494 } else 495 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); 496 /*********/ 497 mb_write(); 498 /*********/ 499 if ((uintptr_t)told_cnt > (uintptr_t)1U) { 500 told_cnt->cnts.curobjs--; 501 told_cnt->cnts.curbytes -= old_size; 502 } 503 if ((uintptr_t)cnt > (uintptr_t)1U) { 504 cnt->cnts.curobjs++; 505 cnt->cnts.curbytes += size; 506 if (opt_prof_accum) { 507 cnt->cnts.accumobjs++; 508 cnt->cnts.accumbytes += size; 509 } 510 } 511 /*********/ 512 mb_write(); 513 /*********/ 514 if ((uintptr_t)told_cnt > (uintptr_t)1U) 515 told_cnt->epoch++; 516 if ((uintptr_t)cnt > (uintptr_t)1U) 517 cnt->epoch++; 518 /*********/ 519 mb_write(); /* Not strictly necessary. */ 520} 521 522JEMALLOC_INLINE void 523prof_free(const void *ptr, size_t size) 524{ 525 prof_ctx_t *ctx = prof_ctx_get(ptr); 526 527 if ((uintptr_t)ctx > (uintptr_t)1) { 528 assert(size == isalloc(ptr)); 529 prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt); 530 531 if (tcnt != NULL) { 532 tcnt->epoch++; 533 /*********/ 534 mb_write(); 535 /*********/ 536 tcnt->cnts.curobjs--; 537 tcnt->cnts.curbytes -= size; 538 /*********/ 539 mb_write(); 540 /*********/ 541 tcnt->epoch++; 542 /*********/ 543 mb_write(); 544 /*********/ 545 } else { 546 /* 547 * OOM during free() cannot be propagated, so operate 548 * directly on cnt->ctx->cnt_merged. 549 */ 550 malloc_mutex_lock(&ctx->lock); 551 ctx->cnt_merged.curobjs--; 552 ctx->cnt_merged.curbytes -= size; 553 malloc_mutex_unlock(&ctx->lock); 554 } 555 } 556} 557#endif 558 559#endif /* JEMALLOC_H_INLINES */ 560/******************************************************************************/ 561#endif /* JEMALLOC_PROF */ 562