prof.h revision 7372b15a31c63ac5cb9ed8aeabc2a0a3c005e8bf
1/******************************************************************************/ 2#ifdef JEMALLOC_H_TYPES 3 4typedef struct prof_bt_s prof_bt_t; 5typedef struct prof_cnt_s prof_cnt_t; 6typedef struct prof_thr_cnt_s prof_thr_cnt_t; 7typedef struct prof_ctx_s prof_ctx_t; 8typedef struct prof_tdata_s prof_tdata_t; 9 10/* Option defaults. */ 11#define PROF_PREFIX_DEFAULT "jeprof" 12#define LG_PROF_BT_MAX_DEFAULT 7 13#define LG_PROF_SAMPLE_DEFAULT 0 14#define LG_PROF_INTERVAL_DEFAULT -1 15#define LG_PROF_TCMAX_DEFAULT -1 16 17/* 18 * Hard limit on stack backtrace depth. Note that the version of 19 * prof_backtrace() that is based on __builtin_return_address() necessarily has 20 * a hard-coded number of backtrace frame handlers. 21 */ 22#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND)) 23# define LG_PROF_BT_MAX ((ZU(1) << (LG_SIZEOF_PTR+3)) - 1) 24#else 25# define LG_PROF_BT_MAX 7 /* >= LG_PROF_BT_MAX_DEFAULT */ 26#endif 27#define PROF_BT_MAX (1U << LG_PROF_BT_MAX) 28 29/* Initial hash table size. */ 30#define PROF_CKH_MINITEMS 64 31 32/* Size of memory buffer to use when writing dump files. */ 33#define PROF_DUMP_BUF_SIZE 65536 34 35#endif /* JEMALLOC_H_TYPES */ 36/******************************************************************************/ 37#ifdef JEMALLOC_H_STRUCTS 38 39struct prof_bt_s { 40 /* Backtrace, stored as len program counters. */ 41 void **vec; 42 unsigned len; 43}; 44 45#ifdef JEMALLOC_PROF_LIBGCC 46/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ 47typedef struct { 48 prof_bt_t *bt; 49 unsigned nignore; 50 unsigned max; 51} prof_unwind_data_t; 52#endif 53 54struct prof_cnt_s { 55 /* 56 * Profiling counters. An allocation/deallocation pair can operate on 57 * different prof_thr_cnt_t objects that are linked into the same 58 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go 59 * negative. In principle it is possible for the *bytes counters to 60 * overflow/underflow, but a general solution would require something 61 * like 128-bit counters; this implementation doesn't bother to solve 62 * that problem. 63 */ 64 int64_t curobjs; 65 int64_t curbytes; 66 uint64_t accumobjs; 67 uint64_t accumbytes; 68}; 69 70struct prof_thr_cnt_s { 71 /* Linkage into prof_ctx_t's cnts_ql. */ 72 ql_elm(prof_thr_cnt_t) cnts_link; 73 74 /* Linkage into thread's LRU. */ 75 ql_elm(prof_thr_cnt_t) lru_link; 76 77 /* 78 * Associated context. If a thread frees an object that it did not 79 * allocate, it is possible that the context is not cached in the 80 * thread's hash table, in which case it must be able to look up the 81 * context, insert a new prof_thr_cnt_t into the thread's hash table, 82 * and link it into the prof_ctx_t's cnts_ql. 83 */ 84 prof_ctx_t *ctx; 85 86 /* 87 * Threads use memory barriers to update the counters. Since there is 88 * only ever one writer, the only challenge is for the reader to get a 89 * consistent read of the counters. 90 * 91 * The writer uses this series of operations: 92 * 93 * 1) Increment epoch to an odd number. 94 * 2) Update counters. 95 * 3) Increment epoch to an even number. 96 * 97 * The reader must assure 1) that the epoch is even while it reads the 98 * counters, and 2) that the epoch doesn't change between the time it 99 * starts and finishes reading the counters. 100 */ 101 unsigned epoch; 102 103 /* Profiling counters. */ 104 prof_cnt_t cnts; 105}; 106 107struct prof_ctx_s { 108 /* Associated backtrace. */ 109 prof_bt_t *bt; 110 111 /* Protects cnt_merged and cnts_ql. */ 112 malloc_mutex_t lock; 113 114 /* Temporary storage for summation during dump. */ 115 prof_cnt_t cnt_summed; 116 117 /* When threads exit, they merge their stats into cnt_merged. */ 118 prof_cnt_t cnt_merged; 119 120 /* 121 * List of profile counters, one for each thread that has allocated in 122 * this context. 123 */ 124 ql_head(prof_thr_cnt_t) cnts_ql; 125}; 126 127struct prof_tdata_s { 128 /* 129 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a 130 * cache of backtraces, with associated thread-specific prof_thr_cnt_t 131 * objects. Other threads may read the prof_thr_cnt_t contents, but no 132 * others will ever write them. 133 * 134 * Upon thread exit, the thread must merge all the prof_thr_cnt_t 135 * counter data into the associated prof_ctx_t objects, and unlink/free 136 * the prof_thr_cnt_t objects. 137 */ 138 ckh_t bt2cnt; 139 140 /* LRU for contents of bt2cnt. */ 141 ql_head(prof_thr_cnt_t) lru_ql; 142 143 /* Backtrace vector, used for calls to prof_backtrace(). */ 144 void **vec; 145 146 /* Sampling state. */ 147 uint64_t prn_state; 148 uint64_t threshold; 149 uint64_t accum; 150}; 151 152#endif /* JEMALLOC_H_STRUCTS */ 153/******************************************************************************/ 154#ifdef JEMALLOC_H_EXTERNS 155 156extern bool opt_prof; 157/* 158 * Even if opt_prof is true, sampling can be temporarily disabled by setting 159 * opt_prof_active to false. No locking is used when updating opt_prof_active, 160 * so there are no guarantees regarding how long it will take for all threads 161 * to notice state changes. 162 */ 163extern bool opt_prof_active; 164extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */ 165extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ 166extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ 167extern bool opt_prof_gdump; /* High-water memory dumping. */ 168extern bool opt_prof_leak; /* Dump leak summary at exit. */ 169extern bool opt_prof_accum; /* Report cumulative bytes. */ 170extern ssize_t opt_lg_prof_tcmax; /* lg(max per thread bactrace cache) */ 171extern char opt_prof_prefix[PATH_MAX + 1]; 172 173/* 174 * Profile dump interval, measured in bytes allocated. Each arena triggers a 175 * profile dump when it reaches this threshold. The effect is that the 176 * interval between profile dumps averages prof_interval, though the actual 177 * interval between dumps will tend to be sporadic, and the interval will be a 178 * maximum of approximately (prof_interval * narenas). 179 */ 180extern uint64_t prof_interval; 181 182/* 183 * If true, promote small sampled objects to large objects, since small run 184 * headers do not have embedded profile context pointers. 185 */ 186extern bool prof_promote; 187 188/* (1U << opt_lg_prof_bt_max). */ 189extern unsigned prof_bt_max; 190 191/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ 192#ifndef NO_TLS 193extern __thread prof_tdata_t *prof_tdata_tls 194 JEMALLOC_ATTR(tls_model("initial-exec")); 195# define PROF_TCACHE_GET() prof_tdata_tls 196# define PROF_TCACHE_SET(v) do { \ 197 prof_tdata_tls = (v); \ 198 pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ 199} while (0) 200#else 201# define PROF_TCACHE_GET() \ 202 ((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd)) 203# define PROF_TCACHE_SET(v) do { \ 204 pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ 205} while (0) 206#endif 207/* 208 * Same contents as b2cnt_tls, but initialized such that the TSD destructor is 209 * called when a thread exits, so that prof_tdata_tls contents can be merged, 210 * unlinked, and deallocated. 211 */ 212extern pthread_key_t prof_tdata_tsd; 213 214void bt_init(prof_bt_t *bt, void **vec); 215void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max); 216prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); 217void prof_idump(void); 218bool prof_mdump(const char *filename); 219void prof_gdump(void); 220prof_tdata_t *prof_tdata_init(void); 221void prof_boot0(void); 222void prof_boot1(void); 223bool prof_boot2(void); 224 225#endif /* JEMALLOC_H_EXTERNS */ 226/******************************************************************************/ 227#ifdef JEMALLOC_H_INLINES 228 229#define PROF_ALLOC_PREP(nignore, size, ret) do { \ 230 prof_tdata_t *prof_tdata; \ 231 prof_bt_t bt; \ 232 \ 233 assert(size == s2u(size)); \ 234 \ 235 prof_tdata = PROF_TCACHE_GET(); \ 236 if (prof_tdata == NULL) { \ 237 prof_tdata = prof_tdata_init(); \ 238 if (prof_tdata == NULL) { \ 239 ret = NULL; \ 240 break; \ 241 } \ 242 } \ 243 \ 244 if (opt_prof_active == false) { \ 245 /* Sampling is currently inactive, so avoid sampling. */\ 246 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 247 } else if (opt_lg_prof_sample == 0) { \ 248 /* Don't bother with sampling logic, since sampling */\ 249 /* interval is 1. */\ 250 bt_init(&bt, prof_tdata->vec); \ 251 prof_backtrace(&bt, nignore, prof_bt_max); \ 252 ret = prof_lookup(&bt); \ 253 } else { \ 254 if (prof_tdata->threshold == 0) { \ 255 /* Initialize. Seed the prng differently for */\ 256 /* each thread. */\ 257 prof_tdata->prn_state = \ 258 (uint64_t)(uintptr_t)&size; \ 259 prof_sample_threshold_update(prof_tdata); \ 260 } \ 261 \ 262 /* Determine whether to capture a backtrace based on */\ 263 /* whether size is enough for prof_accum to reach */\ 264 /* prof_tdata->threshold. However, delay updating */\ 265 /* these variables until prof_{m,re}alloc(), because */\ 266 /* we don't know for sure that the allocation will */\ 267 /* succeed. */\ 268 /* */\ 269 /* Use subtraction rather than addition to avoid */\ 270 /* potential integer overflow. */\ 271 if (size >= prof_tdata->threshold - \ 272 prof_tdata->accum) { \ 273 bt_init(&bt, prof_tdata->vec); \ 274 prof_backtrace(&bt, nignore, prof_bt_max); \ 275 ret = prof_lookup(&bt); \ 276 } else \ 277 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 278 } \ 279} while (0) 280 281#ifndef JEMALLOC_ENABLE_INLINE 282void prof_sample_threshold_update(prof_tdata_t *prof_tdata); 283prof_ctx_t *prof_ctx_get(const void *ptr); 284void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); 285bool prof_sample_accum_update(size_t size); 286void prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt); 287void prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, 288 size_t old_size, prof_ctx_t *old_ctx); 289void prof_free(const void *ptr, size_t size); 290#endif 291 292#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) 293JEMALLOC_INLINE void 294prof_sample_threshold_update(prof_tdata_t *prof_tdata) 295{ 296 uint64_t r; 297 double u; 298 299 cassert(config_prof); 300 301 /* 302 * Compute sample threshold as a geometrically distributed random 303 * variable with mean (2^opt_lg_prof_sample). 304 * 305 * __ __ 306 * | log(u) | 1 307 * prof_tdata->threshold = | -------- |, where p = ------------------- 308 * | log(1-p) | opt_lg_prof_sample 309 * 2 310 * 311 * For more information on the math, see: 312 * 313 * Non-Uniform Random Variate Generation 314 * Luc Devroye 315 * Springer-Verlag, New York, 1986 316 * pp 500 317 * (http://cg.scs.carleton.ca/~luc/rnbookindex.html) 318 */ 319 prn64(r, 53, prof_tdata->prn_state, 320 (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU); 321 u = (double)r * (1.0/9007199254740992.0L); 322 prof_tdata->threshold = (uint64_t)(log(u) / 323 log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) 324 + (uint64_t)1U; 325} 326 327JEMALLOC_INLINE prof_ctx_t * 328prof_ctx_get(const void *ptr) 329{ 330 prof_ctx_t *ret; 331 arena_chunk_t *chunk; 332 333 cassert(config_prof); 334 assert(ptr != NULL); 335 336 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 337 if (chunk != ptr) { 338 /* Region. */ 339 assert(chunk->arena->magic == ARENA_MAGIC); 340 341 ret = arena_prof_ctx_get(ptr); 342 } else 343 ret = huge_prof_ctx_get(ptr); 344 345 return (ret); 346} 347 348JEMALLOC_INLINE void 349prof_ctx_set(const void *ptr, prof_ctx_t *ctx) 350{ 351 arena_chunk_t *chunk; 352 353 cassert(config_prof); 354 assert(ptr != NULL); 355 356 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 357 if (chunk != ptr) { 358 /* Region. */ 359 assert(chunk->arena->magic == ARENA_MAGIC); 360 361 arena_prof_ctx_set(ptr, ctx); 362 } else 363 huge_prof_ctx_set(ptr, ctx); 364} 365 366JEMALLOC_INLINE bool 367prof_sample_accum_update(size_t size) 368{ 369 prof_tdata_t *prof_tdata; 370 371 cassert(config_prof); 372 /* Sampling logic is unnecessary if the interval is 1. */ 373 assert(opt_lg_prof_sample != 0); 374 375 prof_tdata = PROF_TCACHE_GET(); 376 assert(prof_tdata != NULL); 377 378 /* Take care to avoid integer overflow. */ 379 if (size >= prof_tdata->threshold - prof_tdata->accum) { 380 prof_tdata->accum -= (prof_tdata->threshold - size); 381 /* Compute new sample threshold. */ 382 prof_sample_threshold_update(prof_tdata); 383 while (prof_tdata->accum >= prof_tdata->threshold) { 384 prof_tdata->accum -= prof_tdata->threshold; 385 prof_sample_threshold_update(prof_tdata); 386 } 387 return (false); 388 } else { 389 prof_tdata->accum += size; 390 return (true); 391 } 392} 393 394JEMALLOC_INLINE void 395prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt) 396{ 397 398 cassert(config_prof); 399 assert(ptr != NULL); 400 assert(size == isalloc(ptr)); 401 402 if (opt_lg_prof_sample != 0) { 403 if (prof_sample_accum_update(size)) { 404 /* 405 * Don't sample. For malloc()-like allocation, it is 406 * always possible to tell in advance how large an 407 * object's usable size will be, so there should never 408 * be a difference between the size passed to 409 * PROF_ALLOC_PREP() and prof_malloc(). 410 */ 411 assert((uintptr_t)cnt == (uintptr_t)1U); 412 } 413 } 414 415 if ((uintptr_t)cnt > (uintptr_t)1U) { 416 prof_ctx_set(ptr, cnt->ctx); 417 418 cnt->epoch++; 419 /*********/ 420 mb_write(); 421 /*********/ 422 cnt->cnts.curobjs++; 423 cnt->cnts.curbytes += size; 424 if (opt_prof_accum) { 425 cnt->cnts.accumobjs++; 426 cnt->cnts.accumbytes += size; 427 } 428 /*********/ 429 mb_write(); 430 /*********/ 431 cnt->epoch++; 432 /*********/ 433 mb_write(); 434 /*********/ 435 } else 436 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); 437} 438 439JEMALLOC_INLINE void 440prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, 441 size_t old_size, prof_ctx_t *old_ctx) 442{ 443 prof_thr_cnt_t *told_cnt; 444 445 cassert(config_prof); 446 assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); 447 448 if (ptr != NULL) { 449 assert(size == isalloc(ptr)); 450 if (opt_lg_prof_sample != 0) { 451 if (prof_sample_accum_update(size)) { 452 /* 453 * Don't sample. The size passed to 454 * PROF_ALLOC_PREP() was larger than what 455 * actually got allocated, so a backtrace was 456 * captured for this allocation, even though 457 * its actual size was insufficient to cross 458 * the sample threshold. 459 */ 460 cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 461 } 462 } 463 } 464 465 if ((uintptr_t)old_ctx > (uintptr_t)1U) { 466 told_cnt = prof_lookup(old_ctx->bt); 467 if (told_cnt == NULL) { 468 /* 469 * It's too late to propagate OOM for this realloc(), 470 * so operate directly on old_cnt->ctx->cnt_merged. 471 */ 472 malloc_mutex_lock(&old_ctx->lock); 473 old_ctx->cnt_merged.curobjs--; 474 old_ctx->cnt_merged.curbytes -= old_size; 475 malloc_mutex_unlock(&old_ctx->lock); 476 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 477 } 478 } else 479 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 480 481 if ((uintptr_t)told_cnt > (uintptr_t)1U) 482 told_cnt->epoch++; 483 if ((uintptr_t)cnt > (uintptr_t)1U) { 484 prof_ctx_set(ptr, cnt->ctx); 485 cnt->epoch++; 486 } else 487 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); 488 /*********/ 489 mb_write(); 490 /*********/ 491 if ((uintptr_t)told_cnt > (uintptr_t)1U) { 492 told_cnt->cnts.curobjs--; 493 told_cnt->cnts.curbytes -= old_size; 494 } 495 if ((uintptr_t)cnt > (uintptr_t)1U) { 496 cnt->cnts.curobjs++; 497 cnt->cnts.curbytes += size; 498 if (opt_prof_accum) { 499 cnt->cnts.accumobjs++; 500 cnt->cnts.accumbytes += size; 501 } 502 } 503 /*********/ 504 mb_write(); 505 /*********/ 506 if ((uintptr_t)told_cnt > (uintptr_t)1U) 507 told_cnt->epoch++; 508 if ((uintptr_t)cnt > (uintptr_t)1U) 509 cnt->epoch++; 510 /*********/ 511 mb_write(); /* Not strictly necessary. */ 512} 513 514JEMALLOC_INLINE void 515prof_free(const void *ptr, size_t size) 516{ 517 prof_ctx_t *ctx = prof_ctx_get(ptr); 518 519 cassert(config_prof); 520 521 if ((uintptr_t)ctx > (uintptr_t)1) { 522 assert(size == isalloc(ptr)); 523 prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt); 524 525 if (tcnt != NULL) { 526 tcnt->epoch++; 527 /*********/ 528 mb_write(); 529 /*********/ 530 tcnt->cnts.curobjs--; 531 tcnt->cnts.curbytes -= size; 532 /*********/ 533 mb_write(); 534 /*********/ 535 tcnt->epoch++; 536 /*********/ 537 mb_write(); 538 /*********/ 539 } else { 540 /* 541 * OOM during free() cannot be propagated, so operate 542 * directly on cnt->ctx->cnt_merged. 543 */ 544 malloc_mutex_lock(&ctx->lock); 545 ctx->cnt_merged.curobjs--; 546 ctx->cnt_merged.curbytes -= size; 547 malloc_mutex_unlock(&ctx->lock); 548 } 549 } 550} 551#endif 552 553#endif /* JEMALLOC_H_INLINES */ 554/******************************************************************************/ 555