prof.h revision 3fb50b0407ff7dfe14727995706e2b42836f0f7e
1/******************************************************************************/
2#ifdef JEMALLOC_H_TYPES
3
4typedef struct prof_bt_s prof_bt_t;
5typedef struct prof_cnt_s prof_cnt_t;
6typedef struct prof_thr_cnt_s prof_thr_cnt_t;
7typedef struct prof_ctx_s prof_ctx_t;
8typedef struct prof_tdata_s prof_tdata_t;
9
10/* Option defaults. */
11#define	PROF_PREFIX_DEFAULT		"jeprof"
12#define	LG_PROF_SAMPLE_DEFAULT		19
13#define	LG_PROF_INTERVAL_DEFAULT	-1
14
15/*
16 * Hard limit on stack backtrace depth.  The version of prof_backtrace() that
17 * is based on __builtin_return_address() necessarily has a hard-coded number
18 * of backtrace frame handlers, and should be kept in sync with this setting.
19 */
20#define	PROF_BT_MAX			128
21
22/* Maximum number of backtraces to store in each per thread LRU cache. */
23#define	PROF_TCMAX			1024
24
25/* Initial hash table size. */
26#define	PROF_CKH_MINITEMS		64
27
28/* Size of memory buffer to use when writing dump files. */
29#define	PROF_DUMP_BUFSIZE		65536
30
31/* Size of stack-allocated buffer used by prof_printf(). */
32#define	PROF_PRINTF_BUFSIZE		128
33
34/*
35 * Number of mutexes shared among all ctx's.  No space is allocated for these
36 * unless profiling is enabled, so it's okay to over-provision.
37 */
38#define	PROF_NCTX_LOCKS			1024
39
40#endif /* JEMALLOC_H_TYPES */
41/******************************************************************************/
42#ifdef JEMALLOC_H_STRUCTS
43
44struct prof_bt_s {
45	/* Backtrace, stored as len program counters. */
46	void		**vec;
47	unsigned	len;
48};
49
50#ifdef JEMALLOC_PROF_LIBGCC
51/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
52typedef struct {
53	prof_bt_t	*bt;
54	unsigned	nignore;
55	unsigned	max;
56} prof_unwind_data_t;
57#endif
58
59struct prof_cnt_s {
60	/*
61	 * Profiling counters.  An allocation/deallocation pair can operate on
62	 * different prof_thr_cnt_t objects that are linked into the same
63	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
64	 * negative.  In principle it is possible for the *bytes counters to
65	 * overflow/underflow, but a general solution would require something
66	 * like 128-bit counters; this implementation doesn't bother to solve
67	 * that problem.
68	 */
69	int64_t		curobjs;
70	int64_t		curbytes;
71	uint64_t	accumobjs;
72	uint64_t	accumbytes;
73};
74
75struct prof_thr_cnt_s {
76	/* Linkage into prof_ctx_t's cnts_ql. */
77	ql_elm(prof_thr_cnt_t)	cnts_link;
78
79	/* Linkage into thread's LRU. */
80	ql_elm(prof_thr_cnt_t)	lru_link;
81
82	/*
83	 * Associated context.  If a thread frees an object that it did not
84	 * allocate, it is possible that the context is not cached in the
85	 * thread's hash table, in which case it must be able to look up the
86	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
87	 * and link it into the prof_ctx_t's cnts_ql.
88	 */
89	prof_ctx_t		*ctx;
90
91	/*
92	 * Threads use memory barriers to update the counters.  Since there is
93	 * only ever one writer, the only challenge is for the reader to get a
94	 * consistent read of the counters.
95	 *
96	 * The writer uses this series of operations:
97	 *
98	 * 1) Increment epoch to an odd number.
99	 * 2) Update counters.
100	 * 3) Increment epoch to an even number.
101	 *
102	 * The reader must assure 1) that the epoch is even while it reads the
103	 * counters, and 2) that the epoch doesn't change between the time it
104	 * starts and finishes reading the counters.
105	 */
106	unsigned		epoch;
107
108	/* Profiling counters. */
109	prof_cnt_t		cnts;
110};
111
112struct prof_ctx_s {
113	/* Associated backtrace. */
114	prof_bt_t		*bt;
115
116	/* Protects nlimbo, cnt_merged, and cnts_ql. */
117	malloc_mutex_t		*lock;
118
119	/*
120	 * Number of threads that currently cause this ctx to be in a state of
121	 * limbo due to one of:
122	 *   - Initializing per thread counters associated with this ctx.
123	 *   - Preparing to destroy this ctx.
124	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
125	 * ctx.
126	 */
127	unsigned		nlimbo;
128
129	/* Temporary storage for summation during dump. */
130	prof_cnt_t		cnt_summed;
131
132	/* When threads exit, they merge their stats into cnt_merged. */
133	prof_cnt_t		cnt_merged;
134
135	/*
136	 * List of profile counters, one for each thread that has allocated in
137	 * this context.
138	 */
139	ql_head(prof_thr_cnt_t)	cnts_ql;
140};
141
142struct prof_tdata_s {
143	/*
144	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
145	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
146	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
147	 * others will ever write them.
148	 *
149	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
150	 * counter data into the associated prof_ctx_t objects, and unlink/free
151	 * the prof_thr_cnt_t objects.
152	 */
153	ckh_t			bt2cnt;
154
155	/* LRU for contents of bt2cnt. */
156	ql_head(prof_thr_cnt_t)	lru_ql;
157
158	/* Backtrace vector, used for calls to prof_backtrace(). */
159	void			**vec;
160
161	/* Sampling state. */
162	uint64_t		prng_state;
163	uint64_t		threshold;
164	uint64_t		accum;
165
166	/* State used to avoid dumping while operating on prof internals. */
167	bool			enq;
168	bool			enq_idump;
169	bool			enq_gdump;
170};
171
172#endif /* JEMALLOC_H_STRUCTS */
173/******************************************************************************/
174#ifdef JEMALLOC_H_EXTERNS
175
176extern bool	opt_prof;
177/*
178 * Even if opt_prof is true, sampling can be temporarily disabled by setting
179 * opt_prof_active to false.  No locking is used when updating opt_prof_active,
180 * so there are no guarantees regarding how long it will take for all threads
181 * to notice state changes.
182 */
183extern bool	opt_prof_active;
184extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
185extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
186extern bool	opt_prof_gdump;       /* High-water memory dumping. */
187extern bool	opt_prof_final;       /* Final profile dumping. */
188extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
189extern bool	opt_prof_accum;       /* Report cumulative bytes. */
190extern char	opt_prof_prefix[PATH_MAX + 1];
191
192/*
193 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
194 * profile dump when it reaches this threshold.  The effect is that the
195 * interval between profile dumps averages prof_interval, though the actual
196 * interval between dumps will tend to be sporadic, and the interval will be a
197 * maximum of approximately (prof_interval * narenas).
198 */
199extern uint64_t	prof_interval;
200
201/*
202 * If true, promote small sampled objects to large objects, since small run
203 * headers do not have embedded profile context pointers.
204 */
205extern bool	prof_promote;
206
207void	bt_init(prof_bt_t *bt, void **vec);
208void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
209prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
210void	prof_idump(void);
211bool	prof_mdump(const char *filename);
212void	prof_gdump(void);
213prof_tdata_t	*prof_tdata_init(void);
214void	prof_tdata_cleanup(void *arg);
215void	prof_boot0(void);
216void	prof_boot1(void);
217bool	prof_boot2(void);
218
219#endif /* JEMALLOC_H_EXTERNS */
220/******************************************************************************/
221#ifdef JEMALLOC_H_INLINES
222
223#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
224	prof_tdata_t *prof_tdata;					\
225	prof_bt_t bt;							\
226									\
227	assert(size == s2u(size));					\
228									\
229	prof_tdata = prof_tdata_get();					\
230	if (prof_tdata == NULL) {					\
231		ret = NULL;						\
232		break;							\
233	}								\
234									\
235	if (opt_prof_active == false) {					\
236		/* Sampling is currently inactive, so avoid sampling. */\
237		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
238	} else if (opt_lg_prof_sample == 0) {				\
239		/* Don't bother with sampling logic, since sampling   */\
240		/* interval is 1.                                     */\
241		bt_init(&bt, prof_tdata->vec);				\
242		prof_backtrace(&bt, nignore);				\
243		ret = prof_lookup(&bt);					\
244	} else {							\
245		if (prof_tdata->threshold == 0) {			\
246			/* Initialize.  Seed the prng differently for */\
247			/* each thread.                               */\
248			prof_tdata->prng_state =			\
249			    (uint64_t)(uintptr_t)&size;			\
250			prof_sample_threshold_update(prof_tdata);	\
251		}							\
252									\
253		/* Determine whether to capture a backtrace based on  */\
254		/* whether size is enough for prof_accum to reach     */\
255		/* prof_tdata->threshold.  However, delay updating    */\
256		/* these variables until prof_{m,re}alloc(), because  */\
257		/* we don't know for sure that the allocation will    */\
258		/* succeed.                                           */\
259		/*                                                    */\
260		/* Use subtraction rather than addition to avoid      */\
261		/* potential integer overflow.                        */\
262		if (size >= prof_tdata->threshold -			\
263		    prof_tdata->accum) {				\
264			bt_init(&bt, prof_tdata->vec);			\
265			prof_backtrace(&bt, nignore);			\
266			ret = prof_lookup(&bt);				\
267		} else							\
268			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
269	}								\
270} while (0)
271
272#ifndef JEMALLOC_ENABLE_INLINE
273malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
274
275prof_tdata_t	*prof_tdata_get(void);
276void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
277prof_ctx_t	*prof_ctx_get(const void *ptr);
278void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
279bool	prof_sample_accum_update(size_t size);
280void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
281void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
282    size_t old_size, prof_ctx_t *old_ctx);
283void	prof_free(const void *ptr, size_t size);
284#endif
285
286#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
287/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
288malloc_tsd_externs(prof_tdata, prof_tdata_t *)
289malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
290    prof_tdata_cleanup)
291
292JEMALLOC_INLINE prof_tdata_t *
293prof_tdata_get(void)
294{
295	prof_tdata_t *prof_tdata;
296
297	cassert(config_prof);
298
299	prof_tdata = *prof_tdata_tsd_get();
300	if (prof_tdata == NULL)
301		prof_tdata = prof_tdata_init();
302
303	return (prof_tdata);
304}
305
306JEMALLOC_INLINE void
307prof_sample_threshold_update(prof_tdata_t *prof_tdata)
308{
309	uint64_t r;
310	double u;
311
312	cassert(config_prof);
313
314	/*
315	 * Compute sample threshold as a geometrically distributed random
316	 * variable with mean (2^opt_lg_prof_sample).
317	 *
318	 *                         __        __
319	 *                         |  log(u)  |                     1
320	 * prof_tdata->threshold = | -------- |, where p = -------------------
321	 *                         | log(1-p) |             opt_lg_prof_sample
322	 *                                                 2
323	 *
324	 * For more information on the math, see:
325	 *
326	 *   Non-Uniform Random Variate Generation
327	 *   Luc Devroye
328	 *   Springer-Verlag, New York, 1986
329	 *   pp 500
330	 *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
331	 */
332	prng64(r, 53, prof_tdata->prng_state,
333	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
334	u = (double)r * (1.0/9007199254740992.0L);
335	prof_tdata->threshold = (uint64_t)(log(u) /
336	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
337	    + (uint64_t)1U;
338}
339
340JEMALLOC_INLINE prof_ctx_t *
341prof_ctx_get(const void *ptr)
342{
343	prof_ctx_t *ret;
344	arena_chunk_t *chunk;
345
346	cassert(config_prof);
347	assert(ptr != NULL);
348
349	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
350	if (chunk != ptr) {
351		/* Region. */
352		ret = arena_prof_ctx_get(ptr);
353	} else
354		ret = huge_prof_ctx_get(ptr);
355
356	return (ret);
357}
358
359JEMALLOC_INLINE void
360prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
361{
362	arena_chunk_t *chunk;
363
364	cassert(config_prof);
365	assert(ptr != NULL);
366
367	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
368	if (chunk != ptr) {
369		/* Region. */
370		arena_prof_ctx_set(ptr, ctx);
371	} else
372		huge_prof_ctx_set(ptr, ctx);
373}
374
375JEMALLOC_INLINE bool
376prof_sample_accum_update(size_t size)
377{
378	prof_tdata_t *prof_tdata;
379
380	cassert(config_prof);
381	/* Sampling logic is unnecessary if the interval is 1. */
382	assert(opt_lg_prof_sample != 0);
383
384	prof_tdata = *prof_tdata_tsd_get();
385	assert(prof_tdata != NULL);
386
387	/* Take care to avoid integer overflow. */
388	if (size >= prof_tdata->threshold - prof_tdata->accum) {
389		prof_tdata->accum -= (prof_tdata->threshold - size);
390		/* Compute new sample threshold. */
391		prof_sample_threshold_update(prof_tdata);
392		while (prof_tdata->accum >= prof_tdata->threshold) {
393			prof_tdata->accum -= prof_tdata->threshold;
394			prof_sample_threshold_update(prof_tdata);
395		}
396		return (false);
397	} else {
398		prof_tdata->accum += size;
399		return (true);
400	}
401}
402
403JEMALLOC_INLINE void
404prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
405{
406
407	cassert(config_prof);
408	assert(ptr != NULL);
409	assert(size == isalloc(ptr, true));
410
411	if (opt_lg_prof_sample != 0) {
412		if (prof_sample_accum_update(size)) {
413			/*
414			 * Don't sample.  For malloc()-like allocation, it is
415			 * always possible to tell in advance how large an
416			 * object's usable size will be, so there should never
417			 * be a difference between the size passed to
418			 * PROF_ALLOC_PREP() and prof_malloc().
419			 */
420			assert((uintptr_t)cnt == (uintptr_t)1U);
421		}
422	}
423
424	if ((uintptr_t)cnt > (uintptr_t)1U) {
425		prof_ctx_set(ptr, cnt->ctx);
426
427		cnt->epoch++;
428		/*********/
429		mb_write();
430		/*********/
431		cnt->cnts.curobjs++;
432		cnt->cnts.curbytes += size;
433		if (opt_prof_accum) {
434			cnt->cnts.accumobjs++;
435			cnt->cnts.accumbytes += size;
436		}
437		/*********/
438		mb_write();
439		/*********/
440		cnt->epoch++;
441		/*********/
442		mb_write();
443		/*********/
444	} else
445		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
446}
447
448JEMALLOC_INLINE void
449prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
450    size_t old_size, prof_ctx_t *old_ctx)
451{
452	prof_thr_cnt_t *told_cnt;
453
454	cassert(config_prof);
455	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
456
457	if (ptr != NULL) {
458		assert(size == isalloc(ptr, true));
459		if (opt_lg_prof_sample != 0) {
460			if (prof_sample_accum_update(size)) {
461				/*
462				 * Don't sample.  The size passed to
463				 * PROF_ALLOC_PREP() was larger than what
464				 * actually got allocated, so a backtrace was
465				 * captured for this allocation, even though
466				 * its actual size was insufficient to cross
467				 * the sample threshold.
468				 */
469				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
470			}
471		}
472	}
473
474	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
475		told_cnt = prof_lookup(old_ctx->bt);
476		if (told_cnt == NULL) {
477			/*
478			 * It's too late to propagate OOM for this realloc(),
479			 * so operate directly on old_cnt->ctx->cnt_merged.
480			 */
481			malloc_mutex_lock(old_ctx->lock);
482			old_ctx->cnt_merged.curobjs--;
483			old_ctx->cnt_merged.curbytes -= old_size;
484			malloc_mutex_unlock(old_ctx->lock);
485			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
486		}
487	} else
488		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
489
490	if ((uintptr_t)told_cnt > (uintptr_t)1U)
491		told_cnt->epoch++;
492	if ((uintptr_t)cnt > (uintptr_t)1U) {
493		prof_ctx_set(ptr, cnt->ctx);
494		cnt->epoch++;
495	} else
496		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
497	/*********/
498	mb_write();
499	/*********/
500	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
501		told_cnt->cnts.curobjs--;
502		told_cnt->cnts.curbytes -= old_size;
503	}
504	if ((uintptr_t)cnt > (uintptr_t)1U) {
505		cnt->cnts.curobjs++;
506		cnt->cnts.curbytes += size;
507		if (opt_prof_accum) {
508			cnt->cnts.accumobjs++;
509			cnt->cnts.accumbytes += size;
510		}
511	}
512	/*********/
513	mb_write();
514	/*********/
515	if ((uintptr_t)told_cnt > (uintptr_t)1U)
516		told_cnt->epoch++;
517	if ((uintptr_t)cnt > (uintptr_t)1U)
518		cnt->epoch++;
519	/*********/
520	mb_write(); /* Not strictly necessary. */
521}
522
523JEMALLOC_INLINE void
524prof_free(const void *ptr, size_t size)
525{
526	prof_ctx_t *ctx = prof_ctx_get(ptr);
527
528	cassert(config_prof);
529
530	if ((uintptr_t)ctx > (uintptr_t)1) {
531		assert(size == isalloc(ptr, true));
532		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
533
534		if (tcnt != NULL) {
535			tcnt->epoch++;
536			/*********/
537			mb_write();
538			/*********/
539			tcnt->cnts.curobjs--;
540			tcnt->cnts.curbytes -= size;
541			/*********/
542			mb_write();
543			/*********/
544			tcnt->epoch++;
545			/*********/
546			mb_write();
547			/*********/
548		} else {
549			/*
550			 * OOM during free() cannot be propagated, so operate
551			 * directly on cnt->ctx->cnt_merged.
552			 */
553			malloc_mutex_lock(ctx->lock);
554			ctx->cnt_merged.curobjs--;
555			ctx->cnt_merged.curbytes -= size;
556			malloc_mutex_unlock(ctx->lock);
557		}
558	}
559}
560#endif
561
562#endif /* JEMALLOC_H_INLINES */
563/******************************************************************************/
564