prof.h revision eefdd02e70ec1b9cf11920fcff585835dcbd766b
1/******************************************************************************/
2#ifdef JEMALLOC_H_TYPES
3
4typedef struct prof_bt_s prof_bt_t;
5typedef struct prof_cnt_s prof_cnt_t;
6typedef struct prof_thr_cnt_s prof_thr_cnt_t;
7typedef struct prof_ctx_s prof_ctx_t;
8typedef struct prof_tdata_s prof_tdata_t;
9
10/* Option defaults. */
11#define	PROF_PREFIX_DEFAULT		"jeprof"
12#define	LG_PROF_SAMPLE_DEFAULT		19
13#define	LG_PROF_INTERVAL_DEFAULT	-1
14
15/*
16 * Hard limit on stack backtrace depth.  The version of prof_backtrace() that
17 * is based on __builtin_return_address() necessarily has a hard-coded number
18 * of backtrace frame handlers, and should be kept in sync with this setting.
19 */
20#define	PROF_BT_MAX			128
21
22/* Maximum number of backtraces to store in each per thread LRU cache. */
23#define	PROF_TCMAX			1024
24
25/* Initial hash table size. */
26#define	PROF_CKH_MINITEMS		64
27
28/* Size of memory buffer to use when writing dump files. */
29#define	PROF_DUMP_BUFSIZE		65536
30
31/* Size of stack-allocated buffer used by prof_printf(). */
32#define	PROF_PRINTF_BUFSIZE		128
33
34/*
35 * Number of mutexes shared among all ctx's.  No space is allocated for these
36 * unless profiling is enabled, so it's okay to over-provision.
37 */
38#define	PROF_NCTX_LOCKS			1024
39
40/*
41 * prof_tdata pointers close to NULL are used to encode state information that
42 * is used for cleaning up during thread shutdown.
43 */
44#define	PROF_TDATA_STATE_REINCARNATED	((prof_tdata_t *)(uintptr_t)1)
45#define	PROF_TDATA_STATE_PURGATORY	((prof_tdata_t *)(uintptr_t)2)
46#define	PROF_TDATA_STATE_MAX		PROF_TDATA_STATE_PURGATORY
47
48#endif /* JEMALLOC_H_TYPES */
49/******************************************************************************/
50#ifdef JEMALLOC_H_STRUCTS
51
52struct prof_bt_s {
53	/* Backtrace, stored as len program counters. */
54	void		**vec;
55	unsigned	len;
56};
57
58#ifdef JEMALLOC_PROF_LIBGCC
59/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
60typedef struct {
61	prof_bt_t	*bt;
62	unsigned	nignore;
63	unsigned	max;
64} prof_unwind_data_t;
65#endif
66
67struct prof_cnt_s {
68	/*
69	 * Profiling counters.  An allocation/deallocation pair can operate on
70	 * different prof_thr_cnt_t objects that are linked into the same
71	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
72	 * negative.  In principle it is possible for the *bytes counters to
73	 * overflow/underflow, but a general solution would require something
74	 * like 128-bit counters; this implementation doesn't bother to solve
75	 * that problem.
76	 */
77	int64_t		curobjs;
78	int64_t		curbytes;
79	uint64_t	accumobjs;
80	uint64_t	accumbytes;
81};
82
83struct prof_thr_cnt_s {
84	/* Linkage into prof_ctx_t's cnts_ql. */
85	ql_elm(prof_thr_cnt_t)	cnts_link;
86
87	/* Linkage into thread's LRU. */
88	ql_elm(prof_thr_cnt_t)	lru_link;
89
90	/*
91	 * Associated context.  If a thread frees an object that it did not
92	 * allocate, it is possible that the context is not cached in the
93	 * thread's hash table, in which case it must be able to look up the
94	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
95	 * and link it into the prof_ctx_t's cnts_ql.
96	 */
97	prof_ctx_t		*ctx;
98
99	/*
100	 * Threads use memory barriers to update the counters.  Since there is
101	 * only ever one writer, the only challenge is for the reader to get a
102	 * consistent read of the counters.
103	 *
104	 * The writer uses this series of operations:
105	 *
106	 * 1) Increment epoch to an odd number.
107	 * 2) Update counters.
108	 * 3) Increment epoch to an even number.
109	 *
110	 * The reader must assure 1) that the epoch is even while it reads the
111	 * counters, and 2) that the epoch doesn't change between the time it
112	 * starts and finishes reading the counters.
113	 */
114	unsigned		epoch;
115
116	/* Profiling counters. */
117	prof_cnt_t		cnts;
118};
119
120struct prof_ctx_s {
121	/* Associated backtrace. */
122	prof_bt_t		*bt;
123
124	/* Protects nlimbo, cnt_merged, and cnts_ql. */
125	malloc_mutex_t		*lock;
126
127	/*
128	 * Number of threads that currently cause this ctx to be in a state of
129	 * limbo due to one of:
130	 *   - Initializing per thread counters associated with this ctx.
131	 *   - Preparing to destroy this ctx.
132	 *   - Dumping a heap profile that includes this ctx.
133	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
134	 * ctx.
135	 */
136	unsigned		nlimbo;
137
138	/* Temporary storage for summation during dump. */
139	prof_cnt_t		cnt_summed;
140
141	/* When threads exit, they merge their stats into cnt_merged. */
142	prof_cnt_t		cnt_merged;
143
144	/*
145	 * List of profile counters, one for each thread that has allocated in
146	 * this context.
147	 */
148	ql_head(prof_thr_cnt_t)	cnts_ql;
149
150	/* Linkage for list of contexts to be dumped. */
151	ql_elm(prof_ctx_t)	dump_link;
152};
153typedef ql_head(prof_ctx_t) prof_ctx_list_t;
154
155struct prof_tdata_s {
156	/*
157	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
158	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
159	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
160	 * others will ever write them.
161	 *
162	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
163	 * counter data into the associated prof_ctx_t objects, and unlink/free
164	 * the prof_thr_cnt_t objects.
165	 */
166	ckh_t			bt2cnt;
167
168	/* LRU for contents of bt2cnt. */
169	ql_head(prof_thr_cnt_t)	lru_ql;
170
171	/* Backtrace vector, used for calls to prof_backtrace(). */
172	void			**vec;
173
174	/* Sampling state. */
175	uint64_t		prng_state;
176	uint64_t		threshold;
177	uint64_t		accum;
178
179	/* State used to avoid dumping while operating on prof internals. */
180	bool			enq;
181	bool			enq_idump;
182	bool			enq_gdump;
183};
184
185#endif /* JEMALLOC_H_STRUCTS */
186/******************************************************************************/
187#ifdef JEMALLOC_H_EXTERNS
188
189extern bool	opt_prof;
190/*
191 * Even if opt_prof is true, sampling can be temporarily disabled by setting
192 * opt_prof_active to false.  No locking is used when updating opt_prof_active,
193 * so there are no guarantees regarding how long it will take for all threads
194 * to notice state changes.
195 */
196extern bool	opt_prof_active;
197extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
198extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
199extern bool	opt_prof_gdump;       /* High-water memory dumping. */
200extern bool	opt_prof_final;       /* Final profile dumping. */
201extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
202extern bool	opt_prof_accum;       /* Report cumulative bytes. */
203extern char	opt_prof_prefix[
204    /* Minimize memory bloat for non-prof builds. */
205#ifdef JEMALLOC_PROF
206    PATH_MAX +
207#endif
208    1];
209
210/*
211 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
212 * profile dump when it reaches this threshold.  The effect is that the
213 * interval between profile dumps averages prof_interval, though the actual
214 * interval between dumps will tend to be sporadic, and the interval will be a
215 * maximum of approximately (prof_interval * narenas).
216 */
217extern uint64_t	prof_interval;
218
219/*
220 * If true, promote small sampled objects to large objects, since small run
221 * headers do not have embedded profile context pointers.
222 */
223extern bool	prof_promote;
224
225void	bt_init(prof_bt_t *bt, void **vec);
226void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
227prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
228void	prof_idump(void);
229bool	prof_mdump(const char *filename);
230void	prof_gdump(void);
231prof_tdata_t	*prof_tdata_init(void);
232void	prof_tdata_cleanup(void *arg);
233void	prof_boot0(void);
234void	prof_boot1(void);
235bool	prof_boot2(void);
236void	prof_prefork(void);
237void	prof_postfork_parent(void);
238void	prof_postfork_child(void);
239
240#endif /* JEMALLOC_H_EXTERNS */
241/******************************************************************************/
242#ifdef JEMALLOC_H_INLINES
243
244#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
245	prof_tdata_t *prof_tdata;					\
246	prof_bt_t bt;							\
247									\
248	assert(size == s2u(size));					\
249									\
250	prof_tdata = prof_tdata_get(true);				\
251	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {	\
252		if (prof_tdata != NULL)					\
253			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
254		else							\
255			ret = NULL;					\
256		break;							\
257	}								\
258									\
259	if (opt_prof_active == false) {					\
260		/* Sampling is currently inactive, so avoid sampling. */\
261		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
262	} else if (opt_lg_prof_sample == 0) {				\
263		/* Don't bother with sampling logic, since sampling   */\
264		/* interval is 1.                                     */\
265		bt_init(&bt, prof_tdata->vec);				\
266		prof_backtrace(&bt, nignore);				\
267		ret = prof_lookup(&bt);					\
268	} else {							\
269		if (prof_tdata->threshold == 0) {			\
270			/* Initialize.  Seed the prng differently for */\
271			/* each thread.                               */\
272			prof_tdata->prng_state =			\
273			    (uint64_t)(uintptr_t)&size;			\
274			prof_sample_threshold_update(prof_tdata);	\
275		}							\
276									\
277		/* Determine whether to capture a backtrace based on  */\
278		/* whether size is enough for prof_accum to reach     */\
279		/* prof_tdata->threshold.  However, delay updating    */\
280		/* these variables until prof_{m,re}alloc(), because  */\
281		/* we don't know for sure that the allocation will    */\
282		/* succeed.                                           */\
283		/*                                                    */\
284		/* Use subtraction rather than addition to avoid      */\
285		/* potential integer overflow.                        */\
286		if (size >= prof_tdata->threshold -			\
287		    prof_tdata->accum) {				\
288			bt_init(&bt, prof_tdata->vec);			\
289			prof_backtrace(&bt, nignore);			\
290			ret = prof_lookup(&bt);				\
291		} else							\
292			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
293	}								\
294} while (0)
295
296#ifndef JEMALLOC_ENABLE_INLINE
297malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
298
299prof_tdata_t	*prof_tdata_get(bool create);
300void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
301prof_ctx_t	*prof_ctx_get(const void *ptr);
302void	prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
303bool	prof_sample_accum_update(size_t size);
304void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
305void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
306    size_t old_usize, prof_ctx_t *old_ctx);
307void	prof_free(const void *ptr, size_t size);
308#endif
309
310#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
311/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
312malloc_tsd_externs(prof_tdata, prof_tdata_t *)
313malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
314    prof_tdata_cleanup)
315
316JEMALLOC_INLINE prof_tdata_t *
317prof_tdata_get(bool create)
318{
319	prof_tdata_t *prof_tdata;
320
321	cassert(config_prof);
322
323	prof_tdata = *prof_tdata_tsd_get();
324	if (create && prof_tdata == NULL)
325		prof_tdata = prof_tdata_init();
326
327	return (prof_tdata);
328}
329
330JEMALLOC_INLINE void
331prof_sample_threshold_update(prof_tdata_t *prof_tdata)
332{
333	/*
334	 * The body of this function is compiled out unless heap profiling is
335	 * enabled, so that it is possible to compile jemalloc with floating
336	 * point support completely disabled.  Avoiding floating point code is
337	 * important on memory-constrained systems, but it also enables a
338	 * workaround for versions of glibc that don't properly save/restore
339	 * floating point registers during dynamic lazy symbol loading (which
340	 * internally calls into whatever malloc implementation happens to be
341	 * integrated into the application).  Note that some compilers (e.g.
342	 * gcc 4.8) may use floating point registers for fast memory moves, so
343	 * jemalloc must be compiled with such optimizations disabled (e.g.
344	 * -mno-sse) in order for the workaround to be complete.
345	 */
346#ifdef JEMALLOC_PROF
347	uint64_t r;
348	double u;
349
350	cassert(config_prof);
351
352	/*
353	 * Compute sample threshold as a geometrically distributed random
354	 * variable with mean (2^opt_lg_prof_sample).
355	 *
356	 *                         __        __
357	 *                         |  log(u)  |                     1
358	 * prof_tdata->threshold = | -------- |, where p = -------------------
359	 *                         | log(1-p) |             opt_lg_prof_sample
360	 *                                                 2
361	 *
362	 * For more information on the math, see:
363	 *
364	 *   Non-Uniform Random Variate Generation
365	 *   Luc Devroye
366	 *   Springer-Verlag, New York, 1986
367	 *   pp 500
368	 *   (http://luc.devroye.org/rnbookindex.html)
369	 */
370	prng64(r, 53, prof_tdata->prng_state,
371	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
372	u = (double)r * (1.0/9007199254740992.0L);
373	prof_tdata->threshold = (uint64_t)(log(u) /
374	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
375	    + (uint64_t)1U;
376#endif
377}
378
379JEMALLOC_INLINE prof_ctx_t *
380prof_ctx_get(const void *ptr)
381{
382	prof_ctx_t *ret;
383	arena_chunk_t *chunk;
384
385	cassert(config_prof);
386	assert(ptr != NULL);
387
388	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
389	if (chunk != ptr) {
390		/* Region. */
391		ret = arena_prof_ctx_get(ptr);
392	} else
393		ret = huge_prof_ctx_get(ptr);
394
395	return (ret);
396}
397
398JEMALLOC_INLINE void
399prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
400{
401	arena_chunk_t *chunk;
402
403	cassert(config_prof);
404	assert(ptr != NULL);
405
406	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
407	if (chunk != ptr) {
408		/* Region. */
409		arena_prof_ctx_set(ptr, usize, ctx);
410	} else
411		huge_prof_ctx_set(ptr, ctx);
412}
413
414JEMALLOC_INLINE bool
415prof_sample_accum_update(size_t size)
416{
417	prof_tdata_t *prof_tdata;
418
419	cassert(config_prof);
420	/* Sampling logic is unnecessary if the interval is 1. */
421	assert(opt_lg_prof_sample != 0);
422
423	prof_tdata = prof_tdata_get(false);
424	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
425		return (true);
426
427	/* Take care to avoid integer overflow. */
428	if (size >= prof_tdata->threshold - prof_tdata->accum) {
429		prof_tdata->accum -= (prof_tdata->threshold - size);
430		/* Compute new sample threshold. */
431		prof_sample_threshold_update(prof_tdata);
432		while (prof_tdata->accum >= prof_tdata->threshold) {
433			prof_tdata->accum -= prof_tdata->threshold;
434			prof_sample_threshold_update(prof_tdata);
435		}
436		return (false);
437	} else {
438		prof_tdata->accum += size;
439		return (true);
440	}
441}
442
443JEMALLOC_INLINE void
444prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
445{
446
447	cassert(config_prof);
448	assert(ptr != NULL);
449	assert(usize == isalloc(ptr, true));
450
451	if (opt_lg_prof_sample != 0) {
452		if (prof_sample_accum_update(usize)) {
453			/*
454			 * Don't sample.  For malloc()-like allocation, it is
455			 * always possible to tell in advance how large an
456			 * object's usable size will be, so there should never
457			 * be a difference between the usize passed to
458			 * PROF_ALLOC_PREP() and prof_malloc().
459			 */
460			assert((uintptr_t)cnt == (uintptr_t)1U);
461		}
462	}
463
464	if ((uintptr_t)cnt > (uintptr_t)1U) {
465		prof_ctx_set(ptr, usize, cnt->ctx);
466
467		cnt->epoch++;
468		/*********/
469		mb_write();
470		/*********/
471		cnt->cnts.curobjs++;
472		cnt->cnts.curbytes += usize;
473		if (opt_prof_accum) {
474			cnt->cnts.accumobjs++;
475			cnt->cnts.accumbytes += usize;
476		}
477		/*********/
478		mb_write();
479		/*********/
480		cnt->epoch++;
481		/*********/
482		mb_write();
483		/*********/
484	} else
485		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
486}
487
488JEMALLOC_INLINE void
489prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
490    size_t old_usize, prof_ctx_t *old_ctx)
491{
492	prof_thr_cnt_t *told_cnt;
493
494	cassert(config_prof);
495	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
496
497	if (ptr != NULL) {
498		assert(usize == isalloc(ptr, true));
499		if (opt_lg_prof_sample != 0) {
500			if (prof_sample_accum_update(usize)) {
501				/*
502				 * Don't sample.  The usize passed to
503				 * PROF_ALLOC_PREP() was larger than what
504				 * actually got allocated, so a backtrace was
505				 * captured for this allocation, even though
506				 * its actual usize was insufficient to cross
507				 * the sample threshold.
508				 */
509				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
510			}
511		}
512	}
513
514	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
515		told_cnt = prof_lookup(old_ctx->bt);
516		if (told_cnt == NULL) {
517			/*
518			 * It's too late to propagate OOM for this realloc(),
519			 * so operate directly on old_cnt->ctx->cnt_merged.
520			 */
521			malloc_mutex_lock(old_ctx->lock);
522			old_ctx->cnt_merged.curobjs--;
523			old_ctx->cnt_merged.curbytes -= old_usize;
524			malloc_mutex_unlock(old_ctx->lock);
525			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
526		}
527	} else
528		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
529
530	if ((uintptr_t)told_cnt > (uintptr_t)1U)
531		told_cnt->epoch++;
532	if ((uintptr_t)cnt > (uintptr_t)1U) {
533		prof_ctx_set(ptr, usize, cnt->ctx);
534		cnt->epoch++;
535	} else if (ptr != NULL)
536		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
537	/*********/
538	mb_write();
539	/*********/
540	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
541		told_cnt->cnts.curobjs--;
542		told_cnt->cnts.curbytes -= old_usize;
543	}
544	if ((uintptr_t)cnt > (uintptr_t)1U) {
545		cnt->cnts.curobjs++;
546		cnt->cnts.curbytes += usize;
547		if (opt_prof_accum) {
548			cnt->cnts.accumobjs++;
549			cnt->cnts.accumbytes += usize;
550		}
551	}
552	/*********/
553	mb_write();
554	/*********/
555	if ((uintptr_t)told_cnt > (uintptr_t)1U)
556		told_cnt->epoch++;
557	if ((uintptr_t)cnt > (uintptr_t)1U)
558		cnt->epoch++;
559	/*********/
560	mb_write(); /* Not strictly necessary. */
561}
562
563JEMALLOC_INLINE void
564prof_free(const void *ptr, size_t size)
565{
566	prof_ctx_t *ctx = prof_ctx_get(ptr);
567
568	cassert(config_prof);
569
570	if ((uintptr_t)ctx > (uintptr_t)1) {
571		prof_thr_cnt_t *tcnt;
572		assert(size == isalloc(ptr, true));
573		tcnt = prof_lookup(ctx->bt);
574
575		if (tcnt != NULL) {
576			tcnt->epoch++;
577			/*********/
578			mb_write();
579			/*********/
580			tcnt->cnts.curobjs--;
581			tcnt->cnts.curbytes -= size;
582			/*********/
583			mb_write();
584			/*********/
585			tcnt->epoch++;
586			/*********/
587			mb_write();
588			/*********/
589		} else {
590			/*
591			 * OOM during free() cannot be propagated, so operate
592			 * directly on cnt->ctx->cnt_merged.
593			 */
594			malloc_mutex_lock(ctx->lock);
595			ctx->cnt_merged.curobjs--;
596			ctx->cnt_merged.curbytes -= size;
597			malloc_mutex_unlock(ctx->lock);
598		}
599	}
600}
601#endif
602
603#endif /* JEMALLOC_H_INLINES */
604/******************************************************************************/
605