prof.h revision 5f60afa01eb2cf7d44024d162a1ecc6cceedcca1
1/******************************************************************************/
2#ifdef JEMALLOC_H_TYPES
3
4typedef struct prof_bt_s prof_bt_t;
5typedef struct prof_cnt_s prof_cnt_t;
6typedef struct prof_thr_cnt_s prof_thr_cnt_t;
7typedef struct prof_ctx_s prof_ctx_t;
8typedef struct prof_tdata_s prof_tdata_t;
9
10/* Option defaults. */
11#ifdef JEMALLOC_PROF
12#  define PROF_PREFIX_DEFAULT		"jeprof"
13#else
14#  define PROF_PREFIX_DEFAULT		""
15#endif
16#define	LG_PROF_SAMPLE_DEFAULT		19
17#define	LG_PROF_INTERVAL_DEFAULT	-1
18
19/*
20 * Hard limit on stack backtrace depth.  The version of prof_backtrace() that
21 * is based on __builtin_return_address() necessarily has a hard-coded number
22 * of backtrace frame handlers, and should be kept in sync with this setting.
23 */
24#define	PROF_BT_MAX			128
25
26/* Maximum number of backtraces to store in each per thread LRU cache. */
27#define	PROF_TCMAX			1024
28
29/* Initial hash table size. */
30#define	PROF_CKH_MINITEMS		64
31
32/* Size of memory buffer to use when writing dump files. */
33#define	PROF_DUMP_BUFSIZE		65536
34
35/* Size of stack-allocated buffer used by prof_printf(). */
36#define	PROF_PRINTF_BUFSIZE		128
37
38/*
39 * Number of mutexes shared among all ctx's.  No space is allocated for these
40 * unless profiling is enabled, so it's okay to over-provision.
41 */
42#define	PROF_NCTX_LOCKS			1024
43
44/*
45 * prof_tdata pointers close to NULL are used to encode state information that
46 * is used for cleaning up during thread shutdown.
47 */
48#define	PROF_TDATA_STATE_REINCARNATED	((prof_tdata_t *)(uintptr_t)1)
49#define	PROF_TDATA_STATE_PURGATORY	((prof_tdata_t *)(uintptr_t)2)
50#define	PROF_TDATA_STATE_MAX		PROF_TDATA_STATE_PURGATORY
51
52#endif /* JEMALLOC_H_TYPES */
53/******************************************************************************/
54#ifdef JEMALLOC_H_STRUCTS
55
56struct prof_bt_s {
57	/* Backtrace, stored as len program counters. */
58	void		**vec;
59	unsigned	len;
60};
61
62#ifdef JEMALLOC_PROF_LIBGCC
63/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
64typedef struct {
65	prof_bt_t	*bt;
66	unsigned	nignore;
67	unsigned	max;
68} prof_unwind_data_t;
69#endif
70
71struct prof_cnt_s {
72	/*
73	 * Profiling counters.  An allocation/deallocation pair can operate on
74	 * different prof_thr_cnt_t objects that are linked into the same
75	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
76	 * negative.  In principle it is possible for the *bytes counters to
77	 * overflow/underflow, but a general solution would require something
78	 * like 128-bit counters; this implementation doesn't bother to solve
79	 * that problem.
80	 */
81	int64_t		curobjs;
82	int64_t		curbytes;
83	uint64_t	accumobjs;
84	uint64_t	accumbytes;
85};
86
87struct prof_thr_cnt_s {
88	/* Linkage into prof_ctx_t's cnts_ql. */
89	ql_elm(prof_thr_cnt_t)	cnts_link;
90
91	/* Linkage into thread's LRU. */
92	ql_elm(prof_thr_cnt_t)	lru_link;
93
94	/*
95	 * Associated context.  If a thread frees an object that it did not
96	 * allocate, it is possible that the context is not cached in the
97	 * thread's hash table, in which case it must be able to look up the
98	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
99	 * and link it into the prof_ctx_t's cnts_ql.
100	 */
101	prof_ctx_t		*ctx;
102
103	/*
104	 * Threads use memory barriers to update the counters.  Since there is
105	 * only ever one writer, the only challenge is for the reader to get a
106	 * consistent read of the counters.
107	 *
108	 * The writer uses this series of operations:
109	 *
110	 * 1) Increment epoch to an odd number.
111	 * 2) Update counters.
112	 * 3) Increment epoch to an even number.
113	 *
114	 * The reader must assure 1) that the epoch is even while it reads the
115	 * counters, and 2) that the epoch doesn't change between the time it
116	 * starts and finishes reading the counters.
117	 */
118	unsigned		epoch;
119
120	/* Profiling counters. */
121	prof_cnt_t		cnts;
122};
123
124struct prof_ctx_s {
125	/* Associated backtrace. */
126	prof_bt_t		*bt;
127
128	/* Protects nlimbo, cnt_merged, and cnts_ql. */
129	malloc_mutex_t		*lock;
130
131	/*
132	 * Number of threads that currently cause this ctx to be in a state of
133	 * limbo due to one of:
134	 *   - Initializing per thread counters associated with this ctx.
135	 *   - Preparing to destroy this ctx.
136	 *   - Dumping a heap profile that includes this ctx.
137	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
138	 * ctx.
139	 */
140	unsigned		nlimbo;
141
142	/* Temporary storage for summation during dump. */
143	prof_cnt_t		cnt_summed;
144
145	/* When threads exit, they merge their stats into cnt_merged. */
146	prof_cnt_t		cnt_merged;
147
148	/*
149	 * List of profile counters, one for each thread that has allocated in
150	 * this context.
151	 */
152	ql_head(prof_thr_cnt_t)	cnts_ql;
153
154	/* Linkage for list of contexts to be dumped. */
155	ql_elm(prof_ctx_t)	dump_link;
156};
157typedef ql_head(prof_ctx_t) prof_ctx_list_t;
158
159struct prof_tdata_s {
160	/*
161	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
162	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
163	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
164	 * others will ever write them.
165	 *
166	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
167	 * counter data into the associated prof_ctx_t objects, and unlink/free
168	 * the prof_thr_cnt_t objects.
169	 */
170	ckh_t			bt2cnt;
171
172	/* LRU for contents of bt2cnt. */
173	ql_head(prof_thr_cnt_t)	lru_ql;
174
175	/* Backtrace vector, used for calls to prof_backtrace(). */
176	void			**vec;
177
178	/* Sampling state. */
179	uint64_t		prng_state;
180	uint64_t		threshold;
181	uint64_t		accum;
182
183	/* State used to avoid dumping while operating on prof internals. */
184	bool			enq;
185	bool			enq_idump;
186	bool			enq_gdump;
187};
188
189#endif /* JEMALLOC_H_STRUCTS */
190/******************************************************************************/
191#ifdef JEMALLOC_H_EXTERNS
192
193extern bool	opt_prof;
194/*
195 * Even if opt_prof is true, sampling can be temporarily disabled by setting
196 * opt_prof_active to false.  No locking is used when updating opt_prof_active,
197 * so there are no guarantees regarding how long it will take for all threads
198 * to notice state changes.
199 */
200extern bool	opt_prof_active;
201extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
202extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
203extern bool	opt_prof_gdump;       /* High-water memory dumping. */
204extern bool	opt_prof_final;       /* Final profile dumping. */
205extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
206extern bool	opt_prof_accum;       /* Report cumulative bytes. */
207extern char	opt_prof_prefix[
208    /* Minimize memory bloat for non-prof builds. */
209#ifdef JEMALLOC_PROF
210    PATH_MAX +
211#endif
212    1];
213
214/*
215 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
216 * profile dump when it reaches this threshold.  The effect is that the
217 * interval between profile dumps averages prof_interval, though the actual
218 * interval between dumps will tend to be sporadic, and the interval will be a
219 * maximum of approximately (prof_interval * narenas).
220 */
221extern uint64_t	prof_interval;
222
223/*
224 * If true, promote small sampled objects to large objects, since small run
225 * headers do not have embedded profile context pointers.
226 */
227extern bool	prof_promote;
228
229void	bt_init(prof_bt_t *bt, void **vec);
230void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
231prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
232#ifdef JEMALLOC_JET
233size_t	prof_bt_count(void);
234typedef int (prof_dump_open_t)(bool, const char *);
235extern prof_dump_open_t *prof_dump_open;
236#endif
237void	prof_idump(void);
238bool	prof_mdump(const char *filename);
239void	prof_gdump(void);
240prof_tdata_t	*prof_tdata_init(void);
241void	prof_tdata_cleanup(void *arg);
242void	prof_boot0(void);
243void	prof_boot1(void);
244bool	prof_boot2(void);
245void	prof_prefork(void);
246void	prof_postfork_parent(void);
247void	prof_postfork_child(void);
248
249#endif /* JEMALLOC_H_EXTERNS */
250/******************************************************************************/
251#ifdef JEMALLOC_H_INLINES
252
253#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
254	prof_tdata_t *prof_tdata;					\
255	prof_bt_t bt;							\
256									\
257	assert(size == s2u(size));					\
258									\
259	prof_tdata = prof_tdata_get(true);				\
260	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {	\
261		if (prof_tdata != NULL)					\
262			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
263		else							\
264			ret = NULL;					\
265		break;							\
266	}								\
267									\
268	if (opt_prof_active == false) {					\
269		/* Sampling is currently inactive, so avoid sampling. */\
270		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
271	} else if (opt_lg_prof_sample == 0) {				\
272		/* Don't bother with sampling logic, since sampling   */\
273		/* interval is 1.                                     */\
274		bt_init(&bt, prof_tdata->vec);				\
275		prof_backtrace(&bt, nignore);				\
276		ret = prof_lookup(&bt);					\
277	} else {							\
278		if (prof_tdata->threshold == 0) {			\
279			/* Initialize.  Seed the prng differently for */\
280			/* each thread.                               */\
281			prof_tdata->prng_state =			\
282			    (uint64_t)(uintptr_t)&size;			\
283			prof_sample_threshold_update(prof_tdata);	\
284		}							\
285									\
286		/* Determine whether to capture a backtrace based on  */\
287		/* whether size is enough for prof_accum to reach     */\
288		/* prof_tdata->threshold.  However, delay updating    */\
289		/* these variables until prof_{m,re}alloc(), because  */\
290		/* we don't know for sure that the allocation will    */\
291		/* succeed.                                           */\
292		/*                                                    */\
293		/* Use subtraction rather than addition to avoid      */\
294		/* potential integer overflow.                        */\
295		if (size >= prof_tdata->threshold -			\
296		    prof_tdata->accum) {				\
297			bt_init(&bt, prof_tdata->vec);			\
298			prof_backtrace(&bt, nignore);			\
299			ret = prof_lookup(&bt);				\
300		} else							\
301			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
302	}								\
303} while (0)
304
305#ifndef JEMALLOC_ENABLE_INLINE
306malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
307
308prof_tdata_t	*prof_tdata_get(bool create);
309void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
310prof_ctx_t	*prof_ctx_get(const void *ptr);
311void	prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
312bool	prof_sample_accum_update(size_t size);
313void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
314void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
315    size_t old_usize, prof_ctx_t *old_ctx);
316void	prof_free(const void *ptr, size_t size);
317#endif
318
319#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
320/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
321malloc_tsd_externs(prof_tdata, prof_tdata_t *)
322malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
323    prof_tdata_cleanup)
324
325JEMALLOC_INLINE prof_tdata_t *
326prof_tdata_get(bool create)
327{
328	prof_tdata_t *prof_tdata;
329
330	cassert(config_prof);
331
332	prof_tdata = *prof_tdata_tsd_get();
333	if (create && prof_tdata == NULL)
334		prof_tdata = prof_tdata_init();
335
336	return (prof_tdata);
337}
338
339JEMALLOC_INLINE void
340prof_sample_threshold_update(prof_tdata_t *prof_tdata)
341{
342	/*
343	 * The body of this function is compiled out unless heap profiling is
344	 * enabled, so that it is possible to compile jemalloc with floating
345	 * point support completely disabled.  Avoiding floating point code is
346	 * important on memory-constrained systems, but it also enables a
347	 * workaround for versions of glibc that don't properly save/restore
348	 * floating point registers during dynamic lazy symbol loading (which
349	 * internally calls into whatever malloc implementation happens to be
350	 * integrated into the application).  Note that some compilers (e.g.
351	 * gcc 4.8) may use floating point registers for fast memory moves, so
352	 * jemalloc must be compiled with such optimizations disabled (e.g.
353	 * -mno-sse) in order for the workaround to be complete.
354	 */
355#ifdef JEMALLOC_PROF
356	uint64_t r;
357	double u;
358
359	cassert(config_prof);
360
361	/*
362	 * Compute sample threshold as a geometrically distributed random
363	 * variable with mean (2^opt_lg_prof_sample).
364	 *
365	 *                         __        __
366	 *                         |  log(u)  |                     1
367	 * prof_tdata->threshold = | -------- |, where p = -------------------
368	 *                         | log(1-p) |             opt_lg_prof_sample
369	 *                                                 2
370	 *
371	 * For more information on the math, see:
372	 *
373	 *   Non-Uniform Random Variate Generation
374	 *   Luc Devroye
375	 *   Springer-Verlag, New York, 1986
376	 *   pp 500
377	 *   (http://luc.devroye.org/rnbookindex.html)
378	 */
379	prng64(r, 53, prof_tdata->prng_state,
380	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
381	u = (double)r * (1.0/9007199254740992.0L);
382	prof_tdata->threshold = (uint64_t)(log(u) /
383	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
384	    + (uint64_t)1U;
385#endif
386}
387
388JEMALLOC_INLINE prof_ctx_t *
389prof_ctx_get(const void *ptr)
390{
391	prof_ctx_t *ret;
392	arena_chunk_t *chunk;
393
394	cassert(config_prof);
395	assert(ptr != NULL);
396
397	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
398	if (chunk != ptr) {
399		/* Region. */
400		ret = arena_prof_ctx_get(ptr);
401	} else
402		ret = huge_prof_ctx_get(ptr);
403
404	return (ret);
405}
406
407JEMALLOC_INLINE void
408prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
409{
410	arena_chunk_t *chunk;
411
412	cassert(config_prof);
413	assert(ptr != NULL);
414
415	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
416	if (chunk != ptr) {
417		/* Region. */
418		arena_prof_ctx_set(ptr, usize, ctx);
419	} else
420		huge_prof_ctx_set(ptr, ctx);
421}
422
423JEMALLOC_INLINE bool
424prof_sample_accum_update(size_t size)
425{
426	prof_tdata_t *prof_tdata;
427
428	cassert(config_prof);
429	/* Sampling logic is unnecessary if the interval is 1. */
430	assert(opt_lg_prof_sample != 0);
431
432	prof_tdata = prof_tdata_get(false);
433	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
434		return (true);
435
436	/* Take care to avoid integer overflow. */
437	if (size >= prof_tdata->threshold - prof_tdata->accum) {
438		prof_tdata->accum -= (prof_tdata->threshold - size);
439		/* Compute new sample threshold. */
440		prof_sample_threshold_update(prof_tdata);
441		while (prof_tdata->accum >= prof_tdata->threshold) {
442			prof_tdata->accum -= prof_tdata->threshold;
443			prof_sample_threshold_update(prof_tdata);
444		}
445		return (false);
446	} else {
447		prof_tdata->accum += size;
448		return (true);
449	}
450}
451
452JEMALLOC_INLINE void
453prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
454{
455
456	cassert(config_prof);
457	assert(ptr != NULL);
458	assert(usize == isalloc(ptr, true));
459
460	if (opt_lg_prof_sample != 0) {
461		if (prof_sample_accum_update(usize)) {
462			/*
463			 * Don't sample.  For malloc()-like allocation, it is
464			 * always possible to tell in advance how large an
465			 * object's usable size will be, so there should never
466			 * be a difference between the usize passed to
467			 * PROF_ALLOC_PREP() and prof_malloc().
468			 */
469			assert((uintptr_t)cnt == (uintptr_t)1U);
470		}
471	}
472
473	if ((uintptr_t)cnt > (uintptr_t)1U) {
474		prof_ctx_set(ptr, usize, cnt->ctx);
475
476		cnt->epoch++;
477		/*********/
478		mb_write();
479		/*********/
480		cnt->cnts.curobjs++;
481		cnt->cnts.curbytes += usize;
482		if (opt_prof_accum) {
483			cnt->cnts.accumobjs++;
484			cnt->cnts.accumbytes += usize;
485		}
486		/*********/
487		mb_write();
488		/*********/
489		cnt->epoch++;
490		/*********/
491		mb_write();
492		/*********/
493	} else
494		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
495}
496
497JEMALLOC_INLINE void
498prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
499    size_t old_usize, prof_ctx_t *old_ctx)
500{
501	prof_thr_cnt_t *told_cnt;
502
503	cassert(config_prof);
504	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
505
506	if (ptr != NULL) {
507		assert(usize == isalloc(ptr, true));
508		if (opt_lg_prof_sample != 0) {
509			if (prof_sample_accum_update(usize)) {
510				/*
511				 * Don't sample.  The usize passed to
512				 * PROF_ALLOC_PREP() was larger than what
513				 * actually got allocated, so a backtrace was
514				 * captured for this allocation, even though
515				 * its actual usize was insufficient to cross
516				 * the sample threshold.
517				 */
518				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
519			}
520		}
521	}
522
523	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
524		told_cnt = prof_lookup(old_ctx->bt);
525		if (told_cnt == NULL) {
526			/*
527			 * It's too late to propagate OOM for this realloc(),
528			 * so operate directly on old_cnt->ctx->cnt_merged.
529			 */
530			malloc_mutex_lock(old_ctx->lock);
531			old_ctx->cnt_merged.curobjs--;
532			old_ctx->cnt_merged.curbytes -= old_usize;
533			malloc_mutex_unlock(old_ctx->lock);
534			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
535		}
536	} else
537		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
538
539	if ((uintptr_t)told_cnt > (uintptr_t)1U)
540		told_cnt->epoch++;
541	if ((uintptr_t)cnt > (uintptr_t)1U) {
542		prof_ctx_set(ptr, usize, cnt->ctx);
543		cnt->epoch++;
544	} else if (ptr != NULL)
545		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
546	/*********/
547	mb_write();
548	/*********/
549	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
550		told_cnt->cnts.curobjs--;
551		told_cnt->cnts.curbytes -= old_usize;
552	}
553	if ((uintptr_t)cnt > (uintptr_t)1U) {
554		cnt->cnts.curobjs++;
555		cnt->cnts.curbytes += usize;
556		if (opt_prof_accum) {
557			cnt->cnts.accumobjs++;
558			cnt->cnts.accumbytes += usize;
559		}
560	}
561	/*********/
562	mb_write();
563	/*********/
564	if ((uintptr_t)told_cnt > (uintptr_t)1U)
565		told_cnt->epoch++;
566	if ((uintptr_t)cnt > (uintptr_t)1U)
567		cnt->epoch++;
568	/*********/
569	mb_write(); /* Not strictly necessary. */
570}
571
572JEMALLOC_INLINE void
573prof_free(const void *ptr, size_t size)
574{
575	prof_ctx_t *ctx = prof_ctx_get(ptr);
576
577	cassert(config_prof);
578
579	if ((uintptr_t)ctx > (uintptr_t)1) {
580		prof_thr_cnt_t *tcnt;
581		assert(size == isalloc(ptr, true));
582		tcnt = prof_lookup(ctx->bt);
583
584		if (tcnt != NULL) {
585			tcnt->epoch++;
586			/*********/
587			mb_write();
588			/*********/
589			tcnt->cnts.curobjs--;
590			tcnt->cnts.curbytes -= size;
591			/*********/
592			mb_write();
593			/*********/
594			tcnt->epoch++;
595			/*********/
596			mb_write();
597			/*********/
598		} else {
599			/*
600			 * OOM during free() cannot be propagated, so operate
601			 * directly on cnt->ctx->cnt_merged.
602			 */
603			malloc_mutex_lock(ctx->lock);
604			ctx->cnt_merged.curobjs--;
605			ctx->cnt_merged.curbytes -= size;
606			malloc_mutex_unlock(ctx->lock);
607		}
608	}
609}
610#endif
611
612#endif /* JEMALLOC_H_INLINES */
613/******************************************************************************/
614