prof.h revision 772163b4f3d8e9a12343e9215f6b070068507604
1/******************************************************************************/
2#ifdef JEMALLOC_H_TYPES
3
4typedef struct prof_bt_s prof_bt_t;
5typedef struct prof_cnt_s prof_cnt_t;
6typedef struct prof_thr_cnt_s prof_thr_cnt_t;
7typedef struct prof_ctx_s prof_ctx_t;
8typedef struct prof_tdata_s prof_tdata_t;
9
10/* Option defaults. */
11#define	PROF_PREFIX_DEFAULT		"jeprof"
12#define	LG_PROF_SAMPLE_DEFAULT		19
13#define	LG_PROF_INTERVAL_DEFAULT	-1
14
15/*
16 * Hard limit on stack backtrace depth.  The version of prof_backtrace() that
17 * is based on __builtin_return_address() necessarily has a hard-coded number
18 * of backtrace frame handlers, and should be kept in sync with this setting.
19 */
20#define	PROF_BT_MAX			128
21
22/* Maximum number of backtraces to store in each per thread LRU cache. */
23#define	PROF_TCMAX			1024
24
25/* Initial hash table size. */
26#define	PROF_CKH_MINITEMS		64
27
28/* Size of memory buffer to use when writing dump files. */
29#define	PROF_DUMP_BUFSIZE		65536
30
31/* Size of stack-allocated buffer used by prof_printf(). */
32#define	PROF_PRINTF_BUFSIZE		128
33
34/*
35 * Number of mutexes shared among all ctx's.  No space is allocated for these
36 * unless profiling is enabled, so it's okay to over-provision.
37 */
38#define	PROF_NCTX_LOCKS			1024
39
40/*
41 * prof_tdata pointers close to NULL are used to encode state information that
42 * is used for cleaning up during thread shutdown.
43 */
44#define	PROF_TDATA_STATE_REINCARNATED	((prof_tdata_t *)(uintptr_t)1)
45#define	PROF_TDATA_STATE_PURGATORY	((prof_tdata_t *)(uintptr_t)2)
46#define	PROF_TDATA_STATE_MAX		PROF_TDATA_STATE_PURGATORY
47
48#endif /* JEMALLOC_H_TYPES */
49/******************************************************************************/
50#ifdef JEMALLOC_H_STRUCTS
51
52struct prof_bt_s {
53	/* Backtrace, stored as len program counters. */
54	void		**vec;
55	unsigned	len;
56};
57
58#ifdef JEMALLOC_PROF_LIBGCC
59/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
60typedef struct {
61	prof_bt_t	*bt;
62	unsigned	nignore;
63	unsigned	max;
64} prof_unwind_data_t;
65#endif
66
67struct prof_cnt_s {
68	/*
69	 * Profiling counters.  An allocation/deallocation pair can operate on
70	 * different prof_thr_cnt_t objects that are linked into the same
71	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
72	 * negative.  In principle it is possible for the *bytes counters to
73	 * overflow/underflow, but a general solution would require something
74	 * like 128-bit counters; this implementation doesn't bother to solve
75	 * that problem.
76	 */
77	int64_t		curobjs;
78	int64_t		curbytes;
79	uint64_t	accumobjs;
80	uint64_t	accumbytes;
81};
82
83struct prof_thr_cnt_s {
84	/* Linkage into prof_ctx_t's cnts_ql. */
85	ql_elm(prof_thr_cnt_t)	cnts_link;
86
87	/* Linkage into thread's LRU. */
88	ql_elm(prof_thr_cnt_t)	lru_link;
89
90	/*
91	 * Associated context.  If a thread frees an object that it did not
92	 * allocate, it is possible that the context is not cached in the
93	 * thread's hash table, in which case it must be able to look up the
94	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
95	 * and link it into the prof_ctx_t's cnts_ql.
96	 */
97	prof_ctx_t		*ctx;
98
99	/*
100	 * Threads use memory barriers to update the counters.  Since there is
101	 * only ever one writer, the only challenge is for the reader to get a
102	 * consistent read of the counters.
103	 *
104	 * The writer uses this series of operations:
105	 *
106	 * 1) Increment epoch to an odd number.
107	 * 2) Update counters.
108	 * 3) Increment epoch to an even number.
109	 *
110	 * The reader must assure 1) that the epoch is even while it reads the
111	 * counters, and 2) that the epoch doesn't change between the time it
112	 * starts and finishes reading the counters.
113	 */
114	unsigned		epoch;
115
116	/* Profiling counters. */
117	prof_cnt_t		cnts;
118};
119
120struct prof_ctx_s {
121	/* Associated backtrace. */
122	prof_bt_t		*bt;
123
124	/* Protects nlimbo, cnt_merged, and cnts_ql. */
125	malloc_mutex_t		*lock;
126
127	/*
128	 * Number of threads that currently cause this ctx to be in a state of
129	 * limbo due to one of:
130	 *   - Initializing per thread counters associated with this ctx.
131	 *   - Preparing to destroy this ctx.
132	 *   - Dumping a heap profile that includes this ctx.
133	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
134	 * ctx.
135	 */
136	unsigned		nlimbo;
137
138	/* Temporary storage for summation during dump. */
139	prof_cnt_t		cnt_summed;
140
141	/* When threads exit, they merge their stats into cnt_merged. */
142	prof_cnt_t		cnt_merged;
143
144	/*
145	 * List of profile counters, one for each thread that has allocated in
146	 * this context.
147	 */
148	ql_head(prof_thr_cnt_t)	cnts_ql;
149
150	/* Linkage for list of contexts to be dumped. */
151	ql_elm(prof_ctx_t)	dump_link;
152};
153typedef ql_head(prof_ctx_t) prof_ctx_list_t;
154
155struct prof_tdata_s {
156	/*
157	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
158	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
159	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
160	 * others will ever write them.
161	 *
162	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
163	 * counter data into the associated prof_ctx_t objects, and unlink/free
164	 * the prof_thr_cnt_t objects.
165	 */
166	ckh_t			bt2cnt;
167
168	/* LRU for contents of bt2cnt. */
169	ql_head(prof_thr_cnt_t)	lru_ql;
170
171	/* Backtrace vector, used for calls to prof_backtrace(). */
172	void			**vec;
173
174	/* Sampling state. */
175	uint64_t		prng_state;
176	uint64_t		threshold;
177	uint64_t		accum;
178
179	/* State used to avoid dumping while operating on prof internals. */
180	bool			enq;
181	bool			enq_idump;
182	bool			enq_gdump;
183};
184
185#endif /* JEMALLOC_H_STRUCTS */
186/******************************************************************************/
187#ifdef JEMALLOC_H_EXTERNS
188
189extern bool	opt_prof;
190/*
191 * Even if opt_prof is true, sampling can be temporarily disabled by setting
192 * opt_prof_active to false.  No locking is used when updating opt_prof_active,
193 * so there are no guarantees regarding how long it will take for all threads
194 * to notice state changes.
195 */
196extern bool	opt_prof_active;
197extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
198extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
199extern bool	opt_prof_gdump;       /* High-water memory dumping. */
200extern bool	opt_prof_final;       /* Final profile dumping. */
201extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
202extern bool	opt_prof_accum;       /* Report cumulative bytes. */
203extern char	opt_prof_prefix[
204    /* Minimize memory bloat for non-prof builds. */
205#ifdef JEMALLOC_PROF
206    PATH_MAX +
207#endif
208    1];
209
210/*
211 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
212 * profile dump when it reaches this threshold.  The effect is that the
213 * interval between profile dumps averages prof_interval, though the actual
214 * interval between dumps will tend to be sporadic, and the interval will be a
215 * maximum of approximately (prof_interval * narenas).
216 */
217extern uint64_t	prof_interval;
218
219/*
220 * If true, promote small sampled objects to large objects, since small run
221 * headers do not have embedded profile context pointers.
222 */
223extern bool	prof_promote;
224
225void	bt_init(prof_bt_t *bt, void **vec);
226void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
227prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
228#ifdef JEMALLOC_JET
229size_t	prof_bt_count(void);
230typedef int (prof_dump_open_t)(bool, const char *);
231extern prof_dump_open_t *prof_dump_open;
232#endif
233void	prof_idump(void);
234bool	prof_mdump(const char *filename);
235void	prof_gdump(void);
236prof_tdata_t	*prof_tdata_init(void);
237void	prof_tdata_cleanup(void *arg);
238void	prof_boot0(void);
239void	prof_boot1(void);
240bool	prof_boot2(void);
241void	prof_prefork(void);
242void	prof_postfork_parent(void);
243void	prof_postfork_child(void);
244
245#endif /* JEMALLOC_H_EXTERNS */
246/******************************************************************************/
247#ifdef JEMALLOC_H_INLINES
248
249#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
250	prof_tdata_t *prof_tdata;					\
251	prof_bt_t bt;							\
252									\
253	assert(size == s2u(size));					\
254									\
255	prof_tdata = prof_tdata_get(true);				\
256	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {	\
257		if (prof_tdata != NULL)					\
258			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
259		else							\
260			ret = NULL;					\
261		break;							\
262	}								\
263									\
264	if (opt_prof_active == false) {					\
265		/* Sampling is currently inactive, so avoid sampling. */\
266		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
267	} else if (opt_lg_prof_sample == 0) {				\
268		/* Don't bother with sampling logic, since sampling   */\
269		/* interval is 1.                                     */\
270		bt_init(&bt, prof_tdata->vec);				\
271		prof_backtrace(&bt, nignore);				\
272		ret = prof_lookup(&bt);					\
273	} else {							\
274		if (prof_tdata->threshold == 0) {			\
275			/* Initialize.  Seed the prng differently for */\
276			/* each thread.                               */\
277			prof_tdata->prng_state =			\
278			    (uint64_t)(uintptr_t)&size;			\
279			prof_sample_threshold_update(prof_tdata);	\
280		}							\
281									\
282		/* Determine whether to capture a backtrace based on  */\
283		/* whether size is enough for prof_accum to reach     */\
284		/* prof_tdata->threshold.  However, delay updating    */\
285		/* these variables until prof_{m,re}alloc(), because  */\
286		/* we don't know for sure that the allocation will    */\
287		/* succeed.                                           */\
288		/*                                                    */\
289		/* Use subtraction rather than addition to avoid      */\
290		/* potential integer overflow.                        */\
291		if (size >= prof_tdata->threshold -			\
292		    prof_tdata->accum) {				\
293			bt_init(&bt, prof_tdata->vec);			\
294			prof_backtrace(&bt, nignore);			\
295			ret = prof_lookup(&bt);				\
296		} else							\
297			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
298	}								\
299} while (0)
300
301#ifndef JEMALLOC_ENABLE_INLINE
302malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
303
304prof_tdata_t	*prof_tdata_get(bool create);
305void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
306prof_ctx_t	*prof_ctx_get(const void *ptr);
307void	prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
308bool	prof_sample_accum_update(size_t size);
309void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
310void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
311    size_t old_usize, prof_ctx_t *old_ctx);
312void	prof_free(const void *ptr, size_t size);
313#endif
314
315#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
316/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
317malloc_tsd_externs(prof_tdata, prof_tdata_t *)
318malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
319    prof_tdata_cleanup)
320
321JEMALLOC_INLINE prof_tdata_t *
322prof_tdata_get(bool create)
323{
324	prof_tdata_t *prof_tdata;
325
326	cassert(config_prof);
327
328	prof_tdata = *prof_tdata_tsd_get();
329	if (create && prof_tdata == NULL)
330		prof_tdata = prof_tdata_init();
331
332	return (prof_tdata);
333}
334
335JEMALLOC_INLINE void
336prof_sample_threshold_update(prof_tdata_t *prof_tdata)
337{
338	/*
339	 * The body of this function is compiled out unless heap profiling is
340	 * enabled, so that it is possible to compile jemalloc with floating
341	 * point support completely disabled.  Avoiding floating point code is
342	 * important on memory-constrained systems, but it also enables a
343	 * workaround for versions of glibc that don't properly save/restore
344	 * floating point registers during dynamic lazy symbol loading (which
345	 * internally calls into whatever malloc implementation happens to be
346	 * integrated into the application).  Note that some compilers (e.g.
347	 * gcc 4.8) may use floating point registers for fast memory moves, so
348	 * jemalloc must be compiled with such optimizations disabled (e.g.
349	 * -mno-sse) in order for the workaround to be complete.
350	 */
351#ifdef JEMALLOC_PROF
352	uint64_t r;
353	double u;
354
355	cassert(config_prof);
356
357	/*
358	 * Compute sample threshold as a geometrically distributed random
359	 * variable with mean (2^opt_lg_prof_sample).
360	 *
361	 *                         __        __
362	 *                         |  log(u)  |                     1
363	 * prof_tdata->threshold = | -------- |, where p = -------------------
364	 *                         | log(1-p) |             opt_lg_prof_sample
365	 *                                                 2
366	 *
367	 * For more information on the math, see:
368	 *
369	 *   Non-Uniform Random Variate Generation
370	 *   Luc Devroye
371	 *   Springer-Verlag, New York, 1986
372	 *   pp 500
373	 *   (http://luc.devroye.org/rnbookindex.html)
374	 */
375	prng64(r, 53, prof_tdata->prng_state,
376	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
377	u = (double)r * (1.0/9007199254740992.0L);
378	prof_tdata->threshold = (uint64_t)(log(u) /
379	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
380	    + (uint64_t)1U;
381#endif
382}
383
384JEMALLOC_INLINE prof_ctx_t *
385prof_ctx_get(const void *ptr)
386{
387	prof_ctx_t *ret;
388	arena_chunk_t *chunk;
389
390	cassert(config_prof);
391	assert(ptr != NULL);
392
393	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
394	if (chunk != ptr) {
395		/* Region. */
396		ret = arena_prof_ctx_get(ptr);
397	} else
398		ret = huge_prof_ctx_get(ptr);
399
400	return (ret);
401}
402
403JEMALLOC_INLINE void
404prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
405{
406	arena_chunk_t *chunk;
407
408	cassert(config_prof);
409	assert(ptr != NULL);
410
411	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
412	if (chunk != ptr) {
413		/* Region. */
414		arena_prof_ctx_set(ptr, usize, ctx);
415	} else
416		huge_prof_ctx_set(ptr, ctx);
417}
418
419JEMALLOC_INLINE bool
420prof_sample_accum_update(size_t size)
421{
422	prof_tdata_t *prof_tdata;
423
424	cassert(config_prof);
425	/* Sampling logic is unnecessary if the interval is 1. */
426	assert(opt_lg_prof_sample != 0);
427
428	prof_tdata = prof_tdata_get(false);
429	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
430		return (true);
431
432	/* Take care to avoid integer overflow. */
433	if (size >= prof_tdata->threshold - prof_tdata->accum) {
434		prof_tdata->accum -= (prof_tdata->threshold - size);
435		/* Compute new sample threshold. */
436		prof_sample_threshold_update(prof_tdata);
437		while (prof_tdata->accum >= prof_tdata->threshold) {
438			prof_tdata->accum -= prof_tdata->threshold;
439			prof_sample_threshold_update(prof_tdata);
440		}
441		return (false);
442	} else {
443		prof_tdata->accum += size;
444		return (true);
445	}
446}
447
448JEMALLOC_INLINE void
449prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
450{
451
452	cassert(config_prof);
453	assert(ptr != NULL);
454	assert(usize == isalloc(ptr, true));
455
456	if (opt_lg_prof_sample != 0) {
457		if (prof_sample_accum_update(usize)) {
458			/*
459			 * Don't sample.  For malloc()-like allocation, it is
460			 * always possible to tell in advance how large an
461			 * object's usable size will be, so there should never
462			 * be a difference between the usize passed to
463			 * PROF_ALLOC_PREP() and prof_malloc().
464			 */
465			assert((uintptr_t)cnt == (uintptr_t)1U);
466		}
467	}
468
469	if ((uintptr_t)cnt > (uintptr_t)1U) {
470		prof_ctx_set(ptr, usize, cnt->ctx);
471
472		cnt->epoch++;
473		/*********/
474		mb_write();
475		/*********/
476		cnt->cnts.curobjs++;
477		cnt->cnts.curbytes += usize;
478		if (opt_prof_accum) {
479			cnt->cnts.accumobjs++;
480			cnt->cnts.accumbytes += usize;
481		}
482		/*********/
483		mb_write();
484		/*********/
485		cnt->epoch++;
486		/*********/
487		mb_write();
488		/*********/
489	} else
490		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
491}
492
493JEMALLOC_INLINE void
494prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
495    size_t old_usize, prof_ctx_t *old_ctx)
496{
497	prof_thr_cnt_t *told_cnt;
498
499	cassert(config_prof);
500	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
501
502	if (ptr != NULL) {
503		assert(usize == isalloc(ptr, true));
504		if (opt_lg_prof_sample != 0) {
505			if (prof_sample_accum_update(usize)) {
506				/*
507				 * Don't sample.  The usize passed to
508				 * PROF_ALLOC_PREP() was larger than what
509				 * actually got allocated, so a backtrace was
510				 * captured for this allocation, even though
511				 * its actual usize was insufficient to cross
512				 * the sample threshold.
513				 */
514				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
515			}
516		}
517	}
518
519	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
520		told_cnt = prof_lookup(old_ctx->bt);
521		if (told_cnt == NULL) {
522			/*
523			 * It's too late to propagate OOM for this realloc(),
524			 * so operate directly on old_cnt->ctx->cnt_merged.
525			 */
526			malloc_mutex_lock(old_ctx->lock);
527			old_ctx->cnt_merged.curobjs--;
528			old_ctx->cnt_merged.curbytes -= old_usize;
529			malloc_mutex_unlock(old_ctx->lock);
530			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
531		}
532	} else
533		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
534
535	if ((uintptr_t)told_cnt > (uintptr_t)1U)
536		told_cnt->epoch++;
537	if ((uintptr_t)cnt > (uintptr_t)1U) {
538		prof_ctx_set(ptr, usize, cnt->ctx);
539		cnt->epoch++;
540	} else if (ptr != NULL)
541		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
542	/*********/
543	mb_write();
544	/*********/
545	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
546		told_cnt->cnts.curobjs--;
547		told_cnt->cnts.curbytes -= old_usize;
548	}
549	if ((uintptr_t)cnt > (uintptr_t)1U) {
550		cnt->cnts.curobjs++;
551		cnt->cnts.curbytes += usize;
552		if (opt_prof_accum) {
553			cnt->cnts.accumobjs++;
554			cnt->cnts.accumbytes += usize;
555		}
556	}
557	/*********/
558	mb_write();
559	/*********/
560	if ((uintptr_t)told_cnt > (uintptr_t)1U)
561		told_cnt->epoch++;
562	if ((uintptr_t)cnt > (uintptr_t)1U)
563		cnt->epoch++;
564	/*********/
565	mb_write(); /* Not strictly necessary. */
566}
567
568JEMALLOC_INLINE void
569prof_free(const void *ptr, size_t size)
570{
571	prof_ctx_t *ctx = prof_ctx_get(ptr);
572
573	cassert(config_prof);
574
575	if ((uintptr_t)ctx > (uintptr_t)1) {
576		prof_thr_cnt_t *tcnt;
577		assert(size == isalloc(ptr, true));
578		tcnt = prof_lookup(ctx->bt);
579
580		if (tcnt != NULL) {
581			tcnt->epoch++;
582			/*********/
583			mb_write();
584			/*********/
585			tcnt->cnts.curobjs--;
586			tcnt->cnts.curbytes -= size;
587			/*********/
588			mb_write();
589			/*********/
590			tcnt->epoch++;
591			/*********/
592			mb_write();
593			/*********/
594		} else {
595			/*
596			 * OOM during free() cannot be propagated, so operate
597			 * directly on cnt->ctx->cnt_merged.
598			 */
599			malloc_mutex_lock(ctx->lock);
600			ctx->cnt_merged.curobjs--;
601			ctx->cnt_merged.curbytes -= size;
602			malloc_mutex_unlock(ctx->lock);
603		}
604	}
605}
606#endif
607
608#endif /* JEMALLOC_H_INLINES */
609/******************************************************************************/
610