prof.h revision a507004d294ad0c78b4d01559479620ebb272a49
1#ifdef JEMALLOC_PROF
2/******************************************************************************/
3#ifdef JEMALLOC_H_TYPES
4
5typedef struct prof_bt_s prof_bt_t;
6typedef struct prof_cnt_s prof_cnt_t;
7typedef struct prof_thr_cnt_s prof_thr_cnt_t;
8typedef struct prof_ctx_s prof_ctx_t;
9typedef struct prof_tdata_s prof_tdata_t;
10
11/* Option defaults. */
12#define	PROF_PREFIX_DEFAULT		"jeprof"
13#define	LG_PROF_BT_MAX_DEFAULT		7
14#define	LG_PROF_SAMPLE_DEFAULT		0
15#define	LG_PROF_INTERVAL_DEFAULT	-1
16#define	LG_PROF_TCMAX_DEFAULT		-1
17
18/*
19 * Hard limit on stack backtrace depth.  Note that the version of
20 * prof_backtrace() that is based on __builtin_return_address() necessarily has
21 * a hard-coded number of backtrace frame handlers.
22 */
23#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
24#  define LG_PROF_BT_MAX	((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
25#else
26#  define LG_PROF_BT_MAX	7 /* >= LG_PROF_BT_MAX_DEFAULT */
27#endif
28#define	PROF_BT_MAX		(1U << LG_PROF_BT_MAX)
29
30/* Initial hash table size. */
31#define	PROF_CKH_MINITEMS	64
32
33/* Size of memory buffer to use when writing dump files. */
34#define	PROF_DUMP_BUF_SIZE	65536
35
36#endif /* JEMALLOC_H_TYPES */
37/******************************************************************************/
38#ifdef JEMALLOC_H_STRUCTS
39
40struct prof_bt_s {
41	/* Backtrace, stored as len program counters. */
42	void		**vec;
43	unsigned	len;
44};
45
46#ifdef JEMALLOC_PROF_LIBGCC
47/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
48typedef struct {
49	prof_bt_t	*bt;
50	unsigned	nignore;
51	unsigned	max;
52} prof_unwind_data_t;
53#endif
54
55struct prof_cnt_s {
56	/*
57	 * Profiling counters.  An allocation/deallocation pair can operate on
58	 * different prof_thr_cnt_t objects that are linked into the same
59	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
60	 * negative.  In principle it is possible for the *bytes counters to
61	 * overflow/underflow, but a general solution would require something
62	 * like 128-bit counters; this implementation doesn't bother to solve
63	 * that problem.
64	 */
65	int64_t		curobjs;
66	int64_t		curbytes;
67	uint64_t	accumobjs;
68	uint64_t	accumbytes;
69};
70
71struct prof_thr_cnt_s {
72	/* Linkage into prof_ctx_t's cnts_ql. */
73	ql_elm(prof_thr_cnt_t)	cnts_link;
74
75	/* Linkage into thread's LRU. */
76	ql_elm(prof_thr_cnt_t)	lru_link;
77
78	/*
79	 * Associated context.  If a thread frees an object that it did not
80	 * allocate, it is possible that the context is not cached in the
81	 * thread's hash table, in which case it must be able to look up the
82	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
83	 * and link it into the prof_ctx_t's cnts_ql.
84	 */
85	prof_ctx_t		*ctx;
86
87	/*
88	 * Threads use memory barriers to update the counters.  Since there is
89	 * only ever one writer, the only challenge is for the reader to get a
90	 * consistent read of the counters.
91	 *
92	 * The writer uses this series of operations:
93	 *
94	 * 1) Increment epoch to an odd number.
95	 * 2) Update counters.
96	 * 3) Increment epoch to an even number.
97	 *
98	 * The reader must assure 1) that the epoch is even while it reads the
99	 * counters, and 2) that the epoch doesn't change between the time it
100	 * starts and finishes reading the counters.
101	 */
102	unsigned		epoch;
103
104	/* Profiling counters. */
105	prof_cnt_t		cnts;
106};
107
108struct prof_ctx_s {
109	/* Associated backtrace. */
110	prof_bt_t		*bt;
111
112	/* Protects cnt_merged and cnts_ql. */
113	malloc_mutex_t		lock;
114
115	/* Temporary storage for summation during dump. */
116	prof_cnt_t		cnt_summed;
117
118	/* When threads exit, they merge their stats into cnt_merged. */
119	prof_cnt_t		cnt_merged;
120
121	/*
122	 * List of profile counters, one for each thread that has allocated in
123	 * this context.
124	 */
125	ql_head(prof_thr_cnt_t)	cnts_ql;
126};
127
128struct prof_tdata_s {
129	/*
130	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
131	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
132	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
133	 * others will ever write them.
134	 *
135	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
136	 * counter data into the associated prof_ctx_t objects, and unlink/free
137	 * the prof_thr_cnt_t objects.
138	 */
139	ckh_t			bt2cnt;
140
141	/* LRU for contents of bt2cnt. */
142	ql_head(prof_thr_cnt_t)	lru_ql;
143
144	/* Backtrace vector, used for calls to prof_backtrace(). */
145	void			**vec;
146
147	/* Sampling state. */
148	uint64_t		prn_state;
149	uint64_t		threshold;
150	uint64_t		accum;
151};
152
153#endif /* JEMALLOC_H_STRUCTS */
154/******************************************************************************/
155#ifdef JEMALLOC_H_EXTERNS
156
157extern bool	opt_prof;
158/*
159 * Even if opt_prof is true, sampling can be temporarily disabled by setting
160 * opt_prof_active to false.  No locking is used when updating opt_prof_active,
161 * so there are no guarantees regarding how long it will take for all threads
162 * to notice state changes.
163 */
164extern bool	opt_prof_active;
165extern size_t	opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
166extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
167extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
168extern bool	opt_prof_gdump;       /* High-water memory dumping. */
169extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
170extern bool	opt_prof_accum;       /* Report cumulative bytes. */
171extern ssize_t	opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
172extern char	opt_prof_prefix[PATH_MAX + 1];
173
174/*
175 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
176 * profile dump when it reaches this threshold.  The effect is that the
177 * interval between profile dumps averages prof_interval, though the actual
178 * interval between dumps will tend to be sporadic, and the interval will be a
179 * maximum of approximately (prof_interval * narenas).
180 */
181extern uint64_t	prof_interval;
182
183/*
184 * If true, promote small sampled objects to large objects, since small run
185 * headers do not have embedded profile context pointers.
186 */
187extern bool	prof_promote;
188
189/* (1U << opt_lg_prof_bt_max). */
190extern unsigned	prof_bt_max;
191
192/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
193#ifndef NO_TLS
194extern __thread prof_tdata_t	*prof_tdata_tls
195    JEMALLOC_ATTR(tls_model("initial-exec"));
196#  define PROF_TCACHE_GET()	prof_tdata_tls
197#  define PROF_TCACHE_SET(v)	do {					\
198	prof_tdata_tls = (v);						\
199	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
200} while (0)
201#else
202#  define PROF_TCACHE_GET()						\
203	((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
204#  define PROF_TCACHE_SET(v)	do {					\
205	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
206} while (0)
207#endif
208/*
209 * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
210 * called when a thread exits, so that prof_tdata_tls contents can be merged,
211 * unlinked, and deallocated.
212 */
213extern pthread_key_t	prof_tdata_tsd;
214
215void	bt_init(prof_bt_t *bt, void **vec);
216void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
217prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
218void	prof_idump(void);
219bool	prof_mdump(const char *filename);
220void	prof_gdump(void);
221prof_tdata_t	*prof_tdata_init(void);
222void	prof_boot0(void);
223void	prof_boot1(void);
224bool	prof_boot2(void);
225
226#endif /* JEMALLOC_H_EXTERNS */
227/******************************************************************************/
228#ifdef JEMALLOC_H_INLINES
229
230#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
231	prof_tdata_t *prof_tdata;					\
232	prof_bt_t bt;							\
233									\
234	assert(size == s2u(size));					\
235									\
236	prof_tdata = PROF_TCACHE_GET();					\
237	if (prof_tdata == NULL) {					\
238		prof_tdata = prof_tdata_init();				\
239		if (prof_tdata == NULL) {				\
240			ret = NULL;					\
241			break;						\
242		}							\
243	}								\
244									\
245	if (opt_prof_active == false) {					\
246		/* Sampling is currently inactive, so avoid sampling. */\
247		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
248	} else if (opt_lg_prof_sample == 0) {				\
249		/* Don't bother with sampling logic, since sampling   */\
250		/* interval is 1.                                     */\
251		bt_init(&bt, prof_tdata->vec);				\
252		prof_backtrace(&bt, nignore, prof_bt_max);		\
253		ret = prof_lookup(&bt);					\
254	} else {							\
255		if (prof_tdata->threshold == 0) {			\
256			/* Initialize.  Seed the prng differently for */\
257			/* each thread.                               */\
258			prof_tdata->prn_state =				\
259			    (uint64_t)(uintptr_t)&size;			\
260			prof_sample_threshold_update(prof_tdata);	\
261		}							\
262									\
263		/* Determine whether to capture a backtrace based on  */\
264		/* whether size is enough for prof_accum to reach     */\
265		/* prof_tdata->threshold.  However, delay updating    */\
266		/* these variables until prof_{m,re}alloc(), because  */\
267		/* we don't know for sure that the allocation will    */\
268		/* succeed.                                           */\
269		/*                                                    */\
270		/* Use subtraction rather than addition to avoid      */\
271		/* potential integer overflow.                        */\
272		if (size >= prof_tdata->threshold -			\
273		    prof_tdata->accum) {				\
274			bt_init(&bt, prof_tdata->vec);			\
275			prof_backtrace(&bt, nignore, prof_bt_max);	\
276			ret = prof_lookup(&bt);				\
277		} else							\
278			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
279	}								\
280} while (0)
281
282#ifndef JEMALLOC_ENABLE_INLINE
283void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
284prof_ctx_t	*prof_ctx_get(const void *ptr);
285void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
286bool	prof_sample_accum_update(size_t size);
287void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
288void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
289    size_t old_size, prof_ctx_t *old_ctx);
290void	prof_free(const void *ptr, size_t size);
291#endif
292
293#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
294JEMALLOC_INLINE void
295prof_sample_threshold_update(prof_tdata_t *prof_tdata)
296{
297	uint64_t r;
298	double u;
299
300	/*
301	 * Compute sample threshold as a geometrically distributed random
302	 * variable with mean (2^opt_lg_prof_sample).
303	 *
304	 *                         __        __
305	 *                         |  log(u)  |                     1
306	 * prof_tdata->threshold = | -------- |, where p = -------------------
307	 *                         | log(1-p) |             opt_lg_prof_sample
308	 *                                                 2
309	 *
310	 * For more information on the math, see:
311	 *
312	 *   Non-Uniform Random Variate Generation
313	 *   Luc Devroye
314	 *   Springer-Verlag, New York, 1986
315	 *   pp 500
316	 *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
317	 */
318	prn64(r, 53, prof_tdata->prn_state,
319	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
320	u = (double)r * (1.0/9007199254740992.0L);
321	prof_tdata->threshold = (uint64_t)(log(u) /
322	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
323	    + (uint64_t)1U;
324}
325
326JEMALLOC_INLINE prof_ctx_t *
327prof_ctx_get(const void *ptr)
328{
329	prof_ctx_t *ret;
330	arena_chunk_t *chunk;
331
332	assert(ptr != NULL);
333
334	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
335	if (chunk != ptr) {
336		/* Region. */
337		dassert(chunk->arena->magic == ARENA_MAGIC);
338
339		ret = arena_prof_ctx_get(ptr);
340	} else
341		ret = huge_prof_ctx_get(ptr);
342
343	return (ret);
344}
345
346JEMALLOC_INLINE void
347prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
348{
349	arena_chunk_t *chunk;
350
351	assert(ptr != NULL);
352
353	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
354	if (chunk != ptr) {
355		/* Region. */
356		dassert(chunk->arena->magic == ARENA_MAGIC);
357
358		arena_prof_ctx_set(ptr, ctx);
359	} else
360		huge_prof_ctx_set(ptr, ctx);
361}
362
363JEMALLOC_INLINE bool
364prof_sample_accum_update(size_t size)
365{
366	prof_tdata_t *prof_tdata;
367
368	/* Sampling logic is unnecessary if the interval is 1. */
369	assert(opt_lg_prof_sample != 0);
370
371	prof_tdata = PROF_TCACHE_GET();
372	assert(prof_tdata != NULL);
373
374	/* Take care to avoid integer overflow. */
375	if (size >= prof_tdata->threshold - prof_tdata->accum) {
376		prof_tdata->accum -= (prof_tdata->threshold - size);
377		/* Compute new sample threshold. */
378		prof_sample_threshold_update(prof_tdata);
379		while (prof_tdata->accum >= prof_tdata->threshold) {
380			prof_tdata->accum -= prof_tdata->threshold;
381			prof_sample_threshold_update(prof_tdata);
382		}
383		return (false);
384	} else {
385		prof_tdata->accum += size;
386		return (true);
387	}
388}
389
390JEMALLOC_INLINE void
391prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
392{
393
394	assert(ptr != NULL);
395	assert(size == isalloc(ptr));
396
397	if (opt_lg_prof_sample != 0) {
398		if (prof_sample_accum_update(size)) {
399			/*
400			 * Don't sample.  For malloc()-like allocation, it is
401			 * always possible to tell in advance how large an
402			 * object's usable size will be, so there should never
403			 * be a difference between the size passed to
404			 * PROF_ALLOC_PREP() and prof_malloc().
405			 */
406			assert((uintptr_t)cnt == (uintptr_t)1U);
407		}
408	}
409
410	if ((uintptr_t)cnt > (uintptr_t)1U) {
411		prof_ctx_set(ptr, cnt->ctx);
412
413		cnt->epoch++;
414		/*********/
415		mb_write();
416		/*********/
417		cnt->cnts.curobjs++;
418		cnt->cnts.curbytes += size;
419		if (opt_prof_accum) {
420			cnt->cnts.accumobjs++;
421			cnt->cnts.accumbytes += size;
422		}
423		/*********/
424		mb_write();
425		/*********/
426		cnt->epoch++;
427		/*********/
428		mb_write();
429		/*********/
430	} else
431		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
432}
433
434JEMALLOC_INLINE void
435prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
436    size_t old_size, prof_ctx_t *old_ctx)
437{
438	prof_thr_cnt_t *told_cnt;
439
440	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
441
442	if (ptr != NULL) {
443		assert(size == isalloc(ptr));
444		if (opt_lg_prof_sample != 0) {
445			if (prof_sample_accum_update(size)) {
446				/*
447				 * Don't sample.  The size passed to
448				 * PROF_ALLOC_PREP() was larger than what
449				 * actually got allocated, so a backtrace was
450				 * captured for this allocation, even though
451				 * its actual size was insufficient to cross
452				 * the sample threshold.
453				 */
454				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
455			}
456		}
457	}
458
459	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
460		told_cnt = prof_lookup(old_ctx->bt);
461		if (told_cnt == NULL) {
462			/*
463			 * It's too late to propagate OOM for this realloc(),
464			 * so operate directly on old_cnt->ctx->cnt_merged.
465			 */
466			malloc_mutex_lock(&old_ctx->lock);
467			old_ctx->cnt_merged.curobjs--;
468			old_ctx->cnt_merged.curbytes -= old_size;
469			malloc_mutex_unlock(&old_ctx->lock);
470			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
471		}
472	} else
473		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
474
475	if ((uintptr_t)told_cnt > (uintptr_t)1U)
476		told_cnt->epoch++;
477	if ((uintptr_t)cnt > (uintptr_t)1U) {
478		prof_ctx_set(ptr, cnt->ctx);
479		cnt->epoch++;
480	} else
481		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
482	/*********/
483	mb_write();
484	/*********/
485	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
486		told_cnt->cnts.curobjs--;
487		told_cnt->cnts.curbytes -= old_size;
488	}
489	if ((uintptr_t)cnt > (uintptr_t)1U) {
490		cnt->cnts.curobjs++;
491		cnt->cnts.curbytes += size;
492		if (opt_prof_accum) {
493			cnt->cnts.accumobjs++;
494			cnt->cnts.accumbytes += size;
495		}
496	}
497	/*********/
498	mb_write();
499	/*********/
500	if ((uintptr_t)told_cnt > (uintptr_t)1U)
501		told_cnt->epoch++;
502	if ((uintptr_t)cnt > (uintptr_t)1U)
503		cnt->epoch++;
504	/*********/
505	mb_write(); /* Not strictly necessary. */
506}
507
508JEMALLOC_INLINE void
509prof_free(const void *ptr, size_t size)
510{
511	prof_ctx_t *ctx = prof_ctx_get(ptr);
512
513	if ((uintptr_t)ctx > (uintptr_t)1) {
514		assert(size == isalloc(ptr));
515		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
516
517		if (tcnt != NULL) {
518			tcnt->epoch++;
519			/*********/
520			mb_write();
521			/*********/
522			tcnt->cnts.curobjs--;
523			tcnt->cnts.curbytes -= size;
524			/*********/
525			mb_write();
526			/*********/
527			tcnt->epoch++;
528			/*********/
529			mb_write();
530			/*********/
531		} else {
532			/*
533			 * OOM during free() cannot be propagated, so operate
534			 * directly on cnt->ctx->cnt_merged.
535			 */
536			malloc_mutex_lock(&ctx->lock);
537			ctx->cnt_merged.curobjs--;
538			ctx->cnt_merged.curbytes -= size;
539			malloc_mutex_unlock(&ctx->lock);
540		}
541	}
542}
543#endif
544
545#endif /* JEMALLOC_H_INLINES */
546/******************************************************************************/
547#endif /* JEMALLOC_PROF */
548