prof.h revision 7372b15a31c63ac5cb9ed8aeabc2a0a3c005e8bf
1/******************************************************************************/
2#ifdef JEMALLOC_H_TYPES
3
4typedef struct prof_bt_s prof_bt_t;
5typedef struct prof_cnt_s prof_cnt_t;
6typedef struct prof_thr_cnt_s prof_thr_cnt_t;
7typedef struct prof_ctx_s prof_ctx_t;
8typedef struct prof_tdata_s prof_tdata_t;
9
10/* Option defaults. */
11#define	PROF_PREFIX_DEFAULT		"jeprof"
12#define	LG_PROF_BT_MAX_DEFAULT		7
13#define	LG_PROF_SAMPLE_DEFAULT		0
14#define	LG_PROF_INTERVAL_DEFAULT	-1
15#define	LG_PROF_TCMAX_DEFAULT		-1
16
17/*
18 * Hard limit on stack backtrace depth.  Note that the version of
19 * prof_backtrace() that is based on __builtin_return_address() necessarily has
20 * a hard-coded number of backtrace frame handlers.
21 */
22#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
23#  define LG_PROF_BT_MAX	((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
24#else
25#  define LG_PROF_BT_MAX	7 /* >= LG_PROF_BT_MAX_DEFAULT */
26#endif
27#define	PROF_BT_MAX		(1U << LG_PROF_BT_MAX)
28
29/* Initial hash table size. */
30#define	PROF_CKH_MINITEMS	64
31
32/* Size of memory buffer to use when writing dump files. */
33#define	PROF_DUMP_BUF_SIZE	65536
34
35#endif /* JEMALLOC_H_TYPES */
36/******************************************************************************/
37#ifdef JEMALLOC_H_STRUCTS
38
39struct prof_bt_s {
40	/* Backtrace, stored as len program counters. */
41	void		**vec;
42	unsigned	len;
43};
44
45#ifdef JEMALLOC_PROF_LIBGCC
46/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
47typedef struct {
48	prof_bt_t	*bt;
49	unsigned	nignore;
50	unsigned	max;
51} prof_unwind_data_t;
52#endif
53
54struct prof_cnt_s {
55	/*
56	 * Profiling counters.  An allocation/deallocation pair can operate on
57	 * different prof_thr_cnt_t objects that are linked into the same
58	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
59	 * negative.  In principle it is possible for the *bytes counters to
60	 * overflow/underflow, but a general solution would require something
61	 * like 128-bit counters; this implementation doesn't bother to solve
62	 * that problem.
63	 */
64	int64_t		curobjs;
65	int64_t		curbytes;
66	uint64_t	accumobjs;
67	uint64_t	accumbytes;
68};
69
70struct prof_thr_cnt_s {
71	/* Linkage into prof_ctx_t's cnts_ql. */
72	ql_elm(prof_thr_cnt_t)	cnts_link;
73
74	/* Linkage into thread's LRU. */
75	ql_elm(prof_thr_cnt_t)	lru_link;
76
77	/*
78	 * Associated context.  If a thread frees an object that it did not
79	 * allocate, it is possible that the context is not cached in the
80	 * thread's hash table, in which case it must be able to look up the
81	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
82	 * and link it into the prof_ctx_t's cnts_ql.
83	 */
84	prof_ctx_t		*ctx;
85
86	/*
87	 * Threads use memory barriers to update the counters.  Since there is
88	 * only ever one writer, the only challenge is for the reader to get a
89	 * consistent read of the counters.
90	 *
91	 * The writer uses this series of operations:
92	 *
93	 * 1) Increment epoch to an odd number.
94	 * 2) Update counters.
95	 * 3) Increment epoch to an even number.
96	 *
97	 * The reader must assure 1) that the epoch is even while it reads the
98	 * counters, and 2) that the epoch doesn't change between the time it
99	 * starts and finishes reading the counters.
100	 */
101	unsigned		epoch;
102
103	/* Profiling counters. */
104	prof_cnt_t		cnts;
105};
106
107struct prof_ctx_s {
108	/* Associated backtrace. */
109	prof_bt_t		*bt;
110
111	/* Protects cnt_merged and cnts_ql. */
112	malloc_mutex_t		lock;
113
114	/* Temporary storage for summation during dump. */
115	prof_cnt_t		cnt_summed;
116
117	/* When threads exit, they merge their stats into cnt_merged. */
118	prof_cnt_t		cnt_merged;
119
120	/*
121	 * List of profile counters, one for each thread that has allocated in
122	 * this context.
123	 */
124	ql_head(prof_thr_cnt_t)	cnts_ql;
125};
126
127struct prof_tdata_s {
128	/*
129	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
130	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
131	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
132	 * others will ever write them.
133	 *
134	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
135	 * counter data into the associated prof_ctx_t objects, and unlink/free
136	 * the prof_thr_cnt_t objects.
137	 */
138	ckh_t			bt2cnt;
139
140	/* LRU for contents of bt2cnt. */
141	ql_head(prof_thr_cnt_t)	lru_ql;
142
143	/* Backtrace vector, used for calls to prof_backtrace(). */
144	void			**vec;
145
146	/* Sampling state. */
147	uint64_t		prn_state;
148	uint64_t		threshold;
149	uint64_t		accum;
150};
151
152#endif /* JEMALLOC_H_STRUCTS */
153/******************************************************************************/
154#ifdef JEMALLOC_H_EXTERNS
155
156extern bool	opt_prof;
157/*
158 * Even if opt_prof is true, sampling can be temporarily disabled by setting
159 * opt_prof_active to false.  No locking is used when updating opt_prof_active,
160 * so there are no guarantees regarding how long it will take for all threads
161 * to notice state changes.
162 */
163extern bool	opt_prof_active;
164extern size_t	opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
165extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
166extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
167extern bool	opt_prof_gdump;       /* High-water memory dumping. */
168extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
169extern bool	opt_prof_accum;       /* Report cumulative bytes. */
170extern ssize_t	opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
171extern char	opt_prof_prefix[PATH_MAX + 1];
172
173/*
174 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
175 * profile dump when it reaches this threshold.  The effect is that the
176 * interval between profile dumps averages prof_interval, though the actual
177 * interval between dumps will tend to be sporadic, and the interval will be a
178 * maximum of approximately (prof_interval * narenas).
179 */
180extern uint64_t	prof_interval;
181
182/*
183 * If true, promote small sampled objects to large objects, since small run
184 * headers do not have embedded profile context pointers.
185 */
186extern bool	prof_promote;
187
188/* (1U << opt_lg_prof_bt_max). */
189extern unsigned	prof_bt_max;
190
191/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
192#ifndef NO_TLS
193extern __thread prof_tdata_t	*prof_tdata_tls
194    JEMALLOC_ATTR(tls_model("initial-exec"));
195#  define PROF_TCACHE_GET()	prof_tdata_tls
196#  define PROF_TCACHE_SET(v)	do {					\
197	prof_tdata_tls = (v);						\
198	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
199} while (0)
200#else
201#  define PROF_TCACHE_GET()						\
202	((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
203#  define PROF_TCACHE_SET(v)	do {					\
204	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
205} while (0)
206#endif
207/*
208 * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
209 * called when a thread exits, so that prof_tdata_tls contents can be merged,
210 * unlinked, and deallocated.
211 */
212extern pthread_key_t	prof_tdata_tsd;
213
214void	bt_init(prof_bt_t *bt, void **vec);
215void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
216prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
217void	prof_idump(void);
218bool	prof_mdump(const char *filename);
219void	prof_gdump(void);
220prof_tdata_t	*prof_tdata_init(void);
221void	prof_boot0(void);
222void	prof_boot1(void);
223bool	prof_boot2(void);
224
225#endif /* JEMALLOC_H_EXTERNS */
226/******************************************************************************/
227#ifdef JEMALLOC_H_INLINES
228
229#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
230	prof_tdata_t *prof_tdata;					\
231	prof_bt_t bt;							\
232									\
233	assert(size == s2u(size));					\
234									\
235	prof_tdata = PROF_TCACHE_GET();					\
236	if (prof_tdata == NULL) {					\
237		prof_tdata = prof_tdata_init();				\
238		if (prof_tdata == NULL) {				\
239			ret = NULL;					\
240			break;						\
241		}							\
242	}								\
243									\
244	if (opt_prof_active == false) {					\
245		/* Sampling is currently inactive, so avoid sampling. */\
246		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
247	} else if (opt_lg_prof_sample == 0) {				\
248		/* Don't bother with sampling logic, since sampling   */\
249		/* interval is 1.                                     */\
250		bt_init(&bt, prof_tdata->vec);				\
251		prof_backtrace(&bt, nignore, prof_bt_max);		\
252		ret = prof_lookup(&bt);					\
253	} else {							\
254		if (prof_tdata->threshold == 0) {			\
255			/* Initialize.  Seed the prng differently for */\
256			/* each thread.                               */\
257			prof_tdata->prn_state =				\
258			    (uint64_t)(uintptr_t)&size;			\
259			prof_sample_threshold_update(prof_tdata);	\
260		}							\
261									\
262		/* Determine whether to capture a backtrace based on  */\
263		/* whether size is enough for prof_accum to reach     */\
264		/* prof_tdata->threshold.  However, delay updating    */\
265		/* these variables until prof_{m,re}alloc(), because  */\
266		/* we don't know for sure that the allocation will    */\
267		/* succeed.                                           */\
268		/*                                                    */\
269		/* Use subtraction rather than addition to avoid      */\
270		/* potential integer overflow.                        */\
271		if (size >= prof_tdata->threshold -			\
272		    prof_tdata->accum) {				\
273			bt_init(&bt, prof_tdata->vec);			\
274			prof_backtrace(&bt, nignore, prof_bt_max);	\
275			ret = prof_lookup(&bt);				\
276		} else							\
277			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
278	}								\
279} while (0)
280
281#ifndef JEMALLOC_ENABLE_INLINE
282void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
283prof_ctx_t	*prof_ctx_get(const void *ptr);
284void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
285bool	prof_sample_accum_update(size_t size);
286void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
287void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
288    size_t old_size, prof_ctx_t *old_ctx);
289void	prof_free(const void *ptr, size_t size);
290#endif
291
292#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
293JEMALLOC_INLINE void
294prof_sample_threshold_update(prof_tdata_t *prof_tdata)
295{
296	uint64_t r;
297	double u;
298
299	cassert(config_prof);
300
301	/*
302	 * Compute sample threshold as a geometrically distributed random
303	 * variable with mean (2^opt_lg_prof_sample).
304	 *
305	 *                         __        __
306	 *                         |  log(u)  |                     1
307	 * prof_tdata->threshold = | -------- |, where p = -------------------
308	 *                         | log(1-p) |             opt_lg_prof_sample
309	 *                                                 2
310	 *
311	 * For more information on the math, see:
312	 *
313	 *   Non-Uniform Random Variate Generation
314	 *   Luc Devroye
315	 *   Springer-Verlag, New York, 1986
316	 *   pp 500
317	 *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
318	 */
319	prn64(r, 53, prof_tdata->prn_state,
320	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
321	u = (double)r * (1.0/9007199254740992.0L);
322	prof_tdata->threshold = (uint64_t)(log(u) /
323	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
324	    + (uint64_t)1U;
325}
326
327JEMALLOC_INLINE prof_ctx_t *
328prof_ctx_get(const void *ptr)
329{
330	prof_ctx_t *ret;
331	arena_chunk_t *chunk;
332
333	cassert(config_prof);
334	assert(ptr != NULL);
335
336	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
337	if (chunk != ptr) {
338		/* Region. */
339		assert(chunk->arena->magic == ARENA_MAGIC);
340
341		ret = arena_prof_ctx_get(ptr);
342	} else
343		ret = huge_prof_ctx_get(ptr);
344
345	return (ret);
346}
347
348JEMALLOC_INLINE void
349prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
350{
351	arena_chunk_t *chunk;
352
353	cassert(config_prof);
354	assert(ptr != NULL);
355
356	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
357	if (chunk != ptr) {
358		/* Region. */
359		assert(chunk->arena->magic == ARENA_MAGIC);
360
361		arena_prof_ctx_set(ptr, ctx);
362	} else
363		huge_prof_ctx_set(ptr, ctx);
364}
365
366JEMALLOC_INLINE bool
367prof_sample_accum_update(size_t size)
368{
369	prof_tdata_t *prof_tdata;
370
371	cassert(config_prof);
372	/* Sampling logic is unnecessary if the interval is 1. */
373	assert(opt_lg_prof_sample != 0);
374
375	prof_tdata = PROF_TCACHE_GET();
376	assert(prof_tdata != NULL);
377
378	/* Take care to avoid integer overflow. */
379	if (size >= prof_tdata->threshold - prof_tdata->accum) {
380		prof_tdata->accum -= (prof_tdata->threshold - size);
381		/* Compute new sample threshold. */
382		prof_sample_threshold_update(prof_tdata);
383		while (prof_tdata->accum >= prof_tdata->threshold) {
384			prof_tdata->accum -= prof_tdata->threshold;
385			prof_sample_threshold_update(prof_tdata);
386		}
387		return (false);
388	} else {
389		prof_tdata->accum += size;
390		return (true);
391	}
392}
393
394JEMALLOC_INLINE void
395prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
396{
397
398	cassert(config_prof);
399	assert(ptr != NULL);
400	assert(size == isalloc(ptr));
401
402	if (opt_lg_prof_sample != 0) {
403		if (prof_sample_accum_update(size)) {
404			/*
405			 * Don't sample.  For malloc()-like allocation, it is
406			 * always possible to tell in advance how large an
407			 * object's usable size will be, so there should never
408			 * be a difference between the size passed to
409			 * PROF_ALLOC_PREP() and prof_malloc().
410			 */
411			assert((uintptr_t)cnt == (uintptr_t)1U);
412		}
413	}
414
415	if ((uintptr_t)cnt > (uintptr_t)1U) {
416		prof_ctx_set(ptr, cnt->ctx);
417
418		cnt->epoch++;
419		/*********/
420		mb_write();
421		/*********/
422		cnt->cnts.curobjs++;
423		cnt->cnts.curbytes += size;
424		if (opt_prof_accum) {
425			cnt->cnts.accumobjs++;
426			cnt->cnts.accumbytes += size;
427		}
428		/*********/
429		mb_write();
430		/*********/
431		cnt->epoch++;
432		/*********/
433		mb_write();
434		/*********/
435	} else
436		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
437}
438
439JEMALLOC_INLINE void
440prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
441    size_t old_size, prof_ctx_t *old_ctx)
442{
443	prof_thr_cnt_t *told_cnt;
444
445	cassert(config_prof);
446	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
447
448	if (ptr != NULL) {
449		assert(size == isalloc(ptr));
450		if (opt_lg_prof_sample != 0) {
451			if (prof_sample_accum_update(size)) {
452				/*
453				 * Don't sample.  The size passed to
454				 * PROF_ALLOC_PREP() was larger than what
455				 * actually got allocated, so a backtrace was
456				 * captured for this allocation, even though
457				 * its actual size was insufficient to cross
458				 * the sample threshold.
459				 */
460				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
461			}
462		}
463	}
464
465	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
466		told_cnt = prof_lookup(old_ctx->bt);
467		if (told_cnt == NULL) {
468			/*
469			 * It's too late to propagate OOM for this realloc(),
470			 * so operate directly on old_cnt->ctx->cnt_merged.
471			 */
472			malloc_mutex_lock(&old_ctx->lock);
473			old_ctx->cnt_merged.curobjs--;
474			old_ctx->cnt_merged.curbytes -= old_size;
475			malloc_mutex_unlock(&old_ctx->lock);
476			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
477		}
478	} else
479		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
480
481	if ((uintptr_t)told_cnt > (uintptr_t)1U)
482		told_cnt->epoch++;
483	if ((uintptr_t)cnt > (uintptr_t)1U) {
484		prof_ctx_set(ptr, cnt->ctx);
485		cnt->epoch++;
486	} else
487		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
488	/*********/
489	mb_write();
490	/*********/
491	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
492		told_cnt->cnts.curobjs--;
493		told_cnt->cnts.curbytes -= old_size;
494	}
495	if ((uintptr_t)cnt > (uintptr_t)1U) {
496		cnt->cnts.curobjs++;
497		cnt->cnts.curbytes += size;
498		if (opt_prof_accum) {
499			cnt->cnts.accumobjs++;
500			cnt->cnts.accumbytes += size;
501		}
502	}
503	/*********/
504	mb_write();
505	/*********/
506	if ((uintptr_t)told_cnt > (uintptr_t)1U)
507		told_cnt->epoch++;
508	if ((uintptr_t)cnt > (uintptr_t)1U)
509		cnt->epoch++;
510	/*********/
511	mb_write(); /* Not strictly necessary. */
512}
513
514JEMALLOC_INLINE void
515prof_free(const void *ptr, size_t size)
516{
517	prof_ctx_t *ctx = prof_ctx_get(ptr);
518
519	cassert(config_prof);
520
521	if ((uintptr_t)ctx > (uintptr_t)1) {
522		assert(size == isalloc(ptr));
523		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
524
525		if (tcnt != NULL) {
526			tcnt->epoch++;
527			/*********/
528			mb_write();
529			/*********/
530			tcnt->cnts.curobjs--;
531			tcnt->cnts.curbytes -= size;
532			/*********/
533			mb_write();
534			/*********/
535			tcnt->epoch++;
536			/*********/
537			mb_write();
538			/*********/
539		} else {
540			/*
541			 * OOM during free() cannot be propagated, so operate
542			 * directly on cnt->ctx->cnt_merged.
543			 */
544			malloc_mutex_lock(&ctx->lock);
545			ctx->cnt_merged.curobjs--;
546			ctx->cnt_merged.curbytes -= size;
547			malloc_mutex_unlock(&ctx->lock);
548		}
549	}
550}
551#endif
552
553#endif /* JEMALLOC_H_INLINES */
554/******************************************************************************/
555