prof.h revision b1941c615023cab9baf0a78a28df1e3b4972434f
1/******************************************************************************/
2#ifdef JEMALLOC_H_TYPES
3
4typedef struct prof_bt_s prof_bt_t;
5typedef struct prof_cnt_s prof_cnt_t;
6typedef struct prof_thr_cnt_s prof_thr_cnt_t;
7typedef struct prof_ctx_s prof_ctx_t;
8typedef struct prof_tdata_s prof_tdata_t;
9
10/* Option defaults. */
11#define	PROF_PREFIX_DEFAULT		"jeprof"
12#define	LG_PROF_SAMPLE_DEFAULT		19
13#define	LG_PROF_INTERVAL_DEFAULT	-1
14
15/*
16 * Hard limit on stack backtrace depth.  The version of prof_backtrace() that
17 * is based on __builtin_return_address() necessarily has a hard-coded number
18 * of backtrace frame handlers, and should be kept in sync with this setting.
19 */
20#define	PROF_BT_MAX			128
21
22/* Maximum number of backtraces to store in each per thread LRU cache. */
23#define	PROF_TCMAX			1024
24
25/* Initial hash table size. */
26#define	PROF_CKH_MINITEMS		64
27
28/* Size of memory buffer to use when writing dump files. */
29#define	PROF_DUMP_BUFSIZE		65536
30
31/* Size of stack-allocated buffer used by prof_printf(). */
32#define	PROF_PRINTF_BUFSIZE		128
33
34/*
35 * Number of mutexes shared among all ctx's.  No space is allocated for these
36 * unless profiling is enabled, so it's okay to over-provision.
37 */
38#define	PROF_NCTX_LOCKS			1024
39
40/*
41 * prof_tdata pointers close to NULL are used to encode state information that
42 * is used for cleaning up during thread shutdown.
43 */
44#define	PROF_TDATA_STATE_REINCARNATED	((prof_tdata_t *)(uintptr_t)1)
45#define	PROF_TDATA_STATE_PURGATORY	((prof_tdata_t *)(uintptr_t)2)
46#define	PROF_TDATA_STATE_MAX		PROF_TDATA_STATE_PURGATORY
47
48#endif /* JEMALLOC_H_TYPES */
49/******************************************************************************/
50#ifdef JEMALLOC_H_STRUCTS
51
52struct prof_bt_s {
53	/* Backtrace, stored as len program counters. */
54	void		**vec;
55	unsigned	len;
56};
57
58#ifdef JEMALLOC_PROF_LIBGCC
59/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
60typedef struct {
61	prof_bt_t	*bt;
62	unsigned	nignore;
63	unsigned	max;
64} prof_unwind_data_t;
65#endif
66
67struct prof_cnt_s {
68	/*
69	 * Profiling counters.  An allocation/deallocation pair can operate on
70	 * different prof_thr_cnt_t objects that are linked into the same
71	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
72	 * negative.  In principle it is possible for the *bytes counters to
73	 * overflow/underflow, but a general solution would require something
74	 * like 128-bit counters; this implementation doesn't bother to solve
75	 * that problem.
76	 */
77	int64_t		curobjs;
78	int64_t		curbytes;
79	uint64_t	accumobjs;
80	uint64_t	accumbytes;
81};
82
83struct prof_thr_cnt_s {
84	/* Linkage into prof_ctx_t's cnts_ql. */
85	ql_elm(prof_thr_cnt_t)	cnts_link;
86
87	/* Linkage into thread's LRU. */
88	ql_elm(prof_thr_cnt_t)	lru_link;
89
90	/*
91	 * Associated context.  If a thread frees an object that it did not
92	 * allocate, it is possible that the context is not cached in the
93	 * thread's hash table, in which case it must be able to look up the
94	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
95	 * and link it into the prof_ctx_t's cnts_ql.
96	 */
97	prof_ctx_t		*ctx;
98
99	/*
100	 * Threads use memory barriers to update the counters.  Since there is
101	 * only ever one writer, the only challenge is for the reader to get a
102	 * consistent read of the counters.
103	 *
104	 * The writer uses this series of operations:
105	 *
106	 * 1) Increment epoch to an odd number.
107	 * 2) Update counters.
108	 * 3) Increment epoch to an even number.
109	 *
110	 * The reader must assure 1) that the epoch is even while it reads the
111	 * counters, and 2) that the epoch doesn't change between the time it
112	 * starts and finishes reading the counters.
113	 */
114	unsigned		epoch;
115
116	/* Profiling counters. */
117	prof_cnt_t		cnts;
118};
119
120struct prof_ctx_s {
121	/* Associated backtrace. */
122	prof_bt_t		*bt;
123
124	/* Protects nlimbo, cnt_merged, and cnts_ql. */
125	malloc_mutex_t		*lock;
126
127	/*
128	 * Number of threads that currently cause this ctx to be in a state of
129	 * limbo due to one of:
130	 *   - Initializing per thread counters associated with this ctx.
131	 *   - Preparing to destroy this ctx.
132	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
133	 * ctx.
134	 */
135	unsigned		nlimbo;
136
137	/* Temporary storage for summation during dump. */
138	prof_cnt_t		cnt_summed;
139
140	/* When threads exit, they merge their stats into cnt_merged. */
141	prof_cnt_t		cnt_merged;
142
143	/*
144	 * List of profile counters, one for each thread that has allocated in
145	 * this context.
146	 */
147	ql_head(prof_thr_cnt_t)	cnts_ql;
148};
149
150struct prof_tdata_s {
151	/*
152	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
153	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
154	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
155	 * others will ever write them.
156	 *
157	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
158	 * counter data into the associated prof_ctx_t objects, and unlink/free
159	 * the prof_thr_cnt_t objects.
160	 */
161	ckh_t			bt2cnt;
162
163	/* LRU for contents of bt2cnt. */
164	ql_head(prof_thr_cnt_t)	lru_ql;
165
166	/* Backtrace vector, used for calls to prof_backtrace(). */
167	void			**vec;
168
169	/* Sampling state. */
170	uint64_t		prng_state;
171	uint64_t		threshold;
172	uint64_t		accum;
173
174	/* State used to avoid dumping while operating on prof internals. */
175	bool			enq;
176	bool			enq_idump;
177	bool			enq_gdump;
178};
179
180#endif /* JEMALLOC_H_STRUCTS */
181/******************************************************************************/
182#ifdef JEMALLOC_H_EXTERNS
183
184extern bool	opt_prof;
185/*
186 * Even if opt_prof is true, sampling can be temporarily disabled by setting
187 * opt_prof_active to false.  No locking is used when updating opt_prof_active,
188 * so there are no guarantees regarding how long it will take for all threads
189 * to notice state changes.
190 */
191extern bool	opt_prof_active;
192extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
193extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
194extern bool	opt_prof_gdump;       /* High-water memory dumping. */
195extern bool	opt_prof_final;       /* Final profile dumping. */
196extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
197extern bool	opt_prof_accum;       /* Report cumulative bytes. */
198extern char	opt_prof_prefix[PATH_MAX + 1];
199
200/*
201 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
202 * profile dump when it reaches this threshold.  The effect is that the
203 * interval between profile dumps averages prof_interval, though the actual
204 * interval between dumps will tend to be sporadic, and the interval will be a
205 * maximum of approximately (prof_interval * narenas).
206 */
207extern uint64_t	prof_interval;
208
209/*
210 * If true, promote small sampled objects to large objects, since small run
211 * headers do not have embedded profile context pointers.
212 */
213extern bool	prof_promote;
214
215void	bt_init(prof_bt_t *bt, void **vec);
216void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
217prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
218void	prof_idump(void);
219bool	prof_mdump(const char *filename);
220void	prof_gdump(void);
221prof_tdata_t	*prof_tdata_init(void);
222void	prof_tdata_cleanup(void *arg);
223void	prof_boot0(void);
224void	prof_boot1(void);
225bool	prof_boot2(void);
226void	prof_prefork(void);
227void	prof_postfork_parent(void);
228void	prof_postfork_child(void);
229
230#endif /* JEMALLOC_H_EXTERNS */
231/******************************************************************************/
232#ifdef JEMALLOC_H_INLINES
233
234#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
235	prof_tdata_t *prof_tdata;					\
236	prof_bt_t bt;							\
237									\
238	assert(size == s2u(size));					\
239									\
240	prof_tdata = prof_tdata_get(true);				\
241	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {	\
242		if (prof_tdata != NULL)					\
243			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
244		else							\
245			ret = NULL;					\
246		break;							\
247	}								\
248									\
249	if (opt_prof_active == false) {					\
250		/* Sampling is currently inactive, so avoid sampling. */\
251		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
252	} else if (opt_lg_prof_sample == 0) {				\
253		/* Don't bother with sampling logic, since sampling   */\
254		/* interval is 1.                                     */\
255		bt_init(&bt, prof_tdata->vec);				\
256		prof_backtrace(&bt, nignore);				\
257		ret = prof_lookup(&bt);					\
258	} else {							\
259		if (prof_tdata->threshold == 0) {			\
260			/* Initialize.  Seed the prng differently for */\
261			/* each thread.                               */\
262			prof_tdata->prng_state =			\
263			    (uint64_t)(uintptr_t)&size;			\
264			prof_sample_threshold_update(prof_tdata);	\
265		}							\
266									\
267		/* Determine whether to capture a backtrace based on  */\
268		/* whether size is enough for prof_accum to reach     */\
269		/* prof_tdata->threshold.  However, delay updating    */\
270		/* these variables until prof_{m,re}alloc(), because  */\
271		/* we don't know for sure that the allocation will    */\
272		/* succeed.                                           */\
273		/*                                                    */\
274		/* Use subtraction rather than addition to avoid      */\
275		/* potential integer overflow.                        */\
276		if (size >= prof_tdata->threshold -			\
277		    prof_tdata->accum) {				\
278			bt_init(&bt, prof_tdata->vec);			\
279			prof_backtrace(&bt, nignore);			\
280			ret = prof_lookup(&bt);				\
281		} else							\
282			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
283	}								\
284} while (0)
285
286#ifndef JEMALLOC_ENABLE_INLINE
287malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
288
289prof_tdata_t	*prof_tdata_get(bool create);
290void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
291prof_ctx_t	*prof_ctx_get(const void *ptr);
292void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
293bool	prof_sample_accum_update(size_t size);
294void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
295void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
296    size_t old_size, prof_ctx_t *old_ctx);
297void	prof_free(const void *ptr, size_t size);
298#endif
299
300#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
301/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
302malloc_tsd_externs(prof_tdata, prof_tdata_t *)
303malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
304    prof_tdata_cleanup)
305
306JEMALLOC_INLINE prof_tdata_t *
307prof_tdata_get(bool create)
308{
309	prof_tdata_t *prof_tdata;
310
311	cassert(config_prof);
312
313	prof_tdata = *prof_tdata_tsd_get();
314	if (create && prof_tdata == NULL)
315		prof_tdata = prof_tdata_init();
316
317	return (prof_tdata);
318}
319
320JEMALLOC_INLINE void
321prof_sample_threshold_update(prof_tdata_t *prof_tdata)
322{
323	/*
324	 * The body of this function is compiled out unless heap profiling is
325	 * enabled, so that it is possible to compile jemalloc with floating
326	 * point support completely disabled.  Avoiding floating point code is
327	 * important on memory-constrained systems, but it also enables a
328	 * workaround for versions of glibc that don't properly save/restore
329	 * floating point registers during dynamic lazy symbol loading (which
330	 * internally calls into whatever malloc implementation happens to be
331	 * integrated into the application).  Note that some compilers (e.g.
332	 * gcc 4.8) may use floating point registers for fast memory moves, so
333	 * jemalloc must be compiled with such optimizations disabled (e.g.
334	 * -mno-sse) in order for the workaround to be complete.
335	 */
336#ifdef JEMALLOC_PROF
337	uint64_t r;
338	double u;
339
340	cassert(config_prof);
341
342	/*
343	 * Compute sample threshold as a geometrically distributed random
344	 * variable with mean (2^opt_lg_prof_sample).
345	 *
346	 *                         __        __
347	 *                         |  log(u)  |                     1
348	 * prof_tdata->threshold = | -------- |, where p = -------------------
349	 *                         | log(1-p) |             opt_lg_prof_sample
350	 *                                                 2
351	 *
352	 * For more information on the math, see:
353	 *
354	 *   Non-Uniform Random Variate Generation
355	 *   Luc Devroye
356	 *   Springer-Verlag, New York, 1986
357	 *   pp 500
358	 *   (http://luc.devroye.org/rnbookindex.html)
359	 */
360	prng64(r, 53, prof_tdata->prng_state,
361	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
362	u = (double)r * (1.0/9007199254740992.0L);
363	prof_tdata->threshold = (uint64_t)(log(u) /
364	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
365	    + (uint64_t)1U;
366#endif
367}
368
369JEMALLOC_INLINE prof_ctx_t *
370prof_ctx_get(const void *ptr)
371{
372	prof_ctx_t *ret;
373	arena_chunk_t *chunk;
374
375	cassert(config_prof);
376	assert(ptr != NULL);
377
378	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
379	if (chunk != ptr) {
380		/* Region. */
381		ret = arena_prof_ctx_get(ptr);
382	} else
383		ret = huge_prof_ctx_get(ptr);
384
385	return (ret);
386}
387
388JEMALLOC_INLINE void
389prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
390{
391	arena_chunk_t *chunk;
392
393	cassert(config_prof);
394	assert(ptr != NULL);
395
396	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
397	if (chunk != ptr) {
398		/* Region. */
399		arena_prof_ctx_set(ptr, ctx);
400	} else
401		huge_prof_ctx_set(ptr, ctx);
402}
403
404JEMALLOC_INLINE bool
405prof_sample_accum_update(size_t size)
406{
407	prof_tdata_t *prof_tdata;
408
409	cassert(config_prof);
410	/* Sampling logic is unnecessary if the interval is 1. */
411	assert(opt_lg_prof_sample != 0);
412
413	prof_tdata = prof_tdata_get(false);
414	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
415		return (true);
416
417	/* Take care to avoid integer overflow. */
418	if (size >= prof_tdata->threshold - prof_tdata->accum) {
419		prof_tdata->accum -= (prof_tdata->threshold - size);
420		/* Compute new sample threshold. */
421		prof_sample_threshold_update(prof_tdata);
422		while (prof_tdata->accum >= prof_tdata->threshold) {
423			prof_tdata->accum -= prof_tdata->threshold;
424			prof_sample_threshold_update(prof_tdata);
425		}
426		return (false);
427	} else {
428		prof_tdata->accum += size;
429		return (true);
430	}
431}
432
433JEMALLOC_INLINE void
434prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
435{
436
437	cassert(config_prof);
438	assert(ptr != NULL);
439	assert(size == isalloc(ptr, true));
440
441	if (opt_lg_prof_sample != 0) {
442		if (prof_sample_accum_update(size)) {
443			/*
444			 * Don't sample.  For malloc()-like allocation, it is
445			 * always possible to tell in advance how large an
446			 * object's usable size will be, so there should never
447			 * be a difference between the size passed to
448			 * PROF_ALLOC_PREP() and prof_malloc().
449			 */
450			assert((uintptr_t)cnt == (uintptr_t)1U);
451		}
452	}
453
454	if ((uintptr_t)cnt > (uintptr_t)1U) {
455		prof_ctx_set(ptr, cnt->ctx);
456
457		cnt->epoch++;
458		/*********/
459		mb_write();
460		/*********/
461		cnt->cnts.curobjs++;
462		cnt->cnts.curbytes += size;
463		if (opt_prof_accum) {
464			cnt->cnts.accumobjs++;
465			cnt->cnts.accumbytes += size;
466		}
467		/*********/
468		mb_write();
469		/*********/
470		cnt->epoch++;
471		/*********/
472		mb_write();
473		/*********/
474	} else
475		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
476}
477
478JEMALLOC_INLINE void
479prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
480    size_t old_size, prof_ctx_t *old_ctx)
481{
482	prof_thr_cnt_t *told_cnt;
483
484	cassert(config_prof);
485	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
486
487	if (ptr != NULL) {
488		assert(size == isalloc(ptr, true));
489		if (opt_lg_prof_sample != 0) {
490			if (prof_sample_accum_update(size)) {
491				/*
492				 * Don't sample.  The size passed to
493				 * PROF_ALLOC_PREP() was larger than what
494				 * actually got allocated, so a backtrace was
495				 * captured for this allocation, even though
496				 * its actual size was insufficient to cross
497				 * the sample threshold.
498				 */
499				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
500			}
501		}
502	}
503
504	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
505		told_cnt = prof_lookup(old_ctx->bt);
506		if (told_cnt == NULL) {
507			/*
508			 * It's too late to propagate OOM for this realloc(),
509			 * so operate directly on old_cnt->ctx->cnt_merged.
510			 */
511			malloc_mutex_lock(old_ctx->lock);
512			old_ctx->cnt_merged.curobjs--;
513			old_ctx->cnt_merged.curbytes -= old_size;
514			malloc_mutex_unlock(old_ctx->lock);
515			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
516		}
517	} else
518		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
519
520	if ((uintptr_t)told_cnt > (uintptr_t)1U)
521		told_cnt->epoch++;
522	if ((uintptr_t)cnt > (uintptr_t)1U) {
523		prof_ctx_set(ptr, cnt->ctx);
524		cnt->epoch++;
525	} else if (ptr != NULL)
526		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
527	/*********/
528	mb_write();
529	/*********/
530	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
531		told_cnt->cnts.curobjs--;
532		told_cnt->cnts.curbytes -= old_size;
533	}
534	if ((uintptr_t)cnt > (uintptr_t)1U) {
535		cnt->cnts.curobjs++;
536		cnt->cnts.curbytes += size;
537		if (opt_prof_accum) {
538			cnt->cnts.accumobjs++;
539			cnt->cnts.accumbytes += size;
540		}
541	}
542	/*********/
543	mb_write();
544	/*********/
545	if ((uintptr_t)told_cnt > (uintptr_t)1U)
546		told_cnt->epoch++;
547	if ((uintptr_t)cnt > (uintptr_t)1U)
548		cnt->epoch++;
549	/*********/
550	mb_write(); /* Not strictly necessary. */
551}
552
553JEMALLOC_INLINE void
554prof_free(const void *ptr, size_t size)
555{
556	prof_ctx_t *ctx = prof_ctx_get(ptr);
557
558	cassert(config_prof);
559
560	if ((uintptr_t)ctx > (uintptr_t)1) {
561		prof_thr_cnt_t *tcnt;
562		assert(size == isalloc(ptr, true));
563		tcnt = prof_lookup(ctx->bt);
564
565		if (tcnt != NULL) {
566			tcnt->epoch++;
567			/*********/
568			mb_write();
569			/*********/
570			tcnt->cnts.curobjs--;
571			tcnt->cnts.curbytes -= size;
572			/*********/
573			mb_write();
574			/*********/
575			tcnt->epoch++;
576			/*********/
577			mb_write();
578			/*********/
579		} else {
580			/*
581			 * OOM during free() cannot be propagated, so operate
582			 * directly on cnt->ctx->cnt_merged.
583			 */
584			malloc_mutex_lock(ctx->lock);
585			ctx->cnt_merged.curobjs--;
586			ctx->cnt_merged.curbytes -= size;
587			malloc_mutex_unlock(ctx->lock);
588		}
589	}
590}
591#endif
592
593#endif /* JEMALLOC_H_INLINES */
594/******************************************************************************/
595