perf_event.h revision e6817ec1d8ab31fc7b01906e305f848542df6413
1/*
2 * Performance events:
3 *
4 *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
5 *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
6 *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
7 *
8 * Data type definitions, declarations, prototypes.
9 *
10 *    Started by: Thomas Gleixner and Ingo Molnar
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14#ifndef _LINUX_PERF_EVENT_H
15#define _LINUX_PERF_EVENT_H
16
17#include <linux/types.h>
18#include <linux/ioctl.h>
19#include <asm/byteorder.h>
20
21/*
22 * User-space ABI bits:
23 */
24
25/*
26 * attr.type
27 */
28enum perf_type_id {
29	PERF_TYPE_HARDWARE			= 0,
30	PERF_TYPE_SOFTWARE			= 1,
31	PERF_TYPE_TRACEPOINT			= 2,
32	PERF_TYPE_HW_CACHE			= 3,
33	PERF_TYPE_RAW				= 4,
34	PERF_TYPE_BREAKPOINT			= 5,
35
36	PERF_TYPE_MAX,				/* non-ABI */
37};
38
39/*
40 * Generalized performance event event_id types, used by the
41 * attr.event_id parameter of the sys_perf_event_open()
42 * syscall:
43 */
44enum perf_hw_id {
45	/*
46	 * Common hardware events, generalized by the kernel:
47	 */
48	PERF_COUNT_HW_CPU_CYCLES		= 0,
49	PERF_COUNT_HW_INSTRUCTIONS		= 1,
50	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
51	PERF_COUNT_HW_CACHE_MISSES		= 3,
52	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
53	PERF_COUNT_HW_BRANCH_MISSES		= 5,
54	PERF_COUNT_HW_BUS_CYCLES		= 6,
55	PERF_COUNT_HW_STALLED_CYCLES_FRONTEND	= 7,
56	PERF_COUNT_HW_STALLED_CYCLES_BACKEND	= 8,
57
58	PERF_COUNT_HW_MAX,			/* non-ABI */
59};
60
61/*
62 * Generalized hardware cache events:
63 *
64 *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
65 *       { read, write, prefetch } x
66 *       { accesses, misses }
67 */
68enum perf_hw_cache_id {
69	PERF_COUNT_HW_CACHE_L1D			= 0,
70	PERF_COUNT_HW_CACHE_L1I			= 1,
71	PERF_COUNT_HW_CACHE_LL			= 2,
72	PERF_COUNT_HW_CACHE_DTLB		= 3,
73	PERF_COUNT_HW_CACHE_ITLB		= 4,
74	PERF_COUNT_HW_CACHE_BPU			= 5,
75
76	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
77};
78
79enum perf_hw_cache_op_id {
80	PERF_COUNT_HW_CACHE_OP_READ		= 0,
81	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
82	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
83
84	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
85};
86
87enum perf_hw_cache_op_result_id {
88	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
89	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
90
91	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
92};
93
94/*
95 * Special "software" events provided by the kernel, even if the hardware
96 * does not support performance events. These events measure various
97 * physical and sw events of the kernel (and allow the profiling of them as
98 * well):
99 */
100enum perf_sw_ids {
101	PERF_COUNT_SW_CPU_CLOCK			= 0,
102	PERF_COUNT_SW_TASK_CLOCK		= 1,
103	PERF_COUNT_SW_PAGE_FAULTS		= 2,
104	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
105	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
106	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
107	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
108	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
109	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
110
111	PERF_COUNT_SW_MAX,			/* non-ABI */
112};
113
114/*
115 * Bits that can be set in attr.sample_type to request information
116 * in the overflow packets.
117 */
118enum perf_event_sample_format {
119	PERF_SAMPLE_IP				= 1U << 0,
120	PERF_SAMPLE_TID				= 1U << 1,
121	PERF_SAMPLE_TIME			= 1U << 2,
122	PERF_SAMPLE_ADDR			= 1U << 3,
123	PERF_SAMPLE_READ			= 1U << 4,
124	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
125	PERF_SAMPLE_ID				= 1U << 6,
126	PERF_SAMPLE_CPU				= 1U << 7,
127	PERF_SAMPLE_PERIOD			= 1U << 8,
128	PERF_SAMPLE_STREAM_ID			= 1U << 9,
129	PERF_SAMPLE_RAW				= 1U << 10,
130
131	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
132};
133
134/*
135 * The format of the data returned by read() on a perf event fd,
136 * as specified by attr.read_format:
137 *
138 * struct read_format {
139 *	{ u64		value;
140 *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
141 *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
142 *	  { u64		id;           } && PERF_FORMAT_ID
143 *	} && !PERF_FORMAT_GROUP
144 *
145 *	{ u64		nr;
146 *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
147 *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
148 *	  { u64		value;
149 *	    { u64	id;           } && PERF_FORMAT_ID
150 *	  }		cntr[nr];
151 *	} && PERF_FORMAT_GROUP
152 * };
153 */
154enum perf_event_read_format {
155	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
156	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
157	PERF_FORMAT_ID				= 1U << 2,
158	PERF_FORMAT_GROUP			= 1U << 3,
159
160	PERF_FORMAT_MAX = 1U << 4,		/* non-ABI */
161};
162
163#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
164
165/*
166 * Hardware event_id to monitor via a performance monitoring event:
167 */
168struct perf_event_attr {
169
170	/*
171	 * Major type: hardware/software/tracepoint/etc.
172	 */
173	__u32			type;
174
175	/*
176	 * Size of the attr structure, for fwd/bwd compat.
177	 */
178	__u32			size;
179
180	/*
181	 * Type specific configuration information.
182	 */
183	__u64			config;
184
185	union {
186		__u64		sample_period;
187		__u64		sample_freq;
188	};
189
190	__u64			sample_type;
191	__u64			read_format;
192
193	__u64			disabled       :  1, /* off by default        */
194				inherit	       :  1, /* children inherit it   */
195				pinned	       :  1, /* must always be on PMU */
196				exclusive      :  1, /* only group on PMU     */
197				exclude_user   :  1, /* don't count user      */
198				exclude_kernel :  1, /* ditto kernel          */
199				exclude_hv     :  1, /* ditto hypervisor      */
200				exclude_idle   :  1, /* don't count when idle */
201				mmap           :  1, /* include mmap data     */
202				comm	       :  1, /* include comm data     */
203				freq           :  1, /* use freq, not period  */
204				inherit_stat   :  1, /* per task counts       */
205				enable_on_exec :  1, /* next exec enables     */
206				task           :  1, /* trace fork/exit       */
207				watermark      :  1, /* wakeup_watermark      */
208				/*
209				 * precise_ip:
210				 *
211				 *  0 - SAMPLE_IP can have arbitrary skid
212				 *  1 - SAMPLE_IP must have constant skid
213				 *  2 - SAMPLE_IP requested to have 0 skid
214				 *  3 - SAMPLE_IP must have 0 skid
215				 *
216				 *  See also PERF_RECORD_MISC_EXACT_IP
217				 */
218				precise_ip     :  2, /* skid constraint       */
219				mmap_data      :  1, /* non-exec mmap data    */
220				sample_id_all  :  1, /* sample_type all events */
221
222				__reserved_1   : 45;
223
224	union {
225		__u32		wakeup_events;	  /* wakeup every n events */
226		__u32		wakeup_watermark; /* bytes before wakeup   */
227	};
228
229	__u32			bp_type;
230	union {
231		__u64		bp_addr;
232		__u64		config1; /* extension of config */
233	};
234	union {
235		__u64		bp_len;
236		__u64		config2; /* extension of config1 */
237	};
238};
239
240/*
241 * Ioctls that can be done on a perf event fd:
242 */
243#define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
244#define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
245#define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
246#define PERF_EVENT_IOC_RESET		_IO ('$', 3)
247#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, __u64)
248#define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
249#define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
250
251enum perf_event_ioc_flags {
252	PERF_IOC_FLAG_GROUP		= 1U << 0,
253};
254
255/*
256 * Structure of the page that can be mapped via mmap
257 */
258struct perf_event_mmap_page {
259	__u32	version;		/* version number of this structure */
260	__u32	compat_version;		/* lowest version this is compat with */
261
262	/*
263	 * Bits needed to read the hw events in user-space.
264	 *
265	 *   u32 seq;
266	 *   s64 count;
267	 *
268	 *   do {
269	 *     seq = pc->lock;
270	 *
271	 *     barrier()
272	 *     if (pc->index) {
273	 *       count = pmc_read(pc->index - 1);
274	 *       count += pc->offset;
275	 *     } else
276	 *       goto regular_read;
277	 *
278	 *     barrier();
279	 *   } while (pc->lock != seq);
280	 *
281	 * NOTE: for obvious reason this only works on self-monitoring
282	 *       processes.
283	 */
284	__u32	lock;			/* seqlock for synchronization */
285	__u32	index;			/* hardware event identifier */
286	__s64	offset;			/* add to hardware event value */
287	__u64	time_enabled;		/* time event active */
288	__u64	time_running;		/* time event on cpu */
289
290		/*
291		 * Hole for extension of the self monitor capabilities
292		 */
293
294	__u64	__reserved[123];	/* align to 1k */
295
296	/*
297	 * Control data for the mmap() data buffer.
298	 *
299	 * User-space reading the @data_head value should issue an rmb(), on
300	 * SMP capable platforms, after reading this value -- see
301	 * perf_event_wakeup().
302	 *
303	 * When the mapping is PROT_WRITE the @data_tail value should be
304	 * written by userspace to reflect the last read data. In this case
305	 * the kernel will not over-write unread data.
306	 */
307	__u64   data_head;		/* head in the data section */
308	__u64	data_tail;		/* user-space written tail */
309};
310
311#define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
312#define PERF_RECORD_MISC_CPUMODE_UNKNOWN	(0 << 0)
313#define PERF_RECORD_MISC_KERNEL			(1 << 0)
314#define PERF_RECORD_MISC_USER			(2 << 0)
315#define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
316#define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
317#define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
318
319/*
320 * Indicates that the content of PERF_SAMPLE_IP points to
321 * the actual instruction that triggered the event. See also
322 * perf_event_attr::precise_ip.
323 */
324#define PERF_RECORD_MISC_EXACT_IP		(1 << 14)
325/*
326 * Reserve the last bit to indicate some extended misc field
327 */
328#define PERF_RECORD_MISC_EXT_RESERVED		(1 << 15)
329
330struct perf_event_header {
331	__u32	type;
332	__u16	misc;
333	__u16	size;
334};
335
336enum perf_event_type {
337
338	/*
339	 * If perf_event_attr.sample_id_all is set then all event types will
340	 * have the sample_type selected fields related to where/when
341	 * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID)
342	 * described in PERF_RECORD_SAMPLE below, it will be stashed just after
343	 * the perf_event_header and the fields already present for the existing
344	 * fields, i.e. at the end of the payload. That way a newer perf.data
345	 * file will be supported by older perf tools, with these new optional
346	 * fields being ignored.
347	 *
348	 * The MMAP events record the PROT_EXEC mappings so that we can
349	 * correlate userspace IPs to code. They have the following structure:
350	 *
351	 * struct {
352	 *	struct perf_event_header	header;
353	 *
354	 *	u32				pid, tid;
355	 *	u64				addr;
356	 *	u64				len;
357	 *	u64				pgoff;
358	 *	char				filename[];
359	 * };
360	 */
361	PERF_RECORD_MMAP			= 1,
362
363	/*
364	 * struct {
365	 *	struct perf_event_header	header;
366	 *	u64				id;
367	 *	u64				lost;
368	 * };
369	 */
370	PERF_RECORD_LOST			= 2,
371
372	/*
373	 * struct {
374	 *	struct perf_event_header	header;
375	 *
376	 *	u32				pid, tid;
377	 *	char				comm[];
378	 * };
379	 */
380	PERF_RECORD_COMM			= 3,
381
382	/*
383	 * struct {
384	 *	struct perf_event_header	header;
385	 *	u32				pid, ppid;
386	 *	u32				tid, ptid;
387	 *	u64				time;
388	 * };
389	 */
390	PERF_RECORD_EXIT			= 4,
391
392	/*
393	 * struct {
394	 *	struct perf_event_header	header;
395	 *	u64				time;
396	 *	u64				id;
397	 *	u64				stream_id;
398	 * };
399	 */
400	PERF_RECORD_THROTTLE			= 5,
401	PERF_RECORD_UNTHROTTLE			= 6,
402
403	/*
404	 * struct {
405	 *	struct perf_event_header	header;
406	 *	u32				pid, ppid;
407	 *	u32				tid, ptid;
408	 *	u64				time;
409	 * };
410	 */
411	PERF_RECORD_FORK			= 7,
412
413	/*
414	 * struct {
415	 *	struct perf_event_header	header;
416	 *	u32				pid, tid;
417	 *
418	 *	struct read_format		values;
419	 * };
420	 */
421	PERF_RECORD_READ			= 8,
422
423	/*
424	 * struct {
425	 *	struct perf_event_header	header;
426	 *
427	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
428	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
429	 *	{ u64			time;     } && PERF_SAMPLE_TIME
430	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
431	 *	{ u64			id;	  } && PERF_SAMPLE_ID
432	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
433	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
434	 *	{ u64			period;   } && PERF_SAMPLE_PERIOD
435	 *
436	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
437	 *
438	 *	{ u64			nr,
439	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
440	 *
441	 *	#
442	 *	# The RAW record below is opaque data wrt the ABI
443	 *	#
444	 *	# That is, the ABI doesn't make any promises wrt to
445	 *	# the stability of its content, it may vary depending
446	 *	# on event, hardware, kernel version and phase of
447	 *	# the moon.
448	 *	#
449	 *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
450	 *	#
451	 *
452	 *	{ u32			size;
453	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
454	 * };
455	 */
456	PERF_RECORD_SAMPLE			= 9,
457
458	PERF_RECORD_MAX,			/* non-ABI */
459};
460
461enum perf_callchain_context {
462	PERF_CONTEXT_HV			= (__u64)-32,
463	PERF_CONTEXT_KERNEL		= (__u64)-128,
464	PERF_CONTEXT_USER		= (__u64)-512,
465
466	PERF_CONTEXT_GUEST		= (__u64)-2048,
467	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
468	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,
469
470	PERF_CONTEXT_MAX		= (__u64)-4095,
471};
472
473#define PERF_FLAG_FD_NO_GROUP		(1U << 0)
474#define PERF_FLAG_FD_OUTPUT		(1U << 1)
475#define PERF_FLAG_PID_CGROUP		(1U << 2) /* pid=cgroup id, per-cpu mode only */
476
477#ifdef __KERNEL__
478/*
479 * Kernel-internal data types and definitions:
480 */
481
482#ifdef CONFIG_PERF_EVENTS
483# include <linux/cgroup.h>
484# include <asm/perf_event.h>
485# include <asm/local64.h>
486#endif
487
488struct perf_guest_info_callbacks {
489	int				(*is_in_guest)(void);
490	int				(*is_user_mode)(void);
491	unsigned long			(*get_guest_ip)(void);
492};
493
494#ifdef CONFIG_HAVE_HW_BREAKPOINT
495#include <asm/hw_breakpoint.h>
496#endif
497
498#include <linux/list.h>
499#include <linux/mutex.h>
500#include <linux/rculist.h>
501#include <linux/rcupdate.h>
502#include <linux/spinlock.h>
503#include <linux/hrtimer.h>
504#include <linux/fs.h>
505#include <linux/pid_namespace.h>
506#include <linux/workqueue.h>
507#include <linux/ftrace.h>
508#include <linux/cpu.h>
509#include <linux/irq_work.h>
510#include <linux/jump_label.h>
511#include <asm/atomic.h>
512#include <asm/local.h>
513
514#define PERF_MAX_STACK_DEPTH		255
515
516struct perf_callchain_entry {
517	__u64				nr;
518	__u64				ip[PERF_MAX_STACK_DEPTH];
519};
520
521struct perf_raw_record {
522	u32				size;
523	void				*data;
524};
525
526struct perf_branch_entry {
527	__u64				from;
528	__u64				to;
529	__u64				flags;
530};
531
532struct perf_branch_stack {
533	__u64				nr;
534	struct perf_branch_entry	entries[0];
535};
536
537struct task_struct;
538
539/**
540 * struct hw_perf_event - performance event hardware details:
541 */
542struct hw_perf_event {
543#ifdef CONFIG_PERF_EVENTS
544	union {
545		struct { /* hardware */
546			u64		config;
547			u64		last_tag;
548			unsigned long	config_base;
549			unsigned long	event_base;
550			int		idx;
551			int		last_cpu;
552			unsigned int	extra_reg;
553			u64		extra_config;
554			int		extra_alloc;
555		};
556		struct { /* software */
557			struct hrtimer	hrtimer;
558		};
559#ifdef CONFIG_HAVE_HW_BREAKPOINT
560		struct { /* breakpoint */
561			struct arch_hw_breakpoint	info;
562			struct list_head		bp_list;
563			/*
564			 * Crufty hack to avoid the chicken and egg
565			 * problem hw_breakpoint has with context
566			 * creation and event initalization.
567			 */
568			struct task_struct		*bp_target;
569		};
570#endif
571	};
572	int				state;
573	local64_t			prev_count;
574	u64				sample_period;
575	u64				last_period;
576	local64_t			period_left;
577	u64				interrupts;
578
579	u64				freq_time_stamp;
580	u64				freq_count_stamp;
581#endif
582};
583
584/*
585 * hw_perf_event::state flags
586 */
587#define PERF_HES_STOPPED	0x01 /* the counter is stopped */
588#define PERF_HES_UPTODATE	0x02 /* event->count up-to-date */
589#define PERF_HES_ARCH		0x04
590
591struct perf_event;
592
593/*
594 * Common implementation detail of pmu::{start,commit,cancel}_txn
595 */
596#define PERF_EVENT_TXN 0x1
597
598/**
599 * struct pmu - generic performance monitoring unit
600 */
601struct pmu {
602	struct list_head		entry;
603
604	struct device			*dev;
605	char				*name;
606	int				type;
607
608	int * __percpu			pmu_disable_count;
609	struct perf_cpu_context * __percpu pmu_cpu_context;
610	int				task_ctx_nr;
611
612	/*
613	 * Fully disable/enable this PMU, can be used to protect from the PMI
614	 * as well as for lazy/batch writing of the MSRs.
615	 */
616	void (*pmu_enable)		(struct pmu *pmu); /* optional */
617	void (*pmu_disable)		(struct pmu *pmu); /* optional */
618
619	/*
620	 * Try and initialize the event for this PMU.
621	 * Should return -ENOENT when the @event doesn't match this PMU.
622	 */
623	int (*event_init)		(struct perf_event *event);
624
625#define PERF_EF_START	0x01		/* start the counter when adding    */
626#define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
627#define PERF_EF_UPDATE	0x04		/* update the counter when stopping */
628
629	/*
630	 * Adds/Removes a counter to/from the PMU, can be done inside
631	 * a transaction, see the ->*_txn() methods.
632	 */
633	int  (*add)			(struct perf_event *event, int flags);
634	void (*del)			(struct perf_event *event, int flags);
635
636	/*
637	 * Starts/Stops a counter present on the PMU. The PMI handler
638	 * should stop the counter when perf_event_overflow() returns
639	 * !0. ->start() will be used to continue.
640	 */
641	void (*start)			(struct perf_event *event, int flags);
642	void (*stop)			(struct perf_event *event, int flags);
643
644	/*
645	 * Updates the counter value of the event.
646	 */
647	void (*read)			(struct perf_event *event);
648
649	/*
650	 * Group events scheduling is treated as a transaction, add
651	 * group events as a whole and perform one schedulability test.
652	 * If the test fails, roll back the whole group
653	 *
654	 * Start the transaction, after this ->add() doesn't need to
655	 * do schedulability tests.
656	 */
657	void (*start_txn)		(struct pmu *pmu); /* optional */
658	/*
659	 * If ->start_txn() disabled the ->add() schedulability test
660	 * then ->commit_txn() is required to perform one. On success
661	 * the transaction is closed. On error the transaction is kept
662	 * open until ->cancel_txn() is called.
663	 */
664	int  (*commit_txn)		(struct pmu *pmu); /* optional */
665	/*
666	 * Will cancel the transaction, assumes ->del() is called
667	 * for each successful ->add() during the transaction.
668	 */
669	void (*cancel_txn)		(struct pmu *pmu); /* optional */
670};
671
672/**
673 * enum perf_event_active_state - the states of a event
674 */
675enum perf_event_active_state {
676	PERF_EVENT_STATE_ERROR		= -2,
677	PERF_EVENT_STATE_OFF		= -1,
678	PERF_EVENT_STATE_INACTIVE	=  0,
679	PERF_EVENT_STATE_ACTIVE		=  1,
680};
681
682struct file;
683
684#define PERF_BUFFER_WRITABLE		0x01
685
686struct perf_buffer {
687	atomic_t			refcount;
688	struct rcu_head			rcu_head;
689#ifdef CONFIG_PERF_USE_VMALLOC
690	struct work_struct		work;
691	int				page_order;	/* allocation order  */
692#endif
693	int				nr_pages;	/* nr of data pages  */
694	int				writable;	/* are we writable   */
695
696	atomic_t			poll;		/* POLL_ for wakeups */
697
698	local_t				head;		/* write position    */
699	local_t				nest;		/* nested writers    */
700	local_t				events;		/* event limit       */
701	local_t				wakeup;		/* wakeup stamp      */
702	local_t				lost;		/* nr records lost   */
703
704	long				watermark;	/* wakeup watermark  */
705
706	struct perf_event_mmap_page	*user_page;
707	void				*data_pages[0];
708};
709
710struct perf_sample_data;
711
712typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
713					struct perf_sample_data *,
714					struct pt_regs *regs);
715
716enum perf_group_flag {
717	PERF_GROUP_SOFTWARE		= 0x1,
718};
719
720#define SWEVENT_HLIST_BITS		8
721#define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
722
723struct swevent_hlist {
724	struct hlist_head		heads[SWEVENT_HLIST_SIZE];
725	struct rcu_head			rcu_head;
726};
727
728#define PERF_ATTACH_CONTEXT	0x01
729#define PERF_ATTACH_GROUP	0x02
730#define PERF_ATTACH_TASK	0x04
731
732#ifdef CONFIG_CGROUP_PERF
733/*
734 * perf_cgroup_info keeps track of time_enabled for a cgroup.
735 * This is a per-cpu dynamically allocated data structure.
736 */
737struct perf_cgroup_info {
738	u64				time;
739	u64				timestamp;
740};
741
742struct perf_cgroup {
743	struct				cgroup_subsys_state css;
744	struct				perf_cgroup_info *info;	/* timing info, one per cpu */
745};
746#endif
747
748/**
749 * struct perf_event - performance event kernel representation:
750 */
751struct perf_event {
752#ifdef CONFIG_PERF_EVENTS
753	struct list_head		group_entry;
754	struct list_head		event_entry;
755	struct list_head		sibling_list;
756	struct hlist_node		hlist_entry;
757	int				nr_siblings;
758	int				group_flags;
759	struct perf_event		*group_leader;
760	struct pmu			*pmu;
761
762	enum perf_event_active_state	state;
763	unsigned int			attach_state;
764	local64_t			count;
765	atomic64_t			child_count;
766
767	/*
768	 * These are the total time in nanoseconds that the event
769	 * has been enabled (i.e. eligible to run, and the task has
770	 * been scheduled in, if this is a per-task event)
771	 * and running (scheduled onto the CPU), respectively.
772	 *
773	 * They are computed from tstamp_enabled, tstamp_running and
774	 * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
775	 */
776	u64				total_time_enabled;
777	u64				total_time_running;
778
779	/*
780	 * These are timestamps used for computing total_time_enabled
781	 * and total_time_running when the event is in INACTIVE or
782	 * ACTIVE state, measured in nanoseconds from an arbitrary point
783	 * in time.
784	 * tstamp_enabled: the notional time when the event was enabled
785	 * tstamp_running: the notional time when the event was scheduled on
786	 * tstamp_stopped: in INACTIVE state, the notional time when the
787	 *	event was scheduled off.
788	 */
789	u64				tstamp_enabled;
790	u64				tstamp_running;
791	u64				tstamp_stopped;
792
793	/*
794	 * timestamp shadows the actual context timing but it can
795	 * be safely used in NMI interrupt context. It reflects the
796	 * context time as it was when the event was last scheduled in.
797	 *
798	 * ctx_time already accounts for ctx->timestamp. Therefore to
799	 * compute ctx_time for a sample, simply add perf_clock().
800	 */
801	u64				shadow_ctx_time;
802
803	struct perf_event_attr		attr;
804	u16				header_size;
805	u16				id_header_size;
806	u16				read_size;
807	struct hw_perf_event		hw;
808
809	struct perf_event_context	*ctx;
810	struct file			*filp;
811
812	/*
813	 * These accumulate total time (in nanoseconds) that children
814	 * events have been enabled and running, respectively.
815	 */
816	atomic64_t			child_total_time_enabled;
817	atomic64_t			child_total_time_running;
818
819	/*
820	 * Protect attach/detach and child_list:
821	 */
822	struct mutex			child_mutex;
823	struct list_head		child_list;
824	struct perf_event		*parent;
825
826	int				oncpu;
827	int				cpu;
828
829	struct list_head		owner_entry;
830	struct task_struct		*owner;
831
832	/* mmap bits */
833	struct mutex			mmap_mutex;
834	atomic_t			mmap_count;
835	int				mmap_locked;
836	struct user_struct		*mmap_user;
837	struct perf_buffer		*buffer;
838
839	/* poll related */
840	wait_queue_head_t		waitq;
841	struct fasync_struct		*fasync;
842
843	/* delayed work for NMIs and such */
844	int				pending_wakeup;
845	int				pending_kill;
846	int				pending_disable;
847	struct irq_work			pending;
848
849	atomic_t			event_limit;
850
851	void (*destroy)(struct perf_event *);
852	struct rcu_head			rcu_head;
853
854	struct pid_namespace		*ns;
855	u64				id;
856
857	perf_overflow_handler_t		overflow_handler;
858
859#ifdef CONFIG_EVENT_TRACING
860	struct ftrace_event_call	*tp_event;
861	struct event_filter		*filter;
862#endif
863
864#ifdef CONFIG_CGROUP_PERF
865	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
866	int				cgrp_defer_enabled;
867#endif
868
869#endif /* CONFIG_PERF_EVENTS */
870};
871
872enum perf_event_context_type {
873	task_context,
874	cpu_context,
875};
876
877/**
878 * struct perf_event_context - event context structure
879 *
880 * Used as a container for task events and CPU events as well:
881 */
882struct perf_event_context {
883	struct pmu			*pmu;
884	enum perf_event_context_type	type;
885	/*
886	 * Protect the states of the events in the list,
887	 * nr_active, and the list:
888	 */
889	raw_spinlock_t			lock;
890	/*
891	 * Protect the list of events.  Locking either mutex or lock
892	 * is sufficient to ensure the list doesn't change; to change
893	 * the list you need to lock both the mutex and the spinlock.
894	 */
895	struct mutex			mutex;
896
897	struct list_head		pinned_groups;
898	struct list_head		flexible_groups;
899	struct list_head		event_list;
900	int				nr_events;
901	int				nr_active;
902	int				is_active;
903	int				nr_stat;
904	int				rotate_disable;
905	atomic_t			refcount;
906	struct task_struct		*task;
907
908	/*
909	 * Context clock, runs when context enabled.
910	 */
911	u64				time;
912	u64				timestamp;
913
914	/*
915	 * These fields let us detect when two contexts have both
916	 * been cloned (inherited) from a common ancestor.
917	 */
918	struct perf_event_context	*parent_ctx;
919	u64				parent_gen;
920	u64				generation;
921	int				pin_count;
922	struct rcu_head			rcu_head;
923	int				nr_cgroups; /* cgroup events present */
924};
925
926/*
927 * Number of contexts where an event can trigger:
928 *	task, softirq, hardirq, nmi.
929 */
930#define PERF_NR_CONTEXTS	4
931
932/**
933 * struct perf_event_cpu_context - per cpu event context structure
934 */
935struct perf_cpu_context {
936	struct perf_event_context	ctx;
937	struct perf_event_context	*task_ctx;
938	int				active_oncpu;
939	int				exclusive;
940	struct list_head		rotation_list;
941	int				jiffies_interval;
942	struct pmu			*active_pmu;
943	struct perf_cgroup		*cgrp;
944};
945
946struct perf_output_handle {
947	struct perf_event		*event;
948	struct perf_buffer		*buffer;
949	unsigned long			wakeup;
950	unsigned long			size;
951	void				*addr;
952	int				page;
953	int				nmi;
954	int				sample;
955};
956
957#ifdef CONFIG_PERF_EVENTS
958
959extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
960extern void perf_pmu_unregister(struct pmu *pmu);
961
962extern int perf_num_counters(void);
963extern const char *perf_pmu_name(void);
964extern void __perf_event_task_sched_in(struct task_struct *task);
965extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
966extern int perf_event_init_task(struct task_struct *child);
967extern void perf_event_exit_task(struct task_struct *child);
968extern void perf_event_free_task(struct task_struct *task);
969extern void perf_event_delayed_put(struct task_struct *task);
970extern void perf_event_print_debug(void);
971extern void perf_pmu_disable(struct pmu *pmu);
972extern void perf_pmu_enable(struct pmu *pmu);
973extern int perf_event_task_disable(void);
974extern int perf_event_task_enable(void);
975extern void perf_event_update_userpage(struct perf_event *event);
976extern int perf_event_release_kernel(struct perf_event *event);
977extern struct perf_event *
978perf_event_create_kernel_counter(struct perf_event_attr *attr,
979				int cpu,
980				struct task_struct *task,
981				perf_overflow_handler_t callback);
982extern u64 perf_event_read_value(struct perf_event *event,
983				 u64 *enabled, u64 *running);
984
985struct perf_sample_data {
986	u64				type;
987
988	u64				ip;
989	struct {
990		u32	pid;
991		u32	tid;
992	}				tid_entry;
993	u64				time;
994	u64				addr;
995	u64				id;
996	u64				stream_id;
997	struct {
998		u32	cpu;
999		u32	reserved;
1000	}				cpu_entry;
1001	u64				period;
1002	struct perf_callchain_entry	*callchain;
1003	struct perf_raw_record		*raw;
1004};
1005
1006static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr)
1007{
1008	data->addr = addr;
1009	data->raw  = NULL;
1010}
1011
1012extern void perf_output_sample(struct perf_output_handle *handle,
1013			       struct perf_event_header *header,
1014			       struct perf_sample_data *data,
1015			       struct perf_event *event);
1016extern void perf_prepare_sample(struct perf_event_header *header,
1017				struct perf_sample_data *data,
1018				struct perf_event *event,
1019				struct pt_regs *regs);
1020
1021extern int perf_event_overflow(struct perf_event *event, int nmi,
1022				 struct perf_sample_data *data,
1023				 struct pt_regs *regs);
1024
1025static inline bool is_sampling_event(struct perf_event *event)
1026{
1027	return event->attr.sample_period != 0;
1028}
1029
1030/*
1031 * Return 1 for a software event, 0 for a hardware event
1032 */
1033static inline int is_software_event(struct perf_event *event)
1034{
1035	return event->pmu->task_ctx_nr == perf_sw_context;
1036}
1037
1038extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
1039
1040extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
1041
1042#ifndef perf_arch_fetch_caller_regs
1043static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
1044#endif
1045
1046/*
1047 * Take a snapshot of the regs. Skip ip and frame pointer to
1048 * the nth caller. We only need a few of the regs:
1049 * - ip for PERF_SAMPLE_IP
1050 * - cs for user_mode() tests
1051 * - bp for callchains
1052 * - eflags, for future purposes, just in case
1053 */
1054static inline void perf_fetch_caller_regs(struct pt_regs *regs)
1055{
1056	memset(regs, 0, sizeof(*regs));
1057
1058	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
1059}
1060
1061static __always_inline void
1062perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
1063{
1064	struct pt_regs hot_regs;
1065
1066	if (static_branch(&perf_swevent_enabled[event_id])) {
1067		if (!regs) {
1068			perf_fetch_caller_regs(&hot_regs);
1069			regs = &hot_regs;
1070		}
1071		__perf_sw_event(event_id, nr, nmi, regs, addr);
1072	}
1073}
1074
1075extern struct jump_label_key perf_sched_events;
1076
1077static inline void perf_event_task_sched_in(struct task_struct *task)
1078{
1079	if (static_branch(&perf_sched_events))
1080		__perf_event_task_sched_in(task);
1081}
1082
1083static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
1084{
1085	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1086
1087	__perf_event_task_sched_out(task, next);
1088}
1089
1090extern void perf_event_mmap(struct vm_area_struct *vma);
1091extern struct perf_guest_info_callbacks *perf_guest_cbs;
1092extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
1093extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
1094
1095extern void perf_event_comm(struct task_struct *tsk);
1096extern void perf_event_fork(struct task_struct *tsk);
1097
1098/* Callchains */
1099DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
1100
1101extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
1102extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
1103
1104static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
1105{
1106	if (entry->nr < PERF_MAX_STACK_DEPTH)
1107		entry->ip[entry->nr++] = ip;
1108}
1109
1110extern int sysctl_perf_event_paranoid;
1111extern int sysctl_perf_event_mlock;
1112extern int sysctl_perf_event_sample_rate;
1113
1114extern int perf_proc_update_handler(struct ctl_table *table, int write,
1115		void __user *buffer, size_t *lenp,
1116		loff_t *ppos);
1117
1118static inline bool perf_paranoid_tracepoint_raw(void)
1119{
1120	return sysctl_perf_event_paranoid > -1;
1121}
1122
1123static inline bool perf_paranoid_cpu(void)
1124{
1125	return sysctl_perf_event_paranoid > 0;
1126}
1127
1128static inline bool perf_paranoid_kernel(void)
1129{
1130	return sysctl_perf_event_paranoid > 1;
1131}
1132
1133extern void perf_event_init(void);
1134extern void perf_tp_event(u64 addr, u64 count, void *record,
1135			  int entry_size, struct pt_regs *regs,
1136			  struct hlist_head *head, int rctx);
1137extern void perf_bp_event(struct perf_event *event, void *data);
1138
1139#ifndef perf_misc_flags
1140# define perf_misc_flags(regs) \
1141		(user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
1142# define perf_instruction_pointer(regs)	instruction_pointer(regs)
1143#endif
1144
1145extern int perf_output_begin(struct perf_output_handle *handle,
1146			     struct perf_event *event, unsigned int size,
1147			     int nmi, int sample);
1148extern void perf_output_end(struct perf_output_handle *handle);
1149extern void perf_output_copy(struct perf_output_handle *handle,
1150			     const void *buf, unsigned int len);
1151extern int perf_swevent_get_recursion_context(void);
1152extern void perf_swevent_put_recursion_context(int rctx);
1153extern void perf_event_enable(struct perf_event *event);
1154extern void perf_event_disable(struct perf_event *event);
1155extern void perf_event_task_tick(void);
1156#else
1157static inline void
1158perf_event_task_sched_in(struct task_struct *task)			{ }
1159static inline void
1160perf_event_task_sched_out(struct task_struct *task,
1161			    struct task_struct *next)			{ }
1162static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
1163static inline void perf_event_exit_task(struct task_struct *child)	{ }
1164static inline void perf_event_free_task(struct task_struct *task)	{ }
1165static inline void perf_event_delayed_put(struct task_struct *task)	{ }
1166static inline void perf_event_print_debug(void)				{ }
1167static inline int perf_event_task_disable(void)				{ return -EINVAL; }
1168static inline int perf_event_task_enable(void)				{ return -EINVAL; }
1169
1170static inline void
1171perf_sw_event(u32 event_id, u64 nr, int nmi,
1172		     struct pt_regs *regs, u64 addr)			{ }
1173static inline void
1174perf_bp_event(struct perf_event *event, void *data)			{ }
1175
1176static inline int perf_register_guest_info_callbacks
1177(struct perf_guest_info_callbacks *callbacks)				{ return 0; }
1178static inline int perf_unregister_guest_info_callbacks
1179(struct perf_guest_info_callbacks *callbacks)				{ return 0; }
1180
1181static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
1182static inline void perf_event_comm(struct task_struct *tsk)		{ }
1183static inline void perf_event_fork(struct task_struct *tsk)		{ }
1184static inline void perf_event_init(void)				{ }
1185static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
1186static inline void perf_swevent_put_recursion_context(int rctx)		{ }
1187static inline void perf_event_enable(struct perf_event *event)		{ }
1188static inline void perf_event_disable(struct perf_event *event)		{ }
1189static inline void perf_event_task_tick(void)				{ }
1190#endif
1191
1192#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
1193
1194/*
1195 * This has to have a higher priority than migration_notifier in sched.c.
1196 */
1197#define perf_cpu_notifier(fn)						\
1198do {									\
1199	static struct notifier_block fn##_nb __cpuinitdata =		\
1200		{ .notifier_call = fn, .priority = CPU_PRI_PERF };	\
1201	fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE,			\
1202		(void *)(unsigned long)smp_processor_id());		\
1203	fn(&fn##_nb, (unsigned long)CPU_STARTING,			\
1204		(void *)(unsigned long)smp_processor_id());		\
1205	fn(&fn##_nb, (unsigned long)CPU_ONLINE,				\
1206		(void *)(unsigned long)smp_processor_id());		\
1207	register_cpu_notifier(&fn##_nb);				\
1208} while (0)
1209
1210#endif /* __KERNEL__ */
1211#endif /* _LINUX_PERF_EVENT_H */
1212