1/*
2 * Performance events:
3 *
4 *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
5 *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
6 *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
7 *
8 * Data type definitions, declarations, prototypes.
9 *
10 *    Started by: Thomas Gleixner and Ingo Molnar
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14#ifndef _LINUX_PERF_EVENT_H
15#define _LINUX_PERF_EVENT_H
16
17/* ANDROID_CHANGE_BEGIN */
18#ifndef __APPLE__
19/* Suppress kernel-name space pollution in <linux/types.h> below */
20#include <features.h>
21#include <linux/types.h>
22#include <linux/ioctl.h>
23#include <asm/byteorder.h>
24#else
25#include "../types.h"
26#endif
27/* ANDROID_CHANGE_END */
28
29/*
30 * User-space ABI bits:
31 */
32
33/*
34 * attr.type
35 */
36enum perf_type_id {
37	PERF_TYPE_HARDWARE			= 0,
38	PERF_TYPE_SOFTWARE			= 1,
39	PERF_TYPE_TRACEPOINT			= 2,
40	PERF_TYPE_HW_CACHE			= 3,
41	PERF_TYPE_RAW				= 4,
42	PERF_TYPE_BREAKPOINT			= 5,
43
44	PERF_TYPE_MAX,				/* non-ABI */
45};
46
47/*
48 * Generalized performance event event_id types, used by the
49 * attr.event_id parameter of the sys_perf_event_open()
50 * syscall:
51 */
52enum perf_hw_id {
53	/*
54	 * Common hardware events, generalized by the kernel:
55	 */
56	PERF_COUNT_HW_CPU_CYCLES		= 0,
57	PERF_COUNT_HW_INSTRUCTIONS		= 1,
58	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
59	PERF_COUNT_HW_CACHE_MISSES		= 3,
60	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
61	PERF_COUNT_HW_BRANCH_MISSES		= 5,
62	PERF_COUNT_HW_BUS_CYCLES		= 6,
63	PERF_COUNT_HW_STALLED_CYCLES_FRONTEND	= 7,
64	PERF_COUNT_HW_STALLED_CYCLES_BACKEND	= 8,
65
66	PERF_COUNT_HW_MAX,			/* non-ABI */
67};
68
69/*
70 * Generalized hardware cache events:
71 *
72 *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
73 *       { read, write, prefetch } x
74 *       { accesses, misses }
75 */
76enum perf_hw_cache_id {
77	PERF_COUNT_HW_CACHE_L1D			= 0,
78	PERF_COUNT_HW_CACHE_L1I			= 1,
79	PERF_COUNT_HW_CACHE_LL			= 2,
80	PERF_COUNT_HW_CACHE_DTLB		= 3,
81	PERF_COUNT_HW_CACHE_ITLB		= 4,
82	PERF_COUNT_HW_CACHE_BPU			= 5,
83
84	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
85};
86
87enum perf_hw_cache_op_id {
88	PERF_COUNT_HW_CACHE_OP_READ		= 0,
89	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
90	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
91
92	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
93};
94
95enum perf_hw_cache_op_result_id {
96	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
97	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
98
99	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
100};
101
102/*
103 * Special "software" events provided by the kernel, even if the hardware
104 * does not support performance events. These events measure various
105 * physical and sw events of the kernel (and allow the profiling of them as
106 * well):
107 */
108enum perf_sw_ids {
109	PERF_COUNT_SW_CPU_CLOCK			= 0,
110	PERF_COUNT_SW_TASK_CLOCK		= 1,
111	PERF_COUNT_SW_PAGE_FAULTS		= 2,
112	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
113	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
114	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
115	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
116	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
117	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
118
119	PERF_COUNT_SW_MAX,			/* non-ABI */
120};
121
122/*
123 * Bits that can be set in attr.sample_type to request information
124 * in the overflow packets.
125 */
126enum perf_event_sample_format {
127	PERF_SAMPLE_IP				= 1U << 0,
128	PERF_SAMPLE_TID				= 1U << 1,
129	PERF_SAMPLE_TIME			= 1U << 2,
130	PERF_SAMPLE_ADDR			= 1U << 3,
131	PERF_SAMPLE_READ			= 1U << 4,
132	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
133	PERF_SAMPLE_ID				= 1U << 6,
134	PERF_SAMPLE_CPU				= 1U << 7,
135	PERF_SAMPLE_PERIOD			= 1U << 8,
136	PERF_SAMPLE_STREAM_ID			= 1U << 9,
137	PERF_SAMPLE_RAW				= 1U << 10,
138
139	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
140};
141
142/*
143 * The format of the data returned by read() on a perf event fd,
144 * as specified by attr.read_format:
145 *
146 * struct read_format {
147 *	{ u64		value;
148 *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
149 *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
150 *	  { u64		id;           } && PERF_FORMAT_ID
151 *	} && !PERF_FORMAT_GROUP
152 *
153 *	{ u64		nr;
154 *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
155 *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
156 *	  { u64		value;
157 *	    { u64	id;           } && PERF_FORMAT_ID
158 *	  }		cntr[nr];
159 *	} && PERF_FORMAT_GROUP
160 * };
161 */
162enum perf_event_read_format {
163	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
164	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
165	PERF_FORMAT_ID				= 1U << 2,
166	PERF_FORMAT_GROUP			= 1U << 3,
167
168	PERF_FORMAT_MAX = 1U << 4,		/* non-ABI */
169};
170
171#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
172
173/*
174 * Hardware event_id to monitor via a performance monitoring event:
175 */
176struct perf_event_attr {
177
178	/*
179	 * Major type: hardware/software/tracepoint/etc.
180	 */
181	__u32			type;
182
183	/*
184	 * Size of the attr structure, for fwd/bwd compat.
185	 */
186	__u32			size;
187
188	/*
189	 * Type specific configuration information.
190	 */
191	__u64			config;
192
193	union {
194		__u64		sample_period;
195		__u64		sample_freq;
196	};
197
198	__u64			sample_type;
199	__u64			read_format;
200
201	__u64			disabled       :  1, /* off by default        */
202				inherit	       :  1, /* children inherit it   */
203				pinned	       :  1, /* must always be on PMU */
204				exclusive      :  1, /* only group on PMU     */
205				exclude_user   :  1, /* don't count user      */
206				exclude_kernel :  1, /* ditto kernel          */
207				exclude_hv     :  1, /* ditto hypervisor      */
208				exclude_idle   :  1, /* don't count when idle */
209				mmap           :  1, /* include mmap data     */
210				comm	       :  1, /* include comm data     */
211				freq           :  1, /* use freq, not period  */
212				inherit_stat   :  1, /* per task counts       */
213				enable_on_exec :  1, /* next exec enables     */
214				task           :  1, /* trace fork/exit       */
215				watermark      :  1, /* wakeup_watermark      */
216				/*
217				 * precise_ip:
218				 *
219				 *  0 - SAMPLE_IP can have arbitrary skid
220				 *  1 - SAMPLE_IP must have constant skid
221				 *  2 - SAMPLE_IP requested to have 0 skid
222				 *  3 - SAMPLE_IP must have 0 skid
223				 *
224				 *  See also PERF_RECORD_MISC_EXACT_IP
225				 */
226				precise_ip     :  2, /* skid constraint       */
227				mmap_data      :  1, /* non-exec mmap data    */
228				sample_id_all  :  1, /* sample_type all events */
229
230				__reserved_1   : 45;
231
232	union {
233		__u32		wakeup_events;	  /* wakeup every n events */
234		__u32		wakeup_watermark; /* bytes before wakeup   */
235	};
236
237	__u32			bp_type;
238	union {
239		__u64		bp_addr;
240		__u64		config1; /* extension of config */
241	};
242	union {
243		__u64		bp_len;
244		__u64		config2; /* extension of config1 */
245	};
246};
247
248/*
249 * Ioctls that can be done on a perf event fd:
250 */
251#define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
252#define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
253#define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
254#define PERF_EVENT_IOC_RESET		_IO ('$', 3)
255#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, __u64)
256#define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
257#define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
258
259enum perf_event_ioc_flags {
260	PERF_IOC_FLAG_GROUP		= 1U << 0,
261};
262
263/*
264 * Structure of the page that can be mapped via mmap
265 */
266struct perf_event_mmap_page {
267	__u32	version;		/* version number of this structure */
268	__u32	compat_version;		/* lowest version this is compat with */
269
270	/*
271	 * Bits needed to read the hw events in user-space.
272	 *
273	 *   u32 seq;
274	 *   s64 count;
275	 *
276	 *   do {
277	 *     seq = pc->lock;
278	 *
279	 *     barrier()
280	 *     if (pc->index) {
281	 *       count = pmc_read(pc->index - 1);
282	 *       count += pc->offset;
283	 *     } else
284	 *       goto regular_read;
285	 *
286	 *     barrier();
287	 *   } while (pc->lock != seq);
288	 *
289	 * NOTE: for obvious reason this only works on self-monitoring
290	 *       processes.
291	 */
292	__u32	lock;			/* seqlock for synchronization */
293	__u32	index;			/* hardware event identifier */
294	__s64	offset;			/* add to hardware event value */
295	__u64	time_enabled;		/* time event active */
296	__u64	time_running;		/* time event on cpu */
297
298		/*
299		 * Hole for extension of the self monitor capabilities
300		 */
301
302	__u64	__reserved[123];	/* align to 1k */
303
304	/*
305	 * Control data for the mmap() data buffer.
306	 *
307	 * User-space reading the @data_head value should issue an rmb(), on
308	 * SMP capable platforms, after reading this value -- see
309	 * perf_event_wakeup().
310	 *
311	 * When the mapping is PROT_WRITE the @data_tail value should be
312	 * written by userspace to reflect the last read data. In this case
313	 * the kernel will not over-write unread data.
314	 */
315	__u64   data_head;		/* head in the data section */
316	__u64	data_tail;		/* user-space written tail */
317};
318
319#define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
320#define PERF_RECORD_MISC_CPUMODE_UNKNOWN	(0 << 0)
321#define PERF_RECORD_MISC_KERNEL			(1 << 0)
322#define PERF_RECORD_MISC_USER			(2 << 0)
323#define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
324#define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
325#define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
326
327/*
328 * Indicates that the content of PERF_SAMPLE_IP points to
329 * the actual instruction that triggered the event. See also
330 * perf_event_attr::precise_ip.
331 */
332#define PERF_RECORD_MISC_EXACT_IP		(1 << 14)
333/*
334 * Reserve the last bit to indicate some extended misc field
335 */
336#define PERF_RECORD_MISC_EXT_RESERVED		(1 << 15)
337
338struct perf_event_header {
339	__u32	type;
340	__u16	misc;
341	__u16	size;
342};
343
344enum perf_event_type {
345
346	/*
347	 * If perf_event_attr.sample_id_all is set then all event types will
348	 * have the sample_type selected fields related to where/when
349	 * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID)
350	 * described in PERF_RECORD_SAMPLE below, it will be stashed just after
351	 * the perf_event_header and the fields already present for the existing
352	 * fields, i.e. at the end of the payload. That way a newer perf.data
353	 * file will be supported by older perf tools, with these new optional
354	 * fields being ignored.
355	 *
356	 * The MMAP events record the PROT_EXEC mappings so that we can
357	 * correlate userspace IPs to code. They have the following structure:
358	 *
359	 * struct {
360	 *	struct perf_event_header	header;
361	 *
362	 *	u32				pid, tid;
363	 *	u64				addr;
364	 *	u64				len;
365	 *	u64				pgoff;
366	 *	char				filename[];
367	 * };
368	 */
369	PERF_RECORD_MMAP			= 1,
370
371	/*
372	 * struct {
373	 *	struct perf_event_header	header;
374	 *	u64				id;
375	 *	u64				lost;
376	 * };
377	 */
378	PERF_RECORD_LOST			= 2,
379
380	/*
381	 * struct {
382	 *	struct perf_event_header	header;
383	 *
384	 *	u32				pid, tid;
385	 *	char				comm[];
386	 * };
387	 */
388	PERF_RECORD_COMM			= 3,
389
390	/*
391	 * struct {
392	 *	struct perf_event_header	header;
393	 *	u32				pid, ppid;
394	 *	u32				tid, ptid;
395	 *	u64				time;
396	 * };
397	 */
398	PERF_RECORD_EXIT			= 4,
399
400	/*
401	 * struct {
402	 *	struct perf_event_header	header;
403	 *	u64				time;
404	 *	u64				id;
405	 *	u64				stream_id;
406	 * };
407	 */
408	PERF_RECORD_THROTTLE			= 5,
409	PERF_RECORD_UNTHROTTLE			= 6,
410
411	/*
412	 * struct {
413	 *	struct perf_event_header	header;
414	 *	u32				pid, ppid;
415	 *	u32				tid, ptid;
416	 *	u64				time;
417	 * };
418	 */
419	PERF_RECORD_FORK			= 7,
420
421	/*
422	 * struct {
423	 *	struct perf_event_header	header;
424	 *	u32				pid, tid;
425	 *
426	 *	struct read_format		values;
427	 * };
428	 */
429	PERF_RECORD_READ			= 8,
430
431	/*
432	 * struct {
433	 *	struct perf_event_header	header;
434	 *
435	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
436	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
437	 *	{ u64			time;     } && PERF_SAMPLE_TIME
438	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
439	 *	{ u64			id;	  } && PERF_SAMPLE_ID
440	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
441	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
442	 *	{ u64			period;   } && PERF_SAMPLE_PERIOD
443	 *
444	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
445	 *
446	 *	{ u64			nr,
447	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
448	 *
449	 *	#
450	 *	# The RAW record below is opaque data wrt the ABI
451	 *	#
452	 *	# That is, the ABI doesn't make any promises wrt to
453	 *	# the stability of its content, it may vary depending
454	 *	# on event, hardware, kernel version and phase of
455	 *	# the moon.
456	 *	#
457	 *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
458	 *	#
459	 *
460	 *	{ u32			size;
461	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
462	 * };
463	 */
464	PERF_RECORD_SAMPLE			= 9,
465
466	PERF_RECORD_MAX,			/* non-ABI */
467};
468
469enum perf_callchain_context {
470	PERF_CONTEXT_HV			= (__u64)-32,
471	PERF_CONTEXT_KERNEL		= (__u64)-128,
472	PERF_CONTEXT_USER		= (__u64)-512,
473
474	PERF_CONTEXT_GUEST		= (__u64)-2048,
475	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
476	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,
477
478	PERF_CONTEXT_MAX		= (__u64)-4095,
479};
480
481#define PERF_FLAG_FD_NO_GROUP		(1U << 0)
482#define PERF_FLAG_FD_OUTPUT		(1U << 1)
483#define PERF_FLAG_PID_CGROUP		(1U << 2) /* pid=cgroup id, per-cpu mode only */
484
485#ifdef __KERNEL__
486/*
487 * Kernel-internal data types and definitions:
488 */
489
490#ifdef CONFIG_PERF_EVENTS
491# include <linux/cgroup.h>
492# include <asm/perf_event.h>
493# include <asm/local64.h>
494#endif
495
496struct perf_guest_info_callbacks {
497	int				(*is_in_guest)(void);
498	int				(*is_user_mode)(void);
499	unsigned long			(*get_guest_ip)(void);
500};
501
502#ifdef CONFIG_HAVE_HW_BREAKPOINT
503#include <asm/hw_breakpoint.h>
504#endif
505
506#include <linux/list.h>
507#include <linux/mutex.h>
508#include <linux/rculist.h>
509#include <linux/rcupdate.h>
510#include <linux/spinlock.h>
511#include <linux/hrtimer.h>
512#include <linux/fs.h>
513#include <linux/pid_namespace.h>
514#include <linux/workqueue.h>
515#include <linux/ftrace.h>
516#include <linux/cpu.h>
517#include <linux/irq_work.h>
518#include <linux/jump_label.h>
519#include <asm/atomic.h>
520#include <asm/local.h>
521
522#define PERF_MAX_STACK_DEPTH		255
523
524struct perf_callchain_entry {
525	__u64				nr;
526	__u64				ip[PERF_MAX_STACK_DEPTH];
527};
528
529struct perf_raw_record {
530	u32				size;
531	void				*data;
532};
533
534struct perf_branch_entry {
535	__u64				from;
536	__u64				to;
537	__u64				flags;
538};
539
540struct perf_branch_stack {
541	__u64				nr;
542	struct perf_branch_entry	entries[0];
543};
544
545struct task_struct;
546
547/**
548 * struct hw_perf_event - performance event hardware details:
549 */
550struct hw_perf_event {
551#ifdef CONFIG_PERF_EVENTS
552	union {
553		struct { /* hardware */
554			u64		config;
555			u64		last_tag;
556			unsigned long	config_base;
557			unsigned long	event_base;
558			int		idx;
559			int		last_cpu;
560			unsigned int	extra_reg;
561			u64		extra_config;
562			int		extra_alloc;
563		};
564		struct { /* software */
565			struct hrtimer	hrtimer;
566		};
567#ifdef CONFIG_HAVE_HW_BREAKPOINT
568		struct { /* breakpoint */
569			struct arch_hw_breakpoint	info;
570			struct list_head		bp_list;
571			/*
572			 * Crufty hack to avoid the chicken and egg
573			 * problem hw_breakpoint has with context
574			 * creation and event initalization.
575			 */
576			struct task_struct		*bp_target;
577		};
578#endif
579	};
580	int				state;
581	local64_t			prev_count;
582	u64				sample_period;
583	u64				last_period;
584	local64_t			period_left;
585	u64				interrupts;
586
587	u64				freq_time_stamp;
588	u64				freq_count_stamp;
589#endif
590};
591
592/*
593 * hw_perf_event::state flags
594 */
595#define PERF_HES_STOPPED	0x01 /* the counter is stopped */
596#define PERF_HES_UPTODATE	0x02 /* event->count up-to-date */
597#define PERF_HES_ARCH		0x04
598
599struct perf_event;
600
601/*
602 * Common implementation detail of pmu::{start,commit,cancel}_txn
603 */
604#define PERF_EVENT_TXN 0x1
605
606/**
607 * struct pmu - generic performance monitoring unit
608 */
609struct pmu {
610	struct list_head		entry;
611
612	struct device			*dev;
613	char				*name;
614	int				type;
615
616	int * __percpu			pmu_disable_count;
617	struct perf_cpu_context * __percpu pmu_cpu_context;
618	int				task_ctx_nr;
619
620	/*
621	 * Fully disable/enable this PMU, can be used to protect from the PMI
622	 * as well as for lazy/batch writing of the MSRs.
623	 */
624	void (*pmu_enable)		(struct pmu *pmu); /* optional */
625	void (*pmu_disable)		(struct pmu *pmu); /* optional */
626
627	/*
628	 * Try and initialize the event for this PMU.
629	 * Should return -ENOENT when the @event doesn't match this PMU.
630	 */
631	int (*event_init)		(struct perf_event *event);
632
633#define PERF_EF_START	0x01		/* start the counter when adding    */
634#define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
635#define PERF_EF_UPDATE	0x04		/* update the counter when stopping */
636
637	/*
638	 * Adds/Removes a counter to/from the PMU, can be done inside
639	 * a transaction, see the ->*_txn() methods.
640	 */
641	int  (*add)			(struct perf_event *event, int flags);
642	void (*del)			(struct perf_event *event, int flags);
643
644	/*
645	 * Starts/Stops a counter present on the PMU. The PMI handler
646	 * should stop the counter when perf_event_overflow() returns
647	 * !0. ->start() will be used to continue.
648	 */
649	void (*start)			(struct perf_event *event, int flags);
650	void (*stop)			(struct perf_event *event, int flags);
651
652	/*
653	 * Updates the counter value of the event.
654	 */
655	void (*read)			(struct perf_event *event);
656
657	/*
658	 * Group events scheduling is treated as a transaction, add
659	 * group events as a whole and perform one schedulability test.
660	 * If the test fails, roll back the whole group
661	 *
662	 * Start the transaction, after this ->add() doesn't need to
663	 * do schedulability tests.
664	 */
665	void (*start_txn)		(struct pmu *pmu); /* optional */
666	/*
667	 * If ->start_txn() disabled the ->add() schedulability test
668	 * then ->commit_txn() is required to perform one. On success
669	 * the transaction is closed. On error the transaction is kept
670	 * open until ->cancel_txn() is called.
671	 */
672	int  (*commit_txn)		(struct pmu *pmu); /* optional */
673	/*
674	 * Will cancel the transaction, assumes ->del() is called
675	 * for each successful ->add() during the transaction.
676	 */
677	void (*cancel_txn)		(struct pmu *pmu); /* optional */
678};
679
680/**
681 * enum perf_event_active_state - the states of a event
682 */
683enum perf_event_active_state {
684	PERF_EVENT_STATE_ERROR		= -2,
685	PERF_EVENT_STATE_OFF		= -1,
686	PERF_EVENT_STATE_INACTIVE	=  0,
687	PERF_EVENT_STATE_ACTIVE		=  1,
688};
689
690struct file;
691
692#define PERF_BUFFER_WRITABLE		0x01
693
694struct perf_buffer {
695	atomic_t			refcount;
696	struct rcu_head			rcu_head;
697#ifdef CONFIG_PERF_USE_VMALLOC
698	struct work_struct		work;
699	int				page_order;	/* allocation order  */
700#endif
701	int				nr_pages;	/* nr of data pages  */
702	int				writable;	/* are we writable   */
703
704	atomic_t			poll;		/* POLL_ for wakeups */
705
706	local_t				head;		/* write position    */
707	local_t				nest;		/* nested writers    */
708	local_t				events;		/* event limit       */
709	local_t				wakeup;		/* wakeup stamp      */
710	local_t				lost;		/* nr records lost   */
711
712	long				watermark;	/* wakeup watermark  */
713
714	struct perf_event_mmap_page	*user_page;
715	void				*data_pages[0];
716};
717
718struct perf_sample_data;
719
720typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
721					struct perf_sample_data *,
722					struct pt_regs *regs);
723
724enum perf_group_flag {
725	PERF_GROUP_SOFTWARE		= 0x1,
726};
727
728#define SWEVENT_HLIST_BITS		8
729#define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
730
731struct swevent_hlist {
732	struct hlist_head		heads[SWEVENT_HLIST_SIZE];
733	struct rcu_head			rcu_head;
734};
735
736#define PERF_ATTACH_CONTEXT	0x01
737#define PERF_ATTACH_GROUP	0x02
738#define PERF_ATTACH_TASK	0x04
739
740#ifdef CONFIG_CGROUP_PERF
741/*
742 * perf_cgroup_info keeps track of time_enabled for a cgroup.
743 * This is a per-cpu dynamically allocated data structure.
744 */
745struct perf_cgroup_info {
746	u64				time;
747	u64				timestamp;
748};
749
750struct perf_cgroup {
751	struct				cgroup_subsys_state css;
752	struct				perf_cgroup_info *info;	/* timing info, one per cpu */
753};
754#endif
755
756/**
757 * struct perf_event - performance event kernel representation:
758 */
759struct perf_event {
760#ifdef CONFIG_PERF_EVENTS
761	struct list_head		group_entry;
762	struct list_head		event_entry;
763	struct list_head		sibling_list;
764	struct hlist_node		hlist_entry;
765	int				nr_siblings;
766	int				group_flags;
767	struct perf_event		*group_leader;
768	struct pmu			*pmu;
769
770	enum perf_event_active_state	state;
771	unsigned int			attach_state;
772	local64_t			count;
773	atomic64_t			child_count;
774
775	/*
776	 * These are the total time in nanoseconds that the event
777	 * has been enabled (i.e. eligible to run, and the task has
778	 * been scheduled in, if this is a per-task event)
779	 * and running (scheduled onto the CPU), respectively.
780	 *
781	 * They are computed from tstamp_enabled, tstamp_running and
782	 * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
783	 */
784	u64				total_time_enabled;
785	u64				total_time_running;
786
787	/*
788	 * These are timestamps used for computing total_time_enabled
789	 * and total_time_running when the event is in INACTIVE or
790	 * ACTIVE state, measured in nanoseconds from an arbitrary point
791	 * in time.
792	 * tstamp_enabled: the notional time when the event was enabled
793	 * tstamp_running: the notional time when the event was scheduled on
794	 * tstamp_stopped: in INACTIVE state, the notional time when the
795	 *	event was scheduled off.
796	 */
797	u64				tstamp_enabled;
798	u64				tstamp_running;
799	u64				tstamp_stopped;
800
801	/*
802	 * timestamp shadows the actual context timing but it can
803	 * be safely used in NMI interrupt context. It reflects the
804	 * context time as it was when the event was last scheduled in.
805	 *
806	 * ctx_time already accounts for ctx->timestamp. Therefore to
807	 * compute ctx_time for a sample, simply add perf_clock().
808	 */
809	u64				shadow_ctx_time;
810
811	struct perf_event_attr		attr;
812	u16				header_size;
813	u16				id_header_size;
814	u16				read_size;
815	struct hw_perf_event		hw;
816
817	struct perf_event_context	*ctx;
818	struct file			*filp;
819
820	/*
821	 * These accumulate total time (in nanoseconds) that children
822	 * events have been enabled and running, respectively.
823	 */
824	atomic64_t			child_total_time_enabled;
825	atomic64_t			child_total_time_running;
826
827	/*
828	 * Protect attach/detach and child_list:
829	 */
830	struct mutex			child_mutex;
831	struct list_head		child_list;
832	struct perf_event		*parent;
833
834	int				oncpu;
835	int				cpu;
836
837	struct list_head		owner_entry;
838	struct task_struct		*owner;
839
840	/* mmap bits */
841	struct mutex			mmap_mutex;
842	atomic_t			mmap_count;
843	int				mmap_locked;
844	struct user_struct		*mmap_user;
845	struct perf_buffer		*buffer;
846
847	/* poll related */
848	wait_queue_head_t		waitq;
849	struct fasync_struct		*fasync;
850
851	/* delayed work for NMIs and such */
852	int				pending_wakeup;
853	int				pending_kill;
854	int				pending_disable;
855	struct irq_work			pending;
856
857	atomic_t			event_limit;
858
859	void (*destroy)(struct perf_event *);
860	struct rcu_head			rcu_head;
861
862	struct pid_namespace		*ns;
863	u64				id;
864
865	perf_overflow_handler_t		overflow_handler;
866
867#ifdef CONFIG_EVENT_TRACING
868	struct ftrace_event_call	*tp_event;
869	struct event_filter		*filter;
870#endif
871
872#ifdef CONFIG_CGROUP_PERF
873	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
874	int				cgrp_defer_enabled;
875#endif
876
877#endif /* CONFIG_PERF_EVENTS */
878};
879
880enum perf_event_context_type {
881	task_context,
882	cpu_context,
883};
884
885/**
886 * struct perf_event_context - event context structure
887 *
888 * Used as a container for task events and CPU events as well:
889 */
890struct perf_event_context {
891	struct pmu			*pmu;
892	enum perf_event_context_type	type;
893	/*
894	 * Protect the states of the events in the list,
895	 * nr_active, and the list:
896	 */
897	raw_spinlock_t			lock;
898	/*
899	 * Protect the list of events.  Locking either mutex or lock
900	 * is sufficient to ensure the list doesn't change; to change
901	 * the list you need to lock both the mutex and the spinlock.
902	 */
903	struct mutex			mutex;
904
905	struct list_head		pinned_groups;
906	struct list_head		flexible_groups;
907	struct list_head		event_list;
908	int				nr_events;
909	int				nr_active;
910	int				is_active;
911	int				nr_stat;
912	int				rotate_disable;
913	atomic_t			refcount;
914	struct task_struct		*task;
915
916	/*
917	 * Context clock, runs when context enabled.
918	 */
919	u64				time;
920	u64				timestamp;
921
922	/*
923	 * These fields let us detect when two contexts have both
924	 * been cloned (inherited) from a common ancestor.
925	 */
926	struct perf_event_context	*parent_ctx;
927	u64				parent_gen;
928	u64				generation;
929	int				pin_count;
930	struct rcu_head			rcu_head;
931	int				nr_cgroups; /* cgroup events present */
932};
933
934/*
935 * Number of contexts where an event can trigger:
936 *	task, softirq, hardirq, nmi.
937 */
938#define PERF_NR_CONTEXTS	4
939
940/**
941 * struct perf_event_cpu_context - per cpu event context structure
942 */
943struct perf_cpu_context {
944	struct perf_event_context	ctx;
945	struct perf_event_context	*task_ctx;
946	int				active_oncpu;
947	int				exclusive;
948	struct list_head		rotation_list;
949	int				jiffies_interval;
950	struct pmu			*active_pmu;
951	struct perf_cgroup		*cgrp;
952};
953
954struct perf_output_handle {
955	struct perf_event		*event;
956	struct perf_buffer		*buffer;
957	unsigned long			wakeup;
958	unsigned long			size;
959	void				*addr;
960	int				page;
961	int				nmi;
962	int				sample;
963};
964
965#ifdef CONFIG_PERF_EVENTS
966
967extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
968extern void perf_pmu_unregister(struct pmu *pmu);
969
970extern int perf_num_counters(void);
971extern const char *perf_pmu_name(void);
972extern void __perf_event_task_sched_in(struct task_struct *task);
973extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
974extern int perf_event_init_task(struct task_struct *child);
975extern void perf_event_exit_task(struct task_struct *child);
976extern void perf_event_free_task(struct task_struct *task);
977extern void perf_event_delayed_put(struct task_struct *task);
978extern void perf_event_print_debug(void);
979extern void perf_pmu_disable(struct pmu *pmu);
980extern void perf_pmu_enable(struct pmu *pmu);
981extern int perf_event_task_disable(void);
982extern int perf_event_task_enable(void);
983extern void perf_event_update_userpage(struct perf_event *event);
984extern int perf_event_release_kernel(struct perf_event *event);
985extern struct perf_event *
986perf_event_create_kernel_counter(struct perf_event_attr *attr,
987				int cpu,
988				struct task_struct *task,
989				perf_overflow_handler_t callback);
990extern u64 perf_event_read_value(struct perf_event *event,
991				 u64 *enabled, u64 *running);
992
993struct perf_sample_data {
994	u64				type;
995
996	u64				ip;
997	struct {
998		u32	pid;
999		u32	tid;
1000	}				tid_entry;
1001	u64				time;
1002	u64				addr;
1003	u64				id;
1004	u64				stream_id;
1005	struct {
1006		u32	cpu;
1007		u32	reserved;
1008	}				cpu_entry;
1009	u64				period;
1010	struct perf_callchain_entry	*callchain;
1011	struct perf_raw_record		*raw;
1012};
1013
1014static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr)
1015{
1016	data->addr = addr;
1017	data->raw  = NULL;
1018}
1019
1020extern void perf_output_sample(struct perf_output_handle *handle,
1021			       struct perf_event_header *header,
1022			       struct perf_sample_data *data,
1023			       struct perf_event *event);
1024extern void perf_prepare_sample(struct perf_event_header *header,
1025				struct perf_sample_data *data,
1026				struct perf_event *event,
1027				struct pt_regs *regs);
1028
1029extern int perf_event_overflow(struct perf_event *event, int nmi,
1030				 struct perf_sample_data *data,
1031				 struct pt_regs *regs);
1032
1033static inline bool is_sampling_event(struct perf_event *event)
1034{
1035	return event->attr.sample_period != 0;
1036}
1037
1038/*
1039 * Return 1 for a software event, 0 for a hardware event
1040 */
1041static inline int is_software_event(struct perf_event *event)
1042{
1043	return event->pmu->task_ctx_nr == perf_sw_context;
1044}
1045
1046extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
1047
1048extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
1049
1050#ifndef perf_arch_fetch_caller_regs
1051static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
1052#endif
1053
1054/*
1055 * Take a snapshot of the regs. Skip ip and frame pointer to
1056 * the nth caller. We only need a few of the regs:
1057 * - ip for PERF_SAMPLE_IP
1058 * - cs for user_mode() tests
1059 * - bp for callchains
1060 * - eflags, for future purposes, just in case
1061 */
1062static inline void perf_fetch_caller_regs(struct pt_regs *regs)
1063{
1064	memset(regs, 0, sizeof(*regs));
1065
1066	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
1067}
1068
1069static __always_inline void
1070perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
1071{
1072	struct pt_regs hot_regs;
1073
1074	if (static_branch(&perf_swevent_enabled[event_id])) {
1075		if (!regs) {
1076			perf_fetch_caller_regs(&hot_regs);
1077			regs = &hot_regs;
1078		}
1079		__perf_sw_event(event_id, nr, nmi, regs, addr);
1080	}
1081}
1082
1083extern struct jump_label_key perf_sched_events;
1084
1085static inline void perf_event_task_sched_in(struct task_struct *task)
1086{
1087	if (static_branch(&perf_sched_events))
1088		__perf_event_task_sched_in(task);
1089}
1090
1091static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
1092{
1093	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1094
1095	__perf_event_task_sched_out(task, next);
1096}
1097
1098extern void perf_event_mmap(struct vm_area_struct *vma);
1099extern struct perf_guest_info_callbacks *perf_guest_cbs;
1100extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
1101extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
1102
1103extern void perf_event_comm(struct task_struct *tsk);
1104extern void perf_event_fork(struct task_struct *tsk);
1105
1106/* Callchains */
1107DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
1108
1109extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
1110extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
1111
1112static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
1113{
1114	if (entry->nr < PERF_MAX_STACK_DEPTH)
1115		entry->ip[entry->nr++] = ip;
1116}
1117
1118extern int sysctl_perf_event_paranoid;
1119extern int sysctl_perf_event_mlock;
1120extern int sysctl_perf_event_sample_rate;
1121
1122extern int perf_proc_update_handler(struct ctl_table *table, int write,
1123		void __user *buffer, size_t *lenp,
1124		loff_t *ppos);
1125
1126static inline bool perf_paranoid_tracepoint_raw(void)
1127{
1128	return sysctl_perf_event_paranoid > -1;
1129}
1130
1131static inline bool perf_paranoid_cpu(void)
1132{
1133	return sysctl_perf_event_paranoid > 0;
1134}
1135
1136static inline bool perf_paranoid_kernel(void)
1137{
1138	return sysctl_perf_event_paranoid > 1;
1139}
1140
1141extern void perf_event_init(void);
1142extern void perf_tp_event(u64 addr, u64 count, void *record,
1143			  int entry_size, struct pt_regs *regs,
1144			  struct hlist_head *head, int rctx);
1145extern void perf_bp_event(struct perf_event *event, void *data);
1146
1147#ifndef perf_misc_flags
1148# define perf_misc_flags(regs) \
1149		(user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
1150# define perf_instruction_pointer(regs)	instruction_pointer(regs)
1151#endif
1152
1153extern int perf_output_begin(struct perf_output_handle *handle,
1154			     struct perf_event *event, unsigned int size,
1155			     int nmi, int sample);
1156extern void perf_output_end(struct perf_output_handle *handle);
1157extern void perf_output_copy(struct perf_output_handle *handle,
1158			     const void *buf, unsigned int len);
1159extern int perf_swevent_get_recursion_context(void);
1160extern void perf_swevent_put_recursion_context(int rctx);
1161extern void perf_event_enable(struct perf_event *event);
1162extern void perf_event_disable(struct perf_event *event);
1163extern void perf_event_task_tick(void);
1164#else
1165static inline void
1166perf_event_task_sched_in(struct task_struct *task)			{ }
1167static inline void
1168perf_event_task_sched_out(struct task_struct *task,
1169			    struct task_struct *next)			{ }
1170static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
1171static inline void perf_event_exit_task(struct task_struct *child)	{ }
1172static inline void perf_event_free_task(struct task_struct *task)	{ }
1173static inline void perf_event_delayed_put(struct task_struct *task)	{ }
1174static inline void perf_event_print_debug(void)				{ }
1175static inline int perf_event_task_disable(void)				{ return -EINVAL; }
1176static inline int perf_event_task_enable(void)				{ return -EINVAL; }
1177
1178static inline void
1179perf_sw_event(u32 event_id, u64 nr, int nmi,
1180		     struct pt_regs *regs, u64 addr)			{ }
1181static inline void
1182perf_bp_event(struct perf_event *event, void *data)			{ }
1183
1184static inline int perf_register_guest_info_callbacks
1185(struct perf_guest_info_callbacks *callbacks)				{ return 0; }
1186static inline int perf_unregister_guest_info_callbacks
1187(struct perf_guest_info_callbacks *callbacks)				{ return 0; }
1188
1189static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
1190static inline void perf_event_comm(struct task_struct *tsk)		{ }
1191static inline void perf_event_fork(struct task_struct *tsk)		{ }
1192static inline void perf_event_init(void)				{ }
1193static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
1194static inline void perf_swevent_put_recursion_context(int rctx)		{ }
1195static inline void perf_event_enable(struct perf_event *event)		{ }
1196static inline void perf_event_disable(struct perf_event *event)		{ }
1197static inline void perf_event_task_tick(void)				{ }
1198#endif
1199
1200#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
1201
1202/*
1203 * This has to have a higher priority than migration_notifier in sched.c.
1204 */
1205#define perf_cpu_notifier(fn)						\
1206do {									\
1207	static struct notifier_block fn##_nb __cpuinitdata =		\
1208		{ .notifier_call = fn, .priority = CPU_PRI_PERF };	\
1209	fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE,			\
1210		(void *)(unsigned long)smp_processor_id());		\
1211	fn(&fn##_nb, (unsigned long)CPU_STARTING,			\
1212		(void *)(unsigned long)smp_processor_id());		\
1213	fn(&fn##_nb, (unsigned long)CPU_ONLINE,				\
1214		(void *)(unsigned long)smp_processor_id());		\
1215	register_cpu_notifier(&fn##_nb);				\
1216} while (0)
1217
1218#endif /* __KERNEL__ */
1219#endif /* _LINUX_PERF_EVENT_H */
1220