1#include "builtin.h"
2#include "perf.h"
3
4#include "util/util.h"
5#include "util/cache.h"
6#include "util/symbol.h"
7#include "util/thread.h"
8#include "util/header.h"
9#include "util/session.h"
10
11#include "util/parse-options.h"
12#include "util/trace-event.h"
13
14#include "util/debug.h"
15
16/* ANDROID_CHANGE_BEGIN */
17#ifndef __APPLE__
18#include <sys/prctl.h>
19#endif
20/* ANDROID_CHANGE_END */
21
22#include <semaphore.h>
23/* ANDROID_CHANGE_BEGIN */
24#ifdef __BIONIC__
25/* PTHREAD_STACK_MIN is defined as (2 * PAGE_SIZE) */
26#include <asm/page.h>
27#endif
28/* ANDROID_CHANGE_END */
29#include <pthread.h>
30#include <math.h>
31
32static char			const *input_name = "perf.data";
33
34static char			default_sort_order[] = "avg, max, switch, runtime";
35static const char		*sort_order = default_sort_order;
36
37static int			profile_cpu = -1;
38
39#define PR_SET_NAME		15               /* Set process name */
40#define MAX_CPUS		4096
41
42static u64			run_measurement_overhead;
43static u64			sleep_measurement_overhead;
44
45#define COMM_LEN		20
46#define SYM_LEN			129
47
48#define MAX_PID			65536
49
50static unsigned long		nr_tasks;
51
52struct sched_atom;
53
54struct task_desc {
55	unsigned long		nr;
56	unsigned long		pid;
57	char			comm[COMM_LEN];
58
59	unsigned long		nr_events;
60	unsigned long		curr_event;
61	struct sched_atom	**atoms;
62
63	pthread_t		thread;
64	sem_t			sleep_sem;
65
66	sem_t			ready_for_work;
67	sem_t			work_done_sem;
68
69	u64			cpu_usage;
70};
71
72enum sched_event_type {
73	SCHED_EVENT_RUN,
74	SCHED_EVENT_SLEEP,
75	SCHED_EVENT_WAKEUP,
76	SCHED_EVENT_MIGRATION,
77};
78
79struct sched_atom {
80	enum sched_event_type	type;
81	int			specific_wait;
82	u64			timestamp;
83	u64			duration;
84	unsigned long		nr;
85	sem_t			*wait_sem;
86	struct task_desc	*wakee;
87};
88
89static struct task_desc		*pid_to_task[MAX_PID];
90
91static struct task_desc		**tasks;
92
93static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
94static u64			start_time;
95
96static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
97
98static unsigned long		nr_run_events;
99static unsigned long		nr_sleep_events;
100static unsigned long		nr_wakeup_events;
101
102static unsigned long		nr_sleep_corrections;
103static unsigned long		nr_run_events_optimized;
104
105static unsigned long		targetless_wakeups;
106static unsigned long		multitarget_wakeups;
107
108static u64			cpu_usage;
109static u64			runavg_cpu_usage;
110static u64			parent_cpu_usage;
111static u64			runavg_parent_cpu_usage;
112
113static unsigned long		nr_runs;
114static u64			sum_runtime;
115static u64			sum_fluct;
116static u64			run_avg;
117
118static unsigned int		replay_repeat = 10;
119static unsigned long		nr_timestamps;
120static unsigned long		nr_unordered_timestamps;
121static unsigned long		nr_state_machine_bugs;
122static unsigned long		nr_context_switch_bugs;
123static unsigned long		nr_events;
124static unsigned long		nr_lost_chunks;
125static unsigned long		nr_lost_events;
126
127#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
128
129enum thread_state {
130	THREAD_SLEEPING = 0,
131	THREAD_WAIT_CPU,
132	THREAD_SCHED_IN,
133	THREAD_IGNORE
134};
135
136struct work_atom {
137	struct list_head	list;
138	enum thread_state	state;
139	u64			sched_out_time;
140	u64			wake_up_time;
141	u64			sched_in_time;
142	u64			runtime;
143};
144
145struct work_atoms {
146	struct list_head	work_list;
147	struct thread		*thread;
148	struct rb_node		node;
149	u64			max_lat;
150	u64			max_lat_at;
151	u64			total_lat;
152	u64			nb_atoms;
153	u64			total_runtime;
154};
155
156typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
157
158static struct rb_root		atom_root, sorted_atom_root;
159
160static u64			all_runtime;
161static u64			all_count;
162
163
164static u64 get_nsecs(void)
165{
166	/* ANDROID_CHANGE_BEGIN */
167#ifndef __APPLE__
168	struct timespec ts;
169
170	clock_gettime(CLOCK_MONOTONIC, &ts);
171
172	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
173#else
174	return 0;
175#endif
176	/* ANDROID_CHANGE_END */
177}
178
179static void burn_nsecs(u64 nsecs)
180{
181	u64 T0 = get_nsecs(), T1;
182
183	do {
184		T1 = get_nsecs();
185	} while (T1 + run_measurement_overhead < T0 + nsecs);
186}
187
188static void sleep_nsecs(u64 nsecs)
189{
190	struct timespec ts;
191
192	ts.tv_nsec = nsecs % 999999999;
193	ts.tv_sec = nsecs / 999999999;
194
195	nanosleep(&ts, NULL);
196}
197
198static void calibrate_run_measurement_overhead(void)
199{
200	u64 T0, T1, delta, min_delta = 1000000000ULL;
201	int i;
202
203	for (i = 0; i < 10; i++) {
204		T0 = get_nsecs();
205		burn_nsecs(0);
206		T1 = get_nsecs();
207		delta = T1-T0;
208		min_delta = min(min_delta, delta);
209	}
210	run_measurement_overhead = min_delta;
211
212	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
213}
214
215static void calibrate_sleep_measurement_overhead(void)
216{
217	u64 T0, T1, delta, min_delta = 1000000000ULL;
218	int i;
219
220	for (i = 0; i < 10; i++) {
221		T0 = get_nsecs();
222		sleep_nsecs(10000);
223		T1 = get_nsecs();
224		delta = T1-T0;
225		min_delta = min(min_delta, delta);
226	}
227	min_delta -= 10000;
228	sleep_measurement_overhead = min_delta;
229
230	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
231}
232
233static struct sched_atom *
234get_new_event(struct task_desc *task, u64 timestamp)
235{
236	struct sched_atom *event = zalloc(sizeof(*event));
237	unsigned long idx = task->nr_events;
238	size_t size;
239
240	event->timestamp = timestamp;
241	event->nr = idx;
242
243	task->nr_events++;
244	size = sizeof(struct sched_atom *) * task->nr_events;
245	task->atoms = realloc(task->atoms, size);
246	BUG_ON(!task->atoms);
247
248	task->atoms[idx] = event;
249
250	return event;
251}
252
253static struct sched_atom *last_event(struct task_desc *task)
254{
255	if (!task->nr_events)
256		return NULL;
257
258	return task->atoms[task->nr_events - 1];
259}
260
261static void
262add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
263{
264	struct sched_atom *event, *curr_event = last_event(task);
265
266	/*
267	 * optimize an existing RUN event by merging this one
268	 * to it:
269	 */
270	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
271		nr_run_events_optimized++;
272		curr_event->duration += duration;
273		return;
274	}
275
276	event = get_new_event(task, timestamp);
277
278	event->type = SCHED_EVENT_RUN;
279	event->duration = duration;
280
281	nr_run_events++;
282}
283
284static void
285add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
286		       struct task_desc *wakee)
287{
288	struct sched_atom *event, *wakee_event;
289
290	event = get_new_event(task, timestamp);
291	event->type = SCHED_EVENT_WAKEUP;
292	event->wakee = wakee;
293
294	wakee_event = last_event(wakee);
295	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
296		targetless_wakeups++;
297		return;
298	}
299	if (wakee_event->wait_sem) {
300		multitarget_wakeups++;
301		return;
302	}
303
304	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
305	sem_init(wakee_event->wait_sem, 0, 0);
306	wakee_event->specific_wait = 1;
307	event->wait_sem = wakee_event->wait_sem;
308
309	nr_wakeup_events++;
310}
311
312static void
313add_sched_event_sleep(struct task_desc *task, u64 timestamp,
314		      u64 task_state __used)
315{
316	struct sched_atom *event = get_new_event(task, timestamp);
317
318	event->type = SCHED_EVENT_SLEEP;
319
320	nr_sleep_events++;
321}
322
323static struct task_desc *register_pid(unsigned long pid, const char *comm)
324{
325	struct task_desc *task;
326
327	BUG_ON(pid >= MAX_PID);
328
329	task = pid_to_task[pid];
330
331	if (task)
332		return task;
333
334	task = zalloc(sizeof(*task));
335	task->pid = pid;
336	task->nr = nr_tasks;
337	strcpy(task->comm, comm);
338	/*
339	 * every task starts in sleeping state - this gets ignored
340	 * if there's no wakeup pointing to this sleep state:
341	 */
342	add_sched_event_sleep(task, 0, 0);
343
344	pid_to_task[pid] = task;
345	nr_tasks++;
346	tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
347	BUG_ON(!tasks);
348	tasks[task->nr] = task;
349
350	if (verbose)
351		printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
352
353	return task;
354}
355
356
357static void print_task_traces(void)
358{
359	struct task_desc *task;
360	unsigned long i;
361
362	for (i = 0; i < nr_tasks; i++) {
363		task = tasks[i];
364		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
365			task->nr, task->comm, task->pid, task->nr_events);
366	}
367}
368
369static void add_cross_task_wakeups(void)
370{
371	struct task_desc *task1, *task2;
372	unsigned long i, j;
373
374	for (i = 0; i < nr_tasks; i++) {
375		task1 = tasks[i];
376		j = i + 1;
377		if (j == nr_tasks)
378			j = 0;
379		task2 = tasks[j];
380		add_sched_event_wakeup(task1, 0, task2);
381	}
382}
383
384static void
385process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
386{
387	int ret = 0;
388
389	switch (atom->type) {
390		case SCHED_EVENT_RUN:
391			burn_nsecs(atom->duration);
392			break;
393		case SCHED_EVENT_SLEEP:
394			if (atom->wait_sem)
395				ret = sem_wait(atom->wait_sem);
396			BUG_ON(ret);
397			break;
398		case SCHED_EVENT_WAKEUP:
399			if (atom->wait_sem)
400				ret = sem_post(atom->wait_sem);
401			BUG_ON(ret);
402			break;
403		case SCHED_EVENT_MIGRATION:
404			break;
405		default:
406			BUG_ON(1);
407	}
408}
409
410static u64 get_cpu_usage_nsec_parent(void)
411{
412	struct rusage ru;
413	u64 sum;
414	int err;
415
416	err = getrusage(RUSAGE_SELF, &ru);
417	BUG_ON(err);
418
419	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
420	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
421
422	return sum;
423}
424
425static int self_open_counters(void)
426{
427/* ANDROID_CHANGE_BEGIN */
428#ifndef __APPLE__
429	struct perf_event_attr attr;
430	int fd;
431
432	memset(&attr, 0, sizeof(attr));
433
434	attr.type = PERF_TYPE_SOFTWARE;
435	attr.config = PERF_COUNT_SW_TASK_CLOCK;
436
437	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
438
439	if (fd < 0)
440		die("Error: sys_perf_event_open() syscall returned"
441		    "with %d (%s)\n", fd, strerror(errno));
442	return fd;
443#else
444	die("Error: sys_perf_event_open() syscall not supported on host\n");
445#endif
446/* ANDROID_CHANGE_END */
447}
448
449static u64 get_cpu_usage_nsec_self(int fd)
450{
451	u64 runtime;
452	int ret;
453
454	ret = read(fd, &runtime, sizeof(runtime));
455	BUG_ON(ret != sizeof(runtime));
456
457	return runtime;
458}
459
460static void *thread_func(void *ctx)
461{
462/* ANDROID_CHANGE_BEGIN */
463#ifndef __APPLE__
464/* ANDROID_CHANGE_END */
465	struct task_desc *this_task = ctx;
466	u64 cpu_usage_0, cpu_usage_1;
467	unsigned long i, ret;
468	char comm2[22];
469	int fd;
470
471	sprintf(comm2, ":%s", this_task->comm);
472	prctl(PR_SET_NAME, comm2);
473	fd = self_open_counters();
474
475again:
476	ret = sem_post(&this_task->ready_for_work);
477	BUG_ON(ret);
478	ret = pthread_mutex_lock(&start_work_mutex);
479	BUG_ON(ret);
480	ret = pthread_mutex_unlock(&start_work_mutex);
481	BUG_ON(ret);
482
483	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
484
485	for (i = 0; i < this_task->nr_events; i++) {
486		this_task->curr_event = i;
487		process_sched_event(this_task, this_task->atoms[i]);
488	}
489
490	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
491	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
492	ret = sem_post(&this_task->work_done_sem);
493	BUG_ON(ret);
494
495	ret = pthread_mutex_lock(&work_done_wait_mutex);
496	BUG_ON(ret);
497	ret = pthread_mutex_unlock(&work_done_wait_mutex);
498	BUG_ON(ret);
499
500	goto again;
501/* ANDROID_CHANGE_BEGIN */
502#endif // __APPLE__
503/* ANDROID_CHANGE_END */
504        /* ANDROID_CHANGE_BEGIN */
505#if defined(__BIONIC__) || defined(__APPLE__)
506        return NULL;
507#endif
508        /* ANDROID_CHANGE_END */
509}
510
511static void create_tasks(void)
512{
513	struct task_desc *task;
514	pthread_attr_t attr;
515	unsigned long i;
516	int err;
517
518	err = pthread_attr_init(&attr);
519	BUG_ON(err);
520        /* ANDROID_CHANGE_BEGIN */
521#if 0
522	err = pthread_attr_setstacksize(&attr,
523			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
524#else
525	err = pthread_attr_setstacksize(&attr,
526			(size_t) max((unsigned) 16 * 1024, (unsigned) PTHREAD_STACK_MIN));
527#endif
528        /* ANDROID_CHANGE_END */
529	BUG_ON(err);
530	err = pthread_mutex_lock(&start_work_mutex);
531	BUG_ON(err);
532	err = pthread_mutex_lock(&work_done_wait_mutex);
533	BUG_ON(err);
534	for (i = 0; i < nr_tasks; i++) {
535		task = tasks[i];
536		sem_init(&task->sleep_sem, 0, 0);
537		sem_init(&task->ready_for_work, 0, 0);
538		sem_init(&task->work_done_sem, 0, 0);
539		task->curr_event = 0;
540		err = pthread_create(&task->thread, &attr, thread_func, task);
541		BUG_ON(err);
542	}
543}
544
545static void wait_for_tasks(void)
546{
547	u64 cpu_usage_0, cpu_usage_1;
548	struct task_desc *task;
549	unsigned long i, ret;
550
551	start_time = get_nsecs();
552	cpu_usage = 0;
553	pthread_mutex_unlock(&work_done_wait_mutex);
554
555	for (i = 0; i < nr_tasks; i++) {
556		task = tasks[i];
557		ret = sem_wait(&task->ready_for_work);
558		BUG_ON(ret);
559		sem_init(&task->ready_for_work, 0, 0);
560	}
561	ret = pthread_mutex_lock(&work_done_wait_mutex);
562	BUG_ON(ret);
563
564	cpu_usage_0 = get_cpu_usage_nsec_parent();
565
566	pthread_mutex_unlock(&start_work_mutex);
567
568	for (i = 0; i < nr_tasks; i++) {
569		task = tasks[i];
570		ret = sem_wait(&task->work_done_sem);
571		BUG_ON(ret);
572		sem_init(&task->work_done_sem, 0, 0);
573		cpu_usage += task->cpu_usage;
574		task->cpu_usage = 0;
575	}
576
577	cpu_usage_1 = get_cpu_usage_nsec_parent();
578	if (!runavg_cpu_usage)
579		runavg_cpu_usage = cpu_usage;
580	runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
581
582	parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
583	if (!runavg_parent_cpu_usage)
584		runavg_parent_cpu_usage = parent_cpu_usage;
585	runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
586				   parent_cpu_usage)/10;
587
588	ret = pthread_mutex_lock(&start_work_mutex);
589	BUG_ON(ret);
590
591	for (i = 0; i < nr_tasks; i++) {
592		task = tasks[i];
593		sem_init(&task->sleep_sem, 0, 0);
594		task->curr_event = 0;
595	}
596}
597
598static void run_one_test(void)
599{
600	u64 T0, T1, delta, avg_delta, fluct;
601
602	T0 = get_nsecs();
603	wait_for_tasks();
604	T1 = get_nsecs();
605
606	delta = T1 - T0;
607	sum_runtime += delta;
608	nr_runs++;
609
610	avg_delta = sum_runtime / nr_runs;
611	if (delta < avg_delta)
612		fluct = avg_delta - delta;
613	else
614		fluct = delta - avg_delta;
615	sum_fluct += fluct;
616	if (!run_avg)
617		run_avg = delta;
618	run_avg = (run_avg*9 + delta)/10;
619
620	printf("#%-3ld: %0.3f, ",
621		nr_runs, (double)delta/1000000.0);
622
623	printf("ravg: %0.2f, ",
624		(double)run_avg/1e6);
625
626	printf("cpu: %0.2f / %0.2f",
627		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
628
629#if 0
630	/*
631	 * rusage statistics done by the parent, these are less
632	 * accurate than the sum_exec_runtime based statistics:
633	 */
634	printf(" [%0.2f / %0.2f]",
635		(double)parent_cpu_usage/1e6,
636		(double)runavg_parent_cpu_usage/1e6);
637#endif
638
639	printf("\n");
640
641	if (nr_sleep_corrections)
642		printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
643	nr_sleep_corrections = 0;
644}
645
646static void test_calibrations(void)
647{
648	u64 T0, T1;
649
650	T0 = get_nsecs();
651	burn_nsecs(1e6);
652	T1 = get_nsecs();
653
654	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
655
656	T0 = get_nsecs();
657	sleep_nsecs(1e6);
658	T1 = get_nsecs();
659
660	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
661}
662
663#define FILL_FIELD(ptr, field, event, data)	\
664	ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data)
665
666#define FILL_ARRAY(ptr, array, event, data)			\
667do {								\
668	void *__array = raw_field_ptr(event, #array, data);	\
669	memcpy(ptr.array, __array, sizeof(ptr.array));	\
670} while(0)
671
672#define FILL_COMMON_FIELDS(ptr, event, data)			\
673do {								\
674	FILL_FIELD(ptr, common_type, event, data);		\
675	FILL_FIELD(ptr, common_flags, event, data);		\
676	FILL_FIELD(ptr, common_preempt_count, event, data);	\
677	FILL_FIELD(ptr, common_pid, event, data);		\
678	FILL_FIELD(ptr, common_tgid, event, data);		\
679} while (0)
680
681
682
683struct trace_switch_event {
684	u32 size;
685
686	u16 common_type;
687	u8 common_flags;
688	u8 common_preempt_count;
689	u32 common_pid;
690	u32 common_tgid;
691
692	char prev_comm[16];
693	u32 prev_pid;
694	u32 prev_prio;
695	u64 prev_state;
696	char next_comm[16];
697	u32 next_pid;
698	u32 next_prio;
699};
700
701struct trace_runtime_event {
702	u32 size;
703
704	u16 common_type;
705	u8 common_flags;
706	u8 common_preempt_count;
707	u32 common_pid;
708	u32 common_tgid;
709
710	char comm[16];
711	u32 pid;
712	u64 runtime;
713	u64 vruntime;
714};
715
716struct trace_wakeup_event {
717	u32 size;
718
719	u16 common_type;
720	u8 common_flags;
721	u8 common_preempt_count;
722	u32 common_pid;
723	u32 common_tgid;
724
725	char comm[16];
726	u32 pid;
727
728	u32 prio;
729	u32 success;
730	u32 cpu;
731};
732
733struct trace_fork_event {
734	u32 size;
735
736	u16 common_type;
737	u8 common_flags;
738	u8 common_preempt_count;
739	u32 common_pid;
740	u32 common_tgid;
741
742	char parent_comm[16];
743	u32 parent_pid;
744	char child_comm[16];
745	u32 child_pid;
746};
747
748struct trace_migrate_task_event {
749	u32 size;
750
751	u16 common_type;
752	u8 common_flags;
753	u8 common_preempt_count;
754	u32 common_pid;
755	u32 common_tgid;
756
757	char comm[16];
758	u32 pid;
759
760	u32 prio;
761	u32 cpu;
762};
763
764struct trace_sched_handler {
765	void (*switch_event)(struct trace_switch_event *,
766			     struct perf_session *,
767			     struct event *,
768			     int cpu,
769			     u64 timestamp,
770			     struct thread *thread);
771
772	void (*runtime_event)(struct trace_runtime_event *,
773			      struct perf_session *,
774			      struct event *,
775			      int cpu,
776			      u64 timestamp,
777			      struct thread *thread);
778
779	void (*wakeup_event)(struct trace_wakeup_event *,
780			     struct perf_session *,
781			     struct event *,
782			     int cpu,
783			     u64 timestamp,
784			     struct thread *thread);
785
786	void (*fork_event)(struct trace_fork_event *,
787			   struct event *,
788			   int cpu,
789			   u64 timestamp,
790			   struct thread *thread);
791
792	void (*migrate_task_event)(struct trace_migrate_task_event *,
793			   struct perf_session *session,
794			   struct event *,
795			   int cpu,
796			   u64 timestamp,
797			   struct thread *thread);
798};
799
800
801static void
802replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
803		    struct perf_session *session __used,
804		    struct event *event,
805		    int cpu __used,
806		    u64 timestamp __used,
807		    struct thread *thread __used)
808{
809	struct task_desc *waker, *wakee;
810
811	if (verbose) {
812		printf("sched_wakeup event %p\n", event);
813
814		printf(" ... pid %d woke up %s/%d\n",
815			wakeup_event->common_pid,
816			wakeup_event->comm,
817			wakeup_event->pid);
818	}
819
820	waker = register_pid(wakeup_event->common_pid, "<unknown>");
821	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
822
823	add_sched_event_wakeup(waker, timestamp, wakee);
824}
825
826static u64 cpu_last_switched[MAX_CPUS];
827
828static void
829replay_switch_event(struct trace_switch_event *switch_event,
830		    struct perf_session *session __used,
831		    struct event *event,
832		    int cpu,
833		    u64 timestamp,
834		    struct thread *thread __used)
835{
836	struct task_desc *prev, __used *next;
837	u64 timestamp0;
838	s64 delta;
839
840	if (verbose)
841		printf("sched_switch event %p\n", event);
842
843	if (cpu >= MAX_CPUS || cpu < 0)
844		return;
845
846	timestamp0 = cpu_last_switched[cpu];
847	if (timestamp0)
848		delta = timestamp - timestamp0;
849	else
850		delta = 0;
851
852	if (delta < 0)
853		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
854
855	if (verbose) {
856		printf(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
857			switch_event->prev_comm, switch_event->prev_pid,
858			switch_event->next_comm, switch_event->next_pid,
859			delta);
860	}
861
862	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
863	next = register_pid(switch_event->next_pid, switch_event->next_comm);
864
865	cpu_last_switched[cpu] = timestamp;
866
867	add_sched_event_run(prev, timestamp, delta);
868	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
869}
870
871
872static void
873replay_fork_event(struct trace_fork_event *fork_event,
874		  struct event *event,
875		  int cpu __used,
876		  u64 timestamp __used,
877		  struct thread *thread __used)
878{
879	if (verbose) {
880		printf("sched_fork event %p\n", event);
881		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
882		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
883	}
884	register_pid(fork_event->parent_pid, fork_event->parent_comm);
885	register_pid(fork_event->child_pid, fork_event->child_comm);
886}
887
888static struct trace_sched_handler replay_ops  = {
889	.wakeup_event		= replay_wakeup_event,
890	.switch_event		= replay_switch_event,
891	.fork_event		= replay_fork_event,
892};
893
894struct sort_dimension {
895	const char		*name;
896	sort_fn_t		cmp;
897	struct list_head	list;
898};
899
900static LIST_HEAD(cmp_pid);
901
902static int
903thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
904{
905	struct sort_dimension *sort;
906	int ret = 0;
907
908	BUG_ON(list_empty(list));
909
910	list_for_each_entry(sort, list, list) {
911		ret = sort->cmp(l, r);
912		if (ret)
913			return ret;
914	}
915
916	return ret;
917}
918
919static struct work_atoms *
920thread_atoms_search(struct rb_root *root, struct thread *thread,
921			 struct list_head *sort_list)
922{
923	struct rb_node *node = root->rb_node;
924	struct work_atoms key = { .thread = thread };
925
926	while (node) {
927		struct work_atoms *atoms;
928		int cmp;
929
930		atoms = container_of(node, struct work_atoms, node);
931
932		cmp = thread_lat_cmp(sort_list, &key, atoms);
933		if (cmp > 0)
934			node = node->rb_left;
935		else if (cmp < 0)
936			node = node->rb_right;
937		else {
938			BUG_ON(thread != atoms->thread);
939			return atoms;
940		}
941	}
942	return NULL;
943}
944
945static void
946__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
947			 struct list_head *sort_list)
948{
949	struct rb_node **new = &(root->rb_node), *parent = NULL;
950
951	while (*new) {
952		struct work_atoms *this;
953		int cmp;
954
955		this = container_of(*new, struct work_atoms, node);
956		parent = *new;
957
958		cmp = thread_lat_cmp(sort_list, data, this);
959
960		if (cmp > 0)
961			new = &((*new)->rb_left);
962		else
963			new = &((*new)->rb_right);
964	}
965
966	rb_link_node(&data->node, parent, new);
967	rb_insert_color(&data->node, root);
968}
969
970static void thread_atoms_insert(struct thread *thread)
971{
972	struct work_atoms *atoms = zalloc(sizeof(*atoms));
973	if (!atoms)
974		die("No memory");
975
976	atoms->thread = thread;
977	INIT_LIST_HEAD(&atoms->work_list);
978	__thread_latency_insert(&atom_root, atoms, &cmp_pid);
979}
980
981static void
982latency_fork_event(struct trace_fork_event *fork_event __used,
983		   struct event *event __used,
984		   int cpu __used,
985		   u64 timestamp __used,
986		   struct thread *thread __used)
987{
988	/* should insert the newcomer */
989}
990
991__used
992static char sched_out_state(struct trace_switch_event *switch_event)
993{
994	const char *str = TASK_STATE_TO_CHAR_STR;
995
996	return str[switch_event->prev_state];
997}
998
999static void
1000add_sched_out_event(struct work_atoms *atoms,
1001		    char run_state,
1002		    u64 timestamp)
1003{
1004	struct work_atom *atom = zalloc(sizeof(*atom));
1005	if (!atom)
1006		die("Non memory");
1007
1008	atom->sched_out_time = timestamp;
1009
1010	if (run_state == 'R') {
1011		atom->state = THREAD_WAIT_CPU;
1012		atom->wake_up_time = atom->sched_out_time;
1013	}
1014
1015	list_add_tail(&atom->list, &atoms->work_list);
1016}
1017
1018static void
1019add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used)
1020{
1021	struct work_atom *atom;
1022
1023	BUG_ON(list_empty(&atoms->work_list));
1024
1025	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1026
1027	atom->runtime += delta;
1028	atoms->total_runtime += delta;
1029}
1030
1031static void
1032add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
1033{
1034	struct work_atom *atom;
1035	u64 delta;
1036
1037	if (list_empty(&atoms->work_list))
1038		return;
1039
1040	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1041
1042	if (atom->state != THREAD_WAIT_CPU)
1043		return;
1044
1045	if (timestamp < atom->wake_up_time) {
1046		atom->state = THREAD_IGNORE;
1047		return;
1048	}
1049
1050	atom->state = THREAD_SCHED_IN;
1051	atom->sched_in_time = timestamp;
1052
1053	delta = atom->sched_in_time - atom->wake_up_time;
1054	atoms->total_lat += delta;
1055	if (delta > atoms->max_lat) {
1056		atoms->max_lat = delta;
1057		atoms->max_lat_at = timestamp;
1058	}
1059	atoms->nb_atoms++;
1060}
1061
1062static void
1063latency_switch_event(struct trace_switch_event *switch_event,
1064		     struct perf_session *session,
1065		     struct event *event __used,
1066		     int cpu,
1067		     u64 timestamp,
1068		     struct thread *thread __used)
1069{
1070	struct work_atoms *out_events, *in_events;
1071	struct thread *sched_out, *sched_in;
1072	u64 timestamp0;
1073	s64 delta;
1074
1075	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1076
1077	timestamp0 = cpu_last_switched[cpu];
1078	cpu_last_switched[cpu] = timestamp;
1079	if (timestamp0)
1080		delta = timestamp - timestamp0;
1081	else
1082		delta = 0;
1083
1084	if (delta < 0)
1085		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1086
1087
1088	sched_out = perf_session__findnew(session, switch_event->prev_pid);
1089	sched_in = perf_session__findnew(session, switch_event->next_pid);
1090
1091	out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1092	if (!out_events) {
1093		thread_atoms_insert(sched_out);
1094		out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1095		if (!out_events)
1096			die("out-event: Internal tree error");
1097	}
1098	add_sched_out_event(out_events, sched_out_state(switch_event), timestamp);
1099
1100	in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1101	if (!in_events) {
1102		thread_atoms_insert(sched_in);
1103		in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1104		if (!in_events)
1105			die("in-event: Internal tree error");
1106		/*
1107		 * Take came in we have not heard about yet,
1108		 * add in an initial atom in runnable state:
1109		 */
1110		add_sched_out_event(in_events, 'R', timestamp);
1111	}
1112	add_sched_in_event(in_events, timestamp);
1113}
1114
1115static void
1116latency_runtime_event(struct trace_runtime_event *runtime_event,
1117		     struct perf_session *session,
1118		     struct event *event __used,
1119		     int cpu,
1120		     u64 timestamp,
1121		     struct thread *this_thread __used)
1122{
1123	struct thread *thread = perf_session__findnew(session, runtime_event->pid);
1124	struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1125
1126	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1127	if (!atoms) {
1128		thread_atoms_insert(thread);
1129		atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1130		if (!atoms)
1131			die("in-event: Internal tree error");
1132		add_sched_out_event(atoms, 'R', timestamp);
1133	}
1134
1135	add_runtime_event(atoms, runtime_event->runtime, timestamp);
1136}
1137
1138static void
1139latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
1140		     struct perf_session *session,
1141		     struct event *__event __used,
1142		     int cpu __used,
1143		     u64 timestamp,
1144		     struct thread *thread __used)
1145{
1146	struct work_atoms *atoms;
1147	struct work_atom *atom;
1148	struct thread *wakee;
1149
1150	/* Note for later, it may be interesting to observe the failing cases */
1151	if (!wakeup_event->success)
1152		return;
1153
1154	wakee = perf_session__findnew(session, wakeup_event->pid);
1155	atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1156	if (!atoms) {
1157		thread_atoms_insert(wakee);
1158		atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1159		if (!atoms)
1160			die("wakeup-event: Internal tree error");
1161		add_sched_out_event(atoms, 'S', timestamp);
1162	}
1163
1164	BUG_ON(list_empty(&atoms->work_list));
1165
1166	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1167
1168	/*
1169	 * You WILL be missing events if you've recorded only
1170	 * one CPU, or are only looking at only one, so don't
1171	 * make useless noise.
1172	 */
1173	if (profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1174		nr_state_machine_bugs++;
1175
1176	nr_timestamps++;
1177	if (atom->sched_out_time > timestamp) {
1178		nr_unordered_timestamps++;
1179		return;
1180	}
1181
1182	atom->state = THREAD_WAIT_CPU;
1183	atom->wake_up_time = timestamp;
1184}
1185
1186static void
1187latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
1188		     struct perf_session *session,
1189		     struct event *__event __used,
1190		     int cpu __used,
1191		     u64 timestamp,
1192		     struct thread *thread __used)
1193{
1194	struct work_atoms *atoms;
1195	struct work_atom *atom;
1196	struct thread *migrant;
1197
1198	/*
1199	 * Only need to worry about migration when profiling one CPU.
1200	 */
1201	if (profile_cpu == -1)
1202		return;
1203
1204	migrant = perf_session__findnew(session, migrate_task_event->pid);
1205	atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1206	if (!atoms) {
1207		thread_atoms_insert(migrant);
1208		register_pid(migrant->pid, migrant->comm);
1209		atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1210		if (!atoms)
1211			die("migration-event: Internal tree error");
1212		add_sched_out_event(atoms, 'R', timestamp);
1213	}
1214
1215	BUG_ON(list_empty(&atoms->work_list));
1216
1217	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1218	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1219
1220	nr_timestamps++;
1221
1222	if (atom->sched_out_time > timestamp)
1223		nr_unordered_timestamps++;
1224}
1225
1226static struct trace_sched_handler lat_ops  = {
1227	.wakeup_event		= latency_wakeup_event,
1228	.switch_event		= latency_switch_event,
1229	.runtime_event		= latency_runtime_event,
1230	.fork_event		= latency_fork_event,
1231	.migrate_task_event	= latency_migrate_task_event,
1232};
1233
1234static void output_lat_thread(struct work_atoms *work_list)
1235{
1236	int i;
1237	int ret;
1238	u64 avg;
1239
1240	if (!work_list->nb_atoms)
1241		return;
1242	/*
1243	 * Ignore idle threads:
1244	 */
1245	if (!strcmp(work_list->thread->comm, "swapper"))
1246		return;
1247
1248	all_runtime += work_list->total_runtime;
1249	all_count += work_list->nb_atoms;
1250
1251	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->pid);
1252
1253	for (i = 0; i < 24 - ret; i++)
1254		printf(" ");
1255
1256	avg = work_list->total_lat / work_list->nb_atoms;
1257
1258	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1259	      (double)work_list->total_runtime / 1e6,
1260		 work_list->nb_atoms, (double)avg / 1e6,
1261		 (double)work_list->max_lat / 1e6,
1262		 (double)work_list->max_lat_at / 1e9);
1263}
1264
1265static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1266{
1267	if (l->thread->pid < r->thread->pid)
1268		return -1;
1269	if (l->thread->pid > r->thread->pid)
1270		return 1;
1271
1272	return 0;
1273}
1274
1275static struct sort_dimension pid_sort_dimension = {
1276	.name			= "pid",
1277	.cmp			= pid_cmp,
1278};
1279
1280static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1281{
1282	u64 avgl, avgr;
1283
1284	if (!l->nb_atoms)
1285		return -1;
1286
1287	if (!r->nb_atoms)
1288		return 1;
1289
1290	avgl = l->total_lat / l->nb_atoms;
1291	avgr = r->total_lat / r->nb_atoms;
1292
1293	if (avgl < avgr)
1294		return -1;
1295	if (avgl > avgr)
1296		return 1;
1297
1298	return 0;
1299}
1300
1301static struct sort_dimension avg_sort_dimension = {
1302	.name			= "avg",
1303	.cmp			= avg_cmp,
1304};
1305
1306static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1307{
1308	if (l->max_lat < r->max_lat)
1309		return -1;
1310	if (l->max_lat > r->max_lat)
1311		return 1;
1312
1313	return 0;
1314}
1315
1316static struct sort_dimension max_sort_dimension = {
1317	.name			= "max",
1318	.cmp			= max_cmp,
1319};
1320
1321static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1322{
1323	if (l->nb_atoms < r->nb_atoms)
1324		return -1;
1325	if (l->nb_atoms > r->nb_atoms)
1326		return 1;
1327
1328	return 0;
1329}
1330
1331static struct sort_dimension switch_sort_dimension = {
1332	.name			= "switch",
1333	.cmp			= switch_cmp,
1334};
1335
1336static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1337{
1338	if (l->total_runtime < r->total_runtime)
1339		return -1;
1340	if (l->total_runtime > r->total_runtime)
1341		return 1;
1342
1343	return 0;
1344}
1345
1346static struct sort_dimension runtime_sort_dimension = {
1347	.name			= "runtime",
1348	.cmp			= runtime_cmp,
1349};
1350
1351static struct sort_dimension *available_sorts[] = {
1352	&pid_sort_dimension,
1353	&avg_sort_dimension,
1354	&max_sort_dimension,
1355	&switch_sort_dimension,
1356	&runtime_sort_dimension,
1357};
1358
1359#define NB_AVAILABLE_SORTS	(int)(sizeof(available_sorts) / sizeof(struct sort_dimension *))
1360
1361static LIST_HEAD(sort_list);
1362
1363static int sort_dimension__add(const char *tok, struct list_head *list)
1364{
1365	int i;
1366
1367	for (i = 0; i < NB_AVAILABLE_SORTS; i++) {
1368		if (!strcmp(available_sorts[i]->name, tok)) {
1369			list_add_tail(&available_sorts[i]->list, list);
1370
1371			return 0;
1372		}
1373	}
1374
1375	return -1;
1376}
1377
1378static void setup_sorting(void);
1379
1380static void sort_lat(void)
1381{
1382	struct rb_node *node;
1383
1384	for (;;) {
1385		struct work_atoms *data;
1386		node = rb_first(&atom_root);
1387		if (!node)
1388			break;
1389
1390		rb_erase(node, &atom_root);
1391		data = rb_entry(node, struct work_atoms, node);
1392		__thread_latency_insert(&sorted_atom_root, data, &sort_list);
1393	}
1394}
1395
1396static struct trace_sched_handler *trace_handler;
1397
1398static void
1399process_sched_wakeup_event(void *data, struct perf_session *session,
1400			   struct event *event,
1401			   int cpu __used,
1402			   u64 timestamp __used,
1403			   struct thread *thread __used)
1404{
1405	struct trace_wakeup_event wakeup_event;
1406
1407	FILL_COMMON_FIELDS(wakeup_event, event, data);
1408
1409	FILL_ARRAY(wakeup_event, comm, event, data);
1410	FILL_FIELD(wakeup_event, pid, event, data);
1411	FILL_FIELD(wakeup_event, prio, event, data);
1412	FILL_FIELD(wakeup_event, success, event, data);
1413	FILL_FIELD(wakeup_event, cpu, event, data);
1414
1415	if (trace_handler->wakeup_event)
1416		trace_handler->wakeup_event(&wakeup_event, session, event,
1417					    cpu, timestamp, thread);
1418}
1419
1420/*
1421 * Track the current task - that way we can know whether there's any
1422 * weird events, such as a task being switched away that is not current.
1423 */
1424static int max_cpu;
1425
1426static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
1427
1428static struct thread *curr_thread[MAX_CPUS];
1429
1430static char next_shortname1 = 'A';
1431static char next_shortname2 = '0';
1432
1433static void
1434map_switch_event(struct trace_switch_event *switch_event,
1435		 struct perf_session *session,
1436		 struct event *event __used,
1437		 int this_cpu,
1438		 u64 timestamp,
1439		 struct thread *thread __used)
1440{
1441	struct thread *sched_out __used, *sched_in;
1442	int new_shortname;
1443	u64 timestamp0;
1444	s64 delta;
1445	int cpu;
1446
1447	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1448
1449	if (this_cpu > max_cpu)
1450		max_cpu = this_cpu;
1451
1452	timestamp0 = cpu_last_switched[this_cpu];
1453	cpu_last_switched[this_cpu] = timestamp;
1454	if (timestamp0)
1455		delta = timestamp - timestamp0;
1456	else
1457		delta = 0;
1458
1459	if (delta < 0)
1460		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1461
1462
1463	sched_out = perf_session__findnew(session, switch_event->prev_pid);
1464	sched_in = perf_session__findnew(session, switch_event->next_pid);
1465
1466	curr_thread[this_cpu] = sched_in;
1467
1468	printf("  ");
1469
1470	new_shortname = 0;
1471	if (!sched_in->shortname[0]) {
1472		sched_in->shortname[0] = next_shortname1;
1473		sched_in->shortname[1] = next_shortname2;
1474
1475		if (next_shortname1 < 'Z') {
1476			next_shortname1++;
1477		} else {
1478			next_shortname1='A';
1479			if (next_shortname2 < '9') {
1480				next_shortname2++;
1481			} else {
1482				next_shortname2='0';
1483			}
1484		}
1485		new_shortname = 1;
1486	}
1487
1488	for (cpu = 0; cpu <= max_cpu; cpu++) {
1489		if (cpu != this_cpu)
1490			printf(" ");
1491		else
1492			printf("*");
1493
1494		if (curr_thread[cpu]) {
1495			if (curr_thread[cpu]->pid)
1496				printf("%2s ", curr_thread[cpu]->shortname);
1497			else
1498				printf(".  ");
1499		} else
1500			printf("   ");
1501	}
1502
1503	printf("  %12.6f secs ", (double)timestamp/1e9);
1504	if (new_shortname) {
1505		printf("%s => %s:%d\n",
1506			sched_in->shortname, sched_in->comm, sched_in->pid);
1507	} else {
1508		printf("\n");
1509	}
1510}
1511
1512
1513static void
1514process_sched_switch_event(void *data, struct perf_session *session,
1515			   struct event *event,
1516			   int this_cpu,
1517			   u64 timestamp __used,
1518			   struct thread *thread __used)
1519{
1520	struct trace_switch_event switch_event;
1521
1522	FILL_COMMON_FIELDS(switch_event, event, data);
1523
1524	FILL_ARRAY(switch_event, prev_comm, event, data);
1525	FILL_FIELD(switch_event, prev_pid, event, data);
1526	FILL_FIELD(switch_event, prev_prio, event, data);
1527	FILL_FIELD(switch_event, prev_state, event, data);
1528	FILL_ARRAY(switch_event, next_comm, event, data);
1529	FILL_FIELD(switch_event, next_pid, event, data);
1530	FILL_FIELD(switch_event, next_prio, event, data);
1531
1532	if (curr_pid[this_cpu] != (u32)-1) {
1533		/*
1534		 * Are we trying to switch away a PID that is
1535		 * not current?
1536		 */
1537		if (curr_pid[this_cpu] != switch_event.prev_pid)
1538			nr_context_switch_bugs++;
1539	}
1540	if (trace_handler->switch_event)
1541		trace_handler->switch_event(&switch_event, session, event,
1542					    this_cpu, timestamp, thread);
1543
1544	curr_pid[this_cpu] = switch_event.next_pid;
1545}
1546
1547static void
1548process_sched_runtime_event(void *data, struct perf_session *session,
1549			   struct event *event,
1550			   int cpu __used,
1551			   u64 timestamp __used,
1552			   struct thread *thread __used)
1553{
1554	struct trace_runtime_event runtime_event;
1555
1556	FILL_ARRAY(runtime_event, comm, event, data);
1557	FILL_FIELD(runtime_event, pid, event, data);
1558	FILL_FIELD(runtime_event, runtime, event, data);
1559	FILL_FIELD(runtime_event, vruntime, event, data);
1560
1561	if (trace_handler->runtime_event)
1562		trace_handler->runtime_event(&runtime_event, session, event, cpu, timestamp, thread);
1563}
1564
1565static void
1566process_sched_fork_event(void *data,
1567			 struct event *event,
1568			 int cpu __used,
1569			 u64 timestamp __used,
1570			 struct thread *thread __used)
1571{
1572	struct trace_fork_event fork_event;
1573
1574	FILL_COMMON_FIELDS(fork_event, event, data);
1575
1576	FILL_ARRAY(fork_event, parent_comm, event, data);
1577	FILL_FIELD(fork_event, parent_pid, event, data);
1578	FILL_ARRAY(fork_event, child_comm, event, data);
1579	FILL_FIELD(fork_event, child_pid, event, data);
1580
1581	if (trace_handler->fork_event)
1582		trace_handler->fork_event(&fork_event, event,
1583					  cpu, timestamp, thread);
1584}
1585
1586static void
1587process_sched_exit_event(struct event *event,
1588			 int cpu __used,
1589			 u64 timestamp __used,
1590			 struct thread *thread __used)
1591{
1592	if (verbose)
1593		printf("sched_exit event %p\n", event);
1594}
1595
1596static void
1597process_sched_migrate_task_event(void *data, struct perf_session *session,
1598			   struct event *event,
1599			   int cpu __used,
1600			   u64 timestamp __used,
1601			   struct thread *thread __used)
1602{
1603	struct trace_migrate_task_event migrate_task_event;
1604
1605	FILL_COMMON_FIELDS(migrate_task_event, event, data);
1606
1607	FILL_ARRAY(migrate_task_event, comm, event, data);
1608	FILL_FIELD(migrate_task_event, pid, event, data);
1609	FILL_FIELD(migrate_task_event, prio, event, data);
1610	FILL_FIELD(migrate_task_event, cpu, event, data);
1611
1612	if (trace_handler->migrate_task_event)
1613		trace_handler->migrate_task_event(&migrate_task_event, session,
1614						 event, cpu, timestamp, thread);
1615}
1616
1617static void process_raw_event(union perf_event *raw_event __used,
1618			      struct perf_session *session, void *data, int cpu,
1619			      u64 timestamp, struct thread *thread)
1620{
1621	struct event *event;
1622	int type;
1623
1624
1625	type = trace_parse_common_type(data);
1626	event = trace_find_event(type);
1627
1628	if (!strcmp(event->name, "sched_switch"))
1629		process_sched_switch_event(data, session, event, cpu, timestamp, thread);
1630	if (!strcmp(event->name, "sched_stat_runtime"))
1631		process_sched_runtime_event(data, session, event, cpu, timestamp, thread);
1632	if (!strcmp(event->name, "sched_wakeup"))
1633		process_sched_wakeup_event(data, session, event, cpu, timestamp, thread);
1634	if (!strcmp(event->name, "sched_wakeup_new"))
1635		process_sched_wakeup_event(data, session, event, cpu, timestamp, thread);
1636	if (!strcmp(event->name, "sched_process_fork"))
1637		process_sched_fork_event(data, event, cpu, timestamp, thread);
1638	if (!strcmp(event->name, "sched_process_exit"))
1639		process_sched_exit_event(event, cpu, timestamp, thread);
1640	if (!strcmp(event->name, "sched_migrate_task"))
1641		process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread);
1642}
1643
1644static int process_sample_event(union perf_event *event,
1645				struct perf_sample *sample,
1646				struct perf_evsel *evsel __used,
1647				struct perf_session *session)
1648{
1649	struct thread *thread;
1650
1651	if (!(session->sample_type & PERF_SAMPLE_RAW))
1652		return 0;
1653
1654	thread = perf_session__findnew(session, sample->pid);
1655	if (thread == NULL) {
1656		pr_debug("problem processing %d event, skipping it.\n",
1657			 event->header.type);
1658		return -1;
1659	}
1660
1661	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
1662
1663	if (profile_cpu != -1 && profile_cpu != (int)sample->cpu)
1664		return 0;
1665
1666	process_raw_event(event, session, sample->raw_data, sample->cpu,
1667			  sample->time, thread);
1668
1669	return 0;
1670}
1671
1672static struct perf_event_ops event_ops = {
1673	.sample			= process_sample_event,
1674	.comm			= perf_event__process_comm,
1675	.lost			= perf_event__process_lost,
1676	.fork			= perf_event__process_task,
1677	.ordered_samples	= true,
1678};
1679
1680static int read_events(void)
1681{
1682	int err = -EINVAL;
1683	struct perf_session *session = perf_session__new(input_name, O_RDONLY,
1684							 0, false, &event_ops);
1685	if (session == NULL)
1686		return -ENOMEM;
1687
1688	if (perf_session__has_traces(session, "record -R")) {
1689		err = perf_session__process_events(session, &event_ops);
1690		nr_events      = session->hists.stats.nr_events[0];
1691		nr_lost_events = session->hists.stats.total_lost;
1692		nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
1693	}
1694
1695	perf_session__delete(session);
1696	return err;
1697}
1698
1699static void print_bad_events(void)
1700{
1701	if (nr_unordered_timestamps && nr_timestamps) {
1702		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1703			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
1704			nr_unordered_timestamps, nr_timestamps);
1705	}
1706	if (nr_lost_events && nr_events) {
1707		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1708			(double)nr_lost_events/(double)nr_events*100.0,
1709			nr_lost_events, nr_events, nr_lost_chunks);
1710	}
1711	if (nr_state_machine_bugs && nr_timestamps) {
1712		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
1713			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
1714			nr_state_machine_bugs, nr_timestamps);
1715		if (nr_lost_events)
1716			printf(" (due to lost events?)");
1717		printf("\n");
1718	}
1719	if (nr_context_switch_bugs && nr_timestamps) {
1720		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1721			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
1722			nr_context_switch_bugs, nr_timestamps);
1723		if (nr_lost_events)
1724			printf(" (due to lost events?)");
1725		printf("\n");
1726	}
1727}
1728
1729static void __cmd_lat(void)
1730{
1731	struct rb_node *next;
1732
1733	setup_pager();
1734	read_events();
1735	sort_lat();
1736
1737	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
1738	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
1739	printf(" ---------------------------------------------------------------------------------------------------------------\n");
1740
1741	next = rb_first(&sorted_atom_root);
1742
1743	while (next) {
1744		struct work_atoms *work_list;
1745
1746		work_list = rb_entry(next, struct work_atoms, node);
1747		output_lat_thread(work_list);
1748		next = rb_next(next);
1749	}
1750
1751	printf(" -----------------------------------------------------------------------------------------\n");
1752	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1753		(double)all_runtime/1e6, all_count);
1754
1755	printf(" ---------------------------------------------------\n");
1756
1757	print_bad_events();
1758	printf("\n");
1759
1760}
1761
1762static struct trace_sched_handler map_ops  = {
1763	.wakeup_event		= NULL,
1764	.switch_event		= map_switch_event,
1765	.runtime_event		= NULL,
1766	.fork_event		= NULL,
1767};
1768
1769static void __cmd_map(void)
1770{
1771	max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1772
1773	setup_pager();
1774	read_events();
1775	print_bad_events();
1776}
1777
1778static void __cmd_replay(void)
1779{
1780	unsigned long i;
1781
1782	calibrate_run_measurement_overhead();
1783	calibrate_sleep_measurement_overhead();
1784
1785	test_calibrations();
1786
1787	read_events();
1788
1789	printf("nr_run_events:        %ld\n", nr_run_events);
1790	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
1791	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
1792
1793	if (targetless_wakeups)
1794		printf("target-less wakeups:  %ld\n", targetless_wakeups);
1795	if (multitarget_wakeups)
1796		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
1797	if (nr_run_events_optimized)
1798		printf("run atoms optimized: %ld\n",
1799			nr_run_events_optimized);
1800
1801	print_task_traces();
1802	add_cross_task_wakeups();
1803
1804	create_tasks();
1805	printf("------------------------------------------------------------\n");
1806	for (i = 0; i < replay_repeat; i++)
1807		run_one_test();
1808}
1809
1810
1811static const char * const sched_usage[] = {
1812	"perf sched [<options>] {record|latency|map|replay|trace}",
1813	NULL
1814};
1815
1816static const struct option sched_options[] = {
1817	OPT_STRING('i', "input", &input_name, "file",
1818		    "input file name"),
1819	OPT_INCR('v', "verbose", &verbose,
1820		    "be more verbose (show symbol address, etc)"),
1821	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1822		    "dump raw trace in ASCII"),
1823	OPT_END()
1824};
1825
1826static const char * const latency_usage[] = {
1827	"perf sched latency [<options>]",
1828	NULL
1829};
1830
1831static const struct option latency_options[] = {
1832	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
1833		   "sort by key(s): runtime, switch, avg, max"),
1834	OPT_INCR('v', "verbose", &verbose,
1835		    "be more verbose (show symbol address, etc)"),
1836	OPT_INTEGER('C', "CPU", &profile_cpu,
1837		    "CPU to profile on"),
1838	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1839		    "dump raw trace in ASCII"),
1840	OPT_END()
1841};
1842
1843static const char * const replay_usage[] = {
1844	"perf sched replay [<options>]",
1845	NULL
1846};
1847
1848static const struct option replay_options[] = {
1849	OPT_UINTEGER('r', "repeat", &replay_repeat,
1850		     "repeat the workload replay N times (-1: infinite)"),
1851	OPT_INCR('v', "verbose", &verbose,
1852		    "be more verbose (show symbol address, etc)"),
1853	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1854		    "dump raw trace in ASCII"),
1855	OPT_END()
1856};
1857
1858static void setup_sorting(void)
1859{
1860	char *tmp, *tok, *str = strdup(sort_order);
1861
1862	for (tok = strtok_r(str, ", ", &tmp);
1863			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1864		if (sort_dimension__add(tok, &sort_list) < 0) {
1865			error("Unknown --sort key: `%s'", tok);
1866			usage_with_options(latency_usage, latency_options);
1867		}
1868	}
1869
1870	free(str);
1871
1872	sort_dimension__add("pid", &cmp_pid);
1873}
1874
1875static const char *record_args[] = {
1876	"record",
1877	"-a",
1878	"-R",
1879	"-f",
1880	"-m", "1024",
1881	"-c", "1",
1882	"-e", "sched:sched_switch",
1883	"-e", "sched:sched_stat_wait",
1884	"-e", "sched:sched_stat_sleep",
1885	"-e", "sched:sched_stat_iowait",
1886	"-e", "sched:sched_stat_runtime",
1887	"-e", "sched:sched_process_exit",
1888	"-e", "sched:sched_process_fork",
1889	"-e", "sched:sched_wakeup",
1890	"-e", "sched:sched_migrate_task",
1891};
1892
1893static int __cmd_record(int argc, const char **argv)
1894{
1895	unsigned int rec_argc, i, j;
1896	const char **rec_argv;
1897
1898	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1899	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1900
1901	if (rec_argv == NULL)
1902		return -ENOMEM;
1903
1904	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1905		rec_argv[i] = strdup(record_args[i]);
1906
1907	for (j = 1; j < (unsigned int)argc; j++, i++)
1908		rec_argv[i] = argv[j];
1909
1910	BUG_ON(i != rec_argc);
1911
1912	return cmd_record(i, rec_argv, NULL);
1913}
1914
1915int cmd_sched(int argc, const char **argv, const char *prefix __used)
1916{
1917	argc = parse_options(argc, argv, sched_options, sched_usage,
1918			     PARSE_OPT_STOP_AT_NON_OPTION);
1919	if (!argc)
1920		usage_with_options(sched_usage, sched_options);
1921
1922	/*
1923	 * Aliased to 'perf script' for now:
1924	 */
1925	if (!strcmp(argv[0], "script"))
1926		return cmd_script(argc, argv, prefix);
1927
1928	symbol__init();
1929	if (!strncmp(argv[0], "rec", 3)) {
1930		return __cmd_record(argc, argv);
1931	} else if (!strncmp(argv[0], "lat", 3)) {
1932		trace_handler = &lat_ops;
1933		if (argc > 1) {
1934			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1935			if (argc)
1936				usage_with_options(latency_usage, latency_options);
1937		}
1938		setup_sorting();
1939		__cmd_lat();
1940	} else if (!strcmp(argv[0], "map")) {
1941		trace_handler = &map_ops;
1942		setup_sorting();
1943		__cmd_map();
1944	} else if (!strncmp(argv[0], "rep", 3)) {
1945		trace_handler = &replay_ops;
1946		if (argc) {
1947			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1948			if (argc)
1949				usage_with_options(replay_usage, replay_options);
1950		}
1951		__cmd_replay();
1952	} else {
1953		usage_with_options(sched_usage, sched_options);
1954	}
1955
1956	return 0;
1957}
1958