1#include "builtin.h"
2#include "perf.h"
3
4#include "util/util.h"
5#include "util/evlist.h"
6#include "util/cache.h"
7#include "util/evsel.h"
8#include "util/symbol.h"
9#include "util/thread.h"
10#include "util/header.h"
11#include "util/session.h"
12#include "util/tool.h"
13
14#include "util/parse-options.h"
15#include "util/trace-event.h"
16
17#include "util/debug.h"
18
19#include <sys/prctl.h>
20#include <sys/resource.h>
21
22#include <semaphore.h>
23#include <pthread.h>
24#include <math.h>
25
26#define PR_SET_NAME		15               /* Set process name */
27#define MAX_CPUS		4096
28#define COMM_LEN		20
29#define SYM_LEN			129
30#define MAX_PID			65536
31
32struct sched_atom;
33
34struct task_desc {
35	unsigned long		nr;
36	unsigned long		pid;
37	char			comm[COMM_LEN];
38
39	unsigned long		nr_events;
40	unsigned long		curr_event;
41	struct sched_atom	**atoms;
42
43	pthread_t		thread;
44	sem_t			sleep_sem;
45
46	sem_t			ready_for_work;
47	sem_t			work_done_sem;
48
49	u64			cpu_usage;
50};
51
52enum sched_event_type {
53	SCHED_EVENT_RUN,
54	SCHED_EVENT_SLEEP,
55	SCHED_EVENT_WAKEUP,
56	SCHED_EVENT_MIGRATION,
57};
58
59struct sched_atom {
60	enum sched_event_type	type;
61	int			specific_wait;
62	u64			timestamp;
63	u64			duration;
64	unsigned long		nr;
65	sem_t			*wait_sem;
66	struct task_desc	*wakee;
67};
68
69#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
70
71enum thread_state {
72	THREAD_SLEEPING = 0,
73	THREAD_WAIT_CPU,
74	THREAD_SCHED_IN,
75	THREAD_IGNORE
76};
77
78struct work_atom {
79	struct list_head	list;
80	enum thread_state	state;
81	u64			sched_out_time;
82	u64			wake_up_time;
83	u64			sched_in_time;
84	u64			runtime;
85};
86
87struct work_atoms {
88	struct list_head	work_list;
89	struct thread		*thread;
90	struct rb_node		node;
91	u64			max_lat;
92	u64			max_lat_at;
93	u64			total_lat;
94	u64			nb_atoms;
95	u64			total_runtime;
96};
97
98typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
99
100struct perf_sched;
101
102struct trace_sched_handler {
103	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
104			    struct perf_sample *sample, struct machine *machine);
105
106	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
107			     struct perf_sample *sample, struct machine *machine);
108
109	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
110			    struct perf_sample *sample, struct machine *machine);
111
112	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
113	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
114			  struct machine *machine);
115
116	int (*migrate_task_event)(struct perf_sched *sched,
117				  struct perf_evsel *evsel,
118				  struct perf_sample *sample,
119				  struct machine *machine);
120};
121
122struct perf_sched {
123	struct perf_tool tool;
124	const char	 *sort_order;
125	unsigned long	 nr_tasks;
126	struct task_desc *pid_to_task[MAX_PID];
127	struct task_desc **tasks;
128	const struct trace_sched_handler *tp_handler;
129	pthread_mutex_t	 start_work_mutex;
130	pthread_mutex_t	 work_done_wait_mutex;
131	int		 profile_cpu;
132/*
133 * Track the current task - that way we can know whether there's any
134 * weird events, such as a task being switched away that is not current.
135 */
136	int		 max_cpu;
137	u32		 curr_pid[MAX_CPUS];
138	struct thread	 *curr_thread[MAX_CPUS];
139	char		 next_shortname1;
140	char		 next_shortname2;
141	unsigned int	 replay_repeat;
142	unsigned long	 nr_run_events;
143	unsigned long	 nr_sleep_events;
144	unsigned long	 nr_wakeup_events;
145	unsigned long	 nr_sleep_corrections;
146	unsigned long	 nr_run_events_optimized;
147	unsigned long	 targetless_wakeups;
148	unsigned long	 multitarget_wakeups;
149	unsigned long	 nr_runs;
150	unsigned long	 nr_timestamps;
151	unsigned long	 nr_unordered_timestamps;
152	unsigned long	 nr_state_machine_bugs;
153	unsigned long	 nr_context_switch_bugs;
154	unsigned long	 nr_events;
155	unsigned long	 nr_lost_chunks;
156	unsigned long	 nr_lost_events;
157	u64		 run_measurement_overhead;
158	u64		 sleep_measurement_overhead;
159	u64		 start_time;
160	u64		 cpu_usage;
161	u64		 runavg_cpu_usage;
162	u64		 parent_cpu_usage;
163	u64		 runavg_parent_cpu_usage;
164	u64		 sum_runtime;
165	u64		 sum_fluct;
166	u64		 run_avg;
167	u64		 all_runtime;
168	u64		 all_count;
169	u64		 cpu_last_switched[MAX_CPUS];
170	struct rb_root	 atom_root, sorted_atom_root;
171	struct list_head sort_list, cmp_pid;
172};
173
174static u64 get_nsecs(void)
175{
176#ifndef __APPLE__
177	struct timespec ts;
178
179	clock_gettime(CLOCK_MONOTONIC, &ts);
180
181	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
182#else
183	perror("get_nsecs not supported on MacOS");
184	return 0;
185#endif
186}
187
188static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
189{
190	u64 T0 = get_nsecs(), T1;
191
192	do {
193		T1 = get_nsecs();
194	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
195}
196
197static void sleep_nsecs(u64 nsecs)
198{
199	struct timespec ts;
200
201	ts.tv_nsec = nsecs % 999999999;
202	ts.tv_sec = nsecs / 999999999;
203
204	nanosleep(&ts, NULL);
205}
206
207static void calibrate_run_measurement_overhead(struct perf_sched *sched)
208{
209	u64 T0, T1, delta, min_delta = 1000000000ULL;
210	int i;
211
212	for (i = 0; i < 10; i++) {
213		T0 = get_nsecs();
214		burn_nsecs(sched, 0);
215		T1 = get_nsecs();
216		delta = T1-T0;
217		min_delta = min(min_delta, delta);
218	}
219	sched->run_measurement_overhead = min_delta;
220
221	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
222}
223
224static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
225{
226	u64 T0, T1, delta, min_delta = 1000000000ULL;
227	int i;
228
229	for (i = 0; i < 10; i++) {
230		T0 = get_nsecs();
231		sleep_nsecs(10000);
232		T1 = get_nsecs();
233		delta = T1-T0;
234		min_delta = min(min_delta, delta);
235	}
236	min_delta -= 10000;
237	sched->sleep_measurement_overhead = min_delta;
238
239	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
240}
241
242static struct sched_atom *
243get_new_event(struct task_desc *task, u64 timestamp)
244{
245	struct sched_atom *event = zalloc(sizeof(*event));
246	unsigned long idx = task->nr_events;
247	size_t size;
248
249	event->timestamp = timestamp;
250	event->nr = idx;
251
252	task->nr_events++;
253	size = sizeof(struct sched_atom *) * task->nr_events;
254	task->atoms = realloc(task->atoms, size);
255	BUG_ON(!task->atoms);
256
257	task->atoms[idx] = event;
258
259	return event;
260}
261
262static struct sched_atom *last_event(struct task_desc *task)
263{
264	if (!task->nr_events)
265		return NULL;
266
267	return task->atoms[task->nr_events - 1];
268}
269
270static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
271				u64 timestamp, u64 duration)
272{
273	struct sched_atom *event, *curr_event = last_event(task);
274
275	/*
276	 * optimize an existing RUN event by merging this one
277	 * to it:
278	 */
279	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
280		sched->nr_run_events_optimized++;
281		curr_event->duration += duration;
282		return;
283	}
284
285	event = get_new_event(task, timestamp);
286
287	event->type = SCHED_EVENT_RUN;
288	event->duration = duration;
289
290	sched->nr_run_events++;
291}
292
293static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
294				   u64 timestamp, struct task_desc *wakee)
295{
296	struct sched_atom *event, *wakee_event;
297
298	event = get_new_event(task, timestamp);
299	event->type = SCHED_EVENT_WAKEUP;
300	event->wakee = wakee;
301
302	wakee_event = last_event(wakee);
303	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
304		sched->targetless_wakeups++;
305		return;
306	}
307	if (wakee_event->wait_sem) {
308		sched->multitarget_wakeups++;
309		return;
310	}
311
312	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
313	sem_init(wakee_event->wait_sem, 0, 0);
314	wakee_event->specific_wait = 1;
315	event->wait_sem = wakee_event->wait_sem;
316
317	sched->nr_wakeup_events++;
318}
319
320static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
321				  u64 timestamp, u64 task_state __maybe_unused)
322{
323	struct sched_atom *event = get_new_event(task, timestamp);
324
325	event->type = SCHED_EVENT_SLEEP;
326
327	sched->nr_sleep_events++;
328}
329
330static struct task_desc *register_pid(struct perf_sched *sched,
331				      unsigned long pid, const char *comm)
332{
333	struct task_desc *task;
334
335	BUG_ON(pid >= MAX_PID);
336
337	task = sched->pid_to_task[pid];
338
339	if (task)
340		return task;
341
342	task = zalloc(sizeof(*task));
343	task->pid = pid;
344	task->nr = sched->nr_tasks;
345	strcpy(task->comm, comm);
346	/*
347	 * every task starts in sleeping state - this gets ignored
348	 * if there's no wakeup pointing to this sleep state:
349	 */
350	add_sched_event_sleep(sched, task, 0, 0);
351
352	sched->pid_to_task[pid] = task;
353	sched->nr_tasks++;
354	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
355	BUG_ON(!sched->tasks);
356	sched->tasks[task->nr] = task;
357
358	if (verbose)
359		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
360
361	return task;
362}
363
364
365static void print_task_traces(struct perf_sched *sched)
366{
367	struct task_desc *task;
368	unsigned long i;
369
370	for (i = 0; i < sched->nr_tasks; i++) {
371		task = sched->tasks[i];
372		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
373			task->nr, task->comm, task->pid, task->nr_events);
374	}
375}
376
377static void add_cross_task_wakeups(struct perf_sched *sched)
378{
379	struct task_desc *task1, *task2;
380	unsigned long i, j;
381
382	for (i = 0; i < sched->nr_tasks; i++) {
383		task1 = sched->tasks[i];
384		j = i + 1;
385		if (j == sched->nr_tasks)
386			j = 0;
387		task2 = sched->tasks[j];
388		add_sched_event_wakeup(sched, task1, 0, task2);
389	}
390}
391
392static void perf_sched__process_event(struct perf_sched *sched,
393				      struct sched_atom *atom)
394{
395	int ret = 0;
396
397	switch (atom->type) {
398		case SCHED_EVENT_RUN:
399			burn_nsecs(sched, atom->duration);
400			break;
401		case SCHED_EVENT_SLEEP:
402			if (atom->wait_sem)
403				ret = sem_wait(atom->wait_sem);
404			BUG_ON(ret);
405			break;
406		case SCHED_EVENT_WAKEUP:
407			if (atom->wait_sem)
408				ret = sem_post(atom->wait_sem);
409			BUG_ON(ret);
410			break;
411		case SCHED_EVENT_MIGRATION:
412			break;
413		default:
414			BUG_ON(1);
415	}
416}
417
418static u64 get_cpu_usage_nsec_parent(void)
419{
420	struct rusage ru;
421	u64 sum;
422	int err;
423
424	err = getrusage(RUSAGE_SELF, &ru);
425	BUG_ON(err);
426
427	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
428	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
429
430	return sum;
431}
432
433static int self_open_counters(void)
434{
435	struct perf_event_attr attr;
436	int fd;
437
438	memset(&attr, 0, sizeof(attr));
439
440	attr.type = PERF_TYPE_SOFTWARE;
441	attr.config = PERF_COUNT_SW_TASK_CLOCK;
442
443	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
444
445	if (fd < 0)
446		pr_err("Error: sys_perf_event_open() syscall returned "
447		       "with %d (%s)\n", fd, strerror(errno));
448	return fd;
449}
450
451static u64 get_cpu_usage_nsec_self(int fd)
452{
453	u64 runtime;
454	int ret;
455
456	ret = read(fd, &runtime, sizeof(runtime));
457	BUG_ON(ret != sizeof(runtime));
458
459	return runtime;
460}
461
462struct sched_thread_parms {
463	struct task_desc  *task;
464	struct perf_sched *sched;
465};
466
467static void *thread_func(void *ctx)
468{
469#ifndef __APPLE__
470	struct sched_thread_parms *parms = ctx;
471	struct task_desc *this_task = parms->task;
472	struct perf_sched *sched = parms->sched;
473	u64 cpu_usage_0, cpu_usage_1;
474	unsigned long i, ret;
475	char comm2[22];
476	int fd;
477
478	free(parms);
479
480	sprintf(comm2, ":%s", this_task->comm);
481	prctl(PR_SET_NAME, comm2);
482	fd = self_open_counters();
483	if (fd < 0)
484		return NULL;
485again:
486	ret = sem_post(&this_task->ready_for_work);
487	BUG_ON(ret);
488	ret = pthread_mutex_lock(&sched->start_work_mutex);
489	BUG_ON(ret);
490	ret = pthread_mutex_unlock(&sched->start_work_mutex);
491	BUG_ON(ret);
492
493	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
494
495	for (i = 0; i < this_task->nr_events; i++) {
496		this_task->curr_event = i;
497		perf_sched__process_event(sched, this_task->atoms[i]);
498	}
499
500	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
501	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
502	ret = sem_post(&this_task->work_done_sem);
503	BUG_ON(ret);
504
505	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
506	BUG_ON(ret);
507	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
508	BUG_ON(ret);
509
510	goto again;
511#else
512	perror("thread_func not supported on MacOS");
513	return NULL;
514#endif
515}
516
517static void create_tasks(struct perf_sched *sched)
518{
519	struct task_desc *task;
520	pthread_attr_t attr;
521	unsigned long i;
522	int err;
523
524	err = pthread_attr_init(&attr);
525	BUG_ON(err);
526	err = pthread_attr_setstacksize(&attr,
527			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
528	BUG_ON(err);
529	err = pthread_mutex_lock(&sched->start_work_mutex);
530	BUG_ON(err);
531	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
532	BUG_ON(err);
533	for (i = 0; i < sched->nr_tasks; i++) {
534		struct sched_thread_parms *parms = malloc(sizeof(*parms));
535		BUG_ON(parms == NULL);
536		parms->task = task = sched->tasks[i];
537		parms->sched = sched;
538		sem_init(&task->sleep_sem, 0, 0);
539		sem_init(&task->ready_for_work, 0, 0);
540		sem_init(&task->work_done_sem, 0, 0);
541		task->curr_event = 0;
542		err = pthread_create(&task->thread, &attr, thread_func, parms);
543		BUG_ON(err);
544	}
545}
546
547static void wait_for_tasks(struct perf_sched *sched)
548{
549	u64 cpu_usage_0, cpu_usage_1;
550	struct task_desc *task;
551	unsigned long i, ret;
552
553	sched->start_time = get_nsecs();
554	sched->cpu_usage = 0;
555	pthread_mutex_unlock(&sched->work_done_wait_mutex);
556
557	for (i = 0; i < sched->nr_tasks; i++) {
558		task = sched->tasks[i];
559		ret = sem_wait(&task->ready_for_work);
560		BUG_ON(ret);
561		sem_init(&task->ready_for_work, 0, 0);
562	}
563	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
564	BUG_ON(ret);
565
566	cpu_usage_0 = get_cpu_usage_nsec_parent();
567
568	pthread_mutex_unlock(&sched->start_work_mutex);
569
570	for (i = 0; i < sched->nr_tasks; i++) {
571		task = sched->tasks[i];
572		ret = sem_wait(&task->work_done_sem);
573		BUG_ON(ret);
574		sem_init(&task->work_done_sem, 0, 0);
575		sched->cpu_usage += task->cpu_usage;
576		task->cpu_usage = 0;
577	}
578
579	cpu_usage_1 = get_cpu_usage_nsec_parent();
580	if (!sched->runavg_cpu_usage)
581		sched->runavg_cpu_usage = sched->cpu_usage;
582	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
583
584	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
585	if (!sched->runavg_parent_cpu_usage)
586		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
587	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
588					 sched->parent_cpu_usage)/10;
589
590	ret = pthread_mutex_lock(&sched->start_work_mutex);
591	BUG_ON(ret);
592
593	for (i = 0; i < sched->nr_tasks; i++) {
594		task = sched->tasks[i];
595		sem_init(&task->sleep_sem, 0, 0);
596		task->curr_event = 0;
597	}
598}
599
600static void run_one_test(struct perf_sched *sched)
601{
602	u64 T0, T1, delta, avg_delta, fluct;
603
604	T0 = get_nsecs();
605	wait_for_tasks(sched);
606	T1 = get_nsecs();
607
608	delta = T1 - T0;
609	sched->sum_runtime += delta;
610	sched->nr_runs++;
611
612	avg_delta = sched->sum_runtime / sched->nr_runs;
613	if (delta < avg_delta)
614		fluct = avg_delta - delta;
615	else
616		fluct = delta - avg_delta;
617	sched->sum_fluct += fluct;
618	if (!sched->run_avg)
619		sched->run_avg = delta;
620	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
621
622	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
623
624	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
625
626	printf("cpu: %0.2f / %0.2f",
627		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
628
629#if 0
630	/*
631	 * rusage statistics done by the parent, these are less
632	 * accurate than the sched->sum_exec_runtime based statistics:
633	 */
634	printf(" [%0.2f / %0.2f]",
635		(double)sched->parent_cpu_usage/1e6,
636		(double)sched->runavg_parent_cpu_usage/1e6);
637#endif
638
639	printf("\n");
640
641	if (sched->nr_sleep_corrections)
642		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
643	sched->nr_sleep_corrections = 0;
644}
645
646static void test_calibrations(struct perf_sched *sched)
647{
648	u64 T0, T1;
649
650	T0 = get_nsecs();
651	burn_nsecs(sched, 1e6);
652	T1 = get_nsecs();
653
654	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
655
656	T0 = get_nsecs();
657	sleep_nsecs(1e6);
658	T1 = get_nsecs();
659
660	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
661}
662
663static int
664replay_wakeup_event(struct perf_sched *sched,
665		    struct perf_evsel *evsel, struct perf_sample *sample,
666		    struct machine *machine __maybe_unused)
667{
668	const char *comm = perf_evsel__strval(evsel, sample, "comm");
669	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
670	struct task_desc *waker, *wakee;
671
672	if (verbose) {
673		printf("sched_wakeup event %p\n", evsel);
674
675		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
676	}
677
678	waker = register_pid(sched, sample->tid, "<unknown>");
679	wakee = register_pid(sched, pid, comm);
680
681	add_sched_event_wakeup(sched, waker, sample->time, wakee);
682	return 0;
683}
684
685static int replay_switch_event(struct perf_sched *sched,
686			       struct perf_evsel *evsel,
687			       struct perf_sample *sample,
688			       struct machine *machine __maybe_unused)
689{
690	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
691		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
692	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
693		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
694	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
695	struct task_desc *prev, __maybe_unused *next;
696	u64 timestamp0, timestamp = sample->time;
697	int cpu = sample->cpu;
698	s64 delta;
699
700	if (verbose)
701		printf("sched_switch event %p\n", evsel);
702
703	if (cpu >= MAX_CPUS || cpu < 0)
704		return 0;
705
706	timestamp0 = sched->cpu_last_switched[cpu];
707	if (timestamp0)
708		delta = timestamp - timestamp0;
709	else
710		delta = 0;
711
712	if (delta < 0) {
713		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
714		return -1;
715	}
716
717	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
718		 prev_comm, prev_pid, next_comm, next_pid, delta);
719
720	prev = register_pid(sched, prev_pid, prev_comm);
721	next = register_pid(sched, next_pid, next_comm);
722
723	sched->cpu_last_switched[cpu] = timestamp;
724
725	add_sched_event_run(sched, prev, timestamp, delta);
726	add_sched_event_sleep(sched, prev, timestamp, prev_state);
727
728	return 0;
729}
730
731static int replay_fork_event(struct perf_sched *sched,
732			     union perf_event *event,
733			     struct machine *machine)
734{
735	struct thread *child, *parent;
736
737	child = machine__findnew_thread(machine, event->fork.pid,
738					event->fork.tid);
739	parent = machine__findnew_thread(machine, event->fork.ppid,
740					 event->fork.ptid);
741
742	if (child == NULL || parent == NULL) {
743		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
744				 child, parent);
745		return 0;
746	}
747
748	if (verbose) {
749		printf("fork event\n");
750		printf("... parent: %s/%d\n", parent->comm, parent->tid);
751		printf("...  child: %s/%d\n", child->comm, child->tid);
752	}
753
754	register_pid(sched, parent->tid, parent->comm);
755	register_pid(sched, child->tid, child->comm);
756	return 0;
757}
758
759struct sort_dimension {
760	const char		*name;
761	sort_fn_t		cmp;
762	struct list_head	list;
763};
764
765static int
766thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
767{
768	struct sort_dimension *sort;
769	int ret = 0;
770
771	BUG_ON(list_empty(list));
772
773	list_for_each_entry(sort, list, list) {
774		ret = sort->cmp(l, r);
775		if (ret)
776			return ret;
777	}
778
779	return ret;
780}
781
782static struct work_atoms *
783thread_atoms_search(struct rb_root *root, struct thread *thread,
784			 struct list_head *sort_list)
785{
786	struct rb_node *node = root->rb_node;
787	struct work_atoms key = { .thread = thread };
788
789	while (node) {
790		struct work_atoms *atoms;
791		int cmp;
792
793		atoms = container_of(node, struct work_atoms, node);
794
795		cmp = thread_lat_cmp(sort_list, &key, atoms);
796		if (cmp > 0)
797			node = node->rb_left;
798		else if (cmp < 0)
799			node = node->rb_right;
800		else {
801			BUG_ON(thread != atoms->thread);
802			return atoms;
803		}
804	}
805	return NULL;
806}
807
808static void
809__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
810			 struct list_head *sort_list)
811{
812	struct rb_node **new = &(root->rb_node), *parent = NULL;
813
814	while (*new) {
815		struct work_atoms *this;
816		int cmp;
817
818		this = container_of(*new, struct work_atoms, node);
819		parent = *new;
820
821		cmp = thread_lat_cmp(sort_list, data, this);
822
823		if (cmp > 0)
824			new = &((*new)->rb_left);
825		else
826			new = &((*new)->rb_right);
827	}
828
829	rb_link_node(&data->node, parent, new);
830	rb_insert_color(&data->node, root);
831}
832
833static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
834{
835	struct work_atoms *atoms = zalloc(sizeof(*atoms));
836	if (!atoms) {
837		pr_err("No memory at %s\n", __func__);
838		return -1;
839	}
840
841	atoms->thread = thread;
842	INIT_LIST_HEAD(&atoms->work_list);
843	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
844	return 0;
845}
846
847static char sched_out_state(u64 prev_state)
848{
849	const char *str = TASK_STATE_TO_CHAR_STR;
850
851	return str[prev_state];
852}
853
854static int
855add_sched_out_event(struct work_atoms *atoms,
856		    char run_state,
857		    u64 timestamp)
858{
859	struct work_atom *atom = zalloc(sizeof(*atom));
860	if (!atom) {
861		pr_err("Non memory at %s", __func__);
862		return -1;
863	}
864
865	atom->sched_out_time = timestamp;
866
867	if (run_state == 'R') {
868		atom->state = THREAD_WAIT_CPU;
869		atom->wake_up_time = atom->sched_out_time;
870	}
871
872	list_add_tail(&atom->list, &atoms->work_list);
873	return 0;
874}
875
876static void
877add_runtime_event(struct work_atoms *atoms, u64 delta,
878		  u64 timestamp __maybe_unused)
879{
880	struct work_atom *atom;
881
882	BUG_ON(list_empty(&atoms->work_list));
883
884	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
885
886	atom->runtime += delta;
887	atoms->total_runtime += delta;
888}
889
890static void
891add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
892{
893	struct work_atom *atom;
894	u64 delta;
895
896	if (list_empty(&atoms->work_list))
897		return;
898
899	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
900
901	if (atom->state != THREAD_WAIT_CPU)
902		return;
903
904	if (timestamp < atom->wake_up_time) {
905		atom->state = THREAD_IGNORE;
906		return;
907	}
908
909	atom->state = THREAD_SCHED_IN;
910	atom->sched_in_time = timestamp;
911
912	delta = atom->sched_in_time - atom->wake_up_time;
913	atoms->total_lat += delta;
914	if (delta > atoms->max_lat) {
915		atoms->max_lat = delta;
916		atoms->max_lat_at = timestamp;
917	}
918	atoms->nb_atoms++;
919}
920
921static int latency_switch_event(struct perf_sched *sched,
922				struct perf_evsel *evsel,
923				struct perf_sample *sample,
924				struct machine *machine)
925{
926	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
927		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
928	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
929	struct work_atoms *out_events, *in_events;
930	struct thread *sched_out, *sched_in;
931	u64 timestamp0, timestamp = sample->time;
932	int cpu = sample->cpu;
933	s64 delta;
934
935	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
936
937	timestamp0 = sched->cpu_last_switched[cpu];
938	sched->cpu_last_switched[cpu] = timestamp;
939	if (timestamp0)
940		delta = timestamp - timestamp0;
941	else
942		delta = 0;
943
944	if (delta < 0) {
945		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
946		return -1;
947	}
948
949	sched_out = machine__findnew_thread(machine, 0, prev_pid);
950	sched_in = machine__findnew_thread(machine, 0, next_pid);
951
952	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
953	if (!out_events) {
954		if (thread_atoms_insert(sched, sched_out))
955			return -1;
956		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
957		if (!out_events) {
958			pr_err("out-event: Internal tree error");
959			return -1;
960		}
961	}
962	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
963		return -1;
964
965	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
966	if (!in_events) {
967		if (thread_atoms_insert(sched, sched_in))
968			return -1;
969		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
970		if (!in_events) {
971			pr_err("in-event: Internal tree error");
972			return -1;
973		}
974		/*
975		 * Take came in we have not heard about yet,
976		 * add in an initial atom in runnable state:
977		 */
978		if (add_sched_out_event(in_events, 'R', timestamp))
979			return -1;
980	}
981	add_sched_in_event(in_events, timestamp);
982
983	return 0;
984}
985
986static int latency_runtime_event(struct perf_sched *sched,
987				 struct perf_evsel *evsel,
988				 struct perf_sample *sample,
989				 struct machine *machine)
990{
991	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
992	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
993	struct thread *thread = machine__findnew_thread(machine, 0, pid);
994	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
995	u64 timestamp = sample->time;
996	int cpu = sample->cpu;
997
998	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
999	if (!atoms) {
1000		if (thread_atoms_insert(sched, thread))
1001			return -1;
1002		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1003		if (!atoms) {
1004			pr_err("in-event: Internal tree error");
1005			return -1;
1006		}
1007		if (add_sched_out_event(atoms, 'R', timestamp))
1008			return -1;
1009	}
1010
1011	add_runtime_event(atoms, runtime, timestamp);
1012	return 0;
1013}
1014
1015static int latency_wakeup_event(struct perf_sched *sched,
1016				struct perf_evsel *evsel,
1017				struct perf_sample *sample,
1018				struct machine *machine)
1019{
1020	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid"),
1021		  success = perf_evsel__intval(evsel, sample, "success");
1022	struct work_atoms *atoms;
1023	struct work_atom *atom;
1024	struct thread *wakee;
1025	u64 timestamp = sample->time;
1026
1027	/* Note for later, it may be interesting to observe the failing cases */
1028	if (!success)
1029		return 0;
1030
1031	wakee = machine__findnew_thread(machine, 0, pid);
1032	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1033	if (!atoms) {
1034		if (thread_atoms_insert(sched, wakee))
1035			return -1;
1036		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1037		if (!atoms) {
1038			pr_err("wakeup-event: Internal tree error");
1039			return -1;
1040		}
1041		if (add_sched_out_event(atoms, 'S', timestamp))
1042			return -1;
1043	}
1044
1045	BUG_ON(list_empty(&atoms->work_list));
1046
1047	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1048
1049	/*
1050	 * You WILL be missing events if you've recorded only
1051	 * one CPU, or are only looking at only one, so don't
1052	 * make useless noise.
1053	 */
1054	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1055		sched->nr_state_machine_bugs++;
1056
1057	sched->nr_timestamps++;
1058	if (atom->sched_out_time > timestamp) {
1059		sched->nr_unordered_timestamps++;
1060		return 0;
1061	}
1062
1063	atom->state = THREAD_WAIT_CPU;
1064	atom->wake_up_time = timestamp;
1065	return 0;
1066}
1067
1068static int latency_migrate_task_event(struct perf_sched *sched,
1069				      struct perf_evsel *evsel,
1070				      struct perf_sample *sample,
1071				      struct machine *machine)
1072{
1073	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1074	u64 timestamp = sample->time;
1075	struct work_atoms *atoms;
1076	struct work_atom *atom;
1077	struct thread *migrant;
1078
1079	/*
1080	 * Only need to worry about migration when profiling one CPU.
1081	 */
1082	if (sched->profile_cpu == -1)
1083		return 0;
1084
1085	migrant = machine__findnew_thread(machine, 0, pid);
1086	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1087	if (!atoms) {
1088		if (thread_atoms_insert(sched, migrant))
1089			return -1;
1090		register_pid(sched, migrant->tid, migrant->comm);
1091		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1092		if (!atoms) {
1093			pr_err("migration-event: Internal tree error");
1094			return -1;
1095		}
1096		if (add_sched_out_event(atoms, 'R', timestamp))
1097			return -1;
1098	}
1099
1100	BUG_ON(list_empty(&atoms->work_list));
1101
1102	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1103	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1104
1105	sched->nr_timestamps++;
1106
1107	if (atom->sched_out_time > timestamp)
1108		sched->nr_unordered_timestamps++;
1109
1110	return 0;
1111}
1112
1113static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1114{
1115	int i;
1116	int ret;
1117	u64 avg;
1118
1119	if (!work_list->nb_atoms)
1120		return;
1121	/*
1122	 * Ignore idle threads:
1123	 */
1124	if (!strcmp(work_list->thread->comm, "swapper"))
1125		return;
1126
1127	sched->all_runtime += work_list->total_runtime;
1128	sched->all_count   += work_list->nb_atoms;
1129
1130	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->tid);
1131
1132	for (i = 0; i < 24 - ret; i++)
1133		printf(" ");
1134
1135	avg = work_list->total_lat / work_list->nb_atoms;
1136
1137	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1138	      (double)work_list->total_runtime / 1e6,
1139		 work_list->nb_atoms, (double)avg / 1e6,
1140		 (double)work_list->max_lat / 1e6,
1141		 (double)work_list->max_lat_at / 1e9);
1142}
1143
1144static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1145{
1146	if (l->thread->tid < r->thread->tid)
1147		return -1;
1148	if (l->thread->tid > r->thread->tid)
1149		return 1;
1150
1151	return 0;
1152}
1153
1154static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1155{
1156	u64 avgl, avgr;
1157
1158	if (!l->nb_atoms)
1159		return -1;
1160
1161	if (!r->nb_atoms)
1162		return 1;
1163
1164	avgl = l->total_lat / l->nb_atoms;
1165	avgr = r->total_lat / r->nb_atoms;
1166
1167	if (avgl < avgr)
1168		return -1;
1169	if (avgl > avgr)
1170		return 1;
1171
1172	return 0;
1173}
1174
1175static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1176{
1177	if (l->max_lat < r->max_lat)
1178		return -1;
1179	if (l->max_lat > r->max_lat)
1180		return 1;
1181
1182	return 0;
1183}
1184
1185static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1186{
1187	if (l->nb_atoms < r->nb_atoms)
1188		return -1;
1189	if (l->nb_atoms > r->nb_atoms)
1190		return 1;
1191
1192	return 0;
1193}
1194
1195static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1196{
1197	if (l->total_runtime < r->total_runtime)
1198		return -1;
1199	if (l->total_runtime > r->total_runtime)
1200		return 1;
1201
1202	return 0;
1203}
1204
1205static int sort_dimension__add(const char *tok, struct list_head *list)
1206{
1207	size_t i;
1208	static struct sort_dimension avg_sort_dimension = {
1209		.name = "avg",
1210		.cmp  = avg_cmp,
1211	};
1212	static struct sort_dimension max_sort_dimension = {
1213		.name = "max",
1214		.cmp  = max_cmp,
1215	};
1216	static struct sort_dimension pid_sort_dimension = {
1217		.name = "pid",
1218		.cmp  = pid_cmp,
1219	};
1220	static struct sort_dimension runtime_sort_dimension = {
1221		.name = "runtime",
1222		.cmp  = runtime_cmp,
1223	};
1224	static struct sort_dimension switch_sort_dimension = {
1225		.name = "switch",
1226		.cmp  = switch_cmp,
1227	};
1228	struct sort_dimension *available_sorts[] = {
1229		&pid_sort_dimension,
1230		&avg_sort_dimension,
1231		&max_sort_dimension,
1232		&switch_sort_dimension,
1233		&runtime_sort_dimension,
1234	};
1235
1236	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1237		if (!strcmp(available_sorts[i]->name, tok)) {
1238			list_add_tail(&available_sorts[i]->list, list);
1239
1240			return 0;
1241		}
1242	}
1243
1244	return -1;
1245}
1246
1247static void perf_sched__sort_lat(struct perf_sched *sched)
1248{
1249	struct rb_node *node;
1250
1251	for (;;) {
1252		struct work_atoms *data;
1253		node = rb_first(&sched->atom_root);
1254		if (!node)
1255			break;
1256
1257		rb_erase(node, &sched->atom_root);
1258		data = rb_entry(node, struct work_atoms, node);
1259		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1260	}
1261}
1262
1263static int process_sched_wakeup_event(struct perf_tool *tool,
1264				      struct perf_evsel *evsel,
1265				      struct perf_sample *sample,
1266				      struct machine *machine)
1267{
1268	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1269
1270	if (sched->tp_handler->wakeup_event)
1271		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1272
1273	return 0;
1274}
1275
1276static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
1277			    struct perf_sample *sample, struct machine *machine)
1278{
1279	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1280		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1281	struct thread *sched_out __maybe_unused, *sched_in;
1282	int new_shortname;
1283	u64 timestamp0, timestamp = sample->time;
1284	s64 delta;
1285	int cpu, this_cpu = sample->cpu;
1286
1287	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1288
1289	if (this_cpu > sched->max_cpu)
1290		sched->max_cpu = this_cpu;
1291
1292	timestamp0 = sched->cpu_last_switched[this_cpu];
1293	sched->cpu_last_switched[this_cpu] = timestamp;
1294	if (timestamp0)
1295		delta = timestamp - timestamp0;
1296	else
1297		delta = 0;
1298
1299	if (delta < 0) {
1300		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1301		return -1;
1302	}
1303
1304	sched_out = machine__findnew_thread(machine, 0, prev_pid);
1305	sched_in = machine__findnew_thread(machine, 0, next_pid);
1306
1307	sched->curr_thread[this_cpu] = sched_in;
1308
1309	printf("  ");
1310
1311	new_shortname = 0;
1312	if (!sched_in->shortname[0]) {
1313		sched_in->shortname[0] = sched->next_shortname1;
1314		sched_in->shortname[1] = sched->next_shortname2;
1315
1316		if (sched->next_shortname1 < 'Z') {
1317			sched->next_shortname1++;
1318		} else {
1319			sched->next_shortname1='A';
1320			if (sched->next_shortname2 < '9') {
1321				sched->next_shortname2++;
1322			} else {
1323				sched->next_shortname2='0';
1324			}
1325		}
1326		new_shortname = 1;
1327	}
1328
1329	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
1330		if (cpu != this_cpu)
1331			printf(" ");
1332		else
1333			printf("*");
1334
1335		if (sched->curr_thread[cpu]) {
1336			if (sched->curr_thread[cpu]->tid)
1337				printf("%2s ", sched->curr_thread[cpu]->shortname);
1338			else
1339				printf(".  ");
1340		} else
1341			printf("   ");
1342	}
1343
1344	printf("  %12.6f secs ", (double)timestamp/1e9);
1345	if (new_shortname) {
1346		printf("%s => %s:%d\n",
1347			sched_in->shortname, sched_in->comm, sched_in->tid);
1348	} else {
1349		printf("\n");
1350	}
1351
1352	return 0;
1353}
1354
1355static int process_sched_switch_event(struct perf_tool *tool,
1356				      struct perf_evsel *evsel,
1357				      struct perf_sample *sample,
1358				      struct machine *machine)
1359{
1360	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1361	int this_cpu = sample->cpu, err = 0;
1362	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1363	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1364
1365	if (sched->curr_pid[this_cpu] != (u32)-1) {
1366		/*
1367		 * Are we trying to switch away a PID that is
1368		 * not current?
1369		 */
1370		if (sched->curr_pid[this_cpu] != prev_pid)
1371			sched->nr_context_switch_bugs++;
1372	}
1373
1374	if (sched->tp_handler->switch_event)
1375		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1376
1377	sched->curr_pid[this_cpu] = next_pid;
1378	return err;
1379}
1380
1381static int process_sched_runtime_event(struct perf_tool *tool,
1382				       struct perf_evsel *evsel,
1383				       struct perf_sample *sample,
1384				       struct machine *machine)
1385{
1386	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1387
1388	if (sched->tp_handler->runtime_event)
1389		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1390
1391	return 0;
1392}
1393
1394static int perf_sched__process_fork_event(struct perf_tool *tool,
1395					  union perf_event *event,
1396					  struct perf_sample *sample,
1397					  struct machine *machine)
1398{
1399	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1400
1401	/* run the fork event through the perf machineruy */
1402	perf_event__process_fork(tool, event, sample, machine);
1403
1404	/* and then run additional processing needed for this command */
1405	if (sched->tp_handler->fork_event)
1406		return sched->tp_handler->fork_event(sched, event, machine);
1407
1408	return 0;
1409}
1410
1411static int process_sched_migrate_task_event(struct perf_tool *tool,
1412					    struct perf_evsel *evsel,
1413					    struct perf_sample *sample,
1414					    struct machine *machine)
1415{
1416	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1417
1418	if (sched->tp_handler->migrate_task_event)
1419		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1420
1421	return 0;
1422}
1423
1424typedef int (*tracepoint_handler)(struct perf_tool *tool,
1425				  struct perf_evsel *evsel,
1426				  struct perf_sample *sample,
1427				  struct machine *machine);
1428
1429static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
1430						 union perf_event *event __maybe_unused,
1431						 struct perf_sample *sample,
1432						 struct perf_evsel *evsel,
1433						 struct machine *machine)
1434{
1435	int err = 0;
1436
1437	evsel->hists.stats.total_period += sample->period;
1438	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
1439
1440	if (evsel->handler.func != NULL) {
1441		tracepoint_handler f = evsel->handler.func;
1442		err = f(tool, evsel, sample, machine);
1443	}
1444
1445	return err;
1446}
1447
1448static int perf_sched__read_events(struct perf_sched *sched,
1449				   struct perf_session **psession)
1450{
1451	const struct perf_evsel_str_handler handlers[] = {
1452		{ "sched:sched_switch",	      process_sched_switch_event, },
1453		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
1454		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
1455		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1456		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
1457	};
1458	struct perf_session *session;
1459
1460	session = perf_session__new(input_name, O_RDONLY, 0, false, &sched->tool);
1461	if (session == NULL) {
1462		pr_debug("No Memory for session\n");
1463		return -1;
1464	}
1465
1466	if (perf_session__set_tracepoints_handlers(session, handlers))
1467		goto out_delete;
1468
1469	if (perf_session__has_traces(session, "record -R")) {
1470		int err = perf_session__process_events(session, &sched->tool);
1471		if (err) {
1472			pr_err("Failed to process events, error %d", err);
1473			goto out_delete;
1474		}
1475
1476		sched->nr_events      = session->stats.nr_events[0];
1477		sched->nr_lost_events = session->stats.total_lost;
1478		sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST];
1479	}
1480
1481	if (psession)
1482		*psession = session;
1483	else
1484		perf_session__delete(session);
1485
1486	return 0;
1487
1488out_delete:
1489	perf_session__delete(session);
1490	return -1;
1491}
1492
1493static void print_bad_events(struct perf_sched *sched)
1494{
1495	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1496		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1497			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
1498			sched->nr_unordered_timestamps, sched->nr_timestamps);
1499	}
1500	if (sched->nr_lost_events && sched->nr_events) {
1501		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1502			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
1503			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1504	}
1505	if (sched->nr_state_machine_bugs && sched->nr_timestamps) {
1506		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
1507			(double)sched->nr_state_machine_bugs/(double)sched->nr_timestamps*100.0,
1508			sched->nr_state_machine_bugs, sched->nr_timestamps);
1509		if (sched->nr_lost_events)
1510			printf(" (due to lost events?)");
1511		printf("\n");
1512	}
1513	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1514		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1515			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
1516			sched->nr_context_switch_bugs, sched->nr_timestamps);
1517		if (sched->nr_lost_events)
1518			printf(" (due to lost events?)");
1519		printf("\n");
1520	}
1521}
1522
1523static int perf_sched__lat(struct perf_sched *sched)
1524{
1525	struct rb_node *next;
1526	struct perf_session *session;
1527
1528	setup_pager();
1529
1530	/* save session -- references to threads are held in work_list */
1531	if (perf_sched__read_events(sched, &session))
1532		return -1;
1533
1534	perf_sched__sort_lat(sched);
1535
1536	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
1537	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
1538	printf(" ---------------------------------------------------------------------------------------------------------------\n");
1539
1540	next = rb_first(&sched->sorted_atom_root);
1541
1542	while (next) {
1543		struct work_atoms *work_list;
1544
1545		work_list = rb_entry(next, struct work_atoms, node);
1546		output_lat_thread(sched, work_list);
1547		next = rb_next(next);
1548	}
1549
1550	printf(" -----------------------------------------------------------------------------------------\n");
1551	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1552		(double)sched->all_runtime / 1e6, sched->all_count);
1553
1554	printf(" ---------------------------------------------------\n");
1555
1556	print_bad_events(sched);
1557	printf("\n");
1558
1559	perf_session__delete(session);
1560	return 0;
1561}
1562
1563static int perf_sched__map(struct perf_sched *sched)
1564{
1565	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1566
1567	setup_pager();
1568	if (perf_sched__read_events(sched, NULL))
1569		return -1;
1570	print_bad_events(sched);
1571	return 0;
1572}
1573
1574static int perf_sched__replay(struct perf_sched *sched)
1575{
1576	unsigned long i;
1577
1578	calibrate_run_measurement_overhead(sched);
1579	calibrate_sleep_measurement_overhead(sched);
1580
1581	test_calibrations(sched);
1582
1583	if (perf_sched__read_events(sched, NULL))
1584		return -1;
1585
1586	printf("nr_run_events:        %ld\n", sched->nr_run_events);
1587	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
1588	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1589
1590	if (sched->targetless_wakeups)
1591		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
1592	if (sched->multitarget_wakeups)
1593		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
1594	if (sched->nr_run_events_optimized)
1595		printf("run atoms optimized: %ld\n",
1596			sched->nr_run_events_optimized);
1597
1598	print_task_traces(sched);
1599	add_cross_task_wakeups(sched);
1600
1601	create_tasks(sched);
1602	printf("------------------------------------------------------------\n");
1603	for (i = 0; i < sched->replay_repeat; i++)
1604		run_one_test(sched);
1605
1606	return 0;
1607}
1608
1609static void setup_sorting(struct perf_sched *sched, const struct option *options,
1610			  const char * const usage_msg[])
1611{
1612	char *tmp, *tok, *str = strdup(sched->sort_order);
1613
1614	for (tok = strtok_r(str, ", ", &tmp);
1615			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1616		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1617			error("Unknown --sort key: `%s'", tok);
1618			usage_with_options(usage_msg, options);
1619		}
1620	}
1621
1622	free(str);
1623
1624	sort_dimension__add("pid", &sched->cmp_pid);
1625}
1626
1627static int __cmd_record(int argc, const char **argv)
1628{
1629	unsigned int rec_argc, i, j;
1630	const char **rec_argv;
1631	const char * const record_args[] = {
1632		"record",
1633		"-a",
1634		"-R",
1635		"-m", "1024",
1636		"-c", "1",
1637		"-e", "sched:sched_switch",
1638		"-e", "sched:sched_stat_wait",
1639		"-e", "sched:sched_stat_sleep",
1640		"-e", "sched:sched_stat_iowait",
1641		"-e", "sched:sched_stat_runtime",
1642		"-e", "sched:sched_process_fork",
1643		"-e", "sched:sched_wakeup",
1644		"-e", "sched:sched_migrate_task",
1645	};
1646
1647	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1648	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1649
1650	if (rec_argv == NULL)
1651		return -ENOMEM;
1652
1653	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1654		rec_argv[i] = strdup(record_args[i]);
1655
1656	for (j = 1; j < (unsigned int)argc; j++, i++)
1657		rec_argv[i] = argv[j];
1658
1659	BUG_ON(i != rec_argc);
1660
1661	return cmd_record(i, rec_argv, NULL);
1662}
1663
1664static const char default_sort_order[] = "avg, max, switch, runtime";
1665static struct perf_sched sched = {
1666	.tool = {
1667		.sample		 = perf_sched__process_tracepoint_sample,
1668		.comm		 = perf_event__process_comm,
1669		.lost		 = perf_event__process_lost,
1670		.fork		 = perf_sched__process_fork_event,
1671		.ordered_samples = true,
1672	},
1673	.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
1674	.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
1675	.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
1676	.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
1677	.curr_pid	      = { [0 ... MAX_CPUS - 1] = -1 },
1678	.sort_order	      = default_sort_order,
1679	.replay_repeat	      = 10,
1680	.profile_cpu	      = -1,
1681	.next_shortname1      = 'A',
1682	.next_shortname2      = '0',
1683};
1684
1685int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
1686{
1687	const struct option latency_options[] = {
1688	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
1689		   "sort by key(s): runtime, switch, avg, max"),
1690	OPT_INCR('v', "verbose", &verbose,
1691		    "be more verbose (show symbol address, etc)"),
1692	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
1693		    "CPU to profile on"),
1694	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1695		    "dump raw trace in ASCII"),
1696	OPT_END()
1697	};
1698	const struct option replay_options[] = {
1699	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
1700		     "repeat the workload replay N times (-1: infinite)"),
1701	OPT_INCR('v', "verbose", &verbose,
1702		    "be more verbose (show symbol address, etc)"),
1703	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1704		    "dump raw trace in ASCII"),
1705	OPT_END()
1706	};
1707	const struct option sched_options[] = {
1708	OPT_STRING('i', "input", &input_name, "file",
1709		    "input file name"),
1710	OPT_INCR('v', "verbose", &verbose,
1711		    "be more verbose (show symbol address, etc)"),
1712	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1713		    "dump raw trace in ASCII"),
1714	OPT_END()
1715	};
1716	const char * const latency_usage[] = {
1717		"perf sched latency [<options>]",
1718		NULL
1719	};
1720	const char * const replay_usage[] = {
1721		"perf sched replay [<options>]",
1722		NULL
1723	};
1724	const char * const sched_usage[] = {
1725		"perf sched [<options>] {record|latency|map|replay|script}",
1726		NULL
1727	};
1728	struct trace_sched_handler lat_ops  = {
1729		.wakeup_event	    = latency_wakeup_event,
1730		.switch_event	    = latency_switch_event,
1731		.runtime_event	    = latency_runtime_event,
1732		.migrate_task_event = latency_migrate_task_event,
1733	};
1734	struct trace_sched_handler map_ops  = {
1735		.switch_event	    = map_switch_event,
1736	};
1737	struct trace_sched_handler replay_ops  = {
1738		.wakeup_event	    = replay_wakeup_event,
1739		.switch_event	    = replay_switch_event,
1740		.fork_event	    = replay_fork_event,
1741	};
1742
1743	argc = parse_options(argc, argv, sched_options, sched_usage,
1744			     PARSE_OPT_STOP_AT_NON_OPTION);
1745	if (!argc)
1746		usage_with_options(sched_usage, sched_options);
1747
1748	/*
1749	 * Aliased to 'perf script' for now:
1750	 */
1751	if (!strcmp(argv[0], "script"))
1752		return cmd_script(argc, argv, prefix);
1753
1754	symbol__init();
1755	if (!strncmp(argv[0], "rec", 3)) {
1756		return __cmd_record(argc, argv);
1757	} else if (!strncmp(argv[0], "lat", 3)) {
1758		sched.tp_handler = &lat_ops;
1759		if (argc > 1) {
1760			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1761			if (argc)
1762				usage_with_options(latency_usage, latency_options);
1763		}
1764		setup_sorting(&sched, latency_options, latency_usage);
1765		return perf_sched__lat(&sched);
1766	} else if (!strcmp(argv[0], "map")) {
1767		sched.tp_handler = &map_ops;
1768		setup_sorting(&sched, latency_options, latency_usage);
1769		return perf_sched__map(&sched);
1770	} else if (!strncmp(argv[0], "rep", 3)) {
1771		sched.tp_handler = &replay_ops;
1772		if (argc) {
1773			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1774			if (argc)
1775				usage_with_options(replay_usage, replay_options);
1776		}
1777		return perf_sched__replay(&sched);
1778	} else {
1779		usage_with_options(sched_usage, sched_options);
1780	}
1781
1782	return 0;
1783}
1784