1/*
2 * block queue tracing parse application
3 *
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
6 *
7 *  This program is free software; you can redistribute it and/or modify
8 *  it under the terms of the GNU General Public License as published by
9 *  the Free Software Foundation; either version 2 of the License, or
10 *  (at your option) any later version.
11 *
12 *  This program is distributed in the hope that it will be useful,
13 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 *  GNU General Public License for more details.
16 *
17 *  You should have received a copy of the GNU General Public License
18 *  along with this program; if not, write to the Free Software
19 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20 *
21 */
22#include <sys/types.h>
23#include <sys/stat.h>
24#include <unistd.h>
25#include <stdio.h>
26#include <fcntl.h>
27#include <stdlib.h>
28#include <string.h>
29#include <getopt.h>
30#include <errno.h>
31#include <signal.h>
32#include <locale.h>
33#include <libgen.h>
34
35#include "blktrace.h"
36#include "rbtree.h"
37#include "jhash.h"
38
39static char blkparse_version[] = "1.0.1";
40
41struct skip_info {
42	unsigned long start, end;
43	struct skip_info *prev, *next;
44};
45
46struct per_dev_info {
47	dev_t dev;
48	char *name;
49
50	int backwards;
51	unsigned long long events;
52	unsigned long long first_reported_time;
53	unsigned long long last_reported_time;
54	unsigned long long last_read_time;
55	struct io_stats io_stats;
56	unsigned long skips;
57	unsigned long long seq_skips;
58	unsigned int max_depth[2];
59	unsigned int cur_depth[2];
60
61	struct rb_root rb_track;
62
63	int nfiles;
64	int ncpus;
65
66	unsigned long *cpu_map;
67	unsigned int cpu_map_max;
68
69	struct per_cpu_info *cpus;
70};
71
72/*
73 * some duplicated effort here, we can unify this hash and the ppi hash later
74 */
75struct process_pid_map {
76	pid_t pid;
77	char comm[16];
78	struct process_pid_map *hash_next, *list_next;
79};
80
81#define PPM_HASH_SHIFT	(8)
82#define PPM_HASH_SIZE	(1 << PPM_HASH_SHIFT)
83#define PPM_HASH_MASK	(PPM_HASH_SIZE - 1)
84static struct process_pid_map *ppm_hash_table[PPM_HASH_SIZE];
85
86struct per_process_info {
87	struct process_pid_map *ppm;
88	struct io_stats io_stats;
89	struct per_process_info *hash_next, *list_next;
90	int more_than_one;
91
92	/*
93	 * individual io stats
94	 */
95	unsigned long long longest_allocation_wait[2];
96	unsigned long long longest_dispatch_wait[2];
97	unsigned long long longest_completion_wait[2];
98};
99
100#define PPI_HASH_SHIFT	(8)
101#define PPI_HASH_SIZE	(1 << PPI_HASH_SHIFT)
102#define PPI_HASH_MASK	(PPI_HASH_SIZE - 1)
103static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
104static struct per_process_info *ppi_list;
105static int ppi_list_entries;
106
107static struct option l_opts[] = {
108 	{
109		.name = "act-mask",
110		.has_arg = required_argument,
111		.flag = NULL,
112		.val = 'a'
113	},
114	{
115		.name = "set-mask",
116		.has_arg = required_argument,
117		.flag = NULL,
118		.val = 'A'
119	},
120	{
121		.name = "batch",
122		.has_arg = required_argument,
123		.flag = NULL,
124		.val = 'b'
125	},
126	{
127		.name = "input-directory",
128		.has_arg = required_argument,
129		.flag = NULL,
130		.val = 'D'
131	},
132	{
133		.name = "dump-binary",
134		.has_arg = required_argument,
135		.flag = NULL,
136		.val = 'd'
137	},
138	{
139		.name = "format",
140		.has_arg = required_argument,
141		.flag = NULL,
142		.val = 'f'
143	},
144	{
145		.name = "format-spec",
146		.has_arg = required_argument,
147		.flag = NULL,
148		.val = 'F'
149	},
150	{
151		.name = "hash-by-name",
152		.has_arg = no_argument,
153		.flag = NULL,
154		.val = 'h'
155	},
156	{
157		.name = "input",
158		.has_arg = required_argument,
159		.flag = NULL,
160		.val = 'i'
161	},
162	{
163		.name = "no-msgs",
164		.has_arg = no_argument,
165		.flag = NULL,
166		.val = 'M'
167	},
168	{
169		.name = "output",
170		.has_arg = required_argument,
171		.flag = NULL,
172		.val = 'o'
173	},
174	{
175		.name = "no-text-output",
176		.has_arg = no_argument,
177		.flag = NULL,
178		.val = 'O'
179	},
180	{
181		.name = "quiet",
182		.has_arg = no_argument,
183		.flag = NULL,
184		.val = 'q'
185	},
186	{
187		.name = "per-program-stats",
188		.has_arg = no_argument,
189		.flag = NULL,
190		.val = 's'
191	},
192	{
193		.name = "track-ios",
194		.has_arg = no_argument,
195		.flag = NULL,
196		.val = 't'
197	},
198	{
199		.name = "stopwatch",
200		.has_arg = required_argument,
201		.flag = NULL,
202		.val = 'w'
203	},
204	{
205		.name = "verbose",
206		.has_arg = no_argument,
207		.flag = NULL,
208		.val = 'v'
209	},
210	{
211		.name = "version",
212		.has_arg = no_argument,
213		.flag = NULL,
214		.val = 'V'
215	},
216	{
217		.name = NULL,
218	}
219};
220
221/*
222 * for sorting the displayed output
223 */
224struct trace {
225	struct blk_io_trace *bit;
226	struct rb_node rb_node;
227	struct trace *next;
228	unsigned long read_sequence;
229};
230
231static struct rb_root rb_sort_root;
232static unsigned long rb_sort_entries;
233
234static struct trace *trace_list;
235
236/*
237 * allocation cache
238 */
239static struct blk_io_trace *bit_alloc_list;
240static struct trace *t_alloc_list;
241
242/*
243 * for tracking individual ios
244 */
245struct io_track {
246	struct rb_node rb_node;
247
248	struct process_pid_map *ppm;
249	__u64 sector;
250	unsigned long long allocation_time;
251	unsigned long long queue_time;
252	unsigned long long dispatch_time;
253	unsigned long long completion_time;
254};
255
256static int ndevices;
257static struct per_dev_info *devices;
258static char *get_dev_name(struct per_dev_info *, char *, int);
259static int trace_rb_insert_last(struct per_dev_info *, struct trace *);
260
261FILE *ofp = NULL;
262static char *output_name;
263static char *input_dir;
264
265static unsigned long long genesis_time;
266static unsigned long long last_allowed_time;
267static unsigned long long stopwatch_start;	/* start from zero by default */
268static unsigned long long stopwatch_end = -1ULL;	/* "infinity" */
269static unsigned long read_sequence;
270
271static int per_process_stats;
272static int per_device_and_cpu_stats = 1;
273static int track_ios;
274static int ppi_hash_by_pid = 1;
275static int verbose;
276static unsigned int act_mask = -1U;
277static int stats_printed;
278static int bin_output_msgs = 1;
279int data_is_native = -1;
280
281static FILE *dump_fp;
282static char *dump_binary;
283
284static unsigned int t_alloc_cache;
285static unsigned int bit_alloc_cache;
286
287#define RB_BATCH_DEFAULT	(512)
288static unsigned int rb_batch = RB_BATCH_DEFAULT;
289
290static int pipeline;
291static char *pipename;
292
293static int text_output = 1;
294
295#define is_done()	(*(volatile int *)(&done))
296static volatile int done;
297
298struct timespec		abs_start_time;
299static unsigned long long start_timestamp;
300
301static int have_drv_data = 0;
302
303#define JHASH_RANDOM	(0x3af5f2ee)
304
305#define CPUS_PER_LONG	(8 * sizeof(unsigned long))
306#define CPU_IDX(cpu)	((cpu) / CPUS_PER_LONG)
307#define CPU_BIT(cpu)	((cpu) & (CPUS_PER_LONG - 1))
308
309static void output_binary(void *buf, int len)
310{
311	if (dump_binary) {
312		size_t n = fwrite(buf, len, 1, dump_fp);
313		if (n != 1) {
314			perror(dump_binary);
315			fclose(dump_fp);
316			dump_binary = NULL;
317		}
318	}
319}
320
321static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
322{
323	struct per_cpu_info *cpus = pdi->cpus;
324	int ncpus = pdi->ncpus;
325	int new_count = cpu + 1;
326	int new_space, size;
327	char *new_start;
328
329	size = new_count * sizeof(struct per_cpu_info);
330	cpus = realloc(cpus, size);
331	if (!cpus) {
332		char name[20];
333		fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
334			get_dev_name(pdi, name, sizeof(name)), size);
335		exit(1);
336	}
337
338	new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
339	new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
340	memset(new_start, 0, new_space);
341
342	pdi->ncpus = new_count;
343	pdi->cpus = cpus;
344
345	for (new_count = 0; new_count < pdi->ncpus; new_count++) {
346		struct per_cpu_info *pci = &pdi->cpus[new_count];
347
348		if (!pci->fd) {
349			pci->fd = -1;
350			memset(&pci->rb_last, 0, sizeof(pci->rb_last));
351			pci->rb_last_entries = 0;
352			pci->last_sequence = -1;
353		}
354	}
355}
356
357static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
358{
359	struct per_cpu_info *pci;
360
361	if (cpu >= pdi->ncpus)
362		resize_cpu_info(pdi, cpu);
363
364	pci = &pdi->cpus[cpu];
365	pci->cpu = cpu;
366	return pci;
367}
368
369
370static int resize_devices(char *name)
371{
372	int size = (ndevices + 1) * sizeof(struct per_dev_info);
373
374	devices = realloc(devices, size);
375	if (!devices) {
376		fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
377		return 1;
378	}
379	memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
380	devices[ndevices].name = name;
381	ndevices++;
382	return 0;
383}
384
385static struct per_dev_info *get_dev_info(dev_t dev)
386{
387	struct per_dev_info *pdi;
388	int i;
389
390	for (i = 0; i < ndevices; i++) {
391		if (!devices[i].dev)
392			devices[i].dev = dev;
393		if (devices[i].dev == dev)
394			return &devices[i];
395	}
396
397	if (resize_devices(NULL))
398		return NULL;
399
400	pdi = &devices[ndevices - 1];
401	pdi->dev = dev;
402	pdi->first_reported_time = 0;
403	pdi->last_read_time = 0;
404
405	return pdi;
406}
407
408static void insert_skip(struct per_cpu_info *pci, unsigned long start,
409			unsigned long end)
410{
411	struct skip_info *sip;
412
413	for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) {
414		if (end == (sip->start - 1)) {
415			sip->start = start;
416			return;
417		} else if (start == (sip->end + 1)) {
418			sip->end = end;
419			return;
420		}
421	}
422
423	sip = malloc(sizeof(struct skip_info));
424	sip->start = start;
425	sip->end = end;
426	sip->prev = sip->next = NULL;
427	if (pci->skips_tail == NULL)
428		pci->skips_head = pci->skips_tail = sip;
429	else {
430		sip->prev = pci->skips_tail;
431		pci->skips_tail->next = sip;
432		pci->skips_tail = sip;
433	}
434}
435
436static void remove_sip(struct per_cpu_info *pci, struct skip_info *sip)
437{
438	if (sip->prev == NULL) {
439		if (sip->next == NULL)
440			pci->skips_head = pci->skips_tail = NULL;
441		else {
442			pci->skips_head = sip->next;
443			sip->next->prev = NULL;
444		}
445	} else if (sip->next == NULL) {
446		pci->skips_tail = sip->prev;
447		sip->prev->next = NULL;
448	} else {
449		sip->prev->next = sip->next;
450		sip->next->prev = sip->prev;
451	}
452
453	sip->prev = sip->next = NULL;
454	free(sip);
455}
456
457#define IN_SKIP(sip,seq) (((sip)->start <= (seq)) && ((seq) <= sip->end))
458static int check_current_skips(struct per_cpu_info *pci, unsigned long seq)
459{
460	struct skip_info *sip;
461
462	for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) {
463		if (IN_SKIP(sip, seq)) {
464			if (sip->start == seq) {
465				if (sip->end == seq)
466					remove_sip(pci, sip);
467				else
468					sip->start += 1;
469			} else if (sip->end == seq)
470				sip->end -= 1;
471			else {
472				sip->end = seq - 1;
473				insert_skip(pci, seq + 1, sip->end);
474			}
475			return 1;
476		}
477	}
478
479	return 0;
480}
481
482static void collect_pdi_skips(struct per_dev_info *pdi)
483{
484	struct skip_info *sip;
485	int cpu;
486
487	pdi->skips = 0;
488	pdi->seq_skips = 0;
489
490	for (cpu = 0; cpu < pdi->ncpus; cpu++) {
491		struct per_cpu_info *pci = &pdi->cpus[cpu];
492
493		for (sip = pci->skips_head; sip != NULL; sip = sip->next) {
494			pdi->skips++;
495			pdi->seq_skips += (sip->end - sip->start + 1);
496			if (verbose)
497				fprintf(stderr,"(%d,%d): skipping %lu -> %lu\n",
498					MAJOR(pdi->dev), MINOR(pdi->dev),
499					sip->start, sip->end);
500		}
501	}
502}
503
504static void cpu_mark_online(struct per_dev_info *pdi, unsigned int cpu)
505{
506	if (cpu >= pdi->cpu_map_max || !pdi->cpu_map) {
507		int new_max = (cpu + CPUS_PER_LONG) & ~(CPUS_PER_LONG - 1);
508		unsigned long *map = malloc(new_max / sizeof(long));
509
510		memset(map, 0, new_max / sizeof(long));
511
512		if (pdi->cpu_map) {
513			memcpy(map, pdi->cpu_map, pdi->cpu_map_max / sizeof(long));
514			free(pdi->cpu_map);
515		}
516
517		pdi->cpu_map = map;
518		pdi->cpu_map_max = new_max;
519	}
520
521	pdi->cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu));
522}
523
524static inline void cpu_mark_offline(struct per_dev_info *pdi, int cpu)
525{
526	pdi->cpu_map[CPU_IDX(cpu)] &= ~(1UL << CPU_BIT(cpu));
527}
528
529static inline int cpu_is_online(struct per_dev_info *pdi, int cpu)
530{
531	return (pdi->cpu_map[CPU_IDX(cpu)] & (1UL << CPU_BIT(cpu))) != 0;
532}
533
534static inline int ppm_hash_pid(pid_t pid)
535{
536	return jhash_1word(pid, JHASH_RANDOM) & PPM_HASH_MASK;
537}
538
539static struct process_pid_map *find_ppm(pid_t pid)
540{
541	const int hash_idx = ppm_hash_pid(pid);
542	struct process_pid_map *ppm;
543
544	ppm = ppm_hash_table[hash_idx];
545	while (ppm) {
546		if (ppm->pid == pid)
547			return ppm;
548
549		ppm = ppm->hash_next;
550	}
551
552	return NULL;
553}
554
555static struct process_pid_map *add_ppm_hash(pid_t pid, const char *name)
556{
557	const int hash_idx = ppm_hash_pid(pid);
558	struct process_pid_map *ppm;
559
560	ppm = find_ppm(pid);
561	if (!ppm) {
562		ppm = malloc(sizeof(*ppm));
563		memset(ppm, 0, sizeof(*ppm));
564		ppm->pid = pid;
565		strcpy(ppm->comm, name);
566		ppm->hash_next = ppm_hash_table[hash_idx];
567		ppm_hash_table[hash_idx] = ppm;
568	}
569
570	return ppm;
571}
572
573static void handle_notify(struct blk_io_trace *bit)
574{
575	void	*payload = (caddr_t) bit + sizeof(*bit);
576	__u32	two32[2];
577
578	switch (bit->action) {
579	case BLK_TN_PROCESS:
580		add_ppm_hash(bit->pid, payload);
581		break;
582
583	case BLK_TN_TIMESTAMP:
584		if (bit->pdu_len != sizeof(two32))
585			return;
586		memcpy(two32, payload, sizeof(two32));
587		if (!data_is_native) {
588			two32[0] = be32_to_cpu(two32[0]);
589			two32[1] = be32_to_cpu(two32[1]);
590		}
591		start_timestamp = bit->time;
592		abs_start_time.tv_sec  = two32[0];
593		abs_start_time.tv_nsec = two32[1];
594		if (abs_start_time.tv_nsec < 0) {
595			abs_start_time.tv_sec--;
596			abs_start_time.tv_nsec += 1000000000;
597		}
598
599		break;
600
601	case BLK_TN_MESSAGE:
602		if (bit->pdu_len > 0) {
603			char msg[bit->pdu_len+1];
604
605			memcpy(msg, (char *)payload, bit->pdu_len);
606			msg[bit->pdu_len] = '\0';
607
608			fprintf(ofp,
609				"%3d,%-3d %2d %8s %5d.%09lu %5u %2s %3s %s\n",
610				MAJOR(bit->device), MINOR(bit->device),
611				bit->cpu, "0", (int) SECONDS(bit->time),
612				(unsigned long) NANO_SECONDS(bit->time),
613				0, "m", "N", msg);
614		}
615		break;
616
617	default:
618		/* Ignore unknown notify events */
619		;
620	}
621}
622
623char *find_process_name(pid_t pid)
624{
625	struct process_pid_map *ppm = find_ppm(pid);
626
627	if (ppm)
628		return ppm->comm;
629
630	return NULL;
631}
632
633static inline int ppi_hash_pid(pid_t pid)
634{
635	return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
636}
637
638static inline int ppi_hash_name(const char *name)
639{
640	return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
641}
642
643static inline int ppi_hash(struct per_process_info *ppi)
644{
645	struct process_pid_map *ppm = ppi->ppm;
646
647	if (ppi_hash_by_pid)
648		return ppi_hash_pid(ppm->pid);
649
650	return ppi_hash_name(ppm->comm);
651}
652
653static inline void add_ppi_to_hash(struct per_process_info *ppi)
654{
655	const int hash_idx = ppi_hash(ppi);
656
657	ppi->hash_next = ppi_hash_table[hash_idx];
658	ppi_hash_table[hash_idx] = ppi;
659}
660
661static inline void add_ppi_to_list(struct per_process_info *ppi)
662{
663	ppi->list_next = ppi_list;
664	ppi_list = ppi;
665	ppi_list_entries++;
666}
667
668static struct per_process_info *find_ppi_by_name(char *name)
669{
670	const int hash_idx = ppi_hash_name(name);
671	struct per_process_info *ppi;
672
673	ppi = ppi_hash_table[hash_idx];
674	while (ppi) {
675		struct process_pid_map *ppm = ppi->ppm;
676
677		if (!strcmp(ppm->comm, name))
678			return ppi;
679
680		ppi = ppi->hash_next;
681	}
682
683	return NULL;
684}
685
686static struct per_process_info *find_ppi_by_pid(pid_t pid)
687{
688	const int hash_idx = ppi_hash_pid(pid);
689	struct per_process_info *ppi;
690
691	ppi = ppi_hash_table[hash_idx];
692	while (ppi) {
693		struct process_pid_map *ppm = ppi->ppm;
694
695		if (ppm->pid == pid)
696			return ppi;
697
698		ppi = ppi->hash_next;
699	}
700
701	return NULL;
702}
703
704static struct per_process_info *find_ppi(pid_t pid)
705{
706	struct per_process_info *ppi;
707	char *name;
708
709	if (ppi_hash_by_pid)
710		return find_ppi_by_pid(pid);
711
712	name = find_process_name(pid);
713	if (!name)
714		return NULL;
715
716	ppi = find_ppi_by_name(name);
717	if (ppi && ppi->ppm->pid != pid)
718		ppi->more_than_one = 1;
719
720	return ppi;
721}
722
723/*
724 * struct trace and blktrace allocation cache, we do potentially
725 * millions of mallocs for these structures while only using at most
726 * a few thousand at the time
727 */
728static inline void t_free(struct trace *t)
729{
730	if (t_alloc_cache < 1024) {
731		t->next = t_alloc_list;
732		t_alloc_list = t;
733		t_alloc_cache++;
734	} else
735		free(t);
736}
737
738static inline struct trace *t_alloc(void)
739{
740	struct trace *t = t_alloc_list;
741
742	if (t) {
743		t_alloc_list = t->next;
744		t_alloc_cache--;
745		return t;
746	}
747
748	return malloc(sizeof(*t));
749}
750
751static inline void bit_free(struct blk_io_trace *bit)
752{
753	if (bit_alloc_cache < 1024 && !bit->pdu_len) {
754		/*
755		 * abuse a 64-bit field for a next pointer for the free item
756		 */
757		bit->time = (__u64) (unsigned long) bit_alloc_list;
758		bit_alloc_list = (struct blk_io_trace *) bit;
759		bit_alloc_cache++;
760	} else
761		free(bit);
762}
763
764static inline struct blk_io_trace *bit_alloc(void)
765{
766	struct blk_io_trace *bit = bit_alloc_list;
767
768	if (bit) {
769		bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
770				 bit->time;
771		bit_alloc_cache--;
772		return bit;
773	}
774
775	return malloc(sizeof(*bit));
776}
777
778static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t)
779{
780	struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu);
781
782	rb_erase(&t->rb_node, &pci->rb_last);
783	pci->rb_last_entries--;
784
785	bit_free(t->bit);
786	t_free(t);
787}
788
789static void put_trace(struct per_dev_info *pdi, struct trace *t)
790{
791	rb_erase(&t->rb_node, &rb_sort_root);
792	rb_sort_entries--;
793
794	trace_rb_insert_last(pdi, t);
795}
796
797static inline int trace_rb_insert(struct trace *t, struct rb_root *root)
798{
799	struct rb_node **p = &root->rb_node;
800	struct rb_node *parent = NULL;
801	struct trace *__t;
802
803	while (*p) {
804		parent = *p;
805
806		__t = rb_entry(parent, struct trace, rb_node);
807
808		if (t->bit->time < __t->bit->time)
809			p = &(*p)->rb_left;
810		else if (t->bit->time > __t->bit->time)
811			p = &(*p)->rb_right;
812		else if (t->bit->device < __t->bit->device)
813			p = &(*p)->rb_left;
814		else if (t->bit->device > __t->bit->device)
815			p = &(*p)->rb_right;
816		else if (t->bit->sequence < __t->bit->sequence)
817			p = &(*p)->rb_left;
818		else	/* >= sequence */
819			p = &(*p)->rb_right;
820	}
821
822	rb_link_node(&t->rb_node, parent, p);
823	rb_insert_color(&t->rb_node, root);
824	return 0;
825}
826
827static inline int trace_rb_insert_sort(struct trace *t)
828{
829	if (!trace_rb_insert(t, &rb_sort_root)) {
830		rb_sort_entries++;
831		return 0;
832	}
833
834	return 1;
835}
836
837static int trace_rb_insert_last(struct per_dev_info *pdi, struct trace *t)
838{
839	struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu);
840
841	if (trace_rb_insert(t, &pci->rb_last))
842		return 1;
843
844	pci->rb_last_entries++;
845
846	if (pci->rb_last_entries > rb_batch * pdi->nfiles) {
847		struct rb_node *n = rb_first(&pci->rb_last);
848
849		t = rb_entry(n, struct trace, rb_node);
850		__put_trace_last(pdi, t);
851	}
852
853	return 0;
854}
855
856static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
857				   struct rb_root *root, int order)
858{
859	struct rb_node *n = root->rb_node;
860	struct rb_node *prev = NULL;
861	struct trace *__t;
862
863	while (n) {
864		__t = rb_entry(n, struct trace, rb_node);
865		prev = n;
866
867		if (device < __t->bit->device)
868			n = n->rb_left;
869		else if (device > __t->bit->device)
870			n = n->rb_right;
871		else if (sequence < __t->bit->sequence)
872			n = n->rb_left;
873		else if (sequence > __t->bit->sequence)
874			n = n->rb_right;
875		else
876			return __t;
877	}
878
879	/*
880	 * hack - the list may not be sequence ordered because some
881	 * events don't have sequence and time matched. so we end up
882	 * being a little off in the rb lookup here, because we don't
883	 * know the time we are looking for. compensate by browsing
884	 * a little ahead from the last entry to find the match
885	 */
886	if (order && prev) {
887		int max = 5;
888
889		while (((n = rb_next(prev)) != NULL) && max--) {
890			__t = rb_entry(n, struct trace, rb_node);
891
892			if (__t->bit->device == device &&
893			    __t->bit->sequence == sequence)
894				return __t;
895
896			prev = n;
897		}
898	}
899
900	return NULL;
901}
902
903static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
904					       struct per_cpu_info *pci,
905					       unsigned long seq)
906{
907	return trace_rb_find(pdi->dev, seq, &pci->rb_last, 0);
908}
909
910static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot)
911{
912	struct rb_node **p = &pdi->rb_track.rb_node;
913	struct rb_node *parent = NULL;
914	struct io_track *__iot;
915
916	while (*p) {
917		parent = *p;
918		__iot = rb_entry(parent, struct io_track, rb_node);
919
920		if (iot->sector < __iot->sector)
921			p = &(*p)->rb_left;
922		else if (iot->sector > __iot->sector)
923			p = &(*p)->rb_right;
924		else {
925			fprintf(stderr,
926				"sector alias (%Lu) on device %d,%d!\n",
927				(unsigned long long) iot->sector,
928				MAJOR(pdi->dev), MINOR(pdi->dev));
929			return 1;
930		}
931	}
932
933	rb_link_node(&iot->rb_node, parent, p);
934	rb_insert_color(&iot->rb_node, &pdi->rb_track);
935	return 0;
936}
937
938static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector)
939{
940	struct rb_node *n = pdi->rb_track.rb_node;
941	struct io_track *__iot;
942
943	while (n) {
944		__iot = rb_entry(n, struct io_track, rb_node);
945
946		if (sector < __iot->sector)
947			n = n->rb_left;
948		else if (sector > __iot->sector)
949			n = n->rb_right;
950		else
951			return __iot;
952	}
953
954	return NULL;
955}
956
957static struct io_track *find_track(struct per_dev_info *pdi, pid_t pid,
958				   __u64 sector)
959{
960	struct io_track *iot;
961
962	iot = __find_track(pdi, sector);
963	if (!iot) {
964		iot = malloc(sizeof(*iot));
965		iot->ppm = find_ppm(pid);
966		if (!iot->ppm)
967			iot->ppm = add_ppm_hash(pid, "unknown");
968		iot->sector = sector;
969		track_rb_insert(pdi, iot);
970	}
971
972	return iot;
973}
974
975static void log_track_frontmerge(struct per_dev_info *pdi,
976				 struct blk_io_trace *t)
977{
978	struct io_track *iot;
979
980	if (!track_ios)
981		return;
982
983	iot = __find_track(pdi, t->sector + t_sec(t));
984	if (!iot) {
985		if (verbose)
986			fprintf(stderr, "merge not found for (%d,%d): %llu\n",
987				MAJOR(pdi->dev), MINOR(pdi->dev),
988				(unsigned long long) t->sector + t_sec(t));
989		return;
990	}
991
992	rb_erase(&iot->rb_node, &pdi->rb_track);
993	iot->sector -= t_sec(t);
994	track_rb_insert(pdi, iot);
995}
996
997static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t)
998{
999	struct io_track *iot;
1000
1001	if (!track_ios)
1002		return;
1003
1004	iot = find_track(pdi, t->pid, t->sector);
1005	iot->allocation_time = t->time;
1006}
1007
1008static inline int is_remapper(struct per_dev_info *pdi)
1009{
1010	int major = MAJOR(pdi->dev);
1011
1012	return (major == 253 || major == 9);
1013}
1014
1015/*
1016 * for md/dm setups, the interesting cycle is Q -> C. So track queueing
1017 * time here, as dispatch time
1018 */
1019static void log_track_queue(struct per_dev_info *pdi, struct blk_io_trace *t)
1020{
1021	struct io_track *iot;
1022
1023	if (!track_ios)
1024		return;
1025	if (!is_remapper(pdi))
1026		return;
1027
1028	iot = find_track(pdi, t->pid, t->sector);
1029	iot->dispatch_time = t->time;
1030}
1031
1032/*
1033 * return time between rq allocation and insertion
1034 */
1035static unsigned long long log_track_insert(struct per_dev_info *pdi,
1036					   struct blk_io_trace *t)
1037{
1038	unsigned long long elapsed;
1039	struct io_track *iot;
1040
1041	if (!track_ios)
1042		return -1;
1043
1044	iot = find_track(pdi, t->pid, t->sector);
1045	iot->queue_time = t->time;
1046
1047	if (!iot->allocation_time)
1048		return -1;
1049
1050	elapsed = iot->queue_time - iot->allocation_time;
1051
1052	if (per_process_stats) {
1053		struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1054		int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1055
1056		if (ppi && elapsed > ppi->longest_allocation_wait[w])
1057			ppi->longest_allocation_wait[w] = elapsed;
1058	}
1059
1060	return elapsed;
1061}
1062
1063/*
1064 * return time between queue and issue
1065 */
1066static unsigned long long log_track_issue(struct per_dev_info *pdi,
1067					  struct blk_io_trace *t)
1068{
1069	unsigned long long elapsed;
1070	struct io_track *iot;
1071
1072	if (!track_ios)
1073		return -1;
1074	if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
1075		return -1;
1076
1077	iot = __find_track(pdi, t->sector);
1078	if (!iot) {
1079		if (verbose)
1080			fprintf(stderr, "issue not found for (%d,%d): %llu\n",
1081				MAJOR(pdi->dev), MINOR(pdi->dev),
1082				(unsigned long long) t->sector);
1083		return -1;
1084	}
1085
1086	iot->dispatch_time = t->time;
1087	elapsed = iot->dispatch_time - iot->queue_time;
1088
1089	if (per_process_stats) {
1090		struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1091		int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1092
1093		if (ppi && elapsed > ppi->longest_dispatch_wait[w])
1094			ppi->longest_dispatch_wait[w] = elapsed;
1095	}
1096
1097	return elapsed;
1098}
1099
1100/*
1101 * return time between dispatch and complete
1102 */
1103static unsigned long long log_track_complete(struct per_dev_info *pdi,
1104					     struct blk_io_trace *t)
1105{
1106	unsigned long long elapsed;
1107	struct io_track *iot;
1108
1109	if (!track_ios)
1110		return -1;
1111
1112	iot = __find_track(pdi, t->sector);
1113	if (!iot) {
1114		if (verbose)
1115			fprintf(stderr,"complete not found for (%d,%d): %llu\n",
1116				MAJOR(pdi->dev), MINOR(pdi->dev),
1117				(unsigned long long) t->sector);
1118		return -1;
1119	}
1120
1121	iot->completion_time = t->time;
1122	elapsed = iot->completion_time - iot->dispatch_time;
1123
1124	if (per_process_stats) {
1125		struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1126		int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1127
1128		if (ppi && elapsed > ppi->longest_completion_wait[w])
1129			ppi->longest_completion_wait[w] = elapsed;
1130	}
1131
1132	/*
1133	 * kill the trace, we don't need it after completion
1134	 */
1135	rb_erase(&iot->rb_node, &pdi->rb_track);
1136	free(iot);
1137
1138	return elapsed;
1139}
1140
1141
1142static struct io_stats *find_process_io_stats(pid_t pid)
1143{
1144	struct per_process_info *ppi = find_ppi(pid);
1145
1146	if (!ppi) {
1147		ppi = malloc(sizeof(*ppi));
1148		memset(ppi, 0, sizeof(*ppi));
1149		ppi->ppm = find_ppm(pid);
1150		if (!ppi->ppm)
1151			ppi->ppm = add_ppm_hash(pid, "unknown");
1152		add_ppi_to_hash(ppi);
1153		add_ppi_to_list(ppi);
1154	}
1155
1156	return &ppi->io_stats;
1157}
1158
1159static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
1160{
1161	if (pdi->name)
1162		snprintf(buffer, size, "%s", pdi->name);
1163	else
1164		snprintf(buffer, size, "%d,%d",MAJOR(pdi->dev),MINOR(pdi->dev));
1165	return buffer;
1166}
1167
1168static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
1169{
1170	unsigned long long this = bit->time;
1171	unsigned long long last = pdi->last_reported_time;
1172
1173	pdi->backwards = (this < last) ? 'B' : ' ';
1174	pdi->last_reported_time = this;
1175}
1176
1177static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
1178			       int rw)
1179{
1180	if (rw) {
1181		ios->mwrites++;
1182		ios->mwrite_kb += t_kb(t);
1183	} else {
1184		ios->mreads++;
1185		ios->mread_kb += t_kb(t);
1186	}
1187}
1188
1189static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
1190			     int rw)
1191{
1192	__account_m(&pci->io_stats, t, rw);
1193
1194	if (per_process_stats) {
1195		struct io_stats *ios = find_process_io_stats(t->pid);
1196
1197		__account_m(ios, t, rw);
1198	}
1199}
1200
1201static inline void __account_pc_queue(struct io_stats *ios,
1202				      struct blk_io_trace *t, int rw)
1203{
1204	if (rw) {
1205		ios->qwrites_pc++;
1206		ios->qwrite_kb_pc += t_kb(t);
1207	} else {
1208		ios->qreads_pc++;
1209		ios->qread_kb += t_kb(t);
1210	}
1211}
1212
1213static inline void account_pc_queue(struct blk_io_trace *t,
1214				    struct per_cpu_info *pci, int rw)
1215{
1216	__account_pc_queue(&pci->io_stats, t, rw);
1217
1218	if (per_process_stats) {
1219		struct io_stats *ios = find_process_io_stats(t->pid);
1220
1221		__account_pc_queue(ios, t, rw);
1222	}
1223}
1224
1225static inline void __account_pc_issue(struct io_stats *ios, int rw,
1226				      unsigned int bytes)
1227{
1228	if (rw) {
1229		ios->iwrites_pc++;
1230		ios->iwrite_kb_pc += bytes >> 10;
1231	} else {
1232		ios->ireads_pc++;
1233		ios->iread_kb_pc += bytes >> 10;
1234	}
1235}
1236
1237static inline void account_pc_issue(struct blk_io_trace *t,
1238				    struct per_cpu_info *pci, int rw)
1239{
1240	__account_pc_issue(&pci->io_stats, rw, t->bytes);
1241
1242	if (per_process_stats) {
1243		struct io_stats *ios = find_process_io_stats(t->pid);
1244
1245		__account_pc_issue(ios, rw, t->bytes);
1246	}
1247}
1248
1249static inline void __account_pc_requeue(struct io_stats *ios,
1250					struct blk_io_trace *t, int rw)
1251{
1252	if (rw) {
1253		ios->wrqueue_pc++;
1254		ios->iwrite_kb_pc -= t_kb(t);
1255	} else {
1256		ios->rrqueue_pc++;
1257		ios->iread_kb_pc -= t_kb(t);
1258	}
1259}
1260
1261static inline void account_pc_requeue(struct blk_io_trace *t,
1262				      struct per_cpu_info *pci, int rw)
1263{
1264	__account_pc_requeue(&pci->io_stats, t, rw);
1265
1266	if (per_process_stats) {
1267		struct io_stats *ios = find_process_io_stats(t->pid);
1268
1269		__account_pc_requeue(ios, t, rw);
1270	}
1271}
1272
1273static inline void __account_pc_c(struct io_stats *ios, int rw)
1274{
1275	if (rw)
1276		ios->cwrites_pc++;
1277	else
1278		ios->creads_pc++;
1279}
1280
1281static inline void account_pc_c(struct blk_io_trace *t,
1282				struct per_cpu_info *pci, int rw)
1283{
1284	__account_pc_c(&pci->io_stats, rw);
1285
1286	if (per_process_stats) {
1287		struct io_stats *ios = find_process_io_stats(t->pid);
1288
1289		__account_pc_c(ios, rw);
1290	}
1291}
1292
1293static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
1294				   int rw)
1295{
1296	if (rw) {
1297		ios->qwrites++;
1298		ios->qwrite_kb += t_kb(t);
1299	} else {
1300		ios->qreads++;
1301		ios->qread_kb += t_kb(t);
1302	}
1303}
1304
1305static inline void account_queue(struct blk_io_trace *t,
1306				 struct per_cpu_info *pci, int rw)
1307{
1308	__account_queue(&pci->io_stats, t, rw);
1309
1310	if (per_process_stats) {
1311		struct io_stats *ios = find_process_io_stats(t->pid);
1312
1313		__account_queue(ios, t, rw);
1314	}
1315}
1316
1317static inline void __account_c(struct io_stats *ios, int rw, int bytes)
1318{
1319	if (rw) {
1320		ios->cwrites++;
1321		ios->cwrite_kb += bytes >> 10;
1322	} else {
1323		ios->creads++;
1324		ios->cread_kb += bytes >> 10;
1325	}
1326}
1327
1328static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
1329			     int rw, int bytes)
1330{
1331	__account_c(&pci->io_stats, rw, bytes);
1332
1333	if (per_process_stats) {
1334		struct io_stats *ios = find_process_io_stats(t->pid);
1335
1336		__account_c(ios, rw, bytes);
1337	}
1338}
1339
1340static inline void __account_issue(struct io_stats *ios, int rw,
1341				   unsigned int bytes)
1342{
1343	if (rw) {
1344		ios->iwrites++;
1345		ios->iwrite_kb += bytes >> 10;
1346	} else {
1347		ios->ireads++;
1348		ios->iread_kb += bytes >> 10;
1349	}
1350}
1351
1352static inline void account_issue(struct blk_io_trace *t,
1353				 struct per_cpu_info *pci, int rw)
1354{
1355	__account_issue(&pci->io_stats, rw, t->bytes);
1356
1357	if (per_process_stats) {
1358		struct io_stats *ios = find_process_io_stats(t->pid);
1359
1360		__account_issue(ios, rw, t->bytes);
1361	}
1362}
1363
1364static inline void __account_unplug(struct io_stats *ios, int timer)
1365{
1366	if (timer)
1367		ios->timer_unplugs++;
1368	else
1369		ios->io_unplugs++;
1370}
1371
1372static inline void account_unplug(struct blk_io_trace *t,
1373				  struct per_cpu_info *pci, int timer)
1374{
1375	__account_unplug(&pci->io_stats, timer);
1376
1377	if (per_process_stats) {
1378		struct io_stats *ios = find_process_io_stats(t->pid);
1379
1380		__account_unplug(ios, timer);
1381	}
1382}
1383
1384static inline void __account_requeue(struct io_stats *ios,
1385				     struct blk_io_trace *t, int rw)
1386{
1387	if (rw) {
1388		ios->wrqueue++;
1389		ios->iwrite_kb -= t_kb(t);
1390	} else {
1391		ios->rrqueue++;
1392		ios->iread_kb -= t_kb(t);
1393	}
1394}
1395
1396static inline void account_requeue(struct blk_io_trace *t,
1397				   struct per_cpu_info *pci, int rw)
1398{
1399	__account_requeue(&pci->io_stats, t, rw);
1400
1401	if (per_process_stats) {
1402		struct io_stats *ios = find_process_io_stats(t->pid);
1403
1404		__account_requeue(ios, t, rw);
1405	}
1406}
1407
1408static void log_complete(struct per_dev_info *pdi, struct per_cpu_info *pci,
1409			 struct blk_io_trace *t, char *act)
1410{
1411	process_fmt(act, pci, t, log_track_complete(pdi, t), 0, NULL);
1412}
1413
1414static void log_insert(struct per_dev_info *pdi, struct per_cpu_info *pci,
1415		       struct blk_io_trace *t, char *act)
1416{
1417	process_fmt(act, pci, t, log_track_insert(pdi, t), 0, NULL);
1418}
1419
1420static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
1421		      char *act)
1422{
1423	process_fmt(act, pci, t, -1, 0, NULL);
1424}
1425
1426static void log_issue(struct per_dev_info *pdi, struct per_cpu_info *pci,
1427		      struct blk_io_trace *t, char *act)
1428{
1429	process_fmt(act, pci, t, log_track_issue(pdi, t), 0, NULL);
1430}
1431
1432static void log_merge(struct per_dev_info *pdi, struct per_cpu_info *pci,
1433		      struct blk_io_trace *t, char *act)
1434{
1435	if (act[0] == 'F')
1436		log_track_frontmerge(pdi, t);
1437
1438	process_fmt(act, pci, t, -1ULL, 0, NULL);
1439}
1440
1441static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
1442			char *act)
1443{
1444	process_fmt(act, pci, t, -1ULL, 0, NULL);
1445}
1446
1447static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
1448			char *act)
1449{
1450	process_fmt(act, pci, t, -1ULL, 0, NULL);
1451}
1452
1453static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
1454		      char *act)
1455{
1456	process_fmt(act, pci, t, -1ULL, 0, NULL);
1457}
1458
1459static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
1460		      char *act)
1461{
1462	process_fmt(act, pci, t, -1ULL, 0, NULL);
1463}
1464
1465static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
1466{
1467	unsigned char *buf = (unsigned char *) t + sizeof(*t);
1468
1469	process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
1470}
1471
1472static void dump_trace_pc(struct blk_io_trace *t, struct per_dev_info *pdi,
1473			  struct per_cpu_info *pci)
1474{
1475	int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1476	int act = t->action & 0xffff;
1477
1478	switch (act) {
1479		case __BLK_TA_QUEUE:
1480			log_generic(pci, t, "Q");
1481			account_pc_queue(t, pci, w);
1482			break;
1483		case __BLK_TA_GETRQ:
1484			log_generic(pci, t, "G");
1485			break;
1486		case __BLK_TA_SLEEPRQ:
1487			log_generic(pci, t, "S");
1488			break;
1489		case __BLK_TA_REQUEUE:
1490			/*
1491			 * can happen if we miss traces, don't let it go
1492			 * below zero
1493			 */
1494			if (pdi->cur_depth[w])
1495				pdi->cur_depth[w]--;
1496			account_pc_requeue(t, pci, w);
1497			log_generic(pci, t, "R");
1498			break;
1499		case __BLK_TA_ISSUE:
1500			account_pc_issue(t, pci, w);
1501			pdi->cur_depth[w]++;
1502			if (pdi->cur_depth[w] > pdi->max_depth[w])
1503				pdi->max_depth[w] = pdi->cur_depth[w];
1504			log_pc(pci, t, "D");
1505			break;
1506		case __BLK_TA_COMPLETE:
1507			if (pdi->cur_depth[w])
1508				pdi->cur_depth[w]--;
1509			log_pc(pci, t, "C");
1510			account_pc_c(t, pci, w);
1511			break;
1512		case __BLK_TA_INSERT:
1513			log_pc(pci, t, "I");
1514			break;
1515		default:
1516			fprintf(stderr, "Bad pc action %x\n", act);
1517			break;
1518	}
1519}
1520
1521static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
1522			  struct per_cpu_info *pci)
1523{
1524	int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1525	int act = t->action & 0xffff;
1526
1527	switch (act) {
1528		case __BLK_TA_QUEUE:
1529			log_track_queue(pdi, t);
1530			account_queue(t, pci, w);
1531			log_queue(pci, t, "Q");
1532			break;
1533		case __BLK_TA_INSERT:
1534			log_insert(pdi, pci, t, "I");
1535			break;
1536		case __BLK_TA_BACKMERGE:
1537			account_m(t, pci, w);
1538			log_merge(pdi, pci, t, "M");
1539			break;
1540		case __BLK_TA_FRONTMERGE:
1541			account_m(t, pci, w);
1542			log_merge(pdi, pci, t, "F");
1543			break;
1544		case __BLK_TA_GETRQ:
1545			log_track_getrq(pdi, t);
1546			log_generic(pci, t, "G");
1547			break;
1548		case __BLK_TA_SLEEPRQ:
1549			log_generic(pci, t, "S");
1550			break;
1551		case __BLK_TA_REQUEUE:
1552			/*
1553			 * can happen if we miss traces, don't let it go
1554			 * below zero
1555			 */
1556			if (pdi->cur_depth[w])
1557				pdi->cur_depth[w]--;
1558			account_requeue(t, pci, w);
1559			log_queue(pci, t, "R");
1560			break;
1561		case __BLK_TA_ISSUE:
1562			account_issue(t, pci, w);
1563			pdi->cur_depth[w]++;
1564			if (pdi->cur_depth[w] > pdi->max_depth[w])
1565				pdi->max_depth[w] = pdi->cur_depth[w];
1566			log_issue(pdi, pci, t, "D");
1567			break;
1568		case __BLK_TA_COMPLETE:
1569			if (pdi->cur_depth[w])
1570				pdi->cur_depth[w]--;
1571			account_c(t, pci, w, t->bytes);
1572			log_complete(pdi, pci, t, "C");
1573			break;
1574		case __BLK_TA_PLUG:
1575			log_action(pci, t, "P");
1576			break;
1577		case __BLK_TA_UNPLUG_IO:
1578			account_unplug(t, pci, 0);
1579			log_unplug(pci, t, "U");
1580			break;
1581		case __BLK_TA_UNPLUG_TIMER:
1582			account_unplug(t, pci, 1);
1583			log_unplug(pci, t, "UT");
1584			break;
1585		case __BLK_TA_SPLIT:
1586			log_split(pci, t, "X");
1587			break;
1588		case __BLK_TA_BOUNCE:
1589			log_generic(pci, t, "B");
1590			break;
1591		case __BLK_TA_REMAP:
1592			log_generic(pci, t, "A");
1593			break;
1594		case __BLK_TA_DRV_DATA:
1595			have_drv_data = 1;
1596			/* dump to binary file only */
1597			break;
1598		default:
1599			fprintf(stderr, "Bad fs action %x\n", t->action);
1600			break;
1601	}
1602}
1603
1604static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1605		       struct per_dev_info *pdi)
1606{
1607	if (text_output) {
1608		if (t->action == BLK_TN_MESSAGE)
1609			handle_notify(t);
1610		else if (t->action & BLK_TC_ACT(BLK_TC_PC))
1611			dump_trace_pc(t, pdi, pci);
1612		else
1613			dump_trace_fs(t, pdi, pci);
1614	}
1615
1616	if (!pdi->events)
1617		pdi->first_reported_time = t->time;
1618
1619	pdi->events++;
1620
1621	if (bin_output_msgs ||
1622			    !(t->action & BLK_TC_ACT(BLK_TC_NOTIFY) &&
1623			      t->action == BLK_TN_MESSAGE))
1624		output_binary(t, sizeof(*t) + t->pdu_len);
1625}
1626
1627/*
1628 * print in a proper way, not too small and not too big. if more than
1629 * 1000,000K, turn into M and so on
1630 */
1631static char *size_cnv(char *dst, unsigned long long num, int in_kb)
1632{
1633	char suff[] = { '\0', 'K', 'M', 'G', 'P' };
1634	unsigned int i = 0;
1635
1636	if (in_kb)
1637		i++;
1638
1639	while (num > 1000 * 1000ULL && (i < sizeof(suff) - 1)) {
1640		i++;
1641		num /= 1000;
1642	}
1643
1644	sprintf(dst, "%'8Lu%c", num, suff[i]);
1645	return dst;
1646}
1647
1648static void dump_io_stats(struct per_dev_info *pdi, struct io_stats *ios,
1649			  char *msg)
1650{
1651	static char x[256], y[256];
1652
1653	fprintf(ofp, "%s\n", msg);
1654
1655	fprintf(ofp, " Reads Queued:    %s, %siB\t", size_cnv(x, ios->qreads, 0), size_cnv(y, ios->qread_kb, 1));
1656	fprintf(ofp, " Writes Queued:    %s, %siB\n", size_cnv(x, ios->qwrites, 0), size_cnv(y, ios->qwrite_kb, 1));
1657	fprintf(ofp, " Read Dispatches: %s, %siB\t", size_cnv(x, ios->ireads, 0), size_cnv(y, ios->iread_kb, 1));
1658	fprintf(ofp, " Write Dispatches: %s, %siB\n", size_cnv(x, ios->iwrites, 0), size_cnv(y, ios->iwrite_kb, 1));
1659	fprintf(ofp, " Reads Requeued:  %s\t\t", size_cnv(x, ios->rrqueue, 0));
1660	fprintf(ofp, " Writes Requeued:  %s\n", size_cnv(x, ios->wrqueue, 0));
1661	fprintf(ofp, " Reads Completed: %s, %siB\t", size_cnv(x, ios->creads, 0), size_cnv(y, ios->cread_kb, 1));
1662	fprintf(ofp, " Writes Completed: %s, %siB\n", size_cnv(x, ios->cwrites, 0), size_cnv(y, ios->cwrite_kb, 1));
1663	fprintf(ofp, " Read Merges:     %s, %siB\t", size_cnv(x, ios->mreads, 0), size_cnv(y, ios->mread_kb, 1));
1664	fprintf(ofp, " Write Merges:     %s, %siB\n", size_cnv(x, ios->mwrites, 0), size_cnv(y, ios->mwrite_kb, 1));
1665	if (pdi) {
1666		fprintf(ofp, " Read depth:      %'8u%8c\t", pdi->max_depth[0], ' ');
1667		fprintf(ofp, " Write depth:      %'8u\n", pdi->max_depth[1]);
1668	}
1669	if (ios->qreads_pc || ios->qwrites_pc || ios->ireads_pc || ios->iwrites_pc ||
1670	    ios->rrqueue_pc || ios->wrqueue_pc || ios->creads_pc || ios->cwrites_pc) {
1671		fprintf(ofp, " PC Reads Queued: %s, %siB\t", size_cnv(x, ios->qreads_pc, 0), size_cnv(y, ios->qread_kb_pc, 1));
1672		fprintf(ofp, " PC Writes Queued: %s, %siB\n", size_cnv(x, ios->qwrites_pc, 0), size_cnv(y, ios->qwrite_kb_pc, 1));
1673		fprintf(ofp, " PC Read Disp.:   %s, %siB\t", size_cnv(x, ios->ireads_pc, 0), size_cnv(y, ios->iread_kb_pc, 1));
1674		fprintf(ofp, " PC Write Disp.:   %s, %siB\n", size_cnv(x, ios->iwrites_pc, 0), size_cnv(y, ios->iwrite_kb_pc, 1));
1675		fprintf(ofp, " PC Reads Req.:   %s\t\t", size_cnv(x, ios->rrqueue_pc, 0));
1676		fprintf(ofp, " PC Writes Req.:   %s\n", size_cnv(x, ios->wrqueue_pc, 0));
1677		fprintf(ofp, " PC Reads Compl.: %s\t\t", size_cnv(x, ios->creads_pc, 0));
1678		fprintf(ofp, " PC Writes Compl.: %s\n", size_cnv(x, ios->cwrites, 0));
1679	}
1680	fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
1681	fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
1682}
1683
1684static void dump_wait_stats(struct per_process_info *ppi)
1685{
1686	unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1687	unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1688	unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1689	unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1690	unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1691	unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1692
1693	fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1694	fprintf(ofp, " Allocation wait:  %'8lu\n", wawait);
1695	fprintf(ofp, " Dispatch wait:   %'8lu%8c\t", rdwait, ' ');
1696	fprintf(ofp, " Dispatch wait:    %'8lu\n", wdwait);
1697	fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1698	fprintf(ofp, " Completion wait:  %'8lu\n", wcwait);
1699}
1700
1701static int ppi_name_compare(const void *p1, const void *p2)
1702{
1703	struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1704	struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1705	int res;
1706
1707	res = strverscmp(ppi1->ppm->comm, ppi2->ppm->comm);
1708	if (!res)
1709		res = ppi1->ppm->pid > ppi2->ppm->pid;
1710
1711	return res;
1712}
1713
1714static void sort_process_list(void)
1715{
1716	struct per_process_info **ppis;
1717	struct per_process_info *ppi;
1718	int i = 0;
1719
1720	ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1721
1722	ppi = ppi_list;
1723	while (ppi) {
1724		ppis[i++] = ppi;
1725		ppi = ppi->list_next;
1726	}
1727
1728	qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
1729
1730	i = ppi_list_entries - 1;
1731	ppi_list = NULL;
1732	while (i >= 0) {
1733		ppi = ppis[i];
1734
1735		ppi->list_next = ppi_list;
1736		ppi_list = ppi;
1737		i--;
1738	}
1739
1740	free(ppis);
1741}
1742
1743static void show_process_stats(void)
1744{
1745	struct per_process_info *ppi;
1746
1747	sort_process_list();
1748
1749	ppi = ppi_list;
1750	while (ppi) {
1751		struct process_pid_map *ppm = ppi->ppm;
1752		char name[64];
1753
1754		if (ppi->more_than_one)
1755			sprintf(name, "%s (%u, ...)", ppm->comm, ppm->pid);
1756		else
1757			sprintf(name, "%s (%u)", ppm->comm, ppm->pid);
1758
1759		dump_io_stats(NULL, &ppi->io_stats, name);
1760		dump_wait_stats(ppi);
1761		ppi = ppi->list_next;
1762	}
1763
1764	fprintf(ofp, "\n");
1765}
1766
1767static void show_device_and_cpu_stats(void)
1768{
1769	struct per_dev_info *pdi;
1770	struct per_cpu_info *pci;
1771	struct io_stats total, *ios;
1772	unsigned long long rrate, wrate, msec;
1773	int i, j, pci_events;
1774	char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1775	char name[32];
1776	double ratio;
1777
1778	for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1779
1780		memset(&total, 0, sizeof(total));
1781		pci_events = 0;
1782
1783		if (i > 0)
1784			fprintf(ofp, "\n");
1785
1786		for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1787			if (!pci->nelems)
1788				continue;
1789
1790			ios = &pci->io_stats;
1791			total.qreads += ios->qreads;
1792			total.qwrites += ios->qwrites;
1793			total.creads += ios->creads;
1794			total.cwrites += ios->cwrites;
1795			total.mreads += ios->mreads;
1796			total.mwrites += ios->mwrites;
1797			total.ireads += ios->ireads;
1798			total.iwrites += ios->iwrites;
1799			total.rrqueue += ios->rrqueue;
1800			total.wrqueue += ios->wrqueue;
1801			total.qread_kb += ios->qread_kb;
1802			total.qwrite_kb += ios->qwrite_kb;
1803			total.cread_kb += ios->cread_kb;
1804			total.cwrite_kb += ios->cwrite_kb;
1805			total.iread_kb += ios->iread_kb;
1806			total.iwrite_kb += ios->iwrite_kb;
1807			total.mread_kb += ios->mread_kb;
1808			total.mwrite_kb += ios->mwrite_kb;
1809
1810			total.qreads_pc += ios->qreads_pc;
1811			total.qwrites_pc += ios->qwrites_pc;
1812			total.creads_pc += ios->creads_pc;
1813			total.cwrites_pc += ios->cwrites_pc;
1814			total.ireads_pc += ios->ireads_pc;
1815			total.iwrites_pc += ios->iwrites_pc;
1816			total.rrqueue_pc += ios->rrqueue_pc;
1817			total.wrqueue_pc += ios->wrqueue_pc;
1818			total.qread_kb_pc += ios->qread_kb_pc;
1819			total.qwrite_kb_pc += ios->qwrite_kb_pc;
1820			total.iread_kb_pc += ios->iread_kb_pc;
1821			total.iwrite_kb_pc += ios->iwrite_kb_pc;
1822
1823			total.timer_unplugs += ios->timer_unplugs;
1824			total.io_unplugs += ios->io_unplugs;
1825
1826			snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1827				 j, get_dev_name(pdi, name, sizeof(name)));
1828			dump_io_stats(pdi, ios, line);
1829			pci_events++;
1830		}
1831
1832		if (pci_events > 1) {
1833			fprintf(ofp, "\n");
1834			snprintf(line, sizeof(line) - 1, "Total (%s):",
1835				 get_dev_name(pdi, name, sizeof(name)));
1836			dump_io_stats(NULL, &total, line);
1837		}
1838
1839		wrate = rrate = 0;
1840		msec = (pdi->last_reported_time - pdi->first_reported_time) / 1000000;
1841		if (msec) {
1842			rrate = 1000 * total.cread_kb / msec;
1843			wrate = 1000 * total.cwrite_kb / msec;
1844		}
1845
1846		fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n",
1847			rrate, wrate);
1848		fprintf(ofp, "Events (%s): %'Lu entries\n",
1849			get_dev_name(pdi, line, sizeof(line)), pdi->events);
1850
1851		collect_pdi_skips(pdi);
1852		if (!pdi->skips && !pdi->events)
1853			ratio = 0.0;
1854		else
1855			ratio = 100.0 * ((double)pdi->seq_skips /
1856					(double)(pdi->events + pdi->seq_skips));
1857		fprintf(ofp, "Skips: %'lu forward (%'llu - %5.1lf%%)\n",
1858			pdi->skips, pdi->seq_skips, ratio);
1859	}
1860}
1861
1862static void find_genesis(void)
1863{
1864	struct trace *t = trace_list;
1865
1866	genesis_time = -1ULL;
1867	while (t != NULL) {
1868		if (t->bit->time < genesis_time)
1869			genesis_time = t->bit->time;
1870
1871		t = t->next;
1872	}
1873
1874	/* The time stamp record will usually be the first
1875	 * record in the trace, but not always.
1876	 */
1877	if (start_timestamp
1878	 && start_timestamp != genesis_time) {
1879		long delta = genesis_time - start_timestamp;
1880
1881		abs_start_time.tv_sec  += SECONDS(delta);
1882		abs_start_time.tv_nsec += NANO_SECONDS(delta);
1883		if (abs_start_time.tv_nsec < 0) {
1884			abs_start_time.tv_nsec += 1000000000;
1885			abs_start_time.tv_sec -= 1;
1886		} else
1887		if (abs_start_time.tv_nsec > 1000000000) {
1888			abs_start_time.tv_nsec -= 1000000000;
1889			abs_start_time.tv_sec += 1;
1890		}
1891	}
1892}
1893
1894static inline int check_stopwatch(struct blk_io_trace *bit)
1895{
1896	if (bit->time < stopwatch_end &&
1897	    bit->time >= stopwatch_start)
1898		return 0;
1899
1900	return 1;
1901}
1902
1903/*
1904 * return youngest entry read
1905 */
1906static int sort_entries(unsigned long long *youngest)
1907{
1908	struct per_dev_info *pdi = NULL;
1909	struct per_cpu_info *pci = NULL;
1910	struct trace *t;
1911
1912	if (!genesis_time)
1913		find_genesis();
1914
1915	*youngest = 0;
1916	while ((t = trace_list) != NULL) {
1917		struct blk_io_trace *bit = t->bit;
1918
1919		trace_list = t->next;
1920
1921		bit->time -= genesis_time;
1922
1923		if (bit->time < *youngest || !*youngest)
1924			*youngest = bit->time;
1925
1926		if (!pdi || pdi->dev != bit->device) {
1927			pdi = get_dev_info(bit->device);
1928			pci = NULL;
1929		}
1930
1931		if (!pci || pci->cpu != bit->cpu)
1932			pci = get_cpu_info(pdi, bit->cpu);
1933
1934		if (bit->sequence < pci->smallest_seq_read)
1935			pci->smallest_seq_read = bit->sequence;
1936
1937		if (check_stopwatch(bit)) {
1938			bit_free(bit);
1939			t_free(t);
1940			continue;
1941		}
1942
1943		if (trace_rb_insert_sort(t))
1944			return -1;
1945	}
1946
1947	return 0;
1948}
1949
1950/*
1951 * to continue, we must have traces from all online cpus in the tree
1952 */
1953static int check_cpu_map(struct per_dev_info *pdi)
1954{
1955	unsigned long *cpu_map;
1956	struct rb_node *n;
1957	struct trace *__t;
1958	unsigned int i;
1959	int ret, cpu;
1960
1961	/*
1962	 * create a map of the cpus we have traces for
1963	 */
1964	cpu_map = malloc(pdi->cpu_map_max / sizeof(long));
1965	n = rb_first(&rb_sort_root);
1966	while (n) {
1967		__t = rb_entry(n, struct trace, rb_node);
1968		cpu = __t->bit->cpu;
1969
1970		cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu));
1971		n = rb_next(n);
1972	}
1973
1974	/*
1975	 * we can't continue if pdi->cpu_map has entries set that we don't
1976	 * have in the sort rbtree. the opposite is not a problem, though
1977	 */
1978	ret = 0;
1979	for (i = 0; i < pdi->cpu_map_max / CPUS_PER_LONG; i++) {
1980		if (pdi->cpu_map[i] & ~(cpu_map[i])) {
1981			ret = 1;
1982			break;
1983		}
1984	}
1985
1986	free(cpu_map);
1987	return ret;
1988}
1989
1990static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force)
1991{
1992	struct blk_io_trace *bit = t->bit;
1993	unsigned long expected_sequence;
1994	struct per_cpu_info *pci;
1995	struct trace *__t;
1996
1997	pci = get_cpu_info(pdi, bit->cpu);
1998	expected_sequence = pci->last_sequence + 1;
1999
2000	if (!expected_sequence) {
2001		/*
2002		 * 1 should be the first entry, just allow it
2003		 */
2004		if (bit->sequence == 1)
2005			return 0;
2006		if (bit->sequence == pci->smallest_seq_read)
2007			return 0;
2008
2009		return check_cpu_map(pdi);
2010	}
2011
2012	if (bit->sequence == expected_sequence)
2013		return 0;
2014
2015	/*
2016	 * we may not have seen that sequence yet. if we are not doing
2017	 * the final run, break and wait for more entries.
2018	 */
2019	if (expected_sequence < pci->smallest_seq_read) {
2020		__t = trace_rb_find_last(pdi, pci, expected_sequence);
2021		if (!__t)
2022			goto skip;
2023
2024		__put_trace_last(pdi, __t);
2025		return 0;
2026	} else if (!force) {
2027		return 1;
2028	} else {
2029skip:
2030		if (check_current_skips(pci, bit->sequence))
2031			return 0;
2032
2033		if (expected_sequence < bit->sequence)
2034			insert_skip(pci, expected_sequence, bit->sequence - 1);
2035		return 0;
2036	}
2037}
2038
2039static void show_entries_rb(int force)
2040{
2041	struct per_dev_info *pdi = NULL;
2042	struct per_cpu_info *pci = NULL;
2043	struct blk_io_trace *bit;
2044	struct rb_node *n;
2045	struct trace *t;
2046
2047	while ((n = rb_first(&rb_sort_root)) != NULL) {
2048		if (is_done() && !force && !pipeline)
2049			break;
2050
2051		t = rb_entry(n, struct trace, rb_node);
2052		bit = t->bit;
2053
2054		if (read_sequence - t->read_sequence < 1 && !force)
2055			break;
2056
2057		if (!pdi || pdi->dev != bit->device) {
2058			pdi = get_dev_info(bit->device);
2059			pci = NULL;
2060		}
2061
2062		if (!pdi) {
2063			fprintf(stderr, "Unknown device ID? (%d,%d)\n",
2064				MAJOR(bit->device), MINOR(bit->device));
2065			break;
2066		}
2067
2068		if (check_sequence(pdi, t, force))
2069			break;
2070
2071		if (!force && bit->time > last_allowed_time)
2072			break;
2073
2074		check_time(pdi, bit);
2075
2076		if (!pci || pci->cpu != bit->cpu)
2077			pci = get_cpu_info(pdi, bit->cpu);
2078
2079		pci->last_sequence = bit->sequence;
2080
2081		pci->nelems++;
2082
2083		if (bit->action & (act_mask << BLK_TC_SHIFT))
2084			dump_trace(bit, pci, pdi);
2085
2086		put_trace(pdi, t);
2087	}
2088}
2089
2090static int read_data(int fd, void *buffer, int bytes, int block, int *fdblock)
2091{
2092	int ret, bytes_left, fl;
2093	void *p;
2094
2095	if (block != *fdblock) {
2096		fl = fcntl(fd, F_GETFL);
2097
2098		if (!block) {
2099			*fdblock = 0;
2100			fcntl(fd, F_SETFL, fl | O_NONBLOCK);
2101		} else {
2102			*fdblock = 1;
2103			fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
2104		}
2105	}
2106
2107	bytes_left = bytes;
2108	p = buffer;
2109	while (bytes_left > 0) {
2110		ret = read(fd, p, bytes_left);
2111		if (!ret)
2112			return 1;
2113		else if (ret < 0) {
2114			if (errno != EAGAIN) {
2115				perror("read");
2116				return -1;
2117			}
2118
2119			/*
2120			 * never do partial reads. we can return if we
2121			 * didn't read anything and we should not block,
2122			 * otherwise wait for data
2123			 */
2124			if ((bytes_left == bytes) && !block)
2125				return 1;
2126
2127			usleep(10);
2128			continue;
2129		} else {
2130			p += ret;
2131			bytes_left -= ret;
2132		}
2133	}
2134
2135	return 0;
2136}
2137
2138static inline __u16 get_pdulen(struct blk_io_trace *bit)
2139{
2140	if (data_is_native)
2141		return bit->pdu_len;
2142
2143	return __bswap_16(bit->pdu_len);
2144}
2145
2146static inline __u32 get_magic(struct blk_io_trace *bit)
2147{
2148	if (data_is_native)
2149		return bit->magic;
2150
2151	return __bswap_32(bit->magic);
2152}
2153
2154static int read_events(int fd, int always_block, int *fdblock)
2155{
2156	struct per_dev_info *pdi = NULL;
2157	unsigned int events = 0;
2158
2159	while (!is_done() && events < rb_batch) {
2160		struct blk_io_trace *bit;
2161		struct trace *t;
2162		int pdu_len, should_block, ret;
2163		__u32 magic;
2164
2165		bit = bit_alloc();
2166
2167		should_block = !events || always_block;
2168
2169		ret = read_data(fd, bit, sizeof(*bit), should_block, fdblock);
2170		if (ret) {
2171			bit_free(bit);
2172			if (!events && ret < 0)
2173				events = ret;
2174			break;
2175		}
2176
2177		/*
2178		 * look at first trace to check whether we need to convert
2179		 * data in the future
2180		 */
2181		if (data_is_native == -1 && check_data_endianness(bit->magic))
2182			break;
2183
2184		magic = get_magic(bit);
2185		if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2186			fprintf(stderr, "Bad magic %x\n", magic);
2187			break;
2188		}
2189
2190		pdu_len = get_pdulen(bit);
2191		if (pdu_len) {
2192			void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
2193
2194			if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1, fdblock)) {
2195				bit_free(ptr);
2196				break;
2197			}
2198
2199			bit = ptr;
2200		}
2201
2202		trace_to_cpu(bit);
2203
2204		if (verify_trace(bit)) {
2205			bit_free(bit);
2206			continue;
2207		}
2208
2209		/*
2210		 * not a real trace, so grab and handle it here
2211		 */
2212		if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
2213			handle_notify(bit);
2214			output_binary(bit, sizeof(*bit) + bit->pdu_len);
2215			continue;
2216		}
2217
2218		t = t_alloc();
2219		memset(t, 0, sizeof(*t));
2220		t->bit = bit;
2221		t->read_sequence = read_sequence;
2222
2223		t->next = trace_list;
2224		trace_list = t;
2225
2226		if (!pdi || pdi->dev != bit->device)
2227			pdi = get_dev_info(bit->device);
2228
2229		if (bit->time > pdi->last_read_time)
2230			pdi->last_read_time = bit->time;
2231
2232		events++;
2233	}
2234
2235	return events;
2236}
2237
2238/*
2239 * Managing input streams
2240 */
2241
2242struct ms_stream {
2243	struct ms_stream *next;
2244	struct trace *first, *last;
2245	struct per_dev_info *pdi;
2246	unsigned int cpu;
2247};
2248
2249#define MS_HASH(d, c) ((MAJOR(d) & 0xff) ^ (MINOR(d) & 0xff) ^ (cpu & 0xff))
2250
2251struct ms_stream *ms_head;
2252struct ms_stream *ms_hash[256];
2253
2254static void ms_sort(struct ms_stream *msp);
2255static int ms_prime(struct ms_stream *msp);
2256
2257static inline struct trace *ms_peek(struct ms_stream *msp)
2258{
2259	return (msp == NULL) ? NULL : msp->first;
2260}
2261
2262static inline __u64 ms_peek_time(struct ms_stream *msp)
2263{
2264	return ms_peek(msp)->bit->time;
2265}
2266
2267static inline void ms_resort(struct ms_stream *msp)
2268{
2269	if (msp->next && ms_peek_time(msp) > ms_peek_time(msp->next)) {
2270		ms_head = msp->next;
2271		msp->next = NULL;
2272		ms_sort(msp);
2273	}
2274}
2275
2276static inline void ms_deq(struct ms_stream *msp)
2277{
2278	msp->first = msp->first->next;
2279	if (!msp->first) {
2280		msp->last = NULL;
2281		if (!ms_prime(msp)) {
2282			ms_head = msp->next;
2283			msp->next = NULL;
2284			return;
2285		}
2286	}
2287
2288	ms_resort(msp);
2289}
2290
2291static void ms_sort(struct ms_stream *msp)
2292{
2293	__u64 msp_t = ms_peek_time(msp);
2294	struct ms_stream *this_msp = ms_head;
2295
2296	if (this_msp == NULL)
2297		ms_head = msp;
2298	else if (msp_t < ms_peek_time(this_msp)) {
2299		msp->next = this_msp;
2300		ms_head = msp;
2301	}
2302	else {
2303		while (this_msp->next && ms_peek_time(this_msp->next) < msp_t)
2304			this_msp = this_msp->next;
2305
2306		msp->next = this_msp->next;
2307		this_msp->next = msp;
2308	}
2309}
2310
2311static int ms_prime(struct ms_stream *msp)
2312{
2313	__u32 magic;
2314	unsigned int i;
2315	struct trace *t;
2316	struct per_dev_info *pdi = msp->pdi;
2317	struct per_cpu_info *pci = get_cpu_info(pdi, msp->cpu);
2318	struct blk_io_trace *bit = NULL;
2319	int ret, pdu_len, ndone = 0;
2320
2321	for (i = 0; !is_done() && pci->fd >= 0 && i < rb_batch; i++) {
2322		bit = bit_alloc();
2323		ret = read_data(pci->fd, bit, sizeof(*bit), 1, &pci->fdblock);
2324		if (ret)
2325			goto err;
2326
2327		if (data_is_native == -1 && check_data_endianness(bit->magic))
2328			goto err;
2329
2330		magic = get_magic(bit);
2331		if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2332			fprintf(stderr, "Bad magic %x\n", magic);
2333			goto err;
2334
2335		}
2336
2337		pdu_len = get_pdulen(bit);
2338		if (pdu_len) {
2339			void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
2340			ret = read_data(pci->fd, ptr + sizeof(*bit), pdu_len,
2341							     1, &pci->fdblock);
2342			if (ret) {
2343				free(ptr);
2344				bit = NULL;
2345				goto err;
2346			}
2347
2348			bit = ptr;
2349		}
2350
2351		trace_to_cpu(bit);
2352		if (verify_trace(bit))
2353			goto err;
2354
2355		if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
2356			handle_notify(bit);
2357			output_binary(bit, sizeof(*bit) + bit->pdu_len);
2358			bit_free(bit);
2359
2360			i -= 1;
2361			continue;
2362		}
2363
2364		if (bit->time > pdi->last_read_time)
2365			pdi->last_read_time = bit->time;
2366
2367		t = t_alloc();
2368		memset(t, 0, sizeof(*t));
2369		t->bit = bit;
2370
2371		if (msp->first == NULL)
2372			msp->first = msp->last = t;
2373		else {
2374			msp->last->next = t;
2375			msp->last = t;
2376		}
2377
2378		ndone++;
2379	}
2380
2381	return ndone;
2382
2383err:
2384	if (bit) bit_free(bit);
2385
2386	cpu_mark_offline(pdi, pci->cpu);
2387	close(pci->fd);
2388	pci->fd = -1;
2389
2390	return ndone;
2391}
2392
2393static struct ms_stream *ms_alloc(struct per_dev_info *pdi, int cpu)
2394{
2395	struct ms_stream *msp = malloc(sizeof(*msp));
2396
2397	msp->next = NULL;
2398	msp->first = msp->last = NULL;
2399	msp->pdi = pdi;
2400	msp->cpu = cpu;
2401
2402	if (ms_prime(msp))
2403		ms_sort(msp);
2404
2405	return msp;
2406}
2407
2408static int setup_file(struct per_dev_info *pdi, int cpu)
2409{
2410	int len = 0;
2411	struct stat st;
2412	char *p, *dname;
2413	struct per_cpu_info *pci = get_cpu_info(pdi, cpu);
2414
2415	pci->cpu = cpu;
2416	pci->fdblock = -1;
2417
2418	p = strdup(pdi->name);
2419	dname = dirname(p);
2420	if (strcmp(dname, ".")) {
2421		input_dir = dname;
2422		p = strdup(pdi->name);
2423		strcpy(pdi->name, basename(p));
2424	}
2425	free(p);
2426
2427	if (input_dir)
2428		len = sprintf(pci->fname, "%s/", input_dir);
2429
2430	snprintf(pci->fname + len, sizeof(pci->fname)-1-len,
2431		 "%s.blktrace.%d", pdi->name, pci->cpu);
2432	if (stat(pci->fname, &st) < 0)
2433		return 0;
2434	if (!st.st_size)
2435		return 1;
2436
2437	pci->fd = open(pci->fname, O_RDONLY);
2438	if (pci->fd < 0) {
2439		perror(pci->fname);
2440		return 0;
2441	}
2442
2443	printf("Input file %s added\n", pci->fname);
2444	cpu_mark_online(pdi, pci->cpu);
2445
2446	pdi->nfiles++;
2447	ms_alloc(pdi, pci->cpu);
2448
2449	return 1;
2450}
2451
2452static int handle(struct ms_stream *msp)
2453{
2454	struct trace *t;
2455	struct per_dev_info *pdi;
2456	struct per_cpu_info *pci;
2457	struct blk_io_trace *bit;
2458
2459	t = ms_peek(msp);
2460
2461	bit = t->bit;
2462	pdi = msp->pdi;
2463	pci = get_cpu_info(pdi, msp->cpu);
2464	pci->nelems++;
2465	bit->time -= genesis_time;
2466
2467	if (t->bit->time > stopwatch_end)
2468		return 0;
2469
2470	pdi->last_reported_time = bit->time;
2471	if ((bit->action & (act_mask << BLK_TC_SHIFT))&&
2472	    t->bit->time >= stopwatch_start)
2473		dump_trace(bit, pci, pdi);
2474
2475	ms_deq(msp);
2476
2477	if (text_output)
2478		trace_rb_insert_last(pdi, t);
2479	else {
2480		bit_free(t->bit);
2481		t_free(t);
2482	}
2483
2484	return 1;
2485}
2486
2487/*
2488 * Check if we need to sanitize the name. We allow 'foo', or if foo.blktrace.X
2489 * is given, then strip back down to 'foo' to avoid missing files.
2490 */
2491static int name_fixup(char *name)
2492{
2493	char *b;
2494
2495	if (!name)
2496		return 1;
2497
2498	b = strstr(name, ".blktrace.");
2499	if (b)
2500		*b = '\0';
2501
2502	return 0;
2503}
2504
2505static int do_file(void)
2506{
2507	int i, cpu, ret;
2508	struct per_dev_info *pdi;
2509
2510	/*
2511	 * first prepare all files for reading
2512	 */
2513	for (i = 0; i < ndevices; i++) {
2514		pdi = &devices[i];
2515		ret = name_fixup(pdi->name);
2516		if (ret)
2517			return ret;
2518
2519		for (cpu = 0; setup_file(pdi, cpu); cpu++)
2520			;
2521	}
2522
2523	/*
2524	 * Get the initial time stamp
2525	 */
2526	if (ms_head)
2527		genesis_time = ms_peek_time(ms_head);
2528
2529	/*
2530	 * Keep processing traces while any are left
2531	 */
2532	while (!is_done() && ms_head && handle(ms_head))
2533		;
2534
2535	return 0;
2536}
2537
2538static void do_pipe(int fd)
2539{
2540	unsigned long long youngest;
2541	int events, fdblock;
2542
2543	last_allowed_time = -1ULL;
2544	fdblock = -1;
2545	while ((events = read_events(fd, 0, &fdblock)) > 0) {
2546		read_sequence++;
2547
2548#if 0
2549		smallest_seq_read = -1U;
2550#endif
2551
2552		if (sort_entries(&youngest))
2553			break;
2554
2555		if (youngest > stopwatch_end)
2556			break;
2557
2558		show_entries_rb(0);
2559	}
2560
2561	if (rb_sort_entries)
2562		show_entries_rb(1);
2563}
2564
2565static int do_fifo(void)
2566{
2567	int fd;
2568
2569	if (!strcmp(pipename, "-"))
2570		fd = dup(STDIN_FILENO);
2571	else
2572		fd = open(pipename, O_RDONLY);
2573
2574	if (fd == -1) {
2575		perror("dup stdin");
2576		return -1;
2577	}
2578
2579	do_pipe(fd);
2580	close(fd);
2581	return 0;
2582}
2583
2584static void show_stats(void)
2585{
2586	if (!ofp)
2587		return;
2588	if (stats_printed)
2589		return;
2590
2591	stats_printed = 1;
2592
2593	if (per_process_stats)
2594		show_process_stats();
2595
2596	if (per_device_and_cpu_stats)
2597		show_device_and_cpu_stats();
2598
2599	fflush(ofp);
2600}
2601
2602static void handle_sigint(__attribute__((__unused__)) int sig)
2603{
2604	done = 1;
2605}
2606
2607/*
2608 * Extract start and duration times from a string, allowing
2609 * us to specify a time interval of interest within a trace.
2610 * Format: "duration" (start is zero) or "start:duration".
2611 */
2612static int find_stopwatch_interval(char *string)
2613{
2614	double value;
2615	char *sp;
2616
2617	value = strtod(string, &sp);
2618	if (sp == string) {
2619		fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
2620		return 1;
2621	}
2622	if (*sp == ':') {
2623		stopwatch_start = DOUBLE_TO_NANO_ULL(value);
2624		string = sp + 1;
2625		value = strtod(string, &sp);
2626		if (sp == string || *sp != '\0') {
2627			fprintf(stderr,"Invalid stopwatch duration time: %s\n",
2628				string);
2629			return 1;
2630		}
2631	} else if (*sp != '\0') {
2632		fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
2633		return 1;
2634	}
2635	stopwatch_end = DOUBLE_TO_NANO_ULL(value);
2636	if (stopwatch_end <= stopwatch_start) {
2637		fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
2638			stopwatch_start, stopwatch_end);
2639		return 1;
2640	}
2641
2642	return 0;
2643}
2644
2645static int is_pipe(const char *str)
2646{
2647	struct stat st;
2648
2649	if (!strcmp(str, "-"))
2650		return 1;
2651	if (!stat(str, &st) && S_ISFIFO(st.st_mode))
2652		return 1;
2653
2654	return 0;
2655}
2656
2657#define S_OPTS  "a:A:b:D:d:f:F:hi:o:Oqstw:vVM"
2658static char usage_str[] =    "\n\n" \
2659	"-i <file>           | --input=<file>\n" \
2660	"[ -a <action field> | --act-mask=<action field> ]\n" \
2661	"[ -A <action mask>  | --set-mask=<action mask> ]\n" \
2662	"[ -b <traces>       | --batch=<traces> ]\n" \
2663	"[ -d <file>         | --dump-binary=<file> ]\n" \
2664	"[ -D <dir>          | --input-directory=<dir> ]\n" \
2665	"[ -f <format>       | --format=<format> ]\n" \
2666	"[ -F <spec>         | --format-spec=<spec> ]\n" \
2667	"[ -h                | --hash-by-name ]\n" \
2668	"[ -o <file>         | --output=<file> ]\n" \
2669	"[ -O                | --no-text-output ]\n" \
2670	"[ -q                | --quiet ]\n" \
2671	"[ -s                | --per-program-stats ]\n" \
2672	"[ -t                | --track-ios ]\n" \
2673	"[ -w <time>         | --stopwatch=<time> ]\n" \
2674	"[ -M                | --no-msgs\n" \
2675	"[ -v                | --verbose ]\n" \
2676	"[ -V                | --version ]\n\n" \
2677	"\t-b stdin read batching\n" \
2678	"\t-d Output file. If specified, binary data is written to file\n" \
2679	"\t-D Directory to prepend to input file names\n" \
2680	"\t-f Output format. Customize the output format. The format field\n" \
2681	"\t   identifies can be found in the documentation\n" \
2682	"\t-F Format specification. Can be found in the documentation\n" \
2683	"\t-h Hash processes by name, not pid\n" \
2684	"\t-i Input file containing trace data, or '-' for stdin\n" \
2685	"\t-o Output file. If not given, output is stdout\n" \
2686	"\t-O Do NOT output text data\n" \
2687	"\t-q Quiet. Don't display any stats at the end of the trace\n" \
2688	"\t-s Show per-program io statistics\n" \
2689	"\t-t Track individual ios. Will tell you the time a request took\n" \
2690	"\t   to get queued, to get dispatched, and to get completed\n" \
2691	"\t-w Only parse data between the given time interval in seconds.\n" \
2692	"\t   If 'start' isn't given, blkparse defaults the start time to 0\n" \
2693	"\t-M Do not output messages to binary file\n" \
2694	"\t-v More verbose for marginal errors\n" \
2695	"\t-V Print program version info\n\n";
2696
2697static void usage(char *prog)
2698{
2699	fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str);
2700}
2701
2702int main(int argc, char *argv[])
2703{
2704	int i, c, ret, mode;
2705	int act_mask_tmp = 0;
2706	char *ofp_buffer = NULL;
2707	char *bin_ofp_buffer = NULL;
2708
2709	while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
2710		switch (c) {
2711		case 'a':
2712			i = find_mask_map(optarg);
2713			if (i < 0) {
2714				fprintf(stderr,"Invalid action mask %s\n",
2715					optarg);
2716				return 1;
2717			}
2718			act_mask_tmp |= i;
2719			break;
2720
2721		case 'A':
2722			if ((sscanf(optarg, "%x", &i) != 1) ||
2723							!valid_act_opt(i)) {
2724				fprintf(stderr,
2725					"Invalid set action mask %s/0x%x\n",
2726					optarg, i);
2727				return 1;
2728			}
2729			act_mask_tmp = i;
2730			break;
2731		case 'i':
2732			if (is_pipe(optarg) && !pipeline) {
2733				pipeline = 1;
2734				pipename = strdup(optarg);
2735			} else if (resize_devices(optarg) != 0)
2736				return 1;
2737			break;
2738		case 'D':
2739			input_dir = optarg;
2740			break;
2741		case 'o':
2742			output_name = optarg;
2743			break;
2744		case 'O':
2745			text_output = 0;
2746			break;
2747		case 'b':
2748			rb_batch = atoi(optarg);
2749			if (rb_batch <= 0)
2750				rb_batch = RB_BATCH_DEFAULT;
2751			break;
2752		case 's':
2753			per_process_stats = 1;
2754			break;
2755		case 't':
2756			track_ios = 1;
2757			break;
2758		case 'q':
2759			per_device_and_cpu_stats = 0;
2760			break;
2761		case 'w':
2762			if (find_stopwatch_interval(optarg) != 0)
2763				return 1;
2764			break;
2765		case 'f':
2766			set_all_format_specs(optarg);
2767			break;
2768		case 'F':
2769			if (add_format_spec(optarg) != 0)
2770				return 1;
2771			break;
2772		case 'h':
2773			ppi_hash_by_pid = 0;
2774			break;
2775		case 'v':
2776			verbose++;
2777			break;
2778		case 'V':
2779			printf("%s version %s\n", argv[0], blkparse_version);
2780			return 0;
2781		case 'd':
2782			dump_binary = optarg;
2783			break;
2784		case 'M':
2785			bin_output_msgs = 0;
2786			break;
2787		default:
2788			usage(argv[0]);
2789			return 1;
2790		}
2791	}
2792
2793	while (optind < argc) {
2794		if (is_pipe(argv[optind]) && !pipeline) {
2795			pipeline = 1;
2796			pipename = strdup(argv[optind]);
2797		} else if (resize_devices(argv[optind]) != 0)
2798			return 1;
2799		optind++;
2800	}
2801
2802	if (!pipeline && !ndevices) {
2803		usage(argv[0]);
2804		return 1;
2805	}
2806
2807	if (act_mask_tmp != 0)
2808		act_mask = act_mask_tmp;
2809
2810	memset(&rb_sort_root, 0, sizeof(rb_sort_root));
2811
2812	signal(SIGINT, handle_sigint);
2813	signal(SIGHUP, handle_sigint);
2814	signal(SIGTERM, handle_sigint);
2815
2816	setlocale(LC_NUMERIC, "en_US");
2817
2818	if (text_output) {
2819		if (!output_name) {
2820			ofp = fdopen(STDOUT_FILENO, "w");
2821			mode = _IOLBF;
2822		} else {
2823			char ofname[128];
2824
2825			snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
2826			ofp = fopen(ofname, "w");
2827			mode = _IOFBF;
2828		}
2829
2830		if (!ofp) {
2831			perror("fopen");
2832			return 1;
2833		}
2834
2835		ofp_buffer = malloc(4096);
2836		if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
2837			perror("setvbuf");
2838			return 1;
2839		}
2840	}
2841
2842	if (dump_binary) {
2843		dump_fp = fopen(dump_binary, "w");
2844		if (!dump_fp) {
2845			perror(dump_binary);
2846			dump_binary = NULL;
2847			return 1;
2848		}
2849		bin_ofp_buffer = malloc(128 * 1024);
2850		if (setvbuf(dump_fp, bin_ofp_buffer, _IOFBF, 128 * 1024)) {
2851			perror("setvbuf binary");
2852			return 1;
2853		}
2854	}
2855
2856	if (pipeline)
2857		ret = do_fifo();
2858	else
2859		ret = do_file();
2860
2861	if (!ret)
2862		show_stats();
2863
2864	if (have_drv_data && !dump_binary)
2865		printf("\ndiscarded traces containing low-level device driver "
2866		       "specific data (only available in binary output)\n");
2867
2868	if (ofp_buffer) {
2869		fflush(ofp);
2870		free(ofp_buffer);
2871	}
2872	if (bin_ofp_buffer) {
2873		fflush(dump_fp);
2874		free(bin_ofp_buffer);
2875	}
2876	return ret;
2877}
2878