1/*
2 * I/O monitor based on block queue trace data
3 *
4 * Copyright IBM Corp. 2008
5 *
6 * Author(s): Martin Peschke <mp3@de.ibm.com>
7 *
8 *  This program is free software; you can redistribute it and/or modify
9 *  it under the terms of the GNU General Public License as published by
10 *  the Free Software Foundation; either version 2 of the License, or
11 *  (at your option) any later version.
12 *
13 *  This program is distributed in the hope that it will be useful,
14 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 *  GNU General Public License for more details.
17 *
18 *  You should have received a copy of the GNU General Public License
19 *  along with this program; if not, write to the Free Software
20 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21 */
22
23#include <sys/types.h>
24#include <sys/stat.h>
25#include <fcntl.h>
26#include <unistd.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <signal.h>
31#include <getopt.h>
32#include <errno.h>
33#include <locale.h>
34#include <libgen.h>
35#include <sys/msg.h>
36#include <pthread.h>
37#include <time.h>
38
39#include "blktrace.h"
40#include "rbtree.h"
41#include "jhash.h"
42#include "blkiomon.h"
43
44struct trace {
45	struct blk_io_trace bit;
46	struct rb_node node;
47	struct trace *next;
48	long sequence;
49};
50
51struct rb_search {
52	struct rb_node **node_ptr;
53	struct rb_node *parent;
54};
55
56struct dstat_msg {
57	long mtype;
58	struct blkiomon_stat stat;
59};
60
61struct dstat {
62	struct dstat_msg msg;
63	struct rb_node node;
64	struct dstat *next;
65};
66
67struct output {
68	char *fn;
69	FILE *fp;
70	char *buf;
71	int pipe;
72};
73
74static char blkiomon_version[] = "0.3";
75
76static FILE *ifp;
77static int interval = -1;
78
79static struct trace *vacant_traces_list = NULL;
80static int vacant_traces = 0;
81
82#define TRACE_HASH_SIZE 128
83struct trace *thash[TRACE_HASH_SIZE] = {};
84
85static struct dstat *vacant_dstats_list = NULL;
86static struct rb_root dstat_tree[2] = { RB_ROOT, RB_ROOT };
87static struct dstat *dstat_list[2] = {};
88static int dstat_curr = 0;
89
90static struct output drvdata, human, binary, debug;
91
92static char *msg_q_name = NULL;
93static int msg_q_id = -1, msg_q = -1;
94static long msg_id = -1;
95
96static pthread_t interval_thread;
97static pthread_mutex_t dstat_mutex = PTHREAD_MUTEX_INITIALIZER;
98
99int data_is_native = -1;
100
101static int up = 1;
102
103/* debugging */
104static long leftover = 0, driverdata = 0, match = 0, mismatch = 0, sequence = 0;
105
106static void dump_bit(struct trace *t, const char *descr)
107{
108	struct blk_io_trace *bit = &t->bit;
109
110	if (!debug.fn)
111		return;
112
113	fprintf(debug.fp, "--- %s ---\n", descr);
114	fprintf(debug.fp, "magic    %16d\n", bit->magic);
115	fprintf(debug.fp, "sequence %16d\n", bit->sequence);
116	fprintf(debug.fp, "time     %16ld\n", (unsigned long)bit->time);
117	fprintf(debug.fp, "sector   %16ld\n", (unsigned long)bit->sector);
118	fprintf(debug.fp, "bytes    %16d\n", bit->bytes);
119	fprintf(debug.fp, "action   %16x\n", bit->action);
120	fprintf(debug.fp, "pid      %16d\n", bit->pid);
121	fprintf(debug.fp, "device   %16d\n", bit->device);
122	fprintf(debug.fp, "cpu      %16d\n", bit->cpu);
123	fprintf(debug.fp, "error    %16d\n", bit->error);
124	fprintf(debug.fp, "pdu_len  %16d\n", bit->pdu_len);
125
126	fprintf(debug.fp, "order    %16ld\n", t->sequence);
127}
128
129static void dump_bits(struct trace *t1, struct trace *t2, const char *descr)
130{
131	struct blk_io_trace *bit1 = &t1->bit;
132	struct blk_io_trace *bit2 = &t2->bit;
133
134	if (!debug.fn)
135		return;
136
137	fprintf(debug.fp, "--- %s ---\n", descr);
138	fprintf(debug.fp, "magic    %16d %16d\n", bit1->magic, bit2->magic);
139	fprintf(debug.fp, "sequence %16d %16d\n",
140		bit1->sequence, bit2->sequence);
141	fprintf(debug.fp, "time     %16ld %16ld\n",
142		(unsigned long)bit1->time, (unsigned long)bit2->time);
143	fprintf(debug.fp, "sector   %16ld %16ld\n",
144		(unsigned long)bit1->sector, (unsigned long)bit2->sector);
145	fprintf(debug.fp, "bytes    %16d %16d\n", bit1->bytes, bit2->bytes);
146	fprintf(debug.fp, "action   %16x %16x\n", bit1->action, bit2->action);
147	fprintf(debug.fp, "pid      %16d %16d\n", bit1->pid, bit2->pid);
148	fprintf(debug.fp, "device   %16d %16d\n", bit1->device, bit2->device);
149	fprintf(debug.fp, "cpu      %16d %16d\n", bit1->cpu, bit2->cpu);
150	fprintf(debug.fp, "error    %16d %16d\n", bit1->error, bit2->error);
151	fprintf(debug.fp, "pdu_len  %16d %16d\n", bit1->pdu_len, bit2->pdu_len);
152
153	fprintf(debug.fp, "order    %16ld %16ld\n", t1->sequence, t2->sequence);
154}
155
156static struct dstat *blkiomon_alloc_dstat(void)
157{
158	struct dstat *dstat;
159
160	if (vacant_dstats_list) {
161		dstat = vacant_dstats_list;
162		vacant_dstats_list = dstat->next;
163	} else
164		dstat = malloc(sizeof(*dstat));
165	if (!dstat) {
166		fprintf(stderr,
167			"blkiomon: could not allocate device statistic");
168		return NULL;
169	}
170
171	blkiomon_stat_init(&dstat->msg.stat);
172	return dstat;
173}
174
175static struct dstat *blkiomon_find_dstat(struct rb_search *search, __u32 device)
176{
177	struct rb_node **p = &(dstat_tree[dstat_curr].rb_node);
178	struct rb_node *parent = NULL;
179	struct dstat *dstat;
180
181	while (*p) {
182		parent = *p;
183
184		dstat = rb_entry(parent, struct dstat, node);
185
186		if (dstat->msg.stat.device < device)
187			p = &(*p)->rb_left;
188		else if (dstat->msg.stat.device > device)
189			p = &(*p)->rb_right;
190		else
191			return dstat;
192	}
193	search->node_ptr = p;
194	search->parent = parent;
195	return NULL;
196}
197
198static struct dstat *blkiomon_get_dstat(__u32 device)
199{
200	struct dstat *dstat;
201	struct rb_search search;
202
203	pthread_mutex_lock(&dstat_mutex);
204
205	dstat = blkiomon_find_dstat(&search, device);
206	if (dstat)
207		goto out;
208
209	dstat = blkiomon_alloc_dstat();
210	if (!dstat)
211		goto out;
212
213	dstat->msg.stat.device = device;
214
215	rb_link_node(&dstat->node, search.parent, search.node_ptr);
216	rb_insert_color(&dstat->node, &dstat_tree[dstat_curr]);
217
218	dstat->next = dstat_list[dstat_curr];
219	dstat_list[dstat_curr] = dstat;
220
221out:
222	pthread_mutex_unlock(&dstat_mutex);
223	return dstat;
224}
225
226static int blkiomon_output_msg_q(struct dstat *dstat)
227{
228	if (!msg_q_name)
229		return 0;
230
231	dstat->msg.mtype = msg_id;
232	return msgsnd(msg_q, &dstat->msg, sizeof(struct blkiomon_stat), 0);
233}
234
235static int blkiomon_output_binary(struct dstat *dstat)
236{
237	struct blkiomon_stat *p = &dstat->msg.stat;
238
239	if (!binary.fn)
240		return 0;
241
242	if (fwrite(p, sizeof(*p), 1, binary.fp) != 1)
243		goto failed;
244	if (binary.pipe && fflush(binary.fp))
245		goto failed;
246	return 0;
247
248failed:
249	fprintf(stderr, "blkiomon: could not write to %s\n", binary.fn);
250	fclose(binary.fp);
251	binary.fn = NULL;
252	return 1;
253}
254
255static struct dstat *blkiomon_output(struct dstat *head, struct timespec *ts)
256{
257	struct dstat *dstat, *tail = NULL;
258
259	for (dstat = head; dstat; dstat = dstat->next) {
260		dstat->msg.stat.time = ts->tv_sec;
261		blkiomon_stat_print(human.fp, &dstat->msg.stat);
262		blkiomon_stat_to_be(&dstat->msg.stat);
263		blkiomon_output_binary(dstat);
264		blkiomon_output_msg_q(dstat);
265		tail = dstat;
266	}
267	return tail;
268}
269
270static void *blkiomon_interval(void *data)
271{
272	struct timespec wake, r;
273	struct dstat *head, *tail;
274	int finished;
275
276	clock_gettime(CLOCK_REALTIME, &wake);
277
278	while (1) {
279		wake.tv_sec += interval;
280		if (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &wake, &r)) {
281			fprintf(stderr, "blkiomon: interrupted sleep");
282			continue;
283		}
284
285		/* grab tree and make data gatherer build up another tree */
286		pthread_mutex_lock(&dstat_mutex);
287		finished = dstat_curr;
288		dstat_curr = dstat_curr ? 0 : 1;
289		pthread_mutex_unlock(&dstat_mutex);
290
291		head = dstat_list[finished];
292		if (!head)
293			continue;
294		dstat_list[finished] = NULL;
295		dstat_tree[finished] = RB_ROOT;
296		tail = blkiomon_output(head, &wake);
297
298		pthread_mutex_lock(&dstat_mutex);
299		tail->next = vacant_dstats_list;
300		vacant_dstats_list = head;
301		pthread_mutex_unlock(&dstat_mutex);
302	}
303	return data;
304}
305
306#define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE))
307
308static int blkiomon_account(struct blk_io_trace *bit_d,
309			    struct blk_io_trace *bit_c)
310{
311	struct dstat *dstat;
312	struct blkiomon_stat *p;
313	__u64 d2c = (bit_c->time - bit_d->time) / 1000; /* ns -> us */
314	__u32 size = bit_d->bytes;
315	__u64 thrput = size * 1000 / d2c;
316
317	dstat = blkiomon_get_dstat(bit_d->device);
318	if (!dstat)
319		return 1;
320	p = &dstat->msg.stat;
321
322	if (BLK_DATADIR(bit_c->action) & BLK_TC_READ) {
323		minmax_account(&p->thrput_r, thrput);
324		minmax_account(&p->size_r, size);
325		minmax_account(&p->d2c_r, d2c);
326	} else if (BLK_DATADIR(bit_c->action) & BLK_TC_WRITE) {
327		minmax_account(&p->thrput_w, thrput);
328		minmax_account(&p->size_w, size);
329		minmax_account(&p->d2c_w, d2c);
330	} else
331		p->bidir++;
332
333	histlog2_account(p->size_hist, size, &size_hist);
334	histlog2_account(p->d2c_hist, d2c, &d2c_hist);
335	return 0;
336}
337
338static struct trace *blkiomon_alloc_trace(void)
339{
340	struct trace *t = vacant_traces_list;
341	if (t) {
342		vacant_traces_list = t->next;
343		vacant_traces--;
344	} else
345		t = malloc(sizeof(*t));
346	memset(t, 0, sizeof(*t));
347	return t;
348}
349
350static void blkiomon_free_trace(struct trace *t)
351{
352	if (vacant_traces < 256) {
353		t->next = vacant_traces_list;
354		vacant_traces_list = t;
355		vacant_traces++;
356	} else
357		free(t);
358}
359
360static int action(int a)
361{
362	int bits = BLK_TC_WRITE | BLK_TC_READ | BLK_TC_FS | BLK_TC_PC;
363	return a & (BLK_TC_ACT(bits));
364}
365
366static void blkiomon_store_trace(struct trace *t)
367{
368	int i = t->bit.sector % TRACE_HASH_SIZE;
369
370	t->next = thash[i];
371	thash[i] = t;
372}
373
374static struct trace *blkiomon_fetch_trace(struct blk_io_trace *bit)
375{
376	int i = bit->sector % TRACE_HASH_SIZE;
377	struct trace *t, *prev = NULL;
378
379	for (t = thash[i]; t; t = t->next) {
380		if (t->bit.device == bit->device &&
381		    t->bit.sector == bit->sector &&
382		    action(t->bit.action) == action(bit->action)) {
383			if (prev)
384				prev->next = t->next;
385			else
386				thash[i] = t->next;
387			return t;
388		}
389		prev = t;
390	}
391	return NULL;
392}
393
394static struct trace *blkiomon_do_trace(struct trace *t)
395{
396	struct trace *t_stored, *t_old, *t_young;
397
398	/* store trace if there is no match yet */
399	t_stored = blkiomon_fetch_trace(&t->bit);
400	if (!t_stored) {
401		blkiomon_store_trace(t);
402		return blkiomon_alloc_trace();
403	}
404
405	/* figure out older trace and younger trace */
406	if (t_stored->bit.time < t->bit.time) {
407		t_old = t_stored;
408		t_young = t;
409	} else {
410		t_old = t;
411		t_young = t_stored;
412	}
413
414	/* we need an older D trace and a younger C trace */
415	if (t_old->bit.action & BLK_TC_ACT(BLK_TC_ISSUE) &&
416	    t_young->bit.action & BLK_TC_ACT(BLK_TC_COMPLETE)) {
417		/* matching D and C traces - update statistics */
418		match++;
419		blkiomon_account(&t_old->bit, &t_young->bit);
420		blkiomon_free_trace(t_stored);
421		return t;
422	}
423
424	/* no matching D and C traces - keep more recent trace */
425	dump_bits(t_old, t_young, "mismatch");
426	mismatch++;
427	blkiomon_store_trace(t_young);
428	return t_old;
429}
430
431static int blkiomon_dump_drvdata(struct blk_io_trace *bit, void *pdu_buf)
432{
433	if (!drvdata.fn)
434		return 0;
435
436	if (fwrite(bit, sizeof(*bit), 1, drvdata.fp) != 1)
437		goto failed;
438	if (fwrite(pdu_buf, bit->pdu_len, 1, drvdata.fp) != 1)
439		goto failed;
440	if (drvdata.pipe && fflush(drvdata.fp))
441		goto failed;
442	return 0;
443
444failed:
445	fprintf(stderr, "blkiomon: could not write to %s\n", drvdata.fn);
446	fclose(drvdata.fp);
447	drvdata.fn = NULL;
448	return 1;
449}
450
451static int blkiomon_do_fifo(void)
452{
453	struct trace *t;
454	struct blk_io_trace *bit;
455	void *pdu_buf = NULL;
456
457	t = blkiomon_alloc_trace();
458	if (!t)
459		return 1;
460	bit = &t->bit;
461
462	while (up) {
463		if (fread(bit, sizeof(*bit), 1, ifp) != 1) {
464			if (!feof(ifp))
465				fprintf(stderr,
466					"blkiomon: could not read trace");
467			break;
468		}
469		if (ferror(ifp)) {
470			clearerr(ifp);
471			fprintf(stderr, "blkiomon: error while reading trace");
472			break;
473		}
474
475		if (data_is_native == -1 && check_data_endianness(bit->magic)) {
476			fprintf(stderr, "blkiomon: endianess problem\n");
477			break;
478		}
479
480		/* endianess */
481		trace_to_cpu(bit);
482		if (verify_trace(bit)) {
483			fprintf(stderr, "blkiomon: bad trace\n");
484			break;
485		}
486
487		/* read additional trace payload */
488		if (bit->pdu_len) {
489			pdu_buf = realloc(pdu_buf, bit->pdu_len);
490			if (fread(pdu_buf, bit->pdu_len, 1, ifp) != 1) {
491				clearerr(ifp);
492				fprintf(stderr, "blkiomon: could not read payload\n");
493				break;
494			}
495		}
496
497		t->sequence = sequence++;
498
499		/* forward low-level device driver trace to other tool */
500		if (bit->action & BLK_TC_ACT(BLK_TC_DRV_DATA)) {
501			driverdata++;
502			if (blkiomon_dump_drvdata(bit, pdu_buf)) {
503				fprintf(stderr, "blkiomon: could not send trace\n");
504				break;
505			}
506			continue;
507		}
508
509		if (!(bit->action & BLK_TC_ACT(BLK_TC_ISSUE | BLK_TC_COMPLETE)))
510			continue;
511
512		/* try to find matching trace and update statistics */
513		t = blkiomon_do_trace(t);
514		if (!t) {
515			fprintf(stderr, "blkiomon: could not alloc trace\n");
516			break;
517		}
518		bit = &t->bit;
519		/* t and bit will be recycled for next incoming trace */
520	}
521	blkiomon_free_trace(t);
522	free(pdu_buf);
523	return 0;
524}
525
526static int blkiomon_open_output(struct output *out)
527{
528	int mode, vbuf_size;
529
530	if (!out->fn)
531		return 0;
532
533	if (!strcmp(out->fn, "-")) {
534		out->fp = fdopen(STDOUT_FILENO, "w");
535		mode = _IOLBF;
536		vbuf_size = 4096;
537		out->pipe = 1;
538	} else {
539		out->fp = fopen(out->fn, "w");
540		mode = _IOFBF;
541		vbuf_size = 128 * 1024;
542		out->pipe = 0;
543	}
544	if (!out->fp)
545		goto failed;
546	out->buf = malloc(128 * 1024);
547	if (setvbuf(out->fp, out->buf, mode, vbuf_size))
548		goto failed;
549	return 0;
550
551failed:
552	fprintf(stderr, "blkiomon: could not write to %s\n", out->fn);
553	out->fn = NULL;
554	free(out->buf);
555	return 1;
556}
557
558static int blkiomon_open_msg_q(void)
559{
560	key_t key;
561
562	if (!msg_q_name)
563		return 0;
564	if (!msg_q_id || msg_id <= 0)
565		return 1;
566	key = ftok(msg_q_name, msg_q_id);
567	if (key == -1)
568		return 1;
569	while (up) {
570		msg_q = msgget(key, S_IRWXU);
571		if (msg_q >= 0)
572			break;
573	}
574	return (msg_q >= 0 ? 0 : -1);
575}
576
577static void blkiomon_debug(void)
578{
579	int i;
580	struct trace *t;
581
582	if (!debug.fn)
583		return;
584
585	for (i = 0; i < TRACE_HASH_SIZE; i++)
586		for (t = thash[i]; t; t = t->next) {
587			dump_bit(t, "leftover");
588			leftover++;
589		}
590
591	fprintf(debug.fp, "%ld leftover, %ld match, %ld mismatch, "
592		"%ld driverdata, %ld overall\n",
593		leftover, match, mismatch, driverdata, sequence);
594}
595
596#define S_OPTS "b:d:D:h:I:Q:q:m:V"
597
598static char usage_str[] = "\n\nblkiomon " \
599	"-I <interval>       | --interval=<interval>\n" \
600	"[ -h <file>         | --human-readable=<file> ]\n" \
601	"[ -b <file>         | --binary=<file> ]\n" \
602	"[ -D <file>         | --debug=<file> ]\n" \
603	"[ -Q <path name>    | --msg-queue=<path name>]\n" \
604	"[ -q <msg queue id> | --msg-queue-id=<msg queue id>]\n" \
605	"[ -m <msg id>       | --msg-id=<msg id>]\n" \
606	"[ -V                | --version ]\n\n" \
607	"\t-I   Sample interval.\n" \
608	"\t-h   Human-readable output file.\n" \
609	"\t-b   Binary output file.\n" \
610	"\t-d   Output file for data emitted by low level device driver.\n" \
611	"\t-D   Output file for debugging data.\n" \
612	"\t-Qqm Output to message queue using given ID for messages.\n" \
613	"\t-V   Print program version.\n\n";
614
615static struct option l_opts[] = {
616	{
617		.name = "human-readable",
618		.has_arg = required_argument,
619		.flag = NULL,
620		.val = 'h'
621	},
622	{
623		.name = "binary",
624		.has_arg = required_argument,
625		.flag = NULL,
626		.val = 'b'
627	},
628	{
629		.name = "dump-lldd",
630		.has_arg = required_argument,
631		.flag = NULL,
632		.val = 'd'
633	},
634	{
635		.name = "debug",
636		.has_arg = required_argument,
637		.flag = NULL,
638		.val = 'D'
639	},
640	{
641		.name = "interval",
642		.has_arg = required_argument,
643		.flag = NULL,
644		.val = 'I'
645	},
646	{
647		.name = "msg-queue",
648		.has_arg = required_argument,
649		.flag = NULL,
650		.val = 'Q'
651	},
652	{
653		.name = "msg-queue-id",
654		.has_arg = required_argument,
655		.flag = NULL,
656		.val = 'q'
657	},
658	{
659		.name = "msg-id",
660		.has_arg = required_argument,
661		.flag = NULL,
662		.val = 'm'
663	},
664	{
665		.name = "version",
666		.has_arg = no_argument,
667		.flag = NULL,
668		.val = 'V'
669	},
670	{
671		.name = NULL,
672	}
673};
674
675static void blkiomon_signal(int signal)
676{
677	fprintf(stderr, "blkiomon: terminated by signal\n");
678	up = signal & 0;
679}
680
681int main(int argc, char *argv[])
682{
683	int c;
684
685	signal(SIGALRM, blkiomon_signal);
686	signal(SIGINT, blkiomon_signal);
687	signal(SIGTERM, blkiomon_signal);
688	signal(SIGQUIT, blkiomon_signal);
689
690	while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
691		switch (c) {
692		case 'h':
693			human.fn = optarg;
694			break;
695		case 'b':
696			binary.fn = optarg;
697			break;
698		case 'd':
699			drvdata.fn = optarg;
700			break;
701		case 'D':
702			debug.fn = optarg;
703			break;
704		case 'I':
705			interval = atoi(optarg);
706			break;
707		case 'Q':
708			msg_q_name = optarg;
709			break;
710		case 'q':
711			msg_q_id = atoi(optarg);
712			break;
713		case 'm':
714			msg_id = atoi(optarg);
715			break;
716		case 'V':
717			printf("%s version %s\n", argv[0], blkiomon_version);
718			return 0;
719		default:
720			fprintf(stderr, "Usage: %s", usage_str);
721			return 1;
722		}
723	}
724
725	if (interval <= 0) {
726		fprintf(stderr, "Usage: %s", usage_str);
727		return 1;
728	}
729
730	ifp = fdopen(STDIN_FILENO, "r");
731	if (!ifp) {
732		perror("blkiomon: could not open stdin for reading");
733		return 1;
734	}
735
736	if (blkiomon_open_output(&human))
737		return 1;
738	if (blkiomon_open_output(&binary))
739		return 1;
740	if (blkiomon_open_output(&drvdata))
741		return 1;
742	if (blkiomon_open_output(&debug))
743		return 1;
744	if (blkiomon_open_msg_q())
745		return 1;
746
747	if (pthread_create(&interval_thread, NULL, blkiomon_interval, NULL)) {
748		fprintf(stderr, "blkiomon: could not create thread");
749		return 1;
750	}
751
752	blkiomon_do_fifo();
753
754	blkiomon_debug();
755	return 0;
756}
757