1/*
2 * Code related to writing an iolog of what a thread is doing, and to
3 * later read that back and replay
4 */
5#include <stdio.h>
6#include <stdlib.h>
7#include <libgen.h>
8#include <assert.h>
9#include "flist.h"
10#include "fio.h"
11#include "verify.h"
12#include "trim.h"
13#include "filelock.h"
14
15static const char iolog_ver2[] = "fio version 2 iolog";
16
17void queue_io_piece(struct thread_data *td, struct io_piece *ipo)
18{
19	flist_add_tail(&ipo->list, &td->io_log_list);
20	td->total_io_size += ipo->len;
21}
22
23void log_io_u(struct thread_data *td, struct io_u *io_u)
24{
25	const char *act[] = { "read", "write", "sync", "datasync",
26				"sync_file_range", "wait", "trim" };
27
28	assert(io_u->ddir <= 6);
29
30	if (!td->o.write_iolog_file)
31		return;
32
33	fprintf(td->iolog_f, "%s %s %llu %lu\n", io_u->file->file_name,
34						act[io_u->ddir], io_u->offset,
35						io_u->buflen);
36}
37
38void log_file(struct thread_data *td, struct fio_file *f,
39	      enum file_log_act what)
40{
41	const char *act[] = { "add", "open", "close" };
42
43	assert(what < 3);
44
45	if (!td->o.write_iolog_file)
46		return;
47
48
49	/*
50	 * this happens on the pre-open/close done before the job starts
51	 */
52	if (!td->iolog_f)
53		return;
54
55	fprintf(td->iolog_f, "%s %s\n", f->file_name, act[what]);
56}
57
58static void iolog_delay(struct thread_data *td, unsigned long delay)
59{
60	unsigned long usec = utime_since_now(&td->last_issue);
61	unsigned long this_delay;
62
63	if (delay < usec)
64		return;
65
66	delay -= usec;
67
68	/*
69	 * less than 100 usec delay, just regard it as noise
70	 */
71	if (delay < 100)
72		return;
73
74	while (delay && !td->terminate) {
75		this_delay = delay;
76		if (this_delay > 500000)
77			this_delay = 500000;
78
79		usec_sleep(td, this_delay);
80		delay -= this_delay;
81	}
82}
83
84static int ipo_special(struct thread_data *td, struct io_piece *ipo)
85{
86	struct fio_file *f;
87	int ret;
88
89	/*
90	 * Not a special ipo
91	 */
92	if (ipo->ddir != DDIR_INVAL)
93		return 0;
94
95	f = td->files[ipo->fileno];
96
97	switch (ipo->file_action) {
98	case FIO_LOG_OPEN_FILE:
99		ret = td_io_open_file(td, f);
100		if (!ret)
101			break;
102		td_verror(td, ret, "iolog open file");
103		return -1;
104	case FIO_LOG_CLOSE_FILE:
105		td_io_close_file(td, f);
106		break;
107	case FIO_LOG_UNLINK_FILE:
108		unlink(f->file_name);
109		break;
110	default:
111		log_err("fio: bad file action %d\n", ipo->file_action);
112		break;
113	}
114
115	return 1;
116}
117
118int read_iolog_get(struct thread_data *td, struct io_u *io_u)
119{
120	struct io_piece *ipo;
121	unsigned long elapsed;
122
123	while (!flist_empty(&td->io_log_list)) {
124		int ret;
125
126		ipo = flist_entry(td->io_log_list.next, struct io_piece, list);
127		flist_del(&ipo->list);
128		remove_trim_entry(td, ipo);
129
130		ret = ipo_special(td, ipo);
131		if (ret < 0) {
132			free(ipo);
133			break;
134		} else if (ret > 0) {
135			free(ipo);
136			continue;
137		}
138
139		io_u->ddir = ipo->ddir;
140		if (ipo->ddir != DDIR_WAIT) {
141			io_u->offset = ipo->offset;
142			io_u->buflen = ipo->len;
143			io_u->file = td->files[ipo->fileno];
144			get_file(io_u->file);
145			dprint(FD_IO, "iolog: get %llu/%lu/%s\n", io_u->offset,
146						io_u->buflen, io_u->file->file_name);
147			if (ipo->delay)
148				iolog_delay(td, ipo->delay);
149		} else {
150			elapsed = mtime_since_genesis();
151			if (ipo->delay > elapsed)
152				usec_sleep(td, (ipo->delay - elapsed) * 1000);
153		}
154
155		free(ipo);
156
157		if (io_u->ddir != DDIR_WAIT)
158			return 0;
159	}
160
161	td->done = 1;
162	return 1;
163}
164
165void prune_io_piece_log(struct thread_data *td)
166{
167	struct io_piece *ipo;
168	struct rb_node *n;
169
170	while ((n = rb_first(&td->io_hist_tree)) != NULL) {
171		ipo = rb_entry(n, struct io_piece, rb_node);
172		rb_erase(n, &td->io_hist_tree);
173		remove_trim_entry(td, ipo);
174		td->io_hist_len--;
175		free(ipo);
176	}
177
178	while (!flist_empty(&td->io_hist_list)) {
179		ipo = flist_entry(td->io_hist_list.next, struct io_piece, list);
180		flist_del(&ipo->list);
181		remove_trim_entry(td, ipo);
182		td->io_hist_len--;
183		free(ipo);
184	}
185}
186
187/*
188 * log a successful write, so we can unwind the log for verify
189 */
190void log_io_piece(struct thread_data *td, struct io_u *io_u)
191{
192	struct rb_node **p, *parent;
193	struct io_piece *ipo, *__ipo;
194
195	ipo = malloc(sizeof(struct io_piece));
196	init_ipo(ipo);
197	ipo->file = io_u->file;
198	ipo->offset = io_u->offset;
199	ipo->len = io_u->buflen;
200	ipo->numberio = io_u->numberio;
201	ipo->flags = IP_F_IN_FLIGHT;
202
203	io_u->ipo = ipo;
204
205	if (io_u_should_trim(td, io_u)) {
206		flist_add_tail(&ipo->trim_list, &td->trim_list);
207		td->trim_entries++;
208	}
209
210	/*
211	 * We don't need to sort the entries, if:
212	 *
213	 *	Sequential writes, or
214	 *	Random writes that lay out the file as it goes along
215	 *
216	 * For both these cases, just reading back data in the order we
217	 * wrote it out is the fastest.
218	 *
219	 * One exception is if we don't have a random map AND we are doing
220	 * verifies, in that case we need to check for duplicate blocks and
221	 * drop the old one, which we rely on the rb insert/lookup for
222	 * handling.
223	 */
224	if (((!td->o.verifysort) || !td_random(td) || !td->o.overwrite) &&
225	      (file_randommap(td, ipo->file) || td->o.verify == VERIFY_NONE)) {
226		INIT_FLIST_HEAD(&ipo->list);
227		flist_add_tail(&ipo->list, &td->io_hist_list);
228		ipo->flags |= IP_F_ONLIST;
229		td->io_hist_len++;
230		return;
231	}
232
233	RB_CLEAR_NODE(&ipo->rb_node);
234
235	/*
236	 * Sort the entry into the verification list
237	 */
238restart:
239	p = &td->io_hist_tree.rb_node;
240	parent = NULL;
241	while (*p) {
242		parent = *p;
243
244		__ipo = rb_entry(parent, struct io_piece, rb_node);
245		if (ipo->file < __ipo->file)
246			p = &(*p)->rb_left;
247		else if (ipo->file > __ipo->file)
248			p = &(*p)->rb_right;
249		else if (ipo->offset < __ipo->offset)
250			p = &(*p)->rb_left;
251		else if (ipo->offset > __ipo->offset)
252			p = &(*p)->rb_right;
253		else {
254			dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu",
255				__ipo->offset, __ipo->len,
256				ipo->offset, ipo->len);
257			td->io_hist_len--;
258			rb_erase(parent, &td->io_hist_tree);
259			remove_trim_entry(td, __ipo);
260			free(__ipo);
261			goto restart;
262		}
263	}
264
265	rb_link_node(&ipo->rb_node, parent, p);
266	rb_insert_color(&ipo->rb_node, &td->io_hist_tree);
267	ipo->flags |= IP_F_ONRB;
268	td->io_hist_len++;
269}
270
271void unlog_io_piece(struct thread_data *td, struct io_u *io_u)
272{
273	struct io_piece *ipo = io_u->ipo;
274
275	if (!ipo)
276		return;
277
278	if (ipo->flags & IP_F_ONRB)
279		rb_erase(&ipo->rb_node, &td->io_hist_tree);
280	else if (ipo->flags & IP_F_ONLIST)
281		flist_del(&ipo->list);
282
283	free(ipo);
284	io_u->ipo = NULL;
285	td->io_hist_len--;
286}
287
288void trim_io_piece(struct thread_data *td, struct io_u *io_u)
289{
290	struct io_piece *ipo = io_u->ipo;
291
292	if (!ipo)
293		return;
294
295	ipo->len = io_u->xfer_buflen - io_u->resid;
296}
297
298void write_iolog_close(struct thread_data *td)
299{
300	fflush(td->iolog_f);
301	fclose(td->iolog_f);
302	free(td->iolog_buf);
303	td->iolog_f = NULL;
304	td->iolog_buf = NULL;
305}
306
307/*
308 * Read version 2 iolog data. It is enhanced to include per-file logging,
309 * syncs, etc.
310 */
311static int read_iolog2(struct thread_data *td, FILE *f)
312{
313	unsigned long long offset;
314	unsigned int bytes;
315	int reads, writes, waits, fileno = 0, file_action = 0; /* stupid gcc */
316	char *fname, *act;
317	char *str, *p;
318	enum fio_ddir rw;
319
320	free_release_files(td);
321
322	/*
323	 * Read in the read iolog and store it, reuse the infrastructure
324	 * for doing verifications.
325	 */
326	str = malloc(4096);
327	fname = malloc(256+16);
328	act = malloc(256+16);
329
330	reads = writes = waits = 0;
331	while ((p = fgets(str, 4096, f)) != NULL) {
332		struct io_piece *ipo;
333		int r;
334
335		r = sscanf(p, "%256s %256s %llu %u", fname, act, &offset,
336									&bytes);
337		if (r == 4) {
338			/*
339			 * Check action first
340			 */
341			if (!strcmp(act, "wait"))
342				rw = DDIR_WAIT;
343			else if (!strcmp(act, "read"))
344				rw = DDIR_READ;
345			else if (!strcmp(act, "write"))
346				rw = DDIR_WRITE;
347			else if (!strcmp(act, "sync"))
348				rw = DDIR_SYNC;
349			else if (!strcmp(act, "datasync"))
350				rw = DDIR_DATASYNC;
351			else if (!strcmp(act, "trim"))
352				rw = DDIR_TRIM;
353			else {
354				log_err("fio: bad iolog file action: %s\n",
355									act);
356				continue;
357			}
358			fileno = get_fileno(td, fname);
359		} else if (r == 2) {
360			rw = DDIR_INVAL;
361			if (!strcmp(act, "add")) {
362				fileno = add_file(td, fname, 0, 1);
363				file_action = FIO_LOG_ADD_FILE;
364				continue;
365			} else if (!strcmp(act, "open")) {
366				fileno = get_fileno(td, fname);
367				file_action = FIO_LOG_OPEN_FILE;
368			} else if (!strcmp(act, "close")) {
369				fileno = get_fileno(td, fname);
370				file_action = FIO_LOG_CLOSE_FILE;
371			} else {
372				log_err("fio: bad iolog file action: %s\n",
373									act);
374				continue;
375			}
376		} else {
377			log_err("bad iolog2: %s", p);
378			continue;
379		}
380
381		if (rw == DDIR_READ)
382			reads++;
383		else if (rw == DDIR_WRITE) {
384			/*
385			 * Don't add a write for ro mode
386			 */
387			if (read_only)
388				continue;
389			writes++;
390		} else if (rw == DDIR_WAIT) {
391			waits++;
392		} else if (rw == DDIR_INVAL) {
393		} else if (!ddir_sync(rw)) {
394			log_err("bad ddir: %d\n", rw);
395			continue;
396		}
397
398		/*
399		 * Make note of file
400		 */
401		ipo = malloc(sizeof(*ipo));
402		init_ipo(ipo);
403		ipo->ddir = rw;
404		if (rw == DDIR_WAIT) {
405			ipo->delay = offset;
406		} else {
407			ipo->offset = offset;
408			ipo->len = bytes;
409			if (rw != DDIR_INVAL && bytes > td->o.max_bs[rw])
410				td->o.max_bs[rw] = bytes;
411			ipo->fileno = fileno;
412			ipo->file_action = file_action;
413		}
414
415		queue_io_piece(td, ipo);
416	}
417
418	free(str);
419	free(act);
420	free(fname);
421
422	if (writes && read_only) {
423		log_err("fio: <%s> skips replay of %d writes due to"
424			" read-only\n", td->o.name, writes);
425		writes = 0;
426	}
427
428	if (!reads && !writes && !waits)
429		return 1;
430	else if (reads && !writes)
431		td->o.td_ddir = TD_DDIR_READ;
432	else if (!reads && writes)
433		td->o.td_ddir = TD_DDIR_WRITE;
434	else
435		td->o.td_ddir = TD_DDIR_RW;
436
437	return 0;
438}
439
440/*
441 * open iolog, check version, and call appropriate parser
442 */
443static int init_iolog_read(struct thread_data *td)
444{
445	char buffer[256], *p;
446	FILE *f;
447	int ret;
448
449	f = fopen(td->o.read_iolog_file, "r");
450	if (!f) {
451		perror("fopen read iolog");
452		return 1;
453	}
454
455	p = fgets(buffer, sizeof(buffer), f);
456	if (!p) {
457		td_verror(td, errno, "iolog read");
458		log_err("fio: unable to read iolog\n");
459		fclose(f);
460		return 1;
461	}
462
463	/*
464	 * version 2 of the iolog stores a specific string as the
465	 * first line, check for that
466	 */
467	if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2)))
468		ret = read_iolog2(td, f);
469	else {
470		log_err("fio: iolog version 1 is no longer supported\n");
471		ret = 1;
472	}
473
474	fclose(f);
475	return ret;
476}
477
478/*
479 * Set up a log for storing io patterns.
480 */
481static int init_iolog_write(struct thread_data *td)
482{
483	struct fio_file *ff;
484	FILE *f;
485	unsigned int i;
486
487	f = fopen(td->o.write_iolog_file, "a");
488	if (!f) {
489		perror("fopen write iolog");
490		return 1;
491	}
492
493	/*
494	 * That's it for writing, setup a log buffer and we're done.
495	  */
496	td->iolog_f = f;
497	td->iolog_buf = malloc(8192);
498	setvbuf(f, td->iolog_buf, _IOFBF, 8192);
499
500	/*
501	 * write our version line
502	 */
503	if (fprintf(f, "%s\n", iolog_ver2) < 0) {
504		perror("iolog init\n");
505		return 1;
506	}
507
508	/*
509	 * add all known files
510	 */
511	for_each_file(td, ff, i)
512		log_file(td, ff, FIO_LOG_ADD_FILE);
513
514	return 0;
515}
516
517int init_iolog(struct thread_data *td)
518{
519	int ret = 0;
520
521	if (td->o.read_iolog_file) {
522		int need_swap;
523
524		/*
525		 * Check if it's a blktrace file and load that if possible.
526		 * Otherwise assume it's a normal log file and load that.
527		 */
528		if (is_blktrace(td->o.read_iolog_file, &need_swap))
529			ret = load_blktrace(td, td->o.read_iolog_file, need_swap);
530		else
531			ret = init_iolog_read(td);
532	} else if (td->o.write_iolog_file)
533		ret = init_iolog_write(td);
534
535	if (ret)
536		td_verror(td, EINVAL, "failed initializing iolog");
537
538	return ret;
539}
540
541void setup_log(struct io_log **log, unsigned long avg_msec, int log_type)
542{
543	struct io_log *l = malloc(sizeof(*l));
544
545	memset(l, 0, sizeof(*l));
546	l->nr_samples = 0;
547	l->max_samples = 1024;
548	l->log_type = log_type;
549	l->log = malloc(l->max_samples * sizeof(struct io_sample));
550	l->avg_msec = avg_msec;
551	*log = l;
552}
553
554#ifdef CONFIG_SETVBUF
555static void *set_file_buffer(FILE *f)
556{
557	size_t size = 1048576;
558	void *buf;
559
560	buf = malloc(size);
561	setvbuf(f, buf, _IOFBF, size);
562	return buf;
563}
564
565static void clear_file_buffer(void *buf)
566{
567	free(buf);
568}
569#else
570static void *set_file_buffer(FILE *f)
571{
572	return NULL;
573}
574
575static void clear_file_buffer(void *buf)
576{
577}
578#endif
579
580void __finish_log(struct io_log *log, const char *name)
581{
582	unsigned int i;
583	void *buf;
584	FILE *f;
585
586	f = fopen(name, "a");
587	if (!f) {
588		perror("fopen log");
589		return;
590	}
591
592	buf = set_file_buffer(f);
593
594	for (i = 0; i < log->nr_samples; i++) {
595		fprintf(f, "%lu, %lu, %u, %u\n",
596				(unsigned long) log->log[i].time,
597				(unsigned long) log->log[i].val,
598				log->log[i].ddir, log->log[i].bs);
599	}
600
601	fclose(f);
602	clear_file_buffer(buf);
603	free(log->log);
604	free(log);
605}
606
607static int finish_log_named(struct thread_data *td, struct io_log *log,
608			    const char *prefix, const char *postfix,
609			    int trylock)
610{
611	char file_name[256];
612
613	snprintf(file_name, sizeof(file_name), "%s_%s.log", prefix, postfix);
614
615	if (trylock) {
616		if (fio_trylock_file(file_name))
617			return 1;
618	} else
619		fio_lock_file(file_name);
620
621	if (td->client_type == FIO_CLIENT_TYPE_GUI) {
622		fio_send_iolog(td, log, file_name);
623		free(log->log);
624		free(log);
625	} else
626		__finish_log(log, file_name);
627
628	fio_unlock_file(file_name);
629	return 0;
630}
631
632static int finish_log(struct thread_data *td, struct io_log *log,
633		      const char *name, int trylock)
634{
635	return finish_log_named(td, log, td->o.name, name, trylock);
636}
637
638static int write_this_log(struct thread_data *td, struct io_log *log,
639			  const char *log_file, const char *name, int try)
640{
641	int ret;
642
643	if (!log)
644		return 0;
645
646	if (log_file)
647		ret = finish_log_named(td, log, log_file, name, try);
648	else
649		ret = finish_log(td, log, name, try);
650
651	return ret;
652}
653
654static int write_iops_log(struct thread_data *td, int try)
655{
656	struct thread_options *o = &td->o;
657
658	return write_this_log(td, td->iops_log, o->iops_log_file, "iops", try);
659}
660
661static int write_slat_log(struct thread_data *td, int try)
662{
663	struct thread_options *o = &td->o;
664
665	return write_this_log(td, td->slat_log, o->lat_log_file, "slat", try);
666}
667
668static int write_clat_log(struct thread_data *td, int try)
669{
670	struct thread_options *o = &td->o;
671
672	return write_this_log(td, td->clat_log, o->lat_log_file, "clat" , try);
673}
674
675static int write_lat_log(struct thread_data *td, int try)
676{
677	struct thread_options *o = &td->o;
678
679	return write_this_log(td, td->lat_log, o->lat_log_file, "lat", try);
680}
681
682static int write_bandw_log(struct thread_data *td, int try)
683{
684	struct thread_options *o = &td->o;
685
686	return write_this_log(td, td->bw_log, o->bw_log_file, "bw", try);
687}
688
689enum {
690	BW_LOG_MASK	= 1,
691	LAT_LOG_MASK	= 2,
692	SLAT_LOG_MASK	= 4,
693	CLAT_LOG_MASK	= 8,
694	IOPS_LOG_MASK	= 16,
695
696	ALL_LOG_NR	= 5,
697};
698
699struct log_type {
700	unsigned int mask;
701	int (*fn)(struct thread_data *, int);
702};
703
704static struct log_type log_types[] = {
705	{
706		.mask	= BW_LOG_MASK,
707		.fn	= write_bandw_log,
708	},
709	{
710		.mask	= LAT_LOG_MASK,
711		.fn	= write_lat_log,
712	},
713	{
714		.mask	= SLAT_LOG_MASK,
715		.fn	= write_slat_log,
716	},
717	{
718		.mask	= CLAT_LOG_MASK,
719		.fn	= write_clat_log,
720	},
721	{
722		.mask	= IOPS_LOG_MASK,
723		.fn	= write_iops_log,
724	},
725};
726
727void fio_writeout_logs(struct thread_data *td)
728{
729	unsigned int log_mask = 0;
730	unsigned int log_left = ALL_LOG_NR;
731	int old_state, i;
732
733	old_state = td_bump_runstate(td, TD_FINISHING);
734
735	finalize_logs(td);
736
737	while (log_left) {
738		int prev_log_left = log_left;
739
740		for (i = 0; i < ALL_LOG_NR && log_left; i++) {
741			struct log_type *lt = &log_types[i];
742			int ret;
743
744			if (!(log_mask & lt->mask)) {
745				ret = lt->fn(td, log_left != 1);
746				if (!ret) {
747					log_left--;
748					log_mask |= lt->mask;
749				}
750			}
751		}
752
753		if (prev_log_left == log_left)
754			usleep(5000);
755	}
756
757	td_restore_runstate(td, old_state);
758}
759