1/* 2 * blktrace support code for fio 3 */ 4#include <stdio.h> 5#include <stdlib.h> 6#include <sys/stat.h> 7#include <dirent.h> 8 9#include "flist.h" 10#include "fio.h" 11#include "blktrace_api.h" 12 13#define TRACE_FIFO_SIZE 8192 14 15/* 16 * fifo refill frontend, to avoid reading data in trace sized bites 17 */ 18static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd) 19{ 20 char buf[TRACE_FIFO_SIZE]; 21 unsigned int total; 22 int ret; 23 24 total = sizeof(buf); 25 if (total > fifo_room(fifo)) 26 total = fifo_room(fifo); 27 28 ret = read(fd, buf, total); 29 if (ret < 0) { 30 td_verror(td, errno, "read blktrace file"); 31 return -1; 32 } 33 34 if (ret > 0) 35 ret = fifo_put(fifo, buf, ret); 36 37 dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret); 38 return ret; 39} 40 41/* 42 * Retrieve 'len' bytes from the fifo, refilling if necessary. 43 */ 44static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd, 45 void *buf, unsigned int len) 46{ 47 if (fifo_len(fifo) < len) { 48 int ret = refill_fifo(td, fifo, fd); 49 50 if (ret < 0) 51 return ret; 52 } 53 54 return fifo_get(fifo, buf, len); 55} 56 57/* 58 * Just discard the pdu by seeking past it. 59 */ 60static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd, 61 struct blk_io_trace *t) 62{ 63 if (t->pdu_len == 0) 64 return 0; 65 66 dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len); 67 return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len); 68} 69 70/* 71 * Check if this is a blktrace binary data file. We read a single trace 72 * into memory and check for the magic signature. 73 */ 74int is_blktrace(const char *filename, int *need_swap) 75{ 76 struct blk_io_trace t; 77 int fd, ret; 78 79 fd = open(filename, O_RDONLY); 80 if (fd < 0) 81 return 0; 82 83 ret = read(fd, &t, sizeof(t)); 84 close(fd); 85 86 if (ret < 0) { 87 perror("read blktrace"); 88 return 0; 89 } else if (ret != sizeof(t)) { 90 log_err("fio: short read on blktrace file\n"); 91 return 0; 92 } 93 94 if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) { 95 *need_swap = 0; 96 return 1; 97 } 98 99 /* 100 * Maybe it needs to be endian swapped... 101 */ 102 t.magic = fio_swap32(t.magic); 103 if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) { 104 *need_swap = 1; 105 return 1; 106 } 107 108 return 0; 109} 110 111static int lookup_device(struct thread_data *td, char *path, unsigned int maj, 112 unsigned int min) 113{ 114 struct dirent *dir; 115 struct stat st; 116 int found = 0; 117 DIR *D; 118 119 D = opendir(path); 120 if (!D) 121 return 0; 122 123 while ((dir = readdir(D)) != NULL) { 124 char full_path[256]; 125 126 if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, "..")) 127 continue; 128 129 sprintf(full_path, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, dir->d_name); 130 if (lstat(full_path, &st) == -1) { 131 perror("lstat"); 132 break; 133 } 134 135 if (S_ISDIR(st.st_mode)) { 136 found = lookup_device(td, full_path, maj, min); 137 if (found) { 138 strcpy(path, full_path); 139 break; 140 } 141 } 142 143 if (!S_ISBLK(st.st_mode)) 144 continue; 145 146 /* 147 * If replay_redirect is set then always return this device 148 * upon lookup which overrides the device lookup based on 149 * major minor in the actual blktrace 150 */ 151 if (td->o.replay_redirect) { 152 dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden" 153 " with: %s\n", maj, min, 154 td->o.replay_redirect); 155 strcpy(path, td->o.replay_redirect); 156 found = 1; 157 break; 158 } 159 160 if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) { 161 dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min); 162 strcpy(path, full_path); 163 found = 1; 164 break; 165 } 166 } 167 168 closedir(D); 169 return found; 170} 171 172#define FMINORBITS 20 173#define FMINORMASK ((1U << FMINORBITS) - 1) 174#define FMAJOR(dev) ((unsigned int) ((dev) >> FMINORBITS)) 175#define FMINOR(dev) ((unsigned int) ((dev) & FMINORMASK)) 176 177static void trace_add_open_close_event(struct thread_data *td, int fileno, enum file_log_act action) 178{ 179 struct io_piece *ipo; 180 181 ipo = calloc(1, sizeof(*ipo)); 182 init_ipo(ipo); 183 184 ipo->ddir = DDIR_INVAL; 185 ipo->fileno = fileno; 186 ipo->file_action = action; 187 flist_add_tail(&ipo->list, &td->io_log_list); 188} 189 190static int trace_add_file(struct thread_data *td, __u32 device) 191{ 192 static unsigned int last_maj, last_min, last_fileno; 193 unsigned int maj = FMAJOR(device); 194 unsigned int min = FMINOR(device); 195 struct fio_file *f; 196 char dev[256]; 197 unsigned int i; 198 199 if (last_maj == maj && last_min == min) 200 return last_fileno; 201 202 last_maj = maj; 203 last_min = min; 204 205 /* 206 * check for this file in our list 207 */ 208 for_each_file(td, f, i) 209 if (f->major == maj && f->minor == min) { 210 last_fileno = f->fileno; 211 return last_fileno; 212 } 213 214 strcpy(dev, "/dev"); 215 if (lookup_device(td, dev, maj, min)) { 216 int fileno; 217 218 dprint(FD_BLKTRACE, "add devices %s\n", dev); 219 fileno = add_file_exclusive(td, dev); 220 td->o.open_files++; 221 td->files[fileno]->major = maj; 222 td->files[fileno]->minor = min; 223 trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE); 224 last_fileno = fileno; 225 } 226 227 return last_fileno; 228} 229 230/* 231 * Store blk_io_trace data in an ipo for later retrieval. 232 */ 233static void store_ipo(struct thread_data *td, unsigned long long offset, 234 unsigned int bytes, int rw, unsigned long long ttime, 235 int fileno) 236{ 237 struct io_piece *ipo = malloc(sizeof(*ipo)); 238 239 init_ipo(ipo); 240 241 /* 242 * the 512 is wrong here, it should be the hardware sector size... 243 */ 244 ipo->offset = offset * 512; 245 ipo->len = bytes; 246 ipo->delay = ttime / 1000; 247 if (rw) 248 ipo->ddir = DDIR_WRITE; 249 else 250 ipo->ddir = DDIR_READ; 251 ipo->fileno = fileno; 252 253 dprint(FD_BLKTRACE, "store ddir=%d, off=%llu, len=%lu, delay=%lu\n", 254 ipo->ddir, ipo->offset, 255 ipo->len, ipo->delay); 256 queue_io_piece(td, ipo); 257} 258 259static void handle_trace_notify(struct blk_io_trace *t) 260{ 261 switch (t->action) { 262 case BLK_TN_PROCESS: 263 log_info("blktrace: got process notify: %x, %d\n", 264 t->action, t->pid); 265 break; 266 case BLK_TN_TIMESTAMP: 267 log_info("blktrace: got timestamp notify: %x, %d\n", 268 t->action, t->pid); 269 break; 270 case BLK_TN_MESSAGE: 271 break; 272 default: 273 dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action); 274 break; 275 } 276} 277 278static void handle_trace_discard(struct thread_data *td, struct blk_io_trace *t, 279 unsigned long long ttime, unsigned long *ios) 280{ 281 struct io_piece *ipo = malloc(sizeof(*ipo)); 282 int fileno; 283 284 init_ipo(ipo); 285 fileno = trace_add_file(td, t->device); 286 287 ios[DDIR_WRITE]++; 288 td->o.size += t->bytes; 289 290 memset(ipo, 0, sizeof(*ipo)); 291 INIT_FLIST_HEAD(&ipo->list); 292 293 /* 294 * the 512 is wrong here, it should be the hardware sector size... 295 */ 296 ipo->offset = t->sector * 512; 297 ipo->len = t->bytes; 298 ipo->delay = ttime / 1000; 299 ipo->ddir = DDIR_TRIM; 300 ipo->fileno = fileno; 301 302 dprint(FD_BLKTRACE, "store discard, off=%llu, len=%lu, delay=%lu\n", 303 ipo->offset, ipo->len, 304 ipo->delay); 305 queue_io_piece(td, ipo); 306} 307 308static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t, 309 unsigned long long ttime, unsigned long *ios, 310 unsigned int *bs) 311{ 312 int rw; 313 int fileno; 314 315 fileno = trace_add_file(td, t->device); 316 317 rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 318 319 if (t->bytes > bs[rw]) 320 bs[rw] = t->bytes; 321 322 ios[rw]++; 323 td->o.size += t->bytes; 324 store_ipo(td, t->sector, t->bytes, rw, ttime, fileno); 325} 326 327/* 328 * We only care for queue traces, most of the others are side effects 329 * due to internal workings of the block layer. 330 */ 331static void handle_trace(struct thread_data *td, struct blk_io_trace *t, 332 unsigned long long ttime, unsigned long *ios, 333 unsigned int *bs) 334{ 335 if ((t->action & 0xffff) != __BLK_TA_QUEUE) 336 return; 337 if (t->action & BLK_TC_ACT(BLK_TC_PC)) 338 return; 339 340 if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY)) 341 handle_trace_notify(t); 342 else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD)) 343 handle_trace_discard(td, t, ttime, ios); 344 else 345 handle_trace_fs(td, t, ttime, ios, bs); 346} 347 348static void byteswap_trace(struct blk_io_trace *t) 349{ 350 t->magic = fio_swap32(t->magic); 351 t->sequence = fio_swap32(t->sequence); 352 t->time = fio_swap64(t->time); 353 t->sector = fio_swap64(t->sector); 354 t->bytes = fio_swap32(t->bytes); 355 t->action = fio_swap32(t->action); 356 t->pid = fio_swap32(t->pid); 357 t->device = fio_swap32(t->device); 358 t->cpu = fio_swap32(t->cpu); 359 t->error = fio_swap16(t->error); 360 t->pdu_len = fio_swap16(t->pdu_len); 361} 362 363/* 364 * Load a blktrace file by reading all the blk_io_trace entries, and storing 365 * them as io_pieces like the fio text version would do. 366 */ 367int load_blktrace(struct thread_data *td, const char *filename, int need_swap) 368{ 369 unsigned long long ttime, delay; 370 struct blk_io_trace t; 371 unsigned long ios[2], skipped_writes; 372 unsigned int cpu; 373 unsigned int rw_bs[2]; 374 struct fifo *fifo; 375 int fd, i, old_state; 376 struct fio_file *f; 377 int this_depth, depth; 378 379 fd = open(filename, O_RDONLY); 380 if (fd < 0) { 381 td_verror(td, errno, "open blktrace file"); 382 return 1; 383 } 384 385 fifo = fifo_alloc(TRACE_FIFO_SIZE); 386 387 old_state = td_bump_runstate(td, TD_SETTING_UP); 388 389 td->o.size = 0; 390 391 cpu = 0; 392 ttime = 0; 393 ios[0] = ios[1] = 0; 394 rw_bs[0] = rw_bs[1] = 0; 395 skipped_writes = 0; 396 this_depth = depth = 0; 397 do { 398 int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t)); 399 400 if (ret < 0) 401 goto err; 402 else if (!ret) 403 break; 404 else if (ret < (int) sizeof(t)) { 405 log_err("fio: short fifo get\n"); 406 break; 407 } 408 409 if (need_swap) 410 byteswap_trace(&t); 411 412 if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) { 413 log_err("fio: bad magic in blktrace data: %x\n", 414 t.magic); 415 goto err; 416 } 417 if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) { 418 log_err("fio: bad blktrace version %d\n", 419 t.magic & 0xff); 420 goto err; 421 } 422 ret = discard_pdu(td, fifo, fd, &t); 423 if (ret < 0) { 424 td_verror(td, ret, "blktrace lseek"); 425 goto err; 426 } else if (t.pdu_len != ret) { 427 log_err("fio: discarded %d of %d\n", ret, t.pdu_len); 428 goto err; 429 } 430 if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) { 431 if ((t.action & 0xffff) == __BLK_TA_QUEUE) 432 this_depth++; 433 else if ((t.action & 0xffff) == __BLK_TA_COMPLETE) { 434 depth = max(depth, this_depth); 435 this_depth = 0; 436 } 437 if (!ttime) { 438 ttime = t.time; 439 cpu = t.cpu; 440 } 441 442 delay = 0; 443 if (cpu == t.cpu) 444 delay = t.time - ttime; 445 if ((t.action & BLK_TC_ACT(BLK_TC_WRITE)) && read_only) 446 skipped_writes++; 447 else { 448 /* 449 * set delay to zero if no_stall enabled for 450 * fast replay 451 */ 452 if (td->o.no_stall) 453 delay = 0; 454 455 handle_trace(td, &t, delay, ios, rw_bs); 456 } 457 458 ttime = t.time; 459 cpu = t.cpu; 460 } else { 461 delay = 0; 462 handle_trace(td, &t, delay, ios, rw_bs); 463 } 464 } while (1); 465 466 for (i = 0; i < td->files_index; i++) { 467 f = td->files[i]; 468 trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE); 469 } 470 471 fifo_free(fifo); 472 close(fd); 473 474 td_restore_runstate(td, old_state); 475 476 if (!td->files_index) { 477 log_err("fio: did not find replay device(s)\n"); 478 return 1; 479 } 480 481 /* 482 * For stacked devices, we don't always get a COMPLETE event so 483 * the depth grows to insane values. Limit it to something sane(r). 484 */ 485 if (!depth || depth > 1024) 486 depth = 1024; 487 488 if (skipped_writes) 489 log_err("fio: %s skips replay of %lu writes due to read-only\n", 490 td->o.name, skipped_writes); 491 492 if (!ios[DDIR_READ] && !ios[DDIR_WRITE]) { 493 log_err("fio: found no ios in blktrace data\n"); 494 return 1; 495 } else if (ios[DDIR_READ] && !ios[DDIR_READ]) { 496 td->o.td_ddir = TD_DDIR_READ; 497 td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ]; 498 } else if (!ios[DDIR_READ] && ios[DDIR_WRITE]) { 499 td->o.td_ddir = TD_DDIR_WRITE; 500 td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE]; 501 } else { 502 td->o.td_ddir = TD_DDIR_RW; 503 td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ]; 504 td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE]; 505 } 506 507 /* 508 * We need to do direct/raw ios to the device, to avoid getting 509 * read-ahead in our way. 510 */ 511 td->o.odirect = 1; 512 513 /* 514 * we don't know if this option was set or not. it defaults to 1, 515 * so we'll just guess that we should override it if it's still 1 516 */ 517 if (td->o.iodepth != 1) 518 td->o.iodepth = depth; 519 520 return 0; 521err: 522 close(fd); 523 fifo_free(fifo); 524 return 1; 525} 526