io_u.c revision 1ef2b6be973eded12827990ae1a9eb28b7b20be7
1#include <unistd.h> 2#include <fcntl.h> 3#include <string.h> 4#include <signal.h> 5#include <time.h> 6#include <assert.h> 7 8#include "fio.h" 9#include "hash.h" 10#include "verify.h" 11#include "trim.h" 12#include "lib/rand.h" 13 14struct io_completion_data { 15 int nr; /* input */ 16 17 int error; /* output */ 18 unsigned long bytes_done[2]; /* output */ 19 struct timeval time; /* output */ 20}; 21 22/* 23 * The ->file_map[] contains a map of blocks we have or have not done io 24 * to yet. Used to make sure we cover the entire range in a fair fashion. 25 */ 26static int random_map_free(struct fio_file *f, const unsigned long long block) 27{ 28 unsigned int idx = RAND_MAP_IDX(f, block); 29 unsigned int bit = RAND_MAP_BIT(f, block); 30 31 dprint(FD_RANDOM, "free: b=%llu, idx=%u, bit=%u\n", block, idx, bit); 32 33 return (f->file_map[idx] & (1 << bit)) == 0; 34} 35 36/* 37 * Mark a given offset as used in the map. 38 */ 39static void mark_random_map(struct thread_data *td, struct io_u *io_u) 40{ 41 unsigned int min_bs = td->o.rw_min_bs; 42 struct fio_file *f = io_u->file; 43 unsigned long long block; 44 unsigned int blocks, nr_blocks; 45 int busy_check; 46 47 block = (io_u->offset - f->file_offset) / (unsigned long long) min_bs; 48 nr_blocks = (io_u->buflen + min_bs - 1) / min_bs; 49 blocks = 0; 50 busy_check = !(io_u->flags & IO_U_F_BUSY_OK); 51 52 while (nr_blocks) { 53 unsigned int this_blocks, mask; 54 unsigned int idx, bit; 55 56 /* 57 * If we have a mixed random workload, we may 58 * encounter blocks we already did IO to. 59 */ 60 if (!busy_check) { 61 blocks = nr_blocks; 62 break; 63 } 64 if ((td->o.ddir_seq_nr == 1) && !random_map_free(f, block)) 65 break; 66 67 idx = RAND_MAP_IDX(f, block); 68 bit = RAND_MAP_BIT(f, block); 69 70 fio_assert(td, idx < f->num_maps); 71 72 this_blocks = nr_blocks; 73 if (this_blocks + bit > BLOCKS_PER_MAP) 74 this_blocks = BLOCKS_PER_MAP - bit; 75 76 do { 77 if (this_blocks == BLOCKS_PER_MAP) 78 mask = -1U; 79 else 80 mask = ((1U << this_blocks) - 1) << bit; 81 82 if (!(f->file_map[idx] & mask)) 83 break; 84 85 this_blocks--; 86 } while (this_blocks); 87 88 if (!this_blocks) 89 break; 90 91 f->file_map[idx] |= mask; 92 nr_blocks -= this_blocks; 93 blocks += this_blocks; 94 block += this_blocks; 95 } 96 97 if ((blocks * min_bs) < io_u->buflen) 98 io_u->buflen = blocks * min_bs; 99} 100 101static unsigned long long last_block(struct thread_data *td, struct fio_file *f, 102 enum fio_ddir ddir) 103{ 104 unsigned long long max_blocks; 105 unsigned long long max_size; 106 107 assert(ddir_rw(ddir)); 108 109 /* 110 * Hmm, should we make sure that ->io_size <= ->real_file_size? 111 */ 112 max_size = f->io_size; 113 if (max_size > f->real_file_size) 114 max_size = f->real_file_size; 115 116 max_blocks = max_size / (unsigned long long) td->o.ba[ddir]; 117 if (!max_blocks) 118 return 0; 119 120 return max_blocks; 121} 122 123/* 124 * Return the next free block in the map. 125 */ 126static int get_next_free_block(struct thread_data *td, struct fio_file *f, 127 enum fio_ddir ddir, unsigned long long *b) 128{ 129 unsigned long long min_bs = td->o.rw_min_bs; 130 int i; 131 132 i = f->last_free_lookup; 133 *b = (i * BLOCKS_PER_MAP); 134 while ((*b) * min_bs < f->real_file_size && 135 (*b) * min_bs < f->io_size) { 136 if (f->file_map[i] != (unsigned int) -1) { 137 *b += ffz(f->file_map[i]); 138 if (*b > last_block(td, f, ddir)) 139 break; 140 f->last_free_lookup = i; 141 return 0; 142 } 143 144 *b += BLOCKS_PER_MAP; 145 i++; 146 } 147 148 dprint(FD_IO, "failed finding a free block\n"); 149 return 1; 150} 151 152static int get_next_rand_offset(struct thread_data *td, struct fio_file *f, 153 enum fio_ddir ddir, unsigned long long *b) 154{ 155 unsigned long long r; 156 int loops = 5; 157 158 do { 159 r = os_random_long(&td->random_state); 160 dprint(FD_RANDOM, "off rand %llu\n", r); 161 *b = (last_block(td, f, ddir) - 1) 162 * (r / ((unsigned long long) OS_RAND_MAX + 1.0)); 163 164 /* 165 * if we are not maintaining a random map, we are done. 166 */ 167 if (!file_randommap(td, f)) 168 return 0; 169 170 /* 171 * calculate map offset and check if it's free 172 */ 173 if (random_map_free(f, *b)) 174 return 0; 175 176 dprint(FD_RANDOM, "get_next_rand_offset: offset %llu busy\n", 177 *b); 178 } while (--loops); 179 180 /* 181 * we get here, if we didn't suceed in looking up a block. generate 182 * a random start offset into the filemap, and find the first free 183 * block from there. 184 */ 185 loops = 10; 186 do { 187 f->last_free_lookup = (f->num_maps - 1) * 188 (r / (OS_RAND_MAX + 1.0)); 189 if (!get_next_free_block(td, f, ddir, b)) 190 return 0; 191 192 r = os_random_long(&td->random_state); 193 } while (--loops); 194 195 /* 196 * that didn't work either, try exhaustive search from the start 197 */ 198 f->last_free_lookup = 0; 199 return get_next_free_block(td, f, ddir, b); 200} 201 202static int get_next_rand_block(struct thread_data *td, struct fio_file *f, 203 enum fio_ddir ddir, unsigned long long *b) 204{ 205 if (get_next_rand_offset(td, f, ddir, b)) { 206 dprint(FD_IO, "%s: rand offset failed, last=%llu, size=%llu\n", 207 f->file_name, f->last_pos, f->real_file_size); 208 return 1; 209 } 210 211 return 0; 212} 213 214static int get_next_seq_block(struct thread_data *td, struct fio_file *f, 215 enum fio_ddir ddir, unsigned long long *b) 216{ 217 assert(ddir_rw(ddir)); 218 219 if (f->last_pos < f->real_file_size) { 220 *b = (f->last_pos - f->file_offset) / td->o.min_bs[ddir]; 221 return 0; 222 } 223 224 return 1; 225} 226 227static int get_next_block(struct thread_data *td, struct io_u *io_u, 228 enum fio_ddir ddir, int rw_seq, unsigned long long *b) 229{ 230 struct fio_file *f = io_u->file; 231 int ret; 232 233 assert(ddir_rw(ddir)); 234 235 if (rw_seq) { 236 if (td_random(td)) 237 ret = get_next_rand_block(td, f, ddir, b); 238 else 239 ret = get_next_seq_block(td, f, ddir, b); 240 } else { 241 io_u->flags |= IO_U_F_BUSY_OK; 242 243 if (td->o.rw_seq == RW_SEQ_SEQ) { 244 ret = get_next_seq_block(td, f, ddir, b); 245 if (ret) 246 ret = get_next_rand_block(td, f, ddir, b); 247 } else if (td->o.rw_seq == RW_SEQ_IDENT) { 248 if (f->last_start != -1ULL) 249 *b = (f->last_start - f->file_offset) 250 / td->o.min_bs[ddir]; 251 else 252 *b = 0; 253 ret = 0; 254 } else { 255 log_err("fio: unknown rw_seq=%d\n", td->o.rw_seq); 256 ret = 1; 257 } 258 } 259 260 return ret; 261} 262 263/* 264 * For random io, generate a random new block and see if it's used. Repeat 265 * until we find a free one. For sequential io, just return the end of 266 * the last io issued. 267 */ 268static int __get_next_offset(struct thread_data *td, struct io_u *io_u) 269{ 270 struct fio_file *f = io_u->file; 271 unsigned long long b; 272 enum fio_ddir ddir = io_u->ddir; 273 int rw_seq_hit = 0; 274 275 assert(ddir_rw(ddir)); 276 277 if (td->o.ddir_seq_nr && !--td->ddir_seq_nr) { 278 rw_seq_hit = 1; 279 td->ddir_seq_nr = td->o.ddir_seq_nr; 280 } 281 282 if (get_next_block(td, io_u, ddir, rw_seq_hit, &b)) 283 return 1; 284 285 io_u->offset = b * td->o.ba[ddir]; 286 if (io_u->offset >= f->io_size) { 287 dprint(FD_IO, "get_next_offset: offset %llu >= io_size %llu\n", 288 io_u->offset, f->io_size); 289 return 1; 290 } 291 292 io_u->offset += f->file_offset; 293 if (io_u->offset >= f->real_file_size) { 294 dprint(FD_IO, "get_next_offset: offset %llu >= size %llu\n", 295 io_u->offset, f->real_file_size); 296 return 1; 297 } 298 299 return 0; 300} 301 302static int get_next_offset(struct thread_data *td, struct io_u *io_u) 303{ 304 struct prof_io_ops *ops = &td->prof_io_ops; 305 306 if (ops->fill_io_u_off) 307 return ops->fill_io_u_off(td, io_u); 308 309 return __get_next_offset(td, io_u); 310} 311 312static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u) 313{ 314 const int ddir = io_u->ddir; 315 unsigned int uninitialized_var(buflen); 316 unsigned int minbs, maxbs; 317 long r; 318 319 assert(ddir_rw(ddir)); 320 321 minbs = td->o.min_bs[ddir]; 322 maxbs = td->o.max_bs[ddir]; 323 324 if (minbs == maxbs) 325 buflen = minbs; 326 else { 327 r = os_random_long(&td->bsrange_state); 328 if (!td->o.bssplit_nr[ddir]) { 329 buflen = 1 + (unsigned int) ((double) maxbs * 330 (r / (OS_RAND_MAX + 1.0))); 331 if (buflen < minbs) 332 buflen = minbs; 333 } else { 334 long perc = 0; 335 unsigned int i; 336 337 for (i = 0; i < td->o.bssplit_nr[ddir]; i++) { 338 struct bssplit *bsp = &td->o.bssplit[ddir][i]; 339 340 buflen = bsp->bs; 341 perc += bsp->perc; 342 if (r <= ((OS_RAND_MAX / 100L) * perc)) 343 break; 344 } 345 } 346 if (!td->o.bs_unaligned && is_power_of_2(minbs)) 347 buflen = (buflen + minbs - 1) & ~(minbs - 1); 348 } 349 350 if (io_u->offset + buflen > io_u->file->real_file_size) { 351 dprint(FD_IO, "lower buflen %u -> %u (ddir=%d)\n", buflen, 352 minbs, ddir); 353 buflen = minbs; 354 } 355 356 return buflen; 357} 358 359static unsigned int get_next_buflen(struct thread_data *td, struct io_u *io_u) 360{ 361 struct prof_io_ops *ops = &td->prof_io_ops; 362 363 if (ops->fill_io_u_size) 364 return ops->fill_io_u_size(td, io_u); 365 366 return __get_next_buflen(td, io_u); 367} 368 369static void set_rwmix_bytes(struct thread_data *td) 370{ 371 unsigned int diff; 372 373 /* 374 * we do time or byte based switch. this is needed because 375 * buffered writes may issue a lot quicker than they complete, 376 * whereas reads do not. 377 */ 378 diff = td->o.rwmix[td->rwmix_ddir ^ 1]; 379 td->rwmix_issues = (td->io_issues[td->rwmix_ddir] * diff) / 100; 380} 381 382static inline enum fio_ddir get_rand_ddir(struct thread_data *td) 383{ 384 unsigned int v; 385 long r; 386 387 r = os_random_long(&td->rwmix_state); 388 v = 1 + (int) (100.0 * (r / (OS_RAND_MAX + 1.0))); 389 if (v <= td->o.rwmix[DDIR_READ]) 390 return DDIR_READ; 391 392 return DDIR_WRITE; 393} 394 395static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir) 396{ 397 enum fio_ddir odir = ddir ^ 1; 398 struct timeval t; 399 long usec; 400 401 assert(ddir_rw(ddir)); 402 403 if (td->rate_pending_usleep[ddir] <= 0) 404 return ddir; 405 406 /* 407 * We have too much pending sleep in this direction. See if we 408 * should switch. 409 */ 410 if (td_rw(td)) { 411 /* 412 * Other direction does not have too much pending, switch 413 */ 414 if (td->rate_pending_usleep[odir] < 100000) 415 return odir; 416 417 /* 418 * Both directions have pending sleep. Sleep the minimum time 419 * and deduct from both. 420 */ 421 if (td->rate_pending_usleep[ddir] <= 422 td->rate_pending_usleep[odir]) { 423 usec = td->rate_pending_usleep[ddir]; 424 } else { 425 usec = td->rate_pending_usleep[odir]; 426 ddir = odir; 427 } 428 } else 429 usec = td->rate_pending_usleep[ddir]; 430 431 fio_gettime(&t, NULL); 432 usec_sleep(td, usec); 433 usec = utime_since_now(&t); 434 435 td->rate_pending_usleep[ddir] -= usec; 436 437 odir = ddir ^ 1; 438 if (td_rw(td) && __should_check_rate(td, odir)) 439 td->rate_pending_usleep[odir] -= usec; 440 441 return ddir; 442} 443 444/* 445 * Return the data direction for the next io_u. If the job is a 446 * mixed read/write workload, check the rwmix cycle and switch if 447 * necessary. 448 */ 449static enum fio_ddir get_rw_ddir(struct thread_data *td) 450{ 451 enum fio_ddir ddir; 452 453 /* 454 * see if it's time to fsync 455 */ 456 if (td->o.fsync_blocks && 457 !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) && 458 td->io_issues[DDIR_WRITE] && should_fsync(td)) 459 return DDIR_SYNC; 460 461 /* 462 * see if it's time to fdatasync 463 */ 464 if (td->o.fdatasync_blocks && 465 !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks) && 466 td->io_issues[DDIR_WRITE] && should_fsync(td)) 467 return DDIR_DATASYNC; 468 469 /* 470 * see if it's time to sync_file_range 471 */ 472 if (td->sync_file_range_nr && 473 !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr) && 474 td->io_issues[DDIR_WRITE] && should_fsync(td)) 475 return DDIR_SYNC_FILE_RANGE; 476 477 if (td_rw(td)) { 478 /* 479 * Check if it's time to seed a new data direction. 480 */ 481 if (td->io_issues[td->rwmix_ddir] >= td->rwmix_issues) { 482 /* 483 * Put a top limit on how many bytes we do for 484 * one data direction, to avoid overflowing the 485 * ranges too much 486 */ 487 ddir = get_rand_ddir(td); 488 489 if (ddir != td->rwmix_ddir) 490 set_rwmix_bytes(td); 491 492 td->rwmix_ddir = ddir; 493 } 494 ddir = td->rwmix_ddir; 495 } else if (td_read(td)) 496 ddir = DDIR_READ; 497 else 498 ddir = DDIR_WRITE; 499 500 td->rwmix_ddir = rate_ddir(td, ddir); 501 return td->rwmix_ddir; 502} 503 504static void set_rw_ddir(struct thread_data *td, struct io_u *io_u) 505{ 506 io_u->ddir = get_rw_ddir(td); 507 508 if (io_u->ddir == DDIR_WRITE && (td->io_ops->flags & FIO_BARRIER) && 509 td->o.barrier_blocks && 510 !(td->io_issues[DDIR_WRITE] % td->o.barrier_blocks) && 511 td->io_issues[DDIR_WRITE]) 512 io_u->flags |= IO_U_F_BARRIER; 513} 514 515void put_file_log(struct thread_data *td, struct fio_file *f) 516{ 517 int ret = put_file(td, f); 518 519 if (ret) 520 td_verror(td, ret, "file close"); 521} 522 523void put_io_u(struct thread_data *td, struct io_u *io_u) 524{ 525 td_io_u_lock(td); 526 527 io_u->flags |= IO_U_F_FREE; 528 io_u->flags &= ~IO_U_F_FREE_DEF; 529 530 if (io_u->file) 531 put_file_log(td, io_u->file); 532 533 io_u->file = NULL; 534 if (io_u->flags & IO_U_F_IN_CUR_DEPTH) 535 td->cur_depth--; 536 flist_del_init(&io_u->list); 537 flist_add(&io_u->list, &td->io_u_freelist); 538 td_io_u_unlock(td); 539 td_io_u_free_notify(td); 540} 541 542void clear_io_u(struct thread_data *td, struct io_u *io_u) 543{ 544 io_u->flags &= ~IO_U_F_FLIGHT; 545 put_io_u(td, io_u); 546} 547 548void requeue_io_u(struct thread_data *td, struct io_u **io_u) 549{ 550 struct io_u *__io_u = *io_u; 551 552 dprint(FD_IO, "requeue %p\n", __io_u); 553 554 td_io_u_lock(td); 555 556 __io_u->flags |= IO_U_F_FREE; 557 if ((__io_u->flags & IO_U_F_FLIGHT) && ddir_rw(__io_u->ddir)) 558 td->io_issues[__io_u->ddir]--; 559 560 __io_u->flags &= ~IO_U_F_FLIGHT; 561 if (__io_u->flags & IO_U_F_IN_CUR_DEPTH) 562 td->cur_depth--; 563 flist_del(&__io_u->list); 564 flist_add_tail(&__io_u->list, &td->io_u_requeues); 565 td_io_u_unlock(td); 566 *io_u = NULL; 567} 568 569static int fill_io_u(struct thread_data *td, struct io_u *io_u) 570{ 571 if (td->io_ops->flags & FIO_NOIO) 572 goto out; 573 574 set_rw_ddir(td, io_u); 575 576 /* 577 * fsync() or fdatasync() or trim etc, we are done 578 */ 579 if (!ddir_rw(io_u->ddir)) 580 goto out; 581 582 /* 583 * See if it's time to switch to a new zone 584 */ 585 if (td->zone_bytes >= td->o.zone_size) { 586 td->zone_bytes = 0; 587 io_u->file->last_pos += td->o.zone_skip; 588 td->io_skip_bytes += td->o.zone_skip; 589 } 590 591 /* 592 * No log, let the seq/rand engine retrieve the next buflen and 593 * position. 594 */ 595 if (get_next_offset(td, io_u)) { 596 dprint(FD_IO, "io_u %p, failed getting offset\n", io_u); 597 return 1; 598 } 599 600 io_u->buflen = get_next_buflen(td, io_u); 601 if (!io_u->buflen) { 602 dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u); 603 return 1; 604 } 605 606 if (io_u->offset + io_u->buflen > io_u->file->real_file_size) { 607 dprint(FD_IO, "io_u %p, offset too large\n", io_u); 608 dprint(FD_IO, " off=%llu/%lu > %llu\n", io_u->offset, 609 io_u->buflen, io_u->file->real_file_size); 610 return 1; 611 } 612 613 /* 614 * mark entry before potentially trimming io_u 615 */ 616 if (td_random(td) && file_randommap(td, io_u->file)) 617 mark_random_map(td, io_u); 618 619 /* 620 * If using a write iolog, store this entry. 621 */ 622out: 623 dprint_io_u(io_u, "fill_io_u"); 624 td->zone_bytes += io_u->buflen; 625 log_io_u(td, io_u); 626 return 0; 627} 628 629static void __io_u_mark_map(unsigned int *map, unsigned int nr) 630{ 631 int index = 0; 632 633 switch (nr) { 634 default: 635 index = 6; 636 break; 637 case 33 ... 64: 638 index = 5; 639 break; 640 case 17 ... 32: 641 index = 4; 642 break; 643 case 9 ... 16: 644 index = 3; 645 break; 646 case 5 ... 8: 647 index = 2; 648 break; 649 case 1 ... 4: 650 index = 1; 651 case 0: 652 break; 653 } 654 655 map[index]++; 656} 657 658void io_u_mark_submit(struct thread_data *td, unsigned int nr) 659{ 660 __io_u_mark_map(td->ts.io_u_submit, nr); 661 td->ts.total_submit++; 662} 663 664void io_u_mark_complete(struct thread_data *td, unsigned int nr) 665{ 666 __io_u_mark_map(td->ts.io_u_complete, nr); 667 td->ts.total_complete++; 668} 669 670void io_u_mark_depth(struct thread_data *td, unsigned int nr) 671{ 672 int index = 0; 673 674 switch (td->cur_depth) { 675 default: 676 index = 6; 677 break; 678 case 32 ... 63: 679 index = 5; 680 break; 681 case 16 ... 31: 682 index = 4; 683 break; 684 case 8 ... 15: 685 index = 3; 686 break; 687 case 4 ... 7: 688 index = 2; 689 break; 690 case 2 ... 3: 691 index = 1; 692 case 1: 693 break; 694 } 695 696 td->ts.io_u_map[index] += nr; 697} 698 699static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec) 700{ 701 int index = 0; 702 703 assert(usec < 1000); 704 705 switch (usec) { 706 case 750 ... 999: 707 index = 9; 708 break; 709 case 500 ... 749: 710 index = 8; 711 break; 712 case 250 ... 499: 713 index = 7; 714 break; 715 case 100 ... 249: 716 index = 6; 717 break; 718 case 50 ... 99: 719 index = 5; 720 break; 721 case 20 ... 49: 722 index = 4; 723 break; 724 case 10 ... 19: 725 index = 3; 726 break; 727 case 4 ... 9: 728 index = 2; 729 break; 730 case 2 ... 3: 731 index = 1; 732 case 0 ... 1: 733 break; 734 } 735 736 assert(index < FIO_IO_U_LAT_U_NR); 737 td->ts.io_u_lat_u[index]++; 738} 739 740static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec) 741{ 742 int index = 0; 743 744 switch (msec) { 745 default: 746 index = 11; 747 break; 748 case 1000 ... 1999: 749 index = 10; 750 break; 751 case 750 ... 999: 752 index = 9; 753 break; 754 case 500 ... 749: 755 index = 8; 756 break; 757 case 250 ... 499: 758 index = 7; 759 break; 760 case 100 ... 249: 761 index = 6; 762 break; 763 case 50 ... 99: 764 index = 5; 765 break; 766 case 20 ... 49: 767 index = 4; 768 break; 769 case 10 ... 19: 770 index = 3; 771 break; 772 case 4 ... 9: 773 index = 2; 774 break; 775 case 2 ... 3: 776 index = 1; 777 case 0 ... 1: 778 break; 779 } 780 781 assert(index < FIO_IO_U_LAT_M_NR); 782 td->ts.io_u_lat_m[index]++; 783} 784 785static void io_u_mark_latency(struct thread_data *td, unsigned long usec) 786{ 787 if (usec < 1000) 788 io_u_mark_lat_usec(td, usec); 789 else 790 io_u_mark_lat_msec(td, usec / 1000); 791} 792 793/* 794 * Get next file to service by choosing one at random 795 */ 796static struct fio_file *get_next_file_rand(struct thread_data *td, 797 enum fio_file_flags goodf, 798 enum fio_file_flags badf) 799{ 800 struct fio_file *f; 801 int fno; 802 803 do { 804 long r = os_random_long(&td->next_file_state); 805 int opened = 0; 806 807 fno = (unsigned int) ((double) td->o.nr_files 808 * (r / (OS_RAND_MAX + 1.0))); 809 f = td->files[fno]; 810 if (fio_file_done(f)) 811 continue; 812 813 if (!fio_file_open(f)) { 814 int err; 815 816 err = td_io_open_file(td, f); 817 if (err) 818 continue; 819 opened = 1; 820 } 821 822 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) { 823 dprint(FD_FILE, "get_next_file_rand: %p\n", f); 824 return f; 825 } 826 if (opened) 827 td_io_close_file(td, f); 828 } while (1); 829} 830 831/* 832 * Get next file to service by doing round robin between all available ones 833 */ 834static struct fio_file *get_next_file_rr(struct thread_data *td, int goodf, 835 int badf) 836{ 837 unsigned int old_next_file = td->next_file; 838 struct fio_file *f; 839 840 do { 841 int opened = 0; 842 843 f = td->files[td->next_file]; 844 845 td->next_file++; 846 if (td->next_file >= td->o.nr_files) 847 td->next_file = 0; 848 849 dprint(FD_FILE, "trying file %s %x\n", f->file_name, f->flags); 850 if (fio_file_done(f)) { 851 f = NULL; 852 continue; 853 } 854 855 if (!fio_file_open(f)) { 856 int err; 857 858 err = td_io_open_file(td, f); 859 if (err) { 860 dprint(FD_FILE, "error %d on open of %s\n", 861 err, f->file_name); 862 f = NULL; 863 continue; 864 } 865 opened = 1; 866 } 867 868 dprint(FD_FILE, "goodf=%x, badf=%x, ff=%x\n", goodf, badf, 869 f->flags); 870 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) 871 break; 872 873 if (opened) 874 td_io_close_file(td, f); 875 876 f = NULL; 877 } while (td->next_file != old_next_file); 878 879 dprint(FD_FILE, "get_next_file_rr: %p\n", f); 880 return f; 881} 882 883static struct fio_file *__get_next_file(struct thread_data *td) 884{ 885 struct fio_file *f; 886 887 assert(td->o.nr_files <= td->files_index); 888 889 if (td->nr_done_files >= td->o.nr_files) { 890 dprint(FD_FILE, "get_next_file: nr_open=%d, nr_done=%d," 891 " nr_files=%d\n", td->nr_open_files, 892 td->nr_done_files, 893 td->o.nr_files); 894 return NULL; 895 } 896 897 f = td->file_service_file; 898 if (f && fio_file_open(f) && !fio_file_closing(f)) { 899 if (td->o.file_service_type == FIO_FSERVICE_SEQ) 900 goto out; 901 if (td->file_service_left--) 902 goto out; 903 } 904 905 if (td->o.file_service_type == FIO_FSERVICE_RR || 906 td->o.file_service_type == FIO_FSERVICE_SEQ) 907 f = get_next_file_rr(td, FIO_FILE_open, FIO_FILE_closing); 908 else 909 f = get_next_file_rand(td, FIO_FILE_open, FIO_FILE_closing); 910 911 td->file_service_file = f; 912 td->file_service_left = td->file_service_nr - 1; 913out: 914 dprint(FD_FILE, "get_next_file: %p [%s]\n", f, f->file_name); 915 return f; 916} 917 918static struct fio_file *get_next_file(struct thread_data *td) 919{ 920 struct prof_io_ops *ops = &td->prof_io_ops; 921 922 if (ops->get_next_file) 923 return ops->get_next_file(td); 924 925 return __get_next_file(td); 926} 927 928static int set_io_u_file(struct thread_data *td, struct io_u *io_u) 929{ 930 struct fio_file *f; 931 932 do { 933 f = get_next_file(td); 934 if (!f) 935 return 1; 936 937 io_u->file = f; 938 get_file(f); 939 940 if (!fill_io_u(td, io_u)) 941 break; 942 943 put_file_log(td, f); 944 td_io_close_file(td, f); 945 io_u->file = NULL; 946 fio_file_set_done(f); 947 td->nr_done_files++; 948 dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name, 949 td->nr_done_files, td->o.nr_files); 950 } while (1); 951 952 return 0; 953} 954 955 956struct io_u *__get_io_u(struct thread_data *td) 957{ 958 struct io_u *io_u = NULL; 959 960 td_io_u_lock(td); 961 962again: 963 if (!flist_empty(&td->io_u_requeues)) 964 io_u = flist_entry(td->io_u_requeues.next, struct io_u, list); 965 else if (!queue_full(td)) { 966 io_u = flist_entry(td->io_u_freelist.next, struct io_u, list); 967 968 io_u->buflen = 0; 969 io_u->resid = 0; 970 io_u->file = NULL; 971 io_u->end_io = NULL; 972 } 973 974 if (io_u) { 975 assert(io_u->flags & IO_U_F_FREE); 976 io_u->flags &= ~(IO_U_F_FREE | IO_U_F_FREE_DEF); 977 io_u->flags &= ~(IO_U_F_TRIMMED | IO_U_F_BARRIER); 978 979 io_u->error = 0; 980 flist_del(&io_u->list); 981 flist_add(&io_u->list, &td->io_u_busylist); 982 td->cur_depth++; 983 io_u->flags |= IO_U_F_IN_CUR_DEPTH; 984 } else if (td->o.verify_async) { 985 /* 986 * We ran out, wait for async verify threads to finish and 987 * return one 988 */ 989 pthread_cond_wait(&td->free_cond, &td->io_u_lock); 990 goto again; 991 } 992 993 td_io_u_unlock(td); 994 return io_u; 995} 996 997static int check_get_trim(struct thread_data *td, struct io_u *io_u) 998{ 999 if (td->o.trim_backlog && td->trim_entries) { 1000 int get_trim = 0; 1001 1002 if (td->trim_batch) { 1003 td->trim_batch--; 1004 get_trim = 1; 1005 } else if (!(td->io_hist_len % td->o.trim_backlog) && 1006 td->last_ddir != DDIR_READ) { 1007 td->trim_batch = td->o.trim_batch; 1008 if (!td->trim_batch) 1009 td->trim_batch = td->o.trim_backlog; 1010 get_trim = 1; 1011 } 1012 1013 if (get_trim && !get_next_trim(td, io_u)) 1014 return 1; 1015 } 1016 1017 return 0; 1018} 1019 1020static int check_get_verify(struct thread_data *td, struct io_u *io_u) 1021{ 1022 if (td->o.verify_backlog && td->io_hist_len) { 1023 int get_verify = 0; 1024 1025 if (td->verify_batch) { 1026 td->verify_batch--; 1027 get_verify = 1; 1028 } else if (!(td->io_hist_len % td->o.verify_backlog) && 1029 td->last_ddir != DDIR_READ) { 1030 td->verify_batch = td->o.verify_batch; 1031 if (!td->verify_batch) 1032 td->verify_batch = td->o.verify_backlog; 1033 get_verify = 1; 1034 } 1035 1036 if (get_verify && !get_next_verify(td, io_u)) 1037 return 1; 1038 } 1039 1040 return 0; 1041} 1042 1043/* 1044 * Return an io_u to be processed. Gets a buflen and offset, sets direction, 1045 * etc. The returned io_u is fully ready to be prepped and submitted. 1046 */ 1047struct io_u *get_io_u(struct thread_data *td) 1048{ 1049 struct fio_file *f; 1050 struct io_u *io_u; 1051 1052 io_u = __get_io_u(td); 1053 if (!io_u) { 1054 dprint(FD_IO, "__get_io_u failed\n"); 1055 return NULL; 1056 } 1057 1058 if (check_get_verify(td, io_u)) 1059 goto out; 1060 if (check_get_trim(td, io_u)) 1061 goto out; 1062 1063 /* 1064 * from a requeue, io_u already setup 1065 */ 1066 if (io_u->file) 1067 goto out; 1068 1069 /* 1070 * If using an iolog, grab next piece if any available. 1071 */ 1072 if (td->o.read_iolog_file) { 1073 if (read_iolog_get(td, io_u)) 1074 goto err_put; 1075 } else if (set_io_u_file(td, io_u)) { 1076 dprint(FD_IO, "io_u %p, setting file failed\n", io_u); 1077 goto err_put; 1078 } 1079 1080 f = io_u->file; 1081 assert(fio_file_open(f)); 1082 1083 if (ddir_rw(io_u->ddir)) { 1084 if (!io_u->buflen && !(td->io_ops->flags & FIO_NOIO)) { 1085 dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u); 1086 goto err_put; 1087 } 1088 1089 f->last_start = io_u->offset; 1090 f->last_pos = io_u->offset + io_u->buflen; 1091 1092 if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_WRITE) 1093 populate_verify_io_u(td, io_u); 1094 else if (td->o.refill_buffers && io_u->ddir == DDIR_WRITE) 1095 io_u_fill_buffer(td, io_u, io_u->xfer_buflen); 1096 else if (io_u->ddir == DDIR_READ) { 1097 /* 1098 * Reset the buf_filled parameters so next time if the 1099 * buffer is used for writes it is refilled. 1100 */ 1101 io_u->buf_filled_len = 0; 1102 } 1103 } 1104 1105 /* 1106 * Set io data pointers. 1107 */ 1108 io_u->xfer_buf = io_u->buf; 1109 io_u->xfer_buflen = io_u->buflen; 1110 1111out: 1112 assert(io_u->file); 1113 if (!td_io_prep(td, io_u)) { 1114 if (!td->o.disable_slat) 1115 fio_gettime(&io_u->start_time, NULL); 1116 return io_u; 1117 } 1118err_put: 1119 dprint(FD_IO, "get_io_u failed\n"); 1120 put_io_u(td, io_u); 1121 return NULL; 1122} 1123 1124void io_u_log_error(struct thread_data *td, struct io_u *io_u) 1125{ 1126 const char *msg[] = { "read", "write", "sync", "datasync", 1127 "sync_file_range", "wait", "trim" }; 1128 1129 1130 1131 log_err("fio: io_u error"); 1132 1133 if (io_u->file) 1134 log_err(" on file %s", io_u->file->file_name); 1135 1136 log_err(": %s\n", strerror(io_u->error)); 1137 1138 log_err(" %s offset=%llu, buflen=%lu\n", msg[io_u->ddir], 1139 io_u->offset, io_u->xfer_buflen); 1140 1141 if (!td->error) 1142 td_verror(td, io_u->error, "io_u error"); 1143} 1144 1145static void io_completed(struct thread_data *td, struct io_u *io_u, 1146 struct io_completion_data *icd) 1147{ 1148 /* 1149 * Older gcc's are too dumb to realize that usec is always used 1150 * initialized, silence that warning. 1151 */ 1152 unsigned long uninitialized_var(usec); 1153 struct fio_file *f; 1154 1155 dprint_io_u(io_u, "io complete"); 1156 1157 td_io_u_lock(td); 1158 assert(io_u->flags & IO_U_F_FLIGHT); 1159 io_u->flags &= ~(IO_U_F_FLIGHT | IO_U_F_BUSY_OK); 1160 td_io_u_unlock(td); 1161 1162 if (ddir_sync(io_u->ddir)) { 1163 td->last_was_sync = 1; 1164 f = io_u->file; 1165 if (f) { 1166 f->first_write = -1ULL; 1167 f->last_write = -1ULL; 1168 } 1169 return; 1170 } 1171 1172 td->last_was_sync = 0; 1173 td->last_ddir = io_u->ddir; 1174 1175 if (!io_u->error && ddir_rw(io_u->ddir)) { 1176 unsigned int bytes = io_u->buflen - io_u->resid; 1177 const enum fio_ddir idx = io_u->ddir; 1178 const enum fio_ddir odx = io_u->ddir ^ 1; 1179 int ret; 1180 1181 td->io_blocks[idx]++; 1182 td->io_bytes[idx] += bytes; 1183 td->this_io_bytes[idx] += bytes; 1184 1185 if (idx == DDIR_WRITE) { 1186 f = io_u->file; 1187 if (f) { 1188 if (f->first_write == -1ULL || 1189 io_u->offset < f->first_write) 1190 f->first_write = io_u->offset; 1191 if (f->last_write == -1ULL || 1192 ((io_u->offset + bytes) > f->last_write)) 1193 f->last_write = io_u->offset + bytes; 1194 } 1195 } 1196 1197 if (ramp_time_over(td)) { 1198 unsigned long uninitialized_var(lusec); 1199 1200 if (!td->o.disable_clat || !td->o.disable_bw) 1201 lusec = utime_since(&io_u->issue_time, 1202 &icd->time); 1203 if (!td->o.disable_lat) { 1204 unsigned long tusec; 1205 1206 tusec = utime_since(&io_u->start_time, 1207 &icd->time); 1208 add_lat_sample(td, idx, tusec, bytes); 1209 } 1210 if (!td->o.disable_clat) { 1211 add_clat_sample(td, idx, lusec, bytes); 1212 io_u_mark_latency(td, lusec); 1213 } 1214 if (!td->o.disable_bw) 1215 add_bw_sample(td, idx, bytes, &icd->time); 1216 if (__should_check_rate(td, idx)) { 1217 td->rate_pending_usleep[idx] = 1218 ((td->this_io_bytes[idx] * 1219 td->rate_nsec_cycle[idx]) / 1000 - 1220 utime_since_now(&td->start)); 1221 } 1222 if (__should_check_rate(td, idx ^ 1)) 1223 td->rate_pending_usleep[odx] = 1224 ((td->this_io_bytes[odx] * 1225 td->rate_nsec_cycle[odx]) / 1000 - 1226 utime_since_now(&td->start)); 1227 } 1228 1229 if (td_write(td) && idx == DDIR_WRITE && 1230 td->o.do_verify && 1231 td->o.verify != VERIFY_NONE) 1232 log_io_piece(td, io_u); 1233 1234 icd->bytes_done[idx] += bytes; 1235 1236 if (io_u->end_io) { 1237 ret = io_u->end_io(td, io_u); 1238 if (ret && !icd->error) 1239 icd->error = ret; 1240 } 1241 } else if (io_u->error) { 1242 icd->error = io_u->error; 1243 io_u_log_error(td, io_u); 1244 } 1245 if (td->o.continue_on_error && icd->error && 1246 td_non_fatal_error(icd->error)) { 1247 /* 1248 * If there is a non_fatal error, then add to the error count 1249 * and clear all the errors. 1250 */ 1251 update_error_count(td, icd->error); 1252 td_clear_error(td); 1253 icd->error = 0; 1254 io_u->error = 0; 1255 } 1256} 1257 1258static void init_icd(struct thread_data *td, struct io_completion_data *icd, 1259 int nr) 1260{ 1261 if (!td->o.disable_clat || !td->o.disable_bw) 1262 fio_gettime(&icd->time, NULL); 1263 1264 icd->nr = nr; 1265 1266 icd->error = 0; 1267 icd->bytes_done[0] = icd->bytes_done[1] = 0; 1268} 1269 1270static void ios_completed(struct thread_data *td, 1271 struct io_completion_data *icd) 1272{ 1273 struct io_u *io_u; 1274 int i; 1275 1276 for (i = 0; i < icd->nr; i++) { 1277 io_u = td->io_ops->event(td, i); 1278 1279 io_completed(td, io_u, icd); 1280 1281 if (!(io_u->flags & IO_U_F_FREE_DEF)) 1282 put_io_u(td, io_u); 1283 } 1284} 1285 1286/* 1287 * Complete a single io_u for the sync engines. 1288 */ 1289int io_u_sync_complete(struct thread_data *td, struct io_u *io_u, 1290 unsigned long *bytes) 1291{ 1292 struct io_completion_data icd; 1293 1294 init_icd(td, &icd, 1); 1295 io_completed(td, io_u, &icd); 1296 1297 if (!(io_u->flags & IO_U_F_FREE_DEF)) 1298 put_io_u(td, io_u); 1299 1300 if (icd.error) { 1301 td_verror(td, icd.error, "io_u_sync_complete"); 1302 return -1; 1303 } 1304 1305 if (bytes) { 1306 bytes[0] += icd.bytes_done[0]; 1307 bytes[1] += icd.bytes_done[1]; 1308 } 1309 1310 return 0; 1311} 1312 1313/* 1314 * Called to complete min_events number of io for the async engines. 1315 */ 1316int io_u_queued_complete(struct thread_data *td, int min_evts, 1317 unsigned long *bytes) 1318{ 1319 struct io_completion_data icd; 1320 struct timespec *tvp = NULL; 1321 int ret; 1322 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, }; 1323 1324 dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_evts); 1325 1326 if (!min_evts) 1327 tvp = &ts; 1328 1329 ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete, tvp); 1330 if (ret < 0) { 1331 td_verror(td, -ret, "td_io_getevents"); 1332 return ret; 1333 } else if (!ret) 1334 return ret; 1335 1336 init_icd(td, &icd, ret); 1337 ios_completed(td, &icd); 1338 if (icd.error) { 1339 td_verror(td, icd.error, "io_u_queued_complete"); 1340 return -1; 1341 } 1342 1343 if (bytes) { 1344 bytes[0] += icd.bytes_done[0]; 1345 bytes[1] += icd.bytes_done[1]; 1346 } 1347 1348 return 0; 1349} 1350 1351/* 1352 * Call when io_u is really queued, to update the submission latency. 1353 */ 1354void io_u_queued(struct thread_data *td, struct io_u *io_u) 1355{ 1356 if (!td->o.disable_slat) { 1357 unsigned long slat_time; 1358 1359 slat_time = utime_since(&io_u->start_time, &io_u->issue_time); 1360 add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen); 1361 } 1362} 1363 1364/* 1365 * "randomly" fill the buffer contents 1366 */ 1367void io_u_fill_buffer(struct thread_data *td, struct io_u *io_u, 1368 unsigned int max_bs) 1369{ 1370 if (!td->o.zero_buffers) 1371 fill_random_buf(io_u->buf, max_bs); 1372 else 1373 memset(io_u->buf, 0, max_bs); 1374} 1375