io_u.c revision 2615cc4b28e7d0e436a625dff92e6a71ccc6c49b
1#include <unistd.h> 2#include <fcntl.h> 3#include <string.h> 4#include <signal.h> 5#include <time.h> 6#include <assert.h> 7 8#include "fio.h" 9#include "hash.h" 10#include "verify.h" 11#include "trim.h" 12#include "lib/rand.h" 13 14struct io_completion_data { 15 int nr; /* input */ 16 17 int error; /* output */ 18 unsigned long bytes_done[2]; /* output */ 19 struct timeval time; /* output */ 20}; 21 22/* 23 * The ->file_map[] contains a map of blocks we have or have not done io 24 * to yet. Used to make sure we cover the entire range in a fair fashion. 25 */ 26static int random_map_free(struct fio_file *f, const unsigned long long block) 27{ 28 unsigned int idx = RAND_MAP_IDX(f, block); 29 unsigned int bit = RAND_MAP_BIT(f, block); 30 31 dprint(FD_RANDOM, "free: b=%llu, idx=%u, bit=%u\n", block, idx, bit); 32 33 return (f->file_map[idx] & (1UL << bit)) == 0; 34} 35 36/* 37 * Mark a given offset as used in the map. 38 */ 39static void mark_random_map(struct thread_data *td, struct io_u *io_u) 40{ 41 unsigned int min_bs = td->o.rw_min_bs; 42 struct fio_file *f = io_u->file; 43 unsigned long long block; 44 unsigned int blocks, nr_blocks; 45 int busy_check; 46 47 block = (io_u->offset - f->file_offset) / (unsigned long long) min_bs; 48 nr_blocks = (io_u->buflen + min_bs - 1) / min_bs; 49 blocks = 0; 50 busy_check = !(io_u->flags & IO_U_F_BUSY_OK); 51 52 while (nr_blocks) { 53 unsigned int idx, bit; 54 unsigned long mask, this_blocks; 55 56 /* 57 * If we have a mixed random workload, we may 58 * encounter blocks we already did IO to. 59 */ 60 if (!busy_check) { 61 blocks = nr_blocks; 62 break; 63 } 64 if ((td->o.ddir_seq_nr == 1) && !random_map_free(f, block)) 65 break; 66 67 idx = RAND_MAP_IDX(f, block); 68 bit = RAND_MAP_BIT(f, block); 69 70 fio_assert(td, idx < f->num_maps); 71 72 this_blocks = nr_blocks; 73 if (this_blocks + bit > BLOCKS_PER_MAP) 74 this_blocks = BLOCKS_PER_MAP - bit; 75 76 do { 77 if (this_blocks == BLOCKS_PER_MAP) 78 mask = -1UL; 79 else 80 mask = ((1UL << this_blocks) - 1) << bit; 81 82 if (!(f->file_map[idx] & mask)) 83 break; 84 85 this_blocks--; 86 } while (this_blocks); 87 88 if (!this_blocks) 89 break; 90 91 f->file_map[idx] |= mask; 92 nr_blocks -= this_blocks; 93 blocks += this_blocks; 94 block += this_blocks; 95 } 96 97 if ((blocks * min_bs) < io_u->buflen) 98 io_u->buflen = blocks * min_bs; 99} 100 101static unsigned long long last_block(struct thread_data *td, struct fio_file *f, 102 enum fio_ddir ddir) 103{ 104 unsigned long long max_blocks; 105 unsigned long long max_size; 106 107 assert(ddir_rw(ddir)); 108 109 /* 110 * Hmm, should we make sure that ->io_size <= ->real_file_size? 111 */ 112 max_size = f->io_size; 113 if (max_size > f->real_file_size) 114 max_size = f->real_file_size; 115 116 max_blocks = max_size / (unsigned long long) td->o.ba[ddir]; 117 if (!max_blocks) 118 return 0; 119 120 return max_blocks; 121} 122 123/* 124 * Return the next free block in the map. 125 */ 126static int get_next_free_block(struct thread_data *td, struct fio_file *f, 127 enum fio_ddir ddir, unsigned long long *b) 128{ 129 unsigned long long block, min_bs = td->o.rw_min_bs, lastb; 130 int i; 131 132 lastb = last_block(td, f, ddir); 133 if (!lastb) 134 return 1; 135 136 i = f->last_free_lookup; 137 block = i * BLOCKS_PER_MAP; 138 while (block * min_bs < f->real_file_size && 139 block * min_bs < f->io_size) { 140 if (f->file_map[i] != -1UL) { 141 block += ffz(f->file_map[i]); 142 if (block > lastb) 143 break; 144 f->last_free_lookup = i; 145 *b = block; 146 return 0; 147 } 148 149 block += BLOCKS_PER_MAP; 150 i++; 151 } 152 153 dprint(FD_IO, "failed finding a free block\n"); 154 return 1; 155} 156 157static int get_next_rand_offset(struct thread_data *td, struct fio_file *f, 158 enum fio_ddir ddir, unsigned long long *b) 159{ 160 unsigned long long r, lastb; 161 int loops = 5; 162 163 lastb = last_block(td, f, ddir); 164 if (!lastb) 165 return 1; 166 167 if (f->failed_rands >= 200) 168 goto ffz; 169 170 do { 171 if (td->o.use_os_rand) { 172 r = os_random_long(&td->random_state); 173 *b = (lastb - 1) * (r / ((unsigned long long) OS_RAND_MAX + 1.0)); 174 } else { 175 r = __rand(&td->__random_state); 176 *b = (lastb - 1) * (r / ((unsigned long long) FRAND_MAX + 1.0)); 177 } 178 179 dprint(FD_RANDOM, "off rand %llu\n", r); 180 181 182 /* 183 * if we are not maintaining a random map, we are done. 184 */ 185 if (!file_randommap(td, f)) 186 goto ret_good; 187 188 /* 189 * calculate map offset and check if it's free 190 */ 191 if (random_map_free(f, *b)) 192 goto ret_good; 193 194 dprint(FD_RANDOM, "get_next_rand_offset: offset %llu busy\n", 195 *b); 196 } while (--loops); 197 198 if (!f->failed_rands++) 199 f->last_free_lookup = 0; 200 201 /* 202 * we get here, if we didn't suceed in looking up a block. generate 203 * a random start offset into the filemap, and find the first free 204 * block from there. 205 */ 206 loops = 10; 207 do { 208 f->last_free_lookup = (f->num_maps - 1) * 209 (r / (OS_RAND_MAX + 1.0)); 210 if (!get_next_free_block(td, f, ddir, b)) 211 goto ret; 212 213 r = os_random_long(&td->random_state); 214 } while (--loops); 215 216 /* 217 * that didn't work either, try exhaustive search from the start 218 */ 219 f->last_free_lookup = 0; 220ffz: 221 if (!get_next_free_block(td, f, ddir, b)) 222 return 0; 223 f->last_free_lookup = 0; 224 return get_next_free_block(td, f, ddir, b); 225ret_good: 226 f->failed_rands = 0; 227ret: 228 return 0; 229} 230 231static int get_next_rand_block(struct thread_data *td, struct fio_file *f, 232 enum fio_ddir ddir, unsigned long long *b) 233{ 234 if (get_next_rand_offset(td, f, ddir, b)) { 235 dprint(FD_IO, "%s: rand offset failed, last=%llu, size=%llu\n", 236 f->file_name, f->last_pos, f->real_file_size); 237 return 1; 238 } 239 240 return 0; 241} 242 243static int get_next_seq_block(struct thread_data *td, struct fio_file *f, 244 enum fio_ddir ddir, unsigned long long *b) 245{ 246 assert(ddir_rw(ddir)); 247 248 if (f->last_pos < f->real_file_size) { 249 *b = (f->last_pos - f->file_offset) / td->o.min_bs[ddir]; 250 return 0; 251 } 252 253 return 1; 254} 255 256static int get_next_block(struct thread_data *td, struct io_u *io_u, 257 enum fio_ddir ddir, int rw_seq, unsigned long long *b) 258{ 259 struct fio_file *f = io_u->file; 260 int ret; 261 262 assert(ddir_rw(ddir)); 263 264 if (rw_seq) { 265 if (td_random(td)) 266 ret = get_next_rand_block(td, f, ddir, b); 267 else 268 ret = get_next_seq_block(td, f, ddir, b); 269 } else { 270 io_u->flags |= IO_U_F_BUSY_OK; 271 272 if (td->o.rw_seq == RW_SEQ_SEQ) { 273 ret = get_next_seq_block(td, f, ddir, b); 274 if (ret) 275 ret = get_next_rand_block(td, f, ddir, b); 276 } else if (td->o.rw_seq == RW_SEQ_IDENT) { 277 if (f->last_start != -1ULL) 278 *b = (f->last_start - f->file_offset) 279 / td->o.min_bs[ddir]; 280 else 281 *b = 0; 282 ret = 0; 283 } else { 284 log_err("fio: unknown rw_seq=%d\n", td->o.rw_seq); 285 ret = 1; 286 } 287 } 288 289 return ret; 290} 291 292/* 293 * For random io, generate a random new block and see if it's used. Repeat 294 * until we find a free one. For sequential io, just return the end of 295 * the last io issued. 296 */ 297static int __get_next_offset(struct thread_data *td, struct io_u *io_u) 298{ 299 struct fio_file *f = io_u->file; 300 unsigned long long b; 301 enum fio_ddir ddir = io_u->ddir; 302 int rw_seq_hit = 0; 303 304 assert(ddir_rw(ddir)); 305 306 if (td->o.ddir_seq_nr && !--td->ddir_seq_nr) { 307 rw_seq_hit = 1; 308 td->ddir_seq_nr = td->o.ddir_seq_nr; 309 } 310 311 if (get_next_block(td, io_u, ddir, rw_seq_hit, &b)) 312 return 1; 313 314 io_u->offset = b * td->o.ba[ddir]; 315 if (io_u->offset >= f->io_size) { 316 dprint(FD_IO, "get_next_offset: offset %llu >= io_size %llu\n", 317 io_u->offset, f->io_size); 318 return 1; 319 } 320 321 io_u->offset += f->file_offset; 322 if (io_u->offset >= f->real_file_size) { 323 dprint(FD_IO, "get_next_offset: offset %llu >= size %llu\n", 324 io_u->offset, f->real_file_size); 325 return 1; 326 } 327 328 return 0; 329} 330 331static int get_next_offset(struct thread_data *td, struct io_u *io_u) 332{ 333 struct prof_io_ops *ops = &td->prof_io_ops; 334 335 if (ops->fill_io_u_off) 336 return ops->fill_io_u_off(td, io_u); 337 338 return __get_next_offset(td, io_u); 339} 340 341static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u) 342{ 343 const int ddir = io_u->ddir; 344 unsigned int uninitialized_var(buflen); 345 unsigned int minbs, maxbs; 346 long r; 347 348 assert(ddir_rw(ddir)); 349 350 minbs = td->o.min_bs[ddir]; 351 maxbs = td->o.max_bs[ddir]; 352 353 if (minbs == maxbs) 354 buflen = minbs; 355 else { 356 r = os_random_long(&td->bsrange_state); 357 if (!td->o.bssplit_nr[ddir]) { 358 buflen = 1 + (unsigned int) ((double) maxbs * 359 (r / (OS_RAND_MAX + 1.0))); 360 if (buflen < minbs) 361 buflen = minbs; 362 } else { 363 long perc = 0; 364 unsigned int i; 365 366 for (i = 0; i < td->o.bssplit_nr[ddir]; i++) { 367 struct bssplit *bsp = &td->o.bssplit[ddir][i]; 368 369 buflen = bsp->bs; 370 perc += bsp->perc; 371 if (r <= ((OS_RAND_MAX / 100L) * perc)) 372 break; 373 } 374 } 375 if (!td->o.bs_unaligned && is_power_of_2(minbs)) 376 buflen = (buflen + minbs - 1) & ~(minbs - 1); 377 } 378 379 if (io_u->offset + buflen > io_u->file->real_file_size) { 380 dprint(FD_IO, "lower buflen %u -> %u (ddir=%d)\n", buflen, 381 minbs, ddir); 382 buflen = minbs; 383 } 384 385 return buflen; 386} 387 388static unsigned int get_next_buflen(struct thread_data *td, struct io_u *io_u) 389{ 390 struct prof_io_ops *ops = &td->prof_io_ops; 391 392 if (ops->fill_io_u_size) 393 return ops->fill_io_u_size(td, io_u); 394 395 return __get_next_buflen(td, io_u); 396} 397 398static void set_rwmix_bytes(struct thread_data *td) 399{ 400 unsigned int diff; 401 402 /* 403 * we do time or byte based switch. this is needed because 404 * buffered writes may issue a lot quicker than they complete, 405 * whereas reads do not. 406 */ 407 diff = td->o.rwmix[td->rwmix_ddir ^ 1]; 408 td->rwmix_issues = (td->io_issues[td->rwmix_ddir] * diff) / 100; 409} 410 411static inline enum fio_ddir get_rand_ddir(struct thread_data *td) 412{ 413 unsigned int v; 414 long r; 415 416 r = os_random_long(&td->rwmix_state); 417 v = 1 + (int) (100.0 * (r / (OS_RAND_MAX + 1.0))); 418 if (v <= td->o.rwmix[DDIR_READ]) 419 return DDIR_READ; 420 421 return DDIR_WRITE; 422} 423 424static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir) 425{ 426 enum fio_ddir odir = ddir ^ 1; 427 struct timeval t; 428 long usec; 429 430 assert(ddir_rw(ddir)); 431 432 if (td->rate_pending_usleep[ddir] <= 0) 433 return ddir; 434 435 /* 436 * We have too much pending sleep in this direction. See if we 437 * should switch. 438 */ 439 if (td_rw(td)) { 440 /* 441 * Other direction does not have too much pending, switch 442 */ 443 if (td->rate_pending_usleep[odir] < 100000) 444 return odir; 445 446 /* 447 * Both directions have pending sleep. Sleep the minimum time 448 * and deduct from both. 449 */ 450 if (td->rate_pending_usleep[ddir] <= 451 td->rate_pending_usleep[odir]) { 452 usec = td->rate_pending_usleep[ddir]; 453 } else { 454 usec = td->rate_pending_usleep[odir]; 455 ddir = odir; 456 } 457 } else 458 usec = td->rate_pending_usleep[ddir]; 459 460 fio_gettime(&t, NULL); 461 usec_sleep(td, usec); 462 usec = utime_since_now(&t); 463 464 td->rate_pending_usleep[ddir] -= usec; 465 466 odir = ddir ^ 1; 467 if (td_rw(td) && __should_check_rate(td, odir)) 468 td->rate_pending_usleep[odir] -= usec; 469 470 return ddir; 471} 472 473/* 474 * Return the data direction for the next io_u. If the job is a 475 * mixed read/write workload, check the rwmix cycle and switch if 476 * necessary. 477 */ 478static enum fio_ddir get_rw_ddir(struct thread_data *td) 479{ 480 enum fio_ddir ddir; 481 482 /* 483 * see if it's time to fsync 484 */ 485 if (td->o.fsync_blocks && 486 !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) && 487 td->io_issues[DDIR_WRITE] && should_fsync(td)) 488 return DDIR_SYNC; 489 490 /* 491 * see if it's time to fdatasync 492 */ 493 if (td->o.fdatasync_blocks && 494 !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks) && 495 td->io_issues[DDIR_WRITE] && should_fsync(td)) 496 return DDIR_DATASYNC; 497 498 /* 499 * see if it's time to sync_file_range 500 */ 501 if (td->sync_file_range_nr && 502 !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr) && 503 td->io_issues[DDIR_WRITE] && should_fsync(td)) 504 return DDIR_SYNC_FILE_RANGE; 505 506 if (td_rw(td)) { 507 /* 508 * Check if it's time to seed a new data direction. 509 */ 510 if (td->io_issues[td->rwmix_ddir] >= td->rwmix_issues) { 511 /* 512 * Put a top limit on how many bytes we do for 513 * one data direction, to avoid overflowing the 514 * ranges too much 515 */ 516 ddir = get_rand_ddir(td); 517 518 if (ddir != td->rwmix_ddir) 519 set_rwmix_bytes(td); 520 521 td->rwmix_ddir = ddir; 522 } 523 ddir = td->rwmix_ddir; 524 } else if (td_read(td)) 525 ddir = DDIR_READ; 526 else 527 ddir = DDIR_WRITE; 528 529 td->rwmix_ddir = rate_ddir(td, ddir); 530 return td->rwmix_ddir; 531} 532 533static void set_rw_ddir(struct thread_data *td, struct io_u *io_u) 534{ 535 io_u->ddir = get_rw_ddir(td); 536 537 if (io_u->ddir == DDIR_WRITE && (td->io_ops->flags & FIO_BARRIER) && 538 td->o.barrier_blocks && 539 !(td->io_issues[DDIR_WRITE] % td->o.barrier_blocks) && 540 td->io_issues[DDIR_WRITE]) 541 io_u->flags |= IO_U_F_BARRIER; 542} 543 544void put_file_log(struct thread_data *td, struct fio_file *f) 545{ 546 int ret = put_file(td, f); 547 548 if (ret) 549 td_verror(td, ret, "file close"); 550} 551 552void put_io_u(struct thread_data *td, struct io_u *io_u) 553{ 554 td_io_u_lock(td); 555 556 io_u->flags |= IO_U_F_FREE; 557 io_u->flags &= ~IO_U_F_FREE_DEF; 558 559 if (io_u->file) 560 put_file_log(td, io_u->file); 561 562 io_u->file = NULL; 563 if (io_u->flags & IO_U_F_IN_CUR_DEPTH) 564 td->cur_depth--; 565 flist_del_init(&io_u->list); 566 flist_add(&io_u->list, &td->io_u_freelist); 567 td_io_u_unlock(td); 568 td_io_u_free_notify(td); 569} 570 571void clear_io_u(struct thread_data *td, struct io_u *io_u) 572{ 573 io_u->flags &= ~IO_U_F_FLIGHT; 574 put_io_u(td, io_u); 575} 576 577void requeue_io_u(struct thread_data *td, struct io_u **io_u) 578{ 579 struct io_u *__io_u = *io_u; 580 581 dprint(FD_IO, "requeue %p\n", __io_u); 582 583 td_io_u_lock(td); 584 585 __io_u->flags |= IO_U_F_FREE; 586 if ((__io_u->flags & IO_U_F_FLIGHT) && ddir_rw(__io_u->ddir)) 587 td->io_issues[__io_u->ddir]--; 588 589 __io_u->flags &= ~IO_U_F_FLIGHT; 590 if (__io_u->flags & IO_U_F_IN_CUR_DEPTH) 591 td->cur_depth--; 592 flist_del(&__io_u->list); 593 flist_add_tail(&__io_u->list, &td->io_u_requeues); 594 td_io_u_unlock(td); 595 *io_u = NULL; 596} 597 598static int fill_io_u(struct thread_data *td, struct io_u *io_u) 599{ 600 if (td->io_ops->flags & FIO_NOIO) 601 goto out; 602 603 set_rw_ddir(td, io_u); 604 605 /* 606 * fsync() or fdatasync() or trim etc, we are done 607 */ 608 if (!ddir_rw(io_u->ddir)) 609 goto out; 610 611 /* 612 * See if it's time to switch to a new zone 613 */ 614 if (td->zone_bytes >= td->o.zone_size) { 615 td->zone_bytes = 0; 616 io_u->file->last_pos += td->o.zone_skip; 617 td->io_skip_bytes += td->o.zone_skip; 618 } 619 620 /* 621 * No log, let the seq/rand engine retrieve the next buflen and 622 * position. 623 */ 624 if (get_next_offset(td, io_u)) { 625 dprint(FD_IO, "io_u %p, failed getting offset\n", io_u); 626 return 1; 627 } 628 629 io_u->buflen = get_next_buflen(td, io_u); 630 if (!io_u->buflen) { 631 dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u); 632 return 1; 633 } 634 635 if (io_u->offset + io_u->buflen > io_u->file->real_file_size) { 636 dprint(FD_IO, "io_u %p, offset too large\n", io_u); 637 dprint(FD_IO, " off=%llu/%lu > %llu\n", io_u->offset, 638 io_u->buflen, io_u->file->real_file_size); 639 return 1; 640 } 641 642 /* 643 * mark entry before potentially trimming io_u 644 */ 645 if (td_random(td) && file_randommap(td, io_u->file)) 646 mark_random_map(td, io_u); 647 648 /* 649 * If using a write iolog, store this entry. 650 */ 651out: 652 dprint_io_u(io_u, "fill_io_u"); 653 td->zone_bytes += io_u->buflen; 654 log_io_u(td, io_u); 655 return 0; 656} 657 658static void __io_u_mark_map(unsigned int *map, unsigned int nr) 659{ 660 int idx = 0; 661 662 switch (nr) { 663 default: 664 idx = 6; 665 break; 666 case 33 ... 64: 667 idx = 5; 668 break; 669 case 17 ... 32: 670 idx = 4; 671 break; 672 case 9 ... 16: 673 idx = 3; 674 break; 675 case 5 ... 8: 676 idx = 2; 677 break; 678 case 1 ... 4: 679 idx = 1; 680 case 0: 681 break; 682 } 683 684 map[idx]++; 685} 686 687void io_u_mark_submit(struct thread_data *td, unsigned int nr) 688{ 689 __io_u_mark_map(td->ts.io_u_submit, nr); 690 td->ts.total_submit++; 691} 692 693void io_u_mark_complete(struct thread_data *td, unsigned int nr) 694{ 695 __io_u_mark_map(td->ts.io_u_complete, nr); 696 td->ts.total_complete++; 697} 698 699void io_u_mark_depth(struct thread_data *td, unsigned int nr) 700{ 701 int idx = 0; 702 703 switch (td->cur_depth) { 704 default: 705 idx = 6; 706 break; 707 case 32 ... 63: 708 idx = 5; 709 break; 710 case 16 ... 31: 711 idx = 4; 712 break; 713 case 8 ... 15: 714 idx = 3; 715 break; 716 case 4 ... 7: 717 idx = 2; 718 break; 719 case 2 ... 3: 720 idx = 1; 721 case 1: 722 break; 723 } 724 725 td->ts.io_u_map[idx] += nr; 726} 727 728static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec) 729{ 730 int idx = 0; 731 732 assert(usec < 1000); 733 734 switch (usec) { 735 case 750 ... 999: 736 idx = 9; 737 break; 738 case 500 ... 749: 739 idx = 8; 740 break; 741 case 250 ... 499: 742 idx = 7; 743 break; 744 case 100 ... 249: 745 idx = 6; 746 break; 747 case 50 ... 99: 748 idx = 5; 749 break; 750 case 20 ... 49: 751 idx = 4; 752 break; 753 case 10 ... 19: 754 idx = 3; 755 break; 756 case 4 ... 9: 757 idx = 2; 758 break; 759 case 2 ... 3: 760 idx = 1; 761 case 0 ... 1: 762 break; 763 } 764 765 assert(idx < FIO_IO_U_LAT_U_NR); 766 td->ts.io_u_lat_u[idx]++; 767} 768 769static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec) 770{ 771 int idx = 0; 772 773 switch (msec) { 774 default: 775 idx = 11; 776 break; 777 case 1000 ... 1999: 778 idx = 10; 779 break; 780 case 750 ... 999: 781 idx = 9; 782 break; 783 case 500 ... 749: 784 idx = 8; 785 break; 786 case 250 ... 499: 787 idx = 7; 788 break; 789 case 100 ... 249: 790 idx = 6; 791 break; 792 case 50 ... 99: 793 idx = 5; 794 break; 795 case 20 ... 49: 796 idx = 4; 797 break; 798 case 10 ... 19: 799 idx = 3; 800 break; 801 case 4 ... 9: 802 idx = 2; 803 break; 804 case 2 ... 3: 805 idx = 1; 806 case 0 ... 1: 807 break; 808 } 809 810 assert(idx < FIO_IO_U_LAT_M_NR); 811 td->ts.io_u_lat_m[idx]++; 812} 813 814static void io_u_mark_latency(struct thread_data *td, unsigned long usec) 815{ 816 if (usec < 1000) 817 io_u_mark_lat_usec(td, usec); 818 else 819 io_u_mark_lat_msec(td, usec / 1000); 820} 821 822/* 823 * Get next file to service by choosing one at random 824 */ 825static struct fio_file *get_next_file_rand(struct thread_data *td, 826 enum fio_file_flags goodf, 827 enum fio_file_flags badf) 828{ 829 struct fio_file *f; 830 int fno; 831 832 do { 833 long r = os_random_long(&td->next_file_state); 834 int opened = 0; 835 836 fno = (unsigned int) ((double) td->o.nr_files 837 * (r / (OS_RAND_MAX + 1.0))); 838 f = td->files[fno]; 839 if (fio_file_done(f)) 840 continue; 841 842 if (!fio_file_open(f)) { 843 int err; 844 845 err = td_io_open_file(td, f); 846 if (err) 847 continue; 848 opened = 1; 849 } 850 851 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) { 852 dprint(FD_FILE, "get_next_file_rand: %p\n", f); 853 return f; 854 } 855 if (opened) 856 td_io_close_file(td, f); 857 } while (1); 858} 859 860/* 861 * Get next file to service by doing round robin between all available ones 862 */ 863static struct fio_file *get_next_file_rr(struct thread_data *td, int goodf, 864 int badf) 865{ 866 unsigned int old_next_file = td->next_file; 867 struct fio_file *f; 868 869 do { 870 int opened = 0; 871 872 f = td->files[td->next_file]; 873 874 td->next_file++; 875 if (td->next_file >= td->o.nr_files) 876 td->next_file = 0; 877 878 dprint(FD_FILE, "trying file %s %x\n", f->file_name, f->flags); 879 if (fio_file_done(f)) { 880 f = NULL; 881 continue; 882 } 883 884 if (!fio_file_open(f)) { 885 int err; 886 887 err = td_io_open_file(td, f); 888 if (err) { 889 dprint(FD_FILE, "error %d on open of %s\n", 890 err, f->file_name); 891 f = NULL; 892 continue; 893 } 894 opened = 1; 895 } 896 897 dprint(FD_FILE, "goodf=%x, badf=%x, ff=%x\n", goodf, badf, 898 f->flags); 899 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) 900 break; 901 902 if (opened) 903 td_io_close_file(td, f); 904 905 f = NULL; 906 } while (td->next_file != old_next_file); 907 908 dprint(FD_FILE, "get_next_file_rr: %p\n", f); 909 return f; 910} 911 912static struct fio_file *__get_next_file(struct thread_data *td) 913{ 914 struct fio_file *f; 915 916 assert(td->o.nr_files <= td->files_index); 917 918 if (td->nr_done_files >= td->o.nr_files) { 919 dprint(FD_FILE, "get_next_file: nr_open=%d, nr_done=%d," 920 " nr_files=%d\n", td->nr_open_files, 921 td->nr_done_files, 922 td->o.nr_files); 923 return NULL; 924 } 925 926 f = td->file_service_file; 927 if (f && fio_file_open(f) && !fio_file_closing(f)) { 928 if (td->o.file_service_type == FIO_FSERVICE_SEQ) 929 goto out; 930 if (td->file_service_left--) 931 goto out; 932 } 933 934 if (td->o.file_service_type == FIO_FSERVICE_RR || 935 td->o.file_service_type == FIO_FSERVICE_SEQ) 936 f = get_next_file_rr(td, FIO_FILE_open, FIO_FILE_closing); 937 else 938 f = get_next_file_rand(td, FIO_FILE_open, FIO_FILE_closing); 939 940 td->file_service_file = f; 941 td->file_service_left = td->file_service_nr - 1; 942out: 943 dprint(FD_FILE, "get_next_file: %p [%s]\n", f, f->file_name); 944 return f; 945} 946 947static struct fio_file *get_next_file(struct thread_data *td) 948{ 949 struct prof_io_ops *ops = &td->prof_io_ops; 950 951 if (ops->get_next_file) 952 return ops->get_next_file(td); 953 954 return __get_next_file(td); 955} 956 957static int set_io_u_file(struct thread_data *td, struct io_u *io_u) 958{ 959 struct fio_file *f; 960 961 do { 962 f = get_next_file(td); 963 if (!f) 964 return 1; 965 966 io_u->file = f; 967 get_file(f); 968 969 if (!fill_io_u(td, io_u)) 970 break; 971 972 put_file_log(td, f); 973 td_io_close_file(td, f); 974 io_u->file = NULL; 975 fio_file_set_done(f); 976 td->nr_done_files++; 977 dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name, 978 td->nr_done_files, td->o.nr_files); 979 } while (1); 980 981 return 0; 982} 983 984 985struct io_u *__get_io_u(struct thread_data *td) 986{ 987 struct io_u *io_u = NULL; 988 989 td_io_u_lock(td); 990 991again: 992 if (!flist_empty(&td->io_u_requeues)) 993 io_u = flist_entry(td->io_u_requeues.next, struct io_u, list); 994 else if (!queue_full(td)) { 995 io_u = flist_entry(td->io_u_freelist.next, struct io_u, list); 996 997 io_u->buflen = 0; 998 io_u->resid = 0; 999 io_u->file = NULL; 1000 io_u->end_io = NULL; 1001 } 1002 1003 if (io_u) { 1004 assert(io_u->flags & IO_U_F_FREE); 1005 io_u->flags &= ~(IO_U_F_FREE | IO_U_F_FREE_DEF); 1006 io_u->flags &= ~(IO_U_F_TRIMMED | IO_U_F_BARRIER); 1007 1008 io_u->error = 0; 1009 flist_del(&io_u->list); 1010 flist_add(&io_u->list, &td->io_u_busylist); 1011 td->cur_depth++; 1012 io_u->flags |= IO_U_F_IN_CUR_DEPTH; 1013 } else if (td->o.verify_async) { 1014 /* 1015 * We ran out, wait for async verify threads to finish and 1016 * return one 1017 */ 1018 pthread_cond_wait(&td->free_cond, &td->io_u_lock); 1019 goto again; 1020 } 1021 1022 td_io_u_unlock(td); 1023 return io_u; 1024} 1025 1026static int check_get_trim(struct thread_data *td, struct io_u *io_u) 1027{ 1028 if (td->o.trim_backlog && td->trim_entries) { 1029 int get_trim = 0; 1030 1031 if (td->trim_batch) { 1032 td->trim_batch--; 1033 get_trim = 1; 1034 } else if (!(td->io_hist_len % td->o.trim_backlog) && 1035 td->last_ddir != DDIR_READ) { 1036 td->trim_batch = td->o.trim_batch; 1037 if (!td->trim_batch) 1038 td->trim_batch = td->o.trim_backlog; 1039 get_trim = 1; 1040 } 1041 1042 if (get_trim && !get_next_trim(td, io_u)) 1043 return 1; 1044 } 1045 1046 return 0; 1047} 1048 1049static int check_get_verify(struct thread_data *td, struct io_u *io_u) 1050{ 1051 if (td->o.verify_backlog && td->io_hist_len) { 1052 int get_verify = 0; 1053 1054 if (td->verify_batch) { 1055 td->verify_batch--; 1056 get_verify = 1; 1057 } else if (!(td->io_hist_len % td->o.verify_backlog) && 1058 td->last_ddir != DDIR_READ) { 1059 td->verify_batch = td->o.verify_batch; 1060 if (!td->verify_batch) 1061 td->verify_batch = td->o.verify_backlog; 1062 get_verify = 1; 1063 } 1064 1065 if (get_verify && !get_next_verify(td, io_u)) 1066 return 1; 1067 } 1068 1069 return 0; 1070} 1071 1072/* 1073 * Return an io_u to be processed. Gets a buflen and offset, sets direction, 1074 * etc. The returned io_u is fully ready to be prepped and submitted. 1075 */ 1076struct io_u *get_io_u(struct thread_data *td) 1077{ 1078 struct fio_file *f; 1079 struct io_u *io_u; 1080 1081 io_u = __get_io_u(td); 1082 if (!io_u) { 1083 dprint(FD_IO, "__get_io_u failed\n"); 1084 return NULL; 1085 } 1086 1087 if (check_get_verify(td, io_u)) 1088 goto out; 1089 if (check_get_trim(td, io_u)) 1090 goto out; 1091 1092 /* 1093 * from a requeue, io_u already setup 1094 */ 1095 if (io_u->file) 1096 goto out; 1097 1098 /* 1099 * If using an iolog, grab next piece if any available. 1100 */ 1101 if (td->o.read_iolog_file) { 1102 if (read_iolog_get(td, io_u)) 1103 goto err_put; 1104 } else if (set_io_u_file(td, io_u)) { 1105 dprint(FD_IO, "io_u %p, setting file failed\n", io_u); 1106 goto err_put; 1107 } 1108 1109 f = io_u->file; 1110 assert(fio_file_open(f)); 1111 1112 if (ddir_rw(io_u->ddir)) { 1113 if (!io_u->buflen && !(td->io_ops->flags & FIO_NOIO)) { 1114 dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u); 1115 goto err_put; 1116 } 1117 1118 f->last_start = io_u->offset; 1119 f->last_pos = io_u->offset + io_u->buflen; 1120 1121 if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_WRITE) 1122 populate_verify_io_u(td, io_u); 1123 else if (td->o.refill_buffers && io_u->ddir == DDIR_WRITE) 1124 io_u_fill_buffer(td, io_u, io_u->xfer_buflen); 1125 else if (io_u->ddir == DDIR_READ) { 1126 /* 1127 * Reset the buf_filled parameters so next time if the 1128 * buffer is used for writes it is refilled. 1129 */ 1130 io_u->buf_filled_len = 0; 1131 } 1132 } 1133 1134 /* 1135 * Set io data pointers. 1136 */ 1137 io_u->xfer_buf = io_u->buf; 1138 io_u->xfer_buflen = io_u->buflen; 1139 1140out: 1141 assert(io_u->file); 1142 if (!td_io_prep(td, io_u)) { 1143 if (!td->o.disable_slat) 1144 fio_gettime(&io_u->start_time, NULL); 1145 return io_u; 1146 } 1147err_put: 1148 dprint(FD_IO, "get_io_u failed\n"); 1149 put_io_u(td, io_u); 1150 return NULL; 1151} 1152 1153void io_u_log_error(struct thread_data *td, struct io_u *io_u) 1154{ 1155 const char *msg[] = { "read", "write", "sync", "datasync", 1156 "sync_file_range", "wait", "trim" }; 1157 1158 1159 1160 log_err("fio: io_u error"); 1161 1162 if (io_u->file) 1163 log_err(" on file %s", io_u->file->file_name); 1164 1165 log_err(": %s\n", strerror(io_u->error)); 1166 1167 log_err(" %s offset=%llu, buflen=%lu\n", msg[io_u->ddir], 1168 io_u->offset, io_u->xfer_buflen); 1169 1170 if (!td->error) 1171 td_verror(td, io_u->error, "io_u error"); 1172} 1173 1174static void io_completed(struct thread_data *td, struct io_u *io_u, 1175 struct io_completion_data *icd) 1176{ 1177 /* 1178 * Older gcc's are too dumb to realize that usec is always used 1179 * initialized, silence that warning. 1180 */ 1181 unsigned long uninitialized_var(usec); 1182 struct fio_file *f; 1183 1184 dprint_io_u(io_u, "io complete"); 1185 1186 td_io_u_lock(td); 1187 assert(io_u->flags & IO_U_F_FLIGHT); 1188 io_u->flags &= ~(IO_U_F_FLIGHT | IO_U_F_BUSY_OK); 1189 td_io_u_unlock(td); 1190 1191 if (ddir_sync(io_u->ddir)) { 1192 td->last_was_sync = 1; 1193 f = io_u->file; 1194 if (f) { 1195 f->first_write = -1ULL; 1196 f->last_write = -1ULL; 1197 } 1198 return; 1199 } 1200 1201 td->last_was_sync = 0; 1202 td->last_ddir = io_u->ddir; 1203 1204 if (!io_u->error && ddir_rw(io_u->ddir)) { 1205 unsigned int bytes = io_u->buflen - io_u->resid; 1206 const enum fio_ddir idx = io_u->ddir; 1207 const enum fio_ddir odx = io_u->ddir ^ 1; 1208 int ret; 1209 1210 td->io_blocks[idx]++; 1211 td->io_bytes[idx] += bytes; 1212 td->this_io_bytes[idx] += bytes; 1213 1214 if (idx == DDIR_WRITE) { 1215 f = io_u->file; 1216 if (f) { 1217 if (f->first_write == -1ULL || 1218 io_u->offset < f->first_write) 1219 f->first_write = io_u->offset; 1220 if (f->last_write == -1ULL || 1221 ((io_u->offset + bytes) > f->last_write)) 1222 f->last_write = io_u->offset + bytes; 1223 } 1224 } 1225 1226 if (ramp_time_over(td)) { 1227 unsigned long uninitialized_var(lusec); 1228 1229 if (!td->o.disable_clat || !td->o.disable_bw) 1230 lusec = utime_since(&io_u->issue_time, 1231 &icd->time); 1232 if (!td->o.disable_lat) { 1233 unsigned long tusec; 1234 1235 tusec = utime_since(&io_u->start_time, 1236 &icd->time); 1237 add_lat_sample(td, idx, tusec, bytes); 1238 } 1239 if (!td->o.disable_clat) { 1240 add_clat_sample(td, idx, lusec, bytes); 1241 io_u_mark_latency(td, lusec); 1242 } 1243 if (!td->o.disable_bw) 1244 add_bw_sample(td, idx, bytes, &icd->time); 1245 if (__should_check_rate(td, idx)) { 1246 td->rate_pending_usleep[idx] = 1247 ((td->this_io_bytes[idx] * 1248 td->rate_nsec_cycle[idx]) / 1000 - 1249 utime_since_now(&td->start)); 1250 } 1251 if (__should_check_rate(td, idx ^ 1)) 1252 td->rate_pending_usleep[odx] = 1253 ((td->this_io_bytes[odx] * 1254 td->rate_nsec_cycle[odx]) / 1000 - 1255 utime_since_now(&td->start)); 1256 } 1257 1258 if (td_write(td) && idx == DDIR_WRITE && 1259 td->o.do_verify && 1260 td->o.verify != VERIFY_NONE) 1261 log_io_piece(td, io_u); 1262 1263 icd->bytes_done[idx] += bytes; 1264 1265 if (io_u->end_io) { 1266 ret = io_u->end_io(td, io_u); 1267 if (ret && !icd->error) 1268 icd->error = ret; 1269 } 1270 } else if (io_u->error) { 1271 icd->error = io_u->error; 1272 io_u_log_error(td, io_u); 1273 } 1274 if (td->o.continue_on_error && icd->error && 1275 td_non_fatal_error(icd->error)) { 1276 /* 1277 * If there is a non_fatal error, then add to the error count 1278 * and clear all the errors. 1279 */ 1280 update_error_count(td, icd->error); 1281 td_clear_error(td); 1282 icd->error = 0; 1283 io_u->error = 0; 1284 } 1285} 1286 1287static void init_icd(struct thread_data *td, struct io_completion_data *icd, 1288 int nr) 1289{ 1290 if (!td->o.disable_clat || !td->o.disable_bw) 1291 fio_gettime(&icd->time, NULL); 1292 1293 icd->nr = nr; 1294 1295 icd->error = 0; 1296 icd->bytes_done[0] = icd->bytes_done[1] = 0; 1297} 1298 1299static void ios_completed(struct thread_data *td, 1300 struct io_completion_data *icd) 1301{ 1302 struct io_u *io_u; 1303 int i; 1304 1305 for (i = 0; i < icd->nr; i++) { 1306 io_u = td->io_ops->event(td, i); 1307 1308 io_completed(td, io_u, icd); 1309 1310 if (!(io_u->flags & IO_U_F_FREE_DEF)) 1311 put_io_u(td, io_u); 1312 } 1313} 1314 1315/* 1316 * Complete a single io_u for the sync engines. 1317 */ 1318int io_u_sync_complete(struct thread_data *td, struct io_u *io_u, 1319 unsigned long *bytes) 1320{ 1321 struct io_completion_data icd; 1322 1323 init_icd(td, &icd, 1); 1324 io_completed(td, io_u, &icd); 1325 1326 if (!(io_u->flags & IO_U_F_FREE_DEF)) 1327 put_io_u(td, io_u); 1328 1329 if (icd.error) { 1330 td_verror(td, icd.error, "io_u_sync_complete"); 1331 return -1; 1332 } 1333 1334 if (bytes) { 1335 bytes[0] += icd.bytes_done[0]; 1336 bytes[1] += icd.bytes_done[1]; 1337 } 1338 1339 return 0; 1340} 1341 1342/* 1343 * Called to complete min_events number of io for the async engines. 1344 */ 1345int io_u_queued_complete(struct thread_data *td, int min_evts, 1346 unsigned long *bytes) 1347{ 1348 struct io_completion_data icd; 1349 struct timespec *tvp = NULL; 1350 int ret; 1351 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, }; 1352 1353 dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_evts); 1354 1355 if (!min_evts) 1356 tvp = &ts; 1357 1358 ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete, tvp); 1359 if (ret < 0) { 1360 td_verror(td, -ret, "td_io_getevents"); 1361 return ret; 1362 } else if (!ret) 1363 return ret; 1364 1365 init_icd(td, &icd, ret); 1366 ios_completed(td, &icd); 1367 if (icd.error) { 1368 td_verror(td, icd.error, "io_u_queued_complete"); 1369 return -1; 1370 } 1371 1372 if (bytes) { 1373 bytes[0] += icd.bytes_done[0]; 1374 bytes[1] += icd.bytes_done[1]; 1375 } 1376 1377 return 0; 1378} 1379 1380/* 1381 * Call when io_u is really queued, to update the submission latency. 1382 */ 1383void io_u_queued(struct thread_data *td, struct io_u *io_u) 1384{ 1385 if (!td->o.disable_slat) { 1386 unsigned long slat_time; 1387 1388 slat_time = utime_since(&io_u->start_time, &io_u->issue_time); 1389 add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen); 1390 } 1391} 1392 1393/* 1394 * "randomly" fill the buffer contents 1395 */ 1396void io_u_fill_buffer(struct thread_data *td, struct io_u *io_u, 1397 unsigned int max_bs) 1398{ 1399 io_u->buf_filled_len = 0; 1400 1401 if (!td->o.zero_buffers) 1402 fill_random_buf(io_u->buf, max_bs); 1403 else 1404 memset(io_u->buf, 0, max_bs); 1405} 1406