io_u.c revision eda3a60699e1d96bb68875ef2169ca819eb8f4f9
1#include <unistd.h> 2#include <fcntl.h> 3#include <string.h> 4#include <signal.h> 5#include <time.h> 6#include <assert.h> 7 8#include "fio.h" 9#include "hash.h" 10#include "verify.h" 11#include "trim.h" 12#include "lib/rand.h" 13#include "lib/axmap.h" 14#include "err.h" 15#include "lib/pow2.h" 16#include "minmax.h" 17 18struct io_completion_data { 19 int nr; /* input */ 20 21 int error; /* output */ 22 uint64_t bytes_done[DDIR_RWDIR_CNT]; /* output */ 23 struct timeval time; /* output */ 24}; 25 26/* 27 * The ->io_axmap contains a map of blocks we have or have not done io 28 * to yet. Used to make sure we cover the entire range in a fair fashion. 29 */ 30static bool random_map_free(struct fio_file *f, const uint64_t block) 31{ 32 return !axmap_isset(f->io_axmap, block); 33} 34 35/* 36 * Mark a given offset as used in the map. 37 */ 38static void mark_random_map(struct thread_data *td, struct io_u *io_u) 39{ 40 unsigned int min_bs = td->o.rw_min_bs; 41 struct fio_file *f = io_u->file; 42 unsigned int nr_blocks; 43 uint64_t block; 44 45 block = (io_u->offset - f->file_offset) / (uint64_t) min_bs; 46 nr_blocks = (io_u->buflen + min_bs - 1) / min_bs; 47 48 if (!(io_u->flags & IO_U_F_BUSY_OK)) 49 nr_blocks = axmap_set_nr(f->io_axmap, block, nr_blocks); 50 51 if ((nr_blocks * min_bs) < io_u->buflen) 52 io_u->buflen = nr_blocks * min_bs; 53} 54 55static uint64_t last_block(struct thread_data *td, struct fio_file *f, 56 enum fio_ddir ddir) 57{ 58 uint64_t max_blocks; 59 uint64_t max_size; 60 61 assert(ddir_rw(ddir)); 62 63 /* 64 * Hmm, should we make sure that ->io_size <= ->real_file_size? 65 * -> not for now since there is code assuming it could go either. 66 */ 67 max_size = f->io_size; 68 if (max_size > f->real_file_size) 69 max_size = f->real_file_size; 70 71 if (td->o.zone_range) 72 max_size = td->o.zone_range; 73 74 if (td->o.min_bs[ddir] > td->o.ba[ddir]) 75 max_size -= td->o.min_bs[ddir] - td->o.ba[ddir]; 76 77 max_blocks = max_size / (uint64_t) td->o.ba[ddir]; 78 if (!max_blocks) 79 return 0; 80 81 return max_blocks; 82} 83 84struct rand_off { 85 struct flist_head list; 86 uint64_t off; 87}; 88 89static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f, 90 enum fio_ddir ddir, uint64_t *b, 91 uint64_t lastb) 92{ 93 uint64_t r; 94 95 if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE || 96 td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) { 97 98 r = __rand(&td->random_state); 99 100 dprint(FD_RANDOM, "off rand %llu\n", (unsigned long long) r); 101 102 *b = lastb * (r / (rand_max(&td->random_state) + 1.0)); 103 } else { 104 uint64_t off = 0; 105 106 assert(fio_file_lfsr(f)); 107 108 if (lfsr_next(&f->lfsr, &off)) 109 return 1; 110 111 *b = off; 112 } 113 114 /* 115 * if we are not maintaining a random map, we are done. 116 */ 117 if (!file_randommap(td, f)) 118 goto ret; 119 120 /* 121 * calculate map offset and check if it's free 122 */ 123 if (random_map_free(f, *b)) 124 goto ret; 125 126 dprint(FD_RANDOM, "get_next_rand_offset: offset %llu busy\n", 127 (unsigned long long) *b); 128 129 *b = axmap_next_free(f->io_axmap, *b); 130 if (*b == (uint64_t) -1ULL) 131 return 1; 132ret: 133 return 0; 134} 135 136static int __get_next_rand_offset_zipf(struct thread_data *td, 137 struct fio_file *f, enum fio_ddir ddir, 138 uint64_t *b) 139{ 140 *b = zipf_next(&f->zipf); 141 return 0; 142} 143 144static int __get_next_rand_offset_pareto(struct thread_data *td, 145 struct fio_file *f, enum fio_ddir ddir, 146 uint64_t *b) 147{ 148 *b = pareto_next(&f->zipf); 149 return 0; 150} 151 152static int __get_next_rand_offset_gauss(struct thread_data *td, 153 struct fio_file *f, enum fio_ddir ddir, 154 uint64_t *b) 155{ 156 *b = gauss_next(&f->gauss); 157 return 0; 158} 159 160static int __get_next_rand_offset_zoned(struct thread_data *td, 161 struct fio_file *f, enum fio_ddir ddir, 162 uint64_t *b) 163{ 164 unsigned int v, send, stotal; 165 uint64_t offset, lastb; 166 static int warned; 167 struct zone_split_index *zsi; 168 169 lastb = last_block(td, f, ddir); 170 if (!lastb) 171 return 1; 172 173 if (!td->o.zone_split_nr[ddir]) { 174bail: 175 return __get_next_rand_offset(td, f, ddir, b, lastb); 176 } 177 178 /* 179 * Generate a value, v, between 1 and 100, both inclusive 180 */ 181 v = rand32_between(&td->zone_state, 1, 100); 182 183 zsi = &td->zone_state_index[ddir][v - 1]; 184 stotal = zsi->size_perc_prev; 185 send = zsi->size_perc; 186 187 /* 188 * Should never happen 189 */ 190 if (send == -1U) { 191 if (!warned) { 192 log_err("fio: bug in zoned generation\n"); 193 warned = 1; 194 } 195 goto bail; 196 } 197 198 /* 199 * 'send' is some percentage below or equal to 100 that 200 * marks the end of the current IO range. 'stotal' marks 201 * the start, in percent. 202 */ 203 if (stotal) 204 offset = stotal * lastb / 100ULL; 205 else 206 offset = 0; 207 208 lastb = lastb * (send - stotal) / 100ULL; 209 210 /* 211 * Generate index from 0..send-of-lastb 212 */ 213 if (__get_next_rand_offset(td, f, ddir, b, lastb) == 1) 214 return 1; 215 216 /* 217 * Add our start offset, if any 218 */ 219 if (offset) 220 *b += offset; 221 222 return 0; 223} 224 225static int flist_cmp(void *data, struct flist_head *a, struct flist_head *b) 226{ 227 struct rand_off *r1 = flist_entry(a, struct rand_off, list); 228 struct rand_off *r2 = flist_entry(b, struct rand_off, list); 229 230 return r1->off - r2->off; 231} 232 233static int get_off_from_method(struct thread_data *td, struct fio_file *f, 234 enum fio_ddir ddir, uint64_t *b) 235{ 236 if (td->o.random_distribution == FIO_RAND_DIST_RANDOM) { 237 uint64_t lastb; 238 239 lastb = last_block(td, f, ddir); 240 if (!lastb) 241 return 1; 242 243 return __get_next_rand_offset(td, f, ddir, b, lastb); 244 } else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF) 245 return __get_next_rand_offset_zipf(td, f, ddir, b); 246 else if (td->o.random_distribution == FIO_RAND_DIST_PARETO) 247 return __get_next_rand_offset_pareto(td, f, ddir, b); 248 else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS) 249 return __get_next_rand_offset_gauss(td, f, ddir, b); 250 else if (td->o.random_distribution == FIO_RAND_DIST_ZONED) 251 return __get_next_rand_offset_zoned(td, f, ddir, b); 252 253 log_err("fio: unknown random distribution: %d\n", td->o.random_distribution); 254 return 1; 255} 256 257/* 258 * Sort the reads for a verify phase in batches of verifysort_nr, if 259 * specified. 260 */ 261static inline bool should_sort_io(struct thread_data *td) 262{ 263 if (!td->o.verifysort_nr || !td->o.do_verify) 264 return false; 265 if (!td_random(td)) 266 return false; 267 if (td->runstate != TD_VERIFYING) 268 return false; 269 if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE || 270 td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) 271 return false; 272 273 return true; 274} 275 276static bool should_do_random(struct thread_data *td, enum fio_ddir ddir) 277{ 278 unsigned int v; 279 280 if (td->o.perc_rand[ddir] == 100) 281 return true; 282 283 v = rand32_between(&td->seq_rand_state[ddir], 1, 100); 284 285 return v <= td->o.perc_rand[ddir]; 286} 287 288static int get_next_rand_offset(struct thread_data *td, struct fio_file *f, 289 enum fio_ddir ddir, uint64_t *b) 290{ 291 struct rand_off *r; 292 int i, ret = 1; 293 294 if (!should_sort_io(td)) 295 return get_off_from_method(td, f, ddir, b); 296 297 if (!flist_empty(&td->next_rand_list)) { 298fetch: 299 r = flist_first_entry(&td->next_rand_list, struct rand_off, list); 300 flist_del(&r->list); 301 *b = r->off; 302 free(r); 303 return 0; 304 } 305 306 for (i = 0; i < td->o.verifysort_nr; i++) { 307 r = malloc(sizeof(*r)); 308 309 ret = get_off_from_method(td, f, ddir, &r->off); 310 if (ret) { 311 free(r); 312 break; 313 } 314 315 flist_add(&r->list, &td->next_rand_list); 316 } 317 318 if (ret && !i) 319 return ret; 320 321 assert(!flist_empty(&td->next_rand_list)); 322 flist_sort(NULL, &td->next_rand_list, flist_cmp); 323 goto fetch; 324} 325 326static int get_next_rand_block(struct thread_data *td, struct fio_file *f, 327 enum fio_ddir ddir, uint64_t *b) 328{ 329 if (!get_next_rand_offset(td, f, ddir, b)) 330 return 0; 331 332 if (td->o.time_based || 333 (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)) { 334 fio_file_reset(td, f); 335 if (!get_next_rand_offset(td, f, ddir, b)) 336 return 0; 337 } 338 339 dprint(FD_IO, "%s: rand offset failed, last=%llu, size=%llu\n", 340 f->file_name, (unsigned long long) f->last_pos[ddir], 341 (unsigned long long) f->real_file_size); 342 return 1; 343} 344 345static int get_next_seq_offset(struct thread_data *td, struct fio_file *f, 346 enum fio_ddir ddir, uint64_t *offset) 347{ 348 struct thread_options *o = &td->o; 349 350 assert(ddir_rw(ddir)); 351 352 if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) && 353 o->time_based) { 354 struct thread_options *o = &td->o; 355 uint64_t io_size = f->io_size + (f->io_size % o->min_bs[ddir]); 356 357 if (io_size > f->last_pos[ddir]) 358 f->last_pos[ddir] = 0; 359 else 360 f->last_pos[ddir] = f->last_pos[ddir] - io_size; 361 } 362 363 if (f->last_pos[ddir] < f->real_file_size) { 364 uint64_t pos; 365 366 if (f->last_pos[ddir] == f->file_offset && o->ddir_seq_add < 0) { 367 if (f->real_file_size > f->io_size) 368 f->last_pos[ddir] = f->io_size; 369 else 370 f->last_pos[ddir] = f->real_file_size; 371 } 372 373 pos = f->last_pos[ddir] - f->file_offset; 374 if (pos && o->ddir_seq_add) { 375 pos += o->ddir_seq_add; 376 377 /* 378 * If we reach beyond the end of the file 379 * with holed IO, wrap around to the 380 * beginning again. If we're doing backwards IO, 381 * wrap to the end. 382 */ 383 if (pos >= f->real_file_size) { 384 if (o->ddir_seq_add > 0) 385 pos = f->file_offset; 386 else { 387 if (f->real_file_size > f->io_size) 388 pos = f->io_size; 389 else 390 pos = f->real_file_size; 391 392 pos += o->ddir_seq_add; 393 } 394 } 395 } 396 397 *offset = pos; 398 return 0; 399 } 400 401 return 1; 402} 403 404static int get_next_block(struct thread_data *td, struct io_u *io_u, 405 enum fio_ddir ddir, int rw_seq, 406 unsigned int *is_random) 407{ 408 struct fio_file *f = io_u->file; 409 uint64_t b, offset; 410 int ret; 411 412 assert(ddir_rw(ddir)); 413 414 b = offset = -1ULL; 415 416 if (rw_seq) { 417 if (td_random(td)) { 418 if (should_do_random(td, ddir)) { 419 ret = get_next_rand_block(td, f, ddir, &b); 420 *is_random = 1; 421 } else { 422 *is_random = 0; 423 io_u_set(td, io_u, IO_U_F_BUSY_OK); 424 ret = get_next_seq_offset(td, f, ddir, &offset); 425 if (ret) 426 ret = get_next_rand_block(td, f, ddir, &b); 427 } 428 } else { 429 *is_random = 0; 430 ret = get_next_seq_offset(td, f, ddir, &offset); 431 } 432 } else { 433 io_u_set(td, io_u, IO_U_F_BUSY_OK); 434 *is_random = 0; 435 436 if (td->o.rw_seq == RW_SEQ_SEQ) { 437 ret = get_next_seq_offset(td, f, ddir, &offset); 438 if (ret) { 439 ret = get_next_rand_block(td, f, ddir, &b); 440 *is_random = 0; 441 } 442 } else if (td->o.rw_seq == RW_SEQ_IDENT) { 443 if (f->last_start[ddir] != -1ULL) 444 offset = f->last_start[ddir] - f->file_offset; 445 else 446 offset = 0; 447 ret = 0; 448 } else { 449 log_err("fio: unknown rw_seq=%d\n", td->o.rw_seq); 450 ret = 1; 451 } 452 } 453 454 if (!ret) { 455 if (offset != -1ULL) 456 io_u->offset = offset; 457 else if (b != -1ULL) 458 io_u->offset = b * td->o.ba[ddir]; 459 else { 460 log_err("fio: bug in offset generation: offset=%llu, b=%llu\n", (unsigned long long) offset, (unsigned long long) b); 461 ret = 1; 462 } 463 } 464 465 return ret; 466} 467 468/* 469 * For random io, generate a random new block and see if it's used. Repeat 470 * until we find a free one. For sequential io, just return the end of 471 * the last io issued. 472 */ 473static int __get_next_offset(struct thread_data *td, struct io_u *io_u, 474 unsigned int *is_random) 475{ 476 struct fio_file *f = io_u->file; 477 enum fio_ddir ddir = io_u->ddir; 478 int rw_seq_hit = 0; 479 480 assert(ddir_rw(ddir)); 481 482 if (td->o.ddir_seq_nr && !--td->ddir_seq_nr) { 483 rw_seq_hit = 1; 484 td->ddir_seq_nr = td->o.ddir_seq_nr; 485 } 486 487 if (get_next_block(td, io_u, ddir, rw_seq_hit, is_random)) 488 return 1; 489 490 if (io_u->offset >= f->io_size) { 491 dprint(FD_IO, "get_next_offset: offset %llu >= io_size %llu\n", 492 (unsigned long long) io_u->offset, 493 (unsigned long long) f->io_size); 494 return 1; 495 } 496 497 io_u->offset += f->file_offset; 498 if (io_u->offset >= f->real_file_size) { 499 dprint(FD_IO, "get_next_offset: offset %llu >= size %llu\n", 500 (unsigned long long) io_u->offset, 501 (unsigned long long) f->real_file_size); 502 return 1; 503 } 504 505 return 0; 506} 507 508static int get_next_offset(struct thread_data *td, struct io_u *io_u, 509 unsigned int *is_random) 510{ 511 if (td->flags & TD_F_PROFILE_OPS) { 512 struct prof_io_ops *ops = &td->prof_io_ops; 513 514 if (ops->fill_io_u_off) 515 return ops->fill_io_u_off(td, io_u, is_random); 516 } 517 518 return __get_next_offset(td, io_u, is_random); 519} 520 521static inline bool io_u_fits(struct thread_data *td, struct io_u *io_u, 522 unsigned int buflen) 523{ 524 struct fio_file *f = io_u->file; 525 526 return io_u->offset + buflen <= f->io_size + get_start_offset(td, f); 527} 528 529static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u, 530 unsigned int is_random) 531{ 532 int ddir = io_u->ddir; 533 unsigned int buflen = 0; 534 unsigned int minbs, maxbs; 535 uint64_t frand_max, r; 536 bool power_2; 537 538 assert(ddir_rw(ddir)); 539 540 if (td->o.bs_is_seq_rand) 541 ddir = is_random ? DDIR_WRITE: DDIR_READ; 542 543 minbs = td->o.min_bs[ddir]; 544 maxbs = td->o.max_bs[ddir]; 545 546 if (minbs == maxbs) 547 return minbs; 548 549 /* 550 * If we can't satisfy the min block size from here, then fail 551 */ 552 if (!io_u_fits(td, io_u, minbs)) 553 return 0; 554 555 frand_max = rand_max(&td->bsrange_state); 556 do { 557 r = __rand(&td->bsrange_state); 558 559 if (!td->o.bssplit_nr[ddir]) { 560 buflen = 1 + (unsigned int) ((double) maxbs * 561 (r / (frand_max + 1.0))); 562 if (buflen < minbs) 563 buflen = minbs; 564 } else { 565 long long perc = 0; 566 unsigned int i; 567 568 for (i = 0; i < td->o.bssplit_nr[ddir]; i++) { 569 struct bssplit *bsp = &td->o.bssplit[ddir][i]; 570 571 buflen = bsp->bs; 572 perc += bsp->perc; 573 if (!perc) 574 break; 575 if ((r / perc <= frand_max / 100ULL) && 576 io_u_fits(td, io_u, buflen)) 577 break; 578 } 579 } 580 581 power_2 = is_power_of_2(minbs); 582 if (!td->o.bs_unaligned && power_2) 583 buflen &= ~(minbs - 1); 584 else if (!td->o.bs_unaligned && !power_2) 585 buflen -= buflen % minbs; 586 } while (!io_u_fits(td, io_u, buflen)); 587 588 return buflen; 589} 590 591static unsigned int get_next_buflen(struct thread_data *td, struct io_u *io_u, 592 unsigned int is_random) 593{ 594 if (td->flags & TD_F_PROFILE_OPS) { 595 struct prof_io_ops *ops = &td->prof_io_ops; 596 597 if (ops->fill_io_u_size) 598 return ops->fill_io_u_size(td, io_u, is_random); 599 } 600 601 return __get_next_buflen(td, io_u, is_random); 602} 603 604static void set_rwmix_bytes(struct thread_data *td) 605{ 606 unsigned int diff; 607 608 /* 609 * we do time or byte based switch. this is needed because 610 * buffered writes may issue a lot quicker than they complete, 611 * whereas reads do not. 612 */ 613 diff = td->o.rwmix[td->rwmix_ddir ^ 1]; 614 td->rwmix_issues = (td->io_issues[td->rwmix_ddir] * diff) / 100; 615} 616 617static inline enum fio_ddir get_rand_ddir(struct thread_data *td) 618{ 619 unsigned int v; 620 621 v = rand32_between(&td->rwmix_state, 1, 100); 622 623 if (v <= td->o.rwmix[DDIR_READ]) 624 return DDIR_READ; 625 626 return DDIR_WRITE; 627} 628 629int io_u_quiesce(struct thread_data *td) 630{ 631 int completed = 0; 632 633 /* 634 * We are going to sleep, ensure that we flush anything pending as 635 * not to skew our latency numbers. 636 * 637 * Changed to only monitor 'in flight' requests here instead of the 638 * td->cur_depth, b/c td->cur_depth does not accurately represent 639 * io's that have been actually submitted to an async engine, 640 * and cur_depth is meaningless for sync engines. 641 */ 642 if (td->io_u_queued || td->cur_depth) { 643 int fio_unused ret; 644 645 ret = td_io_commit(td); 646 } 647 648 while (td->io_u_in_flight) { 649 int ret; 650 651 ret = io_u_queued_complete(td, 1); 652 if (ret > 0) 653 completed += ret; 654 } 655 656 if (td->flags & TD_F_REGROW_LOGS) 657 regrow_logs(td); 658 659 return completed; 660} 661 662static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir) 663{ 664 enum fio_ddir odir = ddir ^ 1; 665 long usec; 666 uint64_t now; 667 668 assert(ddir_rw(ddir)); 669 now = utime_since_now(&td->start); 670 671 /* 672 * if rate_next_io_time is in the past, need to catch up to rate 673 */ 674 if (td->rate_next_io_time[ddir] <= now) 675 return ddir; 676 677 /* 678 * We are ahead of rate in this direction. See if we 679 * should switch. 680 */ 681 if (td_rw(td) && td->o.rwmix[odir]) { 682 /* 683 * Other direction is behind rate, switch 684 */ 685 if (td->rate_next_io_time[odir] <= now) 686 return odir; 687 688 /* 689 * Both directions are ahead of rate. sleep the min 690 * switch if necissary 691 */ 692 if (td->rate_next_io_time[ddir] <= 693 td->rate_next_io_time[odir]) { 694 usec = td->rate_next_io_time[ddir] - now; 695 } else { 696 usec = td->rate_next_io_time[odir] - now; 697 ddir = odir; 698 } 699 } else 700 usec = td->rate_next_io_time[ddir] - now; 701 702 if (td->o.io_submit_mode == IO_MODE_INLINE) 703 io_u_quiesce(td); 704 705 usec = usec_sleep(td, usec); 706 707 return ddir; 708} 709 710/* 711 * Return the data direction for the next io_u. If the job is a 712 * mixed read/write workload, check the rwmix cycle and switch if 713 * necessary. 714 */ 715static enum fio_ddir get_rw_ddir(struct thread_data *td) 716{ 717 enum fio_ddir ddir; 718 719 /* 720 * See if it's time to fsync/fdatasync/sync_file_range first, 721 * and if not then move on to check regular I/Os. 722 */ 723 if (should_fsync(td)) { 724 if (td->o.fsync_blocks && td->io_issues[DDIR_WRITE] && 725 !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks)) 726 return DDIR_SYNC; 727 728 if (td->o.fdatasync_blocks && td->io_issues[DDIR_WRITE] && 729 !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks)) 730 return DDIR_DATASYNC; 731 732 if (td->sync_file_range_nr && td->io_issues[DDIR_WRITE] && 733 !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr)) 734 return DDIR_SYNC_FILE_RANGE; 735 } 736 737 if (td_rw(td)) { 738 /* 739 * Check if it's time to seed a new data direction. 740 */ 741 if (td->io_issues[td->rwmix_ddir] >= td->rwmix_issues) { 742 /* 743 * Put a top limit on how many bytes we do for 744 * one data direction, to avoid overflowing the 745 * ranges too much 746 */ 747 ddir = get_rand_ddir(td); 748 749 if (ddir != td->rwmix_ddir) 750 set_rwmix_bytes(td); 751 752 td->rwmix_ddir = ddir; 753 } 754 ddir = td->rwmix_ddir; 755 } else if (td_read(td)) 756 ddir = DDIR_READ; 757 else if (td_write(td)) 758 ddir = DDIR_WRITE; 759 else if (td_trim(td)) 760 ddir = DDIR_TRIM; 761 else 762 ddir = DDIR_INVAL; 763 764 td->rwmix_ddir = rate_ddir(td, ddir); 765 return td->rwmix_ddir; 766} 767 768static void set_rw_ddir(struct thread_data *td, struct io_u *io_u) 769{ 770 enum fio_ddir ddir = get_rw_ddir(td); 771 772 if (td_trimwrite(td)) { 773 struct fio_file *f = io_u->file; 774 if (f->last_pos[DDIR_WRITE] == f->last_pos[DDIR_TRIM]) 775 ddir = DDIR_TRIM; 776 else 777 ddir = DDIR_WRITE; 778 } 779 780 io_u->ddir = io_u->acct_ddir = ddir; 781 782 if (io_u->ddir == DDIR_WRITE && td_ioengine_flagged(td, FIO_BARRIER) && 783 td->o.barrier_blocks && 784 !(td->io_issues[DDIR_WRITE] % td->o.barrier_blocks) && 785 td->io_issues[DDIR_WRITE]) 786 io_u_set(td, io_u, IO_U_F_BARRIER); 787} 788 789void put_file_log(struct thread_data *td, struct fio_file *f) 790{ 791 unsigned int ret = put_file(td, f); 792 793 if (ret) 794 td_verror(td, ret, "file close"); 795} 796 797void put_io_u(struct thread_data *td, struct io_u *io_u) 798{ 799 if (td->parent) 800 td = td->parent; 801 802 td_io_u_lock(td); 803 804 if (io_u->file && !(io_u->flags & IO_U_F_NO_FILE_PUT)) 805 put_file_log(td, io_u->file); 806 807 io_u->file = NULL; 808 io_u_set(td, io_u, IO_U_F_FREE); 809 810 if (io_u->flags & IO_U_F_IN_CUR_DEPTH) { 811 td->cur_depth--; 812 assert(!(td->flags & TD_F_CHILD)); 813 } 814 io_u_qpush(&td->io_u_freelist, io_u); 815 td_io_u_unlock(td); 816 td_io_u_free_notify(td); 817} 818 819void clear_io_u(struct thread_data *td, struct io_u *io_u) 820{ 821 io_u_clear(td, io_u, IO_U_F_FLIGHT); 822 put_io_u(td, io_u); 823} 824 825void requeue_io_u(struct thread_data *td, struct io_u **io_u) 826{ 827 struct io_u *__io_u = *io_u; 828 enum fio_ddir ddir = acct_ddir(__io_u); 829 830 dprint(FD_IO, "requeue %p\n", __io_u); 831 832 if (td->parent) 833 td = td->parent; 834 835 td_io_u_lock(td); 836 837 io_u_set(td, __io_u, IO_U_F_FREE); 838 if ((__io_u->flags & IO_U_F_FLIGHT) && ddir_rw(ddir)) 839 td->io_issues[ddir]--; 840 841 io_u_clear(td, __io_u, IO_U_F_FLIGHT); 842 if (__io_u->flags & IO_U_F_IN_CUR_DEPTH) { 843 td->cur_depth--; 844 assert(!(td->flags & TD_F_CHILD)); 845 } 846 847 io_u_rpush(&td->io_u_requeues, __io_u); 848 td_io_u_unlock(td); 849 td_io_u_free_notify(td); 850 *io_u = NULL; 851} 852 853static int fill_io_u(struct thread_data *td, struct io_u *io_u) 854{ 855 unsigned int is_random; 856 857 if (td_ioengine_flagged(td, FIO_NOIO)) 858 goto out; 859 860 set_rw_ddir(td, io_u); 861 862 /* 863 * fsync() or fdatasync() or trim etc, we are done 864 */ 865 if (!ddir_rw(io_u->ddir)) 866 goto out; 867 868 /* 869 * See if it's time to switch to a new zone 870 */ 871 if (td->zone_bytes >= td->o.zone_size && td->o.zone_skip) { 872 struct fio_file *f = io_u->file; 873 874 td->zone_bytes = 0; 875 f->file_offset += td->o.zone_range + td->o.zone_skip; 876 877 /* 878 * Wrap from the beginning, if we exceed the file size 879 */ 880 if (f->file_offset >= f->real_file_size) 881 f->file_offset = f->real_file_size - f->file_offset; 882 f->last_pos[io_u->ddir] = f->file_offset; 883 td->io_skip_bytes += td->o.zone_skip; 884 } 885 886 /* 887 * No log, let the seq/rand engine retrieve the next buflen and 888 * position. 889 */ 890 if (get_next_offset(td, io_u, &is_random)) { 891 dprint(FD_IO, "io_u %p, failed getting offset\n", io_u); 892 return 1; 893 } 894 895 io_u->buflen = get_next_buflen(td, io_u, is_random); 896 if (!io_u->buflen) { 897 dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u); 898 return 1; 899 } 900 901 if (io_u->offset + io_u->buflen > io_u->file->real_file_size) { 902 dprint(FD_IO, "io_u %p, offset + buflen exceeds file size\n", 903 io_u); 904 dprint(FD_IO, " offset=%llu/buflen=%lu > %llu\n", 905 (unsigned long long) io_u->offset, io_u->buflen, 906 (unsigned long long) io_u->file->real_file_size); 907 return 1; 908 } 909 910 /* 911 * mark entry before potentially trimming io_u 912 */ 913 if (td_random(td) && file_randommap(td, io_u->file)) 914 mark_random_map(td, io_u); 915 916out: 917 dprint_io_u(io_u, "fill_io_u"); 918 td->zone_bytes += io_u->buflen; 919 return 0; 920} 921 922static void __io_u_mark_map(unsigned int *map, unsigned int nr) 923{ 924 int idx = 0; 925 926 switch (nr) { 927 default: 928 idx = 6; 929 break; 930 case 33 ... 64: 931 idx = 5; 932 break; 933 case 17 ... 32: 934 idx = 4; 935 break; 936 case 9 ... 16: 937 idx = 3; 938 break; 939 case 5 ... 8: 940 idx = 2; 941 break; 942 case 1 ... 4: 943 idx = 1; 944 case 0: 945 break; 946 } 947 948 map[idx]++; 949} 950 951void io_u_mark_submit(struct thread_data *td, unsigned int nr) 952{ 953 __io_u_mark_map(td->ts.io_u_submit, nr); 954 td->ts.total_submit++; 955} 956 957void io_u_mark_complete(struct thread_data *td, unsigned int nr) 958{ 959 __io_u_mark_map(td->ts.io_u_complete, nr); 960 td->ts.total_complete++; 961} 962 963void io_u_mark_depth(struct thread_data *td, unsigned int nr) 964{ 965 int idx = 0; 966 967 switch (td->cur_depth) { 968 default: 969 idx = 6; 970 break; 971 case 32 ... 63: 972 idx = 5; 973 break; 974 case 16 ... 31: 975 idx = 4; 976 break; 977 case 8 ... 15: 978 idx = 3; 979 break; 980 case 4 ... 7: 981 idx = 2; 982 break; 983 case 2 ... 3: 984 idx = 1; 985 case 1: 986 break; 987 } 988 989 td->ts.io_u_map[idx] += nr; 990} 991 992static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec) 993{ 994 int idx = 0; 995 996 assert(usec < 1000); 997 998 switch (usec) { 999 case 750 ... 999: 1000 idx = 9; 1001 break; 1002 case 500 ... 749: 1003 idx = 8; 1004 break; 1005 case 250 ... 499: 1006 idx = 7; 1007 break; 1008 case 100 ... 249: 1009 idx = 6; 1010 break; 1011 case 50 ... 99: 1012 idx = 5; 1013 break; 1014 case 20 ... 49: 1015 idx = 4; 1016 break; 1017 case 10 ... 19: 1018 idx = 3; 1019 break; 1020 case 4 ... 9: 1021 idx = 2; 1022 break; 1023 case 2 ... 3: 1024 idx = 1; 1025 case 0 ... 1: 1026 break; 1027 } 1028 1029 assert(idx < FIO_IO_U_LAT_U_NR); 1030 td->ts.io_u_lat_u[idx]++; 1031} 1032 1033static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec) 1034{ 1035 int idx = 0; 1036 1037 switch (msec) { 1038 default: 1039 idx = 11; 1040 break; 1041 case 1000 ... 1999: 1042 idx = 10; 1043 break; 1044 case 750 ... 999: 1045 idx = 9; 1046 break; 1047 case 500 ... 749: 1048 idx = 8; 1049 break; 1050 case 250 ... 499: 1051 idx = 7; 1052 break; 1053 case 100 ... 249: 1054 idx = 6; 1055 break; 1056 case 50 ... 99: 1057 idx = 5; 1058 break; 1059 case 20 ... 49: 1060 idx = 4; 1061 break; 1062 case 10 ... 19: 1063 idx = 3; 1064 break; 1065 case 4 ... 9: 1066 idx = 2; 1067 break; 1068 case 2 ... 3: 1069 idx = 1; 1070 case 0 ... 1: 1071 break; 1072 } 1073 1074 assert(idx < FIO_IO_U_LAT_M_NR); 1075 td->ts.io_u_lat_m[idx]++; 1076} 1077 1078static void io_u_mark_latency(struct thread_data *td, unsigned long usec) 1079{ 1080 if (usec < 1000) 1081 io_u_mark_lat_usec(td, usec); 1082 else 1083 io_u_mark_lat_msec(td, usec / 1000); 1084} 1085 1086static unsigned int __get_next_fileno_rand(struct thread_data *td) 1087{ 1088 unsigned long fileno; 1089 1090 if (td->o.file_service_type == FIO_FSERVICE_RANDOM) { 1091 uint64_t frand_max = rand_max(&td->next_file_state); 1092 unsigned long r; 1093 1094 r = __rand(&td->next_file_state); 1095 return (unsigned int) ((double) td->o.nr_files 1096 * (r / (frand_max + 1.0))); 1097 } 1098 1099 if (td->o.file_service_type == FIO_FSERVICE_ZIPF) 1100 fileno = zipf_next(&td->next_file_zipf); 1101 else if (td->o.file_service_type == FIO_FSERVICE_PARETO) 1102 fileno = pareto_next(&td->next_file_zipf); 1103 else if (td->o.file_service_type == FIO_FSERVICE_GAUSS) 1104 fileno = gauss_next(&td->next_file_gauss); 1105 else { 1106 log_err("fio: bad file service type: %d\n", td->o.file_service_type); 1107 assert(0); 1108 return 0; 1109 } 1110 1111 return fileno >> FIO_FSERVICE_SHIFT; 1112} 1113 1114/* 1115 * Get next file to service by choosing one at random 1116 */ 1117static struct fio_file *get_next_file_rand(struct thread_data *td, 1118 enum fio_file_flags goodf, 1119 enum fio_file_flags badf) 1120{ 1121 struct fio_file *f; 1122 int fno; 1123 1124 do { 1125 int opened = 0; 1126 1127 fno = __get_next_fileno_rand(td); 1128 1129 f = td->files[fno]; 1130 if (fio_file_done(f)) 1131 continue; 1132 1133 if (!fio_file_open(f)) { 1134 int err; 1135 1136 if (td->nr_open_files >= td->o.open_files) 1137 return ERR_PTR(-EBUSY); 1138 1139 err = td_io_open_file(td, f); 1140 if (err) 1141 continue; 1142 opened = 1; 1143 } 1144 1145 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) { 1146 dprint(FD_FILE, "get_next_file_rand: %p\n", f); 1147 return f; 1148 } 1149 if (opened) 1150 td_io_close_file(td, f); 1151 } while (1); 1152} 1153 1154/* 1155 * Get next file to service by doing round robin between all available ones 1156 */ 1157static struct fio_file *get_next_file_rr(struct thread_data *td, int goodf, 1158 int badf) 1159{ 1160 unsigned int old_next_file = td->next_file; 1161 struct fio_file *f; 1162 1163 do { 1164 int opened = 0; 1165 1166 f = td->files[td->next_file]; 1167 1168 td->next_file++; 1169 if (td->next_file >= td->o.nr_files) 1170 td->next_file = 0; 1171 1172 dprint(FD_FILE, "trying file %s %x\n", f->file_name, f->flags); 1173 if (fio_file_done(f)) { 1174 f = NULL; 1175 continue; 1176 } 1177 1178 if (!fio_file_open(f)) { 1179 int err; 1180 1181 if (td->nr_open_files >= td->o.open_files) 1182 return ERR_PTR(-EBUSY); 1183 1184 err = td_io_open_file(td, f); 1185 if (err) { 1186 dprint(FD_FILE, "error %d on open of %s\n", 1187 err, f->file_name); 1188 f = NULL; 1189 continue; 1190 } 1191 opened = 1; 1192 } 1193 1194 dprint(FD_FILE, "goodf=%x, badf=%x, ff=%x\n", goodf, badf, 1195 f->flags); 1196 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) 1197 break; 1198 1199 if (opened) 1200 td_io_close_file(td, f); 1201 1202 f = NULL; 1203 } while (td->next_file != old_next_file); 1204 1205 dprint(FD_FILE, "get_next_file_rr: %p\n", f); 1206 return f; 1207} 1208 1209static struct fio_file *__get_next_file(struct thread_data *td) 1210{ 1211 struct fio_file *f; 1212 1213 assert(td->o.nr_files <= td->files_index); 1214 1215 if (td->nr_done_files >= td->o.nr_files) { 1216 dprint(FD_FILE, "get_next_file: nr_open=%d, nr_done=%d," 1217 " nr_files=%d\n", td->nr_open_files, 1218 td->nr_done_files, 1219 td->o.nr_files); 1220 return NULL; 1221 } 1222 1223 f = td->file_service_file; 1224 if (f && fio_file_open(f) && !fio_file_closing(f)) { 1225 if (td->o.file_service_type == FIO_FSERVICE_SEQ) 1226 goto out; 1227 if (td->file_service_left--) 1228 goto out; 1229 } 1230 1231 if (td->o.file_service_type == FIO_FSERVICE_RR || 1232 td->o.file_service_type == FIO_FSERVICE_SEQ) 1233 f = get_next_file_rr(td, FIO_FILE_open, FIO_FILE_closing); 1234 else 1235 f = get_next_file_rand(td, FIO_FILE_open, FIO_FILE_closing); 1236 1237 if (IS_ERR(f)) 1238 return f; 1239 1240 td->file_service_file = f; 1241 td->file_service_left = td->file_service_nr - 1; 1242out: 1243 if (f) 1244 dprint(FD_FILE, "get_next_file: %p [%s]\n", f, f->file_name); 1245 else 1246 dprint(FD_FILE, "get_next_file: NULL\n"); 1247 return f; 1248} 1249 1250static struct fio_file *get_next_file(struct thread_data *td) 1251{ 1252 if (td->flags & TD_F_PROFILE_OPS) { 1253 struct prof_io_ops *ops = &td->prof_io_ops; 1254 1255 if (ops->get_next_file) 1256 return ops->get_next_file(td); 1257 } 1258 1259 return __get_next_file(td); 1260} 1261 1262static long set_io_u_file(struct thread_data *td, struct io_u *io_u) 1263{ 1264 struct fio_file *f; 1265 1266 do { 1267 f = get_next_file(td); 1268 if (IS_ERR_OR_NULL(f)) 1269 return PTR_ERR(f); 1270 1271 io_u->file = f; 1272 get_file(f); 1273 1274 if (!fill_io_u(td, io_u)) 1275 break; 1276 1277 put_file_log(td, f); 1278 td_io_close_file(td, f); 1279 io_u->file = NULL; 1280 if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM) 1281 fio_file_reset(td, f); 1282 else { 1283 fio_file_set_done(f); 1284 td->nr_done_files++; 1285 dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name, 1286 td->nr_done_files, td->o.nr_files); 1287 } 1288 } while (1); 1289 1290 return 0; 1291} 1292 1293static void lat_fatal(struct thread_data *td, struct io_completion_data *icd, 1294 unsigned long tusec, unsigned long max_usec) 1295{ 1296 if (!td->error) 1297 log_err("fio: latency of %lu usec exceeds specified max (%lu usec)\n", tusec, max_usec); 1298 td_verror(td, ETIMEDOUT, "max latency exceeded"); 1299 icd->error = ETIMEDOUT; 1300} 1301 1302static void lat_new_cycle(struct thread_data *td) 1303{ 1304 fio_gettime(&td->latency_ts, NULL); 1305 td->latency_ios = ddir_rw_sum(td->io_blocks); 1306 td->latency_failed = 0; 1307} 1308 1309/* 1310 * We had an IO outside the latency target. Reduce the queue depth. If we 1311 * are at QD=1, then it's time to give up. 1312 */ 1313static bool __lat_target_failed(struct thread_data *td) 1314{ 1315 if (td->latency_qd == 1) 1316 return true; 1317 1318 td->latency_qd_high = td->latency_qd; 1319 1320 if (td->latency_qd == td->latency_qd_low) 1321 td->latency_qd_low--; 1322 1323 td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2; 1324 1325 dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high); 1326 1327 /* 1328 * When we ramp QD down, quiesce existing IO to prevent 1329 * a storm of ramp downs due to pending higher depth. 1330 */ 1331 io_u_quiesce(td); 1332 lat_new_cycle(td); 1333 return false; 1334} 1335 1336static bool lat_target_failed(struct thread_data *td) 1337{ 1338 if (td->o.latency_percentile.u.f == 100.0) 1339 return __lat_target_failed(td); 1340 1341 td->latency_failed++; 1342 return false; 1343} 1344 1345void lat_target_init(struct thread_data *td) 1346{ 1347 td->latency_end_run = 0; 1348 1349 if (td->o.latency_target) { 1350 dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target); 1351 fio_gettime(&td->latency_ts, NULL); 1352 td->latency_qd = 1; 1353 td->latency_qd_high = td->o.iodepth; 1354 td->latency_qd_low = 1; 1355 td->latency_ios = ddir_rw_sum(td->io_blocks); 1356 } else 1357 td->latency_qd = td->o.iodepth; 1358} 1359 1360void lat_target_reset(struct thread_data *td) 1361{ 1362 if (!td->latency_end_run) 1363 lat_target_init(td); 1364} 1365 1366static void lat_target_success(struct thread_data *td) 1367{ 1368 const unsigned int qd = td->latency_qd; 1369 struct thread_options *o = &td->o; 1370 1371 td->latency_qd_low = td->latency_qd; 1372 1373 /* 1374 * If we haven't failed yet, we double up to a failing value instead 1375 * of bisecting from highest possible queue depth. If we have set 1376 * a limit other than td->o.iodepth, bisect between that. 1377 */ 1378 if (td->latency_qd_high != o->iodepth) 1379 td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2; 1380 else 1381 td->latency_qd *= 2; 1382 1383 if (td->latency_qd > o->iodepth) 1384 td->latency_qd = o->iodepth; 1385 1386 dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high); 1387 1388 /* 1389 * Same as last one, we are done. Let it run a latency cycle, so 1390 * we get only the results from the targeted depth. 1391 */ 1392 if (td->latency_qd == qd) { 1393 if (td->latency_end_run) { 1394 dprint(FD_RATE, "We are done\n"); 1395 td->done = 1; 1396 } else { 1397 dprint(FD_RATE, "Quiesce and final run\n"); 1398 io_u_quiesce(td); 1399 td->latency_end_run = 1; 1400 reset_all_stats(td); 1401 reset_io_stats(td); 1402 } 1403 } 1404 1405 lat_new_cycle(td); 1406} 1407 1408/* 1409 * Check if we can bump the queue depth 1410 */ 1411void lat_target_check(struct thread_data *td) 1412{ 1413 uint64_t usec_window; 1414 uint64_t ios; 1415 double success_ios; 1416 1417 usec_window = utime_since_now(&td->latency_ts); 1418 if (usec_window < td->o.latency_window) 1419 return; 1420 1421 ios = ddir_rw_sum(td->io_blocks) - td->latency_ios; 1422 success_ios = (double) (ios - td->latency_failed) / (double) ios; 1423 success_ios *= 100.0; 1424 1425 dprint(FD_RATE, "Success rate: %.2f%% (target %.2f%%)\n", success_ios, td->o.latency_percentile.u.f); 1426 1427 if (success_ios >= td->o.latency_percentile.u.f) 1428 lat_target_success(td); 1429 else 1430 __lat_target_failed(td); 1431} 1432 1433/* 1434 * If latency target is enabled, we might be ramping up or down and not 1435 * using the full queue depth available. 1436 */ 1437bool queue_full(const struct thread_data *td) 1438{ 1439 const int qempty = io_u_qempty(&td->io_u_freelist); 1440 1441 if (qempty) 1442 return true; 1443 if (!td->o.latency_target) 1444 return false; 1445 1446 return td->cur_depth >= td->latency_qd; 1447} 1448 1449struct io_u *__get_io_u(struct thread_data *td) 1450{ 1451 struct io_u *io_u = NULL; 1452 1453 if (td->stop_io) 1454 return NULL; 1455 1456 td_io_u_lock(td); 1457 1458again: 1459 if (!io_u_rempty(&td->io_u_requeues)) 1460 io_u = io_u_rpop(&td->io_u_requeues); 1461 else if (!queue_full(td)) { 1462 io_u = io_u_qpop(&td->io_u_freelist); 1463 1464 io_u->file = NULL; 1465 io_u->buflen = 0; 1466 io_u->resid = 0; 1467 io_u->end_io = NULL; 1468 } 1469 1470 if (io_u) { 1471 assert(io_u->flags & IO_U_F_FREE); 1472 io_u_clear(td, io_u, IO_U_F_FREE | IO_U_F_NO_FILE_PUT | 1473 IO_U_F_TRIMMED | IO_U_F_BARRIER | 1474 IO_U_F_VER_LIST); 1475 1476 io_u->error = 0; 1477 io_u->acct_ddir = -1; 1478 td->cur_depth++; 1479 assert(!(td->flags & TD_F_CHILD)); 1480 io_u_set(td, io_u, IO_U_F_IN_CUR_DEPTH); 1481 io_u->ipo = NULL; 1482 } else if (td_async_processing(td)) { 1483 /* 1484 * We ran out, wait for async verify threads to finish and 1485 * return one 1486 */ 1487 assert(!(td->flags & TD_F_CHILD)); 1488 assert(!pthread_cond_wait(&td->free_cond, &td->io_u_lock)); 1489 goto again; 1490 } 1491 1492 td_io_u_unlock(td); 1493 return io_u; 1494} 1495 1496static bool check_get_trim(struct thread_data *td, struct io_u *io_u) 1497{ 1498 if (!(td->flags & TD_F_TRIM_BACKLOG)) 1499 return false; 1500 1501 if (td->trim_entries) { 1502 int get_trim = 0; 1503 1504 if (td->trim_batch) { 1505 td->trim_batch--; 1506 get_trim = 1; 1507 } else if (!(td->io_hist_len % td->o.trim_backlog) && 1508 td->last_ddir != DDIR_READ) { 1509 td->trim_batch = td->o.trim_batch; 1510 if (!td->trim_batch) 1511 td->trim_batch = td->o.trim_backlog; 1512 get_trim = 1; 1513 } 1514 1515 if (get_trim && get_next_trim(td, io_u)) 1516 return true; 1517 } 1518 1519 return false; 1520} 1521 1522static bool check_get_verify(struct thread_data *td, struct io_u *io_u) 1523{ 1524 if (!(td->flags & TD_F_VER_BACKLOG)) 1525 return false; 1526 1527 if (td->io_hist_len) { 1528 int get_verify = 0; 1529 1530 if (td->verify_batch) 1531 get_verify = 1; 1532 else if (!(td->io_hist_len % td->o.verify_backlog) && 1533 td->last_ddir != DDIR_READ) { 1534 td->verify_batch = td->o.verify_batch; 1535 if (!td->verify_batch) 1536 td->verify_batch = td->o.verify_backlog; 1537 get_verify = 1; 1538 } 1539 1540 if (get_verify && !get_next_verify(td, io_u)) { 1541 td->verify_batch--; 1542 return true; 1543 } 1544 } 1545 1546 return false; 1547} 1548 1549/* 1550 * Fill offset and start time into the buffer content, to prevent too 1551 * easy compressible data for simple de-dupe attempts. Do this for every 1552 * 512b block in the range, since that should be the smallest block size 1553 * we can expect from a device. 1554 */ 1555static void small_content_scramble(struct io_u *io_u) 1556{ 1557 unsigned int i, nr_blocks = io_u->buflen / 512; 1558 uint64_t boffset; 1559 unsigned int offset; 1560 void *p, *end; 1561 1562 if (!nr_blocks) 1563 return; 1564 1565 p = io_u->xfer_buf; 1566 boffset = io_u->offset; 1567 io_u->buf_filled_len = 0; 1568 1569 for (i = 0; i < nr_blocks; i++) { 1570 /* 1571 * Fill the byte offset into a "random" start offset of 1572 * the buffer, given by the product of the usec time 1573 * and the actual offset. 1574 */ 1575 offset = (io_u->start_time.tv_usec ^ boffset) & 511; 1576 offset &= ~(sizeof(uint64_t) - 1); 1577 if (offset >= 512 - sizeof(uint64_t)) 1578 offset -= sizeof(uint64_t); 1579 memcpy(p + offset, &boffset, sizeof(boffset)); 1580 1581 end = p + 512 - sizeof(io_u->start_time); 1582 memcpy(end, &io_u->start_time, sizeof(io_u->start_time)); 1583 p += 512; 1584 boffset += 512; 1585 } 1586} 1587 1588/* 1589 * Return an io_u to be processed. Gets a buflen and offset, sets direction, 1590 * etc. The returned io_u is fully ready to be prepped and submitted. 1591 */ 1592struct io_u *get_io_u(struct thread_data *td) 1593{ 1594 struct fio_file *f; 1595 struct io_u *io_u; 1596 int do_scramble = 0; 1597 long ret = 0; 1598 1599 io_u = __get_io_u(td); 1600 if (!io_u) { 1601 dprint(FD_IO, "__get_io_u failed\n"); 1602 return NULL; 1603 } 1604 1605 if (check_get_verify(td, io_u)) 1606 goto out; 1607 if (check_get_trim(td, io_u)) 1608 goto out; 1609 1610 /* 1611 * from a requeue, io_u already setup 1612 */ 1613 if (io_u->file) 1614 goto out; 1615 1616 /* 1617 * If using an iolog, grab next piece if any available. 1618 */ 1619 if (td->flags & TD_F_READ_IOLOG) { 1620 if (read_iolog_get(td, io_u)) 1621 goto err_put; 1622 } else if (set_io_u_file(td, io_u)) { 1623 ret = -EBUSY; 1624 dprint(FD_IO, "io_u %p, setting file failed\n", io_u); 1625 goto err_put; 1626 } 1627 1628 f = io_u->file; 1629 if (!f) { 1630 dprint(FD_IO, "io_u %p, setting file failed\n", io_u); 1631 goto err_put; 1632 } 1633 1634 assert(fio_file_open(f)); 1635 1636 if (ddir_rw(io_u->ddir)) { 1637 if (!io_u->buflen && !td_ioengine_flagged(td, FIO_NOIO)) { 1638 dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u); 1639 goto err_put; 1640 } 1641 1642 f->last_start[io_u->ddir] = io_u->offset; 1643 f->last_pos[io_u->ddir] = io_u->offset + io_u->buflen; 1644 1645 if (io_u->ddir == DDIR_WRITE) { 1646 if (td->flags & TD_F_REFILL_BUFFERS) { 1647 io_u_fill_buffer(td, io_u, 1648 td->o.min_bs[DDIR_WRITE], 1649 io_u->buflen); 1650 } else if ((td->flags & TD_F_SCRAMBLE_BUFFERS) && 1651 !(td->flags & TD_F_COMPRESS)) 1652 do_scramble = 1; 1653 if (td->flags & TD_F_VER_NONE) { 1654 populate_verify_io_u(td, io_u); 1655 do_scramble = 0; 1656 } 1657 } else if (io_u->ddir == DDIR_READ) { 1658 /* 1659 * Reset the buf_filled parameters so next time if the 1660 * buffer is used for writes it is refilled. 1661 */ 1662 io_u->buf_filled_len = 0; 1663 } 1664 } 1665 1666 /* 1667 * Set io data pointers. 1668 */ 1669 io_u->xfer_buf = io_u->buf; 1670 io_u->xfer_buflen = io_u->buflen; 1671 1672out: 1673 assert(io_u->file); 1674 if (!td_io_prep(td, io_u)) { 1675 if (!td->o.disable_lat) 1676 fio_gettime(&io_u->start_time, NULL); 1677 1678 if (do_scramble) 1679 small_content_scramble(io_u); 1680 1681 return io_u; 1682 } 1683err_put: 1684 dprint(FD_IO, "get_io_u failed\n"); 1685 put_io_u(td, io_u); 1686 return ERR_PTR(ret); 1687} 1688 1689static void __io_u_log_error(struct thread_data *td, struct io_u *io_u) 1690{ 1691 enum error_type_bit eb = td_error_type(io_u->ddir, io_u->error); 1692 1693 if (td_non_fatal_error(td, eb, io_u->error) && !td->o.error_dump) 1694 return; 1695 1696 log_err("fio: io_u error%s%s: %s: %s offset=%llu, buflen=%lu\n", 1697 io_u->file ? " on file " : "", 1698 io_u->file ? io_u->file->file_name : "", 1699 strerror(io_u->error), 1700 io_ddir_name(io_u->ddir), 1701 io_u->offset, io_u->xfer_buflen); 1702 1703 if (td->io_ops->errdetails) { 1704 char *err = td->io_ops->errdetails(io_u); 1705 1706 log_err("fio: %s\n", err); 1707 free(err); 1708 } 1709 1710 if (!td->error) 1711 td_verror(td, io_u->error, "io_u error"); 1712} 1713 1714void io_u_log_error(struct thread_data *td, struct io_u *io_u) 1715{ 1716 __io_u_log_error(td, io_u); 1717 if (td->parent) 1718 __io_u_log_error(td->parent, io_u); 1719} 1720 1721static inline bool gtod_reduce(struct thread_data *td) 1722{ 1723 return (td->o.disable_clat && td->o.disable_slat && td->o.disable_bw) 1724 || td->o.gtod_reduce; 1725} 1726 1727static void account_io_completion(struct thread_data *td, struct io_u *io_u, 1728 struct io_completion_data *icd, 1729 const enum fio_ddir idx, unsigned int bytes) 1730{ 1731 const int no_reduce = !gtod_reduce(td); 1732 unsigned long lusec = 0; 1733 1734 if (td->parent) 1735 td = td->parent; 1736 1737 if (!td->o.stats) 1738 return; 1739 1740 if (no_reduce) 1741 lusec = utime_since(&io_u->issue_time, &icd->time); 1742 1743 if (!td->o.disable_lat) { 1744 unsigned long tusec; 1745 1746 tusec = utime_since(&io_u->start_time, &icd->time); 1747 add_lat_sample(td, idx, tusec, bytes, io_u->offset); 1748 1749 if (td->flags & TD_F_PROFILE_OPS) { 1750 struct prof_io_ops *ops = &td->prof_io_ops; 1751 1752 if (ops->io_u_lat) 1753 icd->error = ops->io_u_lat(td, tusec); 1754 } 1755 1756 if (td->o.max_latency && tusec > td->o.max_latency) 1757 lat_fatal(td, icd, tusec, td->o.max_latency); 1758 if (td->o.latency_target && tusec > td->o.latency_target) { 1759 if (lat_target_failed(td)) 1760 lat_fatal(td, icd, tusec, td->o.latency_target); 1761 } 1762 } 1763 1764 if (ddir_rw(idx)) { 1765 if (!td->o.disable_clat) { 1766 add_clat_sample(td, idx, lusec, bytes, io_u->offset); 1767 io_u_mark_latency(td, lusec); 1768 } 1769 1770 if (!td->o.disable_bw && per_unit_log(td->bw_log)) 1771 add_bw_sample(td, io_u, bytes, lusec); 1772 1773 if (no_reduce && per_unit_log(td->iops_log)) 1774 add_iops_sample(td, io_u, bytes); 1775 } 1776 1777 if (td->ts.nr_block_infos && io_u->ddir == DDIR_TRIM) { 1778 uint32_t *info = io_u_block_info(td, io_u); 1779 if (BLOCK_INFO_STATE(*info) < BLOCK_STATE_TRIM_FAILURE) { 1780 if (io_u->ddir == DDIR_TRIM) { 1781 *info = BLOCK_INFO(BLOCK_STATE_TRIMMED, 1782 BLOCK_INFO_TRIMS(*info) + 1); 1783 } else if (io_u->ddir == DDIR_WRITE) { 1784 *info = BLOCK_INFO_SET_STATE(BLOCK_STATE_WRITTEN, 1785 *info); 1786 } 1787 } 1788 } 1789} 1790 1791static void file_log_write_comp(const struct thread_data *td, struct fio_file *f, 1792 uint64_t offset, unsigned int bytes) 1793{ 1794 int idx; 1795 1796 if (!f) 1797 return; 1798 1799 if (f->first_write == -1ULL || offset < f->first_write) 1800 f->first_write = offset; 1801 if (f->last_write == -1ULL || ((offset + bytes) > f->last_write)) 1802 f->last_write = offset + bytes; 1803 1804 if (!f->last_write_comp) 1805 return; 1806 1807 idx = f->last_write_idx++; 1808 f->last_write_comp[idx] = offset; 1809 if (f->last_write_idx == td->o.iodepth) 1810 f->last_write_idx = 0; 1811} 1812 1813static void io_completed(struct thread_data *td, struct io_u **io_u_ptr, 1814 struct io_completion_data *icd) 1815{ 1816 struct io_u *io_u = *io_u_ptr; 1817 enum fio_ddir ddir = io_u->ddir; 1818 struct fio_file *f = io_u->file; 1819 1820 dprint_io_u(io_u, "io complete"); 1821 1822 assert(io_u->flags & IO_U_F_FLIGHT); 1823 io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK); 1824 1825 /* 1826 * Mark IO ok to verify 1827 */ 1828 if (io_u->ipo) { 1829 /* 1830 * Remove errored entry from the verification list 1831 */ 1832 if (io_u->error) 1833 unlog_io_piece(td, io_u); 1834 else { 1835 io_u->ipo->flags &= ~IP_F_IN_FLIGHT; 1836 write_barrier(); 1837 } 1838 } 1839 1840 if (ddir_sync(ddir)) { 1841 td->last_was_sync = 1; 1842 if (f) { 1843 f->first_write = -1ULL; 1844 f->last_write = -1ULL; 1845 } 1846 return; 1847 } 1848 1849 td->last_was_sync = 0; 1850 td->last_ddir = ddir; 1851 1852 if (!io_u->error && ddir_rw(ddir)) { 1853 unsigned int bytes = io_u->buflen - io_u->resid; 1854 int ret; 1855 1856 td->io_blocks[ddir]++; 1857 td->this_io_blocks[ddir]++; 1858 td->io_bytes[ddir] += bytes; 1859 1860 if (!(io_u->flags & IO_U_F_VER_LIST)) 1861 td->this_io_bytes[ddir] += bytes; 1862 1863 if (ddir == DDIR_WRITE) 1864 file_log_write_comp(td, f, io_u->offset, bytes); 1865 1866 if (ramp_time_over(td) && (td->runstate == TD_RUNNING || 1867 td->runstate == TD_VERIFYING)) 1868 account_io_completion(td, io_u, icd, ddir, bytes); 1869 1870 icd->bytes_done[ddir] += bytes; 1871 1872 if (io_u->end_io) { 1873 ret = io_u->end_io(td, io_u_ptr); 1874 io_u = *io_u_ptr; 1875 if (ret && !icd->error) 1876 icd->error = ret; 1877 } 1878 } else if (io_u->error) { 1879 icd->error = io_u->error; 1880 io_u_log_error(td, io_u); 1881 } 1882 if (icd->error) { 1883 enum error_type_bit eb = td_error_type(ddir, icd->error); 1884 1885 if (!td_non_fatal_error(td, eb, icd->error)) 1886 return; 1887 1888 /* 1889 * If there is a non_fatal error, then add to the error count 1890 * and clear all the errors. 1891 */ 1892 update_error_count(td, icd->error); 1893 td_clear_error(td); 1894 icd->error = 0; 1895 if (io_u) 1896 io_u->error = 0; 1897 } 1898} 1899 1900static void init_icd(struct thread_data *td, struct io_completion_data *icd, 1901 int nr) 1902{ 1903 int ddir; 1904 1905 if (!gtod_reduce(td)) 1906 fio_gettime(&icd->time, NULL); 1907 1908 icd->nr = nr; 1909 1910 icd->error = 0; 1911 for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) 1912 icd->bytes_done[ddir] = 0; 1913} 1914 1915static void ios_completed(struct thread_data *td, 1916 struct io_completion_data *icd) 1917{ 1918 struct io_u *io_u; 1919 int i; 1920 1921 for (i = 0; i < icd->nr; i++) { 1922 io_u = td->io_ops->event(td, i); 1923 1924 io_completed(td, &io_u, icd); 1925 1926 if (io_u) 1927 put_io_u(td, io_u); 1928 } 1929} 1930 1931/* 1932 * Complete a single io_u for the sync engines. 1933 */ 1934int io_u_sync_complete(struct thread_data *td, struct io_u *io_u) 1935{ 1936 struct io_completion_data icd; 1937 int ddir; 1938 1939 init_icd(td, &icd, 1); 1940 io_completed(td, &io_u, &icd); 1941 1942 if (io_u) 1943 put_io_u(td, io_u); 1944 1945 if (icd.error) { 1946 td_verror(td, icd.error, "io_u_sync_complete"); 1947 return -1; 1948 } 1949 1950 for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) 1951 td->bytes_done[ddir] += icd.bytes_done[ddir]; 1952 1953 return 0; 1954} 1955 1956/* 1957 * Called to complete min_events number of io for the async engines. 1958 */ 1959int io_u_queued_complete(struct thread_data *td, int min_evts) 1960{ 1961 struct io_completion_data icd; 1962 struct timespec *tvp = NULL; 1963 int ret, ddir; 1964 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, }; 1965 1966 dprint(FD_IO, "io_u_queued_complete: min=%d\n", min_evts); 1967 1968 if (!min_evts) 1969 tvp = &ts; 1970 else if (min_evts > td->cur_depth) 1971 min_evts = td->cur_depth; 1972 1973 /* No worries, td_io_getevents fixes min and max if they are 1974 * set incorrectly */ 1975 ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete_max, tvp); 1976 if (ret < 0) { 1977 td_verror(td, -ret, "td_io_getevents"); 1978 return ret; 1979 } else if (!ret) 1980 return ret; 1981 1982 init_icd(td, &icd, ret); 1983 ios_completed(td, &icd); 1984 if (icd.error) { 1985 td_verror(td, icd.error, "io_u_queued_complete"); 1986 return -1; 1987 } 1988 1989 for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) 1990 td->bytes_done[ddir] += icd.bytes_done[ddir]; 1991 1992 return ret; 1993} 1994 1995/* 1996 * Call when io_u is really queued, to update the submission latency. 1997 */ 1998void io_u_queued(struct thread_data *td, struct io_u *io_u) 1999{ 2000 if (!td->o.disable_slat && ramp_time_over(td) && td->o.stats) { 2001 unsigned long slat_time; 2002 2003 slat_time = utime_since(&io_u->start_time, &io_u->issue_time); 2004 2005 if (td->parent) 2006 td = td->parent; 2007 2008 add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen, 2009 io_u->offset); 2010 } 2011} 2012 2013/* 2014 * See if we should reuse the last seed, if dedupe is enabled 2015 */ 2016static struct frand_state *get_buf_state(struct thread_data *td) 2017{ 2018 unsigned int v; 2019 2020 if (!td->o.dedupe_percentage) 2021 return &td->buf_state; 2022 else if (td->o.dedupe_percentage == 100) { 2023 frand_copy(&td->buf_state_prev, &td->buf_state); 2024 return &td->buf_state; 2025 } 2026 2027 v = rand32_between(&td->dedupe_state, 1, 100); 2028 2029 if (v <= td->o.dedupe_percentage) 2030 return &td->buf_state_prev; 2031 2032 return &td->buf_state; 2033} 2034 2035static void save_buf_state(struct thread_data *td, struct frand_state *rs) 2036{ 2037 if (td->o.dedupe_percentage == 100) 2038 frand_copy(rs, &td->buf_state_prev); 2039 else if (rs == &td->buf_state) 2040 frand_copy(&td->buf_state_prev, rs); 2041} 2042 2043void fill_io_buffer(struct thread_data *td, void *buf, unsigned int min_write, 2044 unsigned int max_bs) 2045{ 2046 struct thread_options *o = &td->o; 2047 2048 if (o->mem_type == MEM_CUDA_MALLOC) 2049 return; 2050 2051 if (o->compress_percentage || o->dedupe_percentage) { 2052 unsigned int perc = td->o.compress_percentage; 2053 struct frand_state *rs; 2054 unsigned int left = max_bs; 2055 unsigned int this_write; 2056 2057 do { 2058 rs = get_buf_state(td); 2059 2060 min_write = min(min_write, left); 2061 2062 if (perc) { 2063 this_write = min_not_zero(min_write, 2064 td->o.compress_chunk); 2065 2066 fill_random_buf_percentage(rs, buf, perc, 2067 this_write, this_write, 2068 o->buffer_pattern, 2069 o->buffer_pattern_bytes); 2070 } else { 2071 fill_random_buf(rs, buf, min_write); 2072 this_write = min_write; 2073 } 2074 2075 buf += this_write; 2076 left -= this_write; 2077 save_buf_state(td, rs); 2078 } while (left); 2079 } else if (o->buffer_pattern_bytes) 2080 fill_buffer_pattern(td, buf, max_bs); 2081 else if (o->zero_buffers) 2082 memset(buf, 0, max_bs); 2083 else 2084 fill_random_buf(get_buf_state(td), buf, max_bs); 2085} 2086 2087/* 2088 * "randomly" fill the buffer contents 2089 */ 2090void io_u_fill_buffer(struct thread_data *td, struct io_u *io_u, 2091 unsigned int min_write, unsigned int max_bs) 2092{ 2093 io_u->buf_filled_len = 0; 2094 fill_io_buffer(td, io_u->buf, min_write, max_bs); 2095} 2096 2097static int do_sync_file_range(const struct thread_data *td, 2098 struct fio_file *f) 2099{ 2100 off64_t offset, nbytes; 2101 2102 offset = f->first_write; 2103 nbytes = f->last_write - f->first_write; 2104 2105 if (!nbytes) 2106 return 0; 2107 2108 return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range); 2109} 2110 2111int do_io_u_sync(const struct thread_data *td, struct io_u *io_u) 2112{ 2113 int ret; 2114 2115 if (io_u->ddir == DDIR_SYNC) { 2116 ret = fsync(io_u->file->fd); 2117 } else if (io_u->ddir == DDIR_DATASYNC) { 2118#ifdef CONFIG_FDATASYNC 2119 ret = fdatasync(io_u->file->fd); 2120#else 2121 ret = io_u->xfer_buflen; 2122 io_u->error = EINVAL; 2123#endif 2124 } else if (io_u->ddir == DDIR_SYNC_FILE_RANGE) 2125 ret = do_sync_file_range(td, io_u->file); 2126 else { 2127 ret = io_u->xfer_buflen; 2128 io_u->error = EINVAL; 2129 } 2130 2131 if (ret < 0) 2132 io_u->error = errno; 2133 2134 return ret; 2135} 2136 2137int do_io_u_trim(const struct thread_data *td, struct io_u *io_u) 2138{ 2139#ifndef FIO_HAVE_TRIM 2140 io_u->error = EINVAL; 2141 return 0; 2142#else 2143 struct fio_file *f = io_u->file; 2144 int ret; 2145 2146 ret = os_trim(f->fd, io_u->offset, io_u->xfer_buflen); 2147 if (!ret) 2148 return io_u->xfer_buflen; 2149 2150 io_u->error = ret; 2151 return 0; 2152#endif 2153} 2154