io_u.c revision 317b95d07d4921d2594a1be6e014c9c2d062fe75
1#include <unistd.h> 2#include <fcntl.h> 3#include <string.h> 4#include <signal.h> 5#include <time.h> 6#include <assert.h> 7 8#include "fio.h" 9 10/* 11 * Change this define to play with the timeout handling 12 */ 13#undef FIO_USE_TIMEOUT 14 15struct io_completion_data { 16 int nr; /* input */ 17 18 int error; /* output */ 19 unsigned long bytes_done[2]; /* output */ 20 struct timeval time; /* output */ 21}; 22 23/* 24 * The ->file_map[] contains a map of blocks we have or have not done io 25 * to yet. Used to make sure we cover the entire range in a fair fashion. 26 */ 27static int random_map_free(struct thread_data *td, struct fio_file *f, 28 unsigned long long block) 29{ 30 unsigned int idx = RAND_MAP_IDX(td, f, block); 31 unsigned int bit = RAND_MAP_BIT(td, f, block); 32 33 return (f->file_map[idx] & (1UL << bit)) == 0; 34} 35 36/* 37 * Mark a given offset as used in the map. 38 */ 39static void mark_random_map(struct thread_data *td, struct io_u *io_u) 40{ 41 unsigned int min_bs = td->o.rw_min_bs; 42 struct fio_file *f = io_u->file; 43 unsigned long long block; 44 unsigned int blocks; 45 unsigned int nr_blocks; 46 47 block = io_u->offset / (unsigned long long) min_bs; 48 blocks = 0; 49 nr_blocks = (io_u->buflen + min_bs - 1) / min_bs; 50 51 while (blocks < nr_blocks) { 52 unsigned int idx, bit; 53 54 /* 55 * If we have a mixed random workload, we may 56 * encounter blocks we already did IO to. 57 */ 58 if (!td->o.ddir_nr && !random_map_free(td, f, block)) 59 break; 60 61 idx = RAND_MAP_IDX(td, f, block); 62 bit = RAND_MAP_BIT(td, f, block); 63 64 fio_assert(td, idx < f->num_maps); 65 66 f->file_map[idx] |= (1UL << bit); 67 block++; 68 blocks++; 69 } 70 71 if ((blocks * min_bs) < io_u->buflen) 72 io_u->buflen = blocks * min_bs; 73} 74 75/* 76 * Return the next free block in the map. 77 */ 78static int get_next_free_block(struct thread_data *td, struct fio_file *f, 79 unsigned long long *b) 80{ 81 int i; 82 83 i = f->last_free_lookup; 84 *b = (i * BLOCKS_PER_MAP); 85 while ((*b) * td->o.rw_min_bs < f->real_file_size) { 86 if (f->file_map[i] != -1UL) { 87 *b += ffz(f->file_map[i]); 88 f->last_free_lookup = i; 89 return 0; 90 } 91 92 *b += BLOCKS_PER_MAP; 93 i++; 94 } 95 96 return 1; 97} 98 99static int get_next_rand_offset(struct thread_data *td, struct fio_file *f, 100 int ddir, unsigned long long *b) 101{ 102 unsigned long long max_blocks = f->io_size / td->o.min_bs[ddir]; 103 unsigned long long r, rb; 104 int loops = 5; 105 106 do { 107 r = os_random_long(&td->random_state); 108 if (!max_blocks) 109 *b = 0; 110 else 111 *b = ((max_blocks - 1) * r / (unsigned long long) (RAND_MAX+1.0)); 112 if (td->o.norandommap) 113 break; 114 rb = *b + (f->file_offset / td->o.min_bs[ddir]); 115 loops--; 116 } while (!random_map_free(td, f, rb) && loops); 117 118 /* 119 * if we failed to retrieve a truly random offset within 120 * the loops assigned, see if there are free ones left at all 121 */ 122 if (!loops && get_next_free_block(td, f, b)) 123 return 1; 124 125 return 0; 126} 127 128/* 129 * For random io, generate a random new block and see if it's used. Repeat 130 * until we find a free one. For sequential io, just return the end of 131 * the last io issued. 132 */ 133static int get_next_offset(struct thread_data *td, struct io_u *io_u) 134{ 135 struct fio_file *f = io_u->file; 136 const int ddir = io_u->ddir; 137 unsigned long long b; 138 139 if (td_random(td) && (td->o.ddir_nr && !--td->ddir_nr)) { 140 td->ddir_nr = td->o.ddir_nr; 141 142 if (get_next_rand_offset(td, f, ddir, &b)) 143 return 1; 144 } else { 145 if (f->last_pos >= f->real_file_size) 146 return 1; 147 148 b = f->last_pos / td->o.min_bs[ddir]; 149 } 150 151 io_u->offset = (b * td->o.min_bs[ddir]) + f->file_offset; 152 if (io_u->offset >= f->real_file_size) 153 return 1; 154 155 return 0; 156} 157 158static unsigned int get_next_buflen(struct thread_data *td, struct io_u *io_u) 159{ 160 const int ddir = io_u->ddir; 161 unsigned int buflen; 162 long r; 163 164 if (td->o.min_bs[ddir] == td->o.max_bs[ddir]) 165 buflen = td->o.min_bs[ddir]; 166 else { 167 r = os_random_long(&td->bsrange_state); 168 buflen = (unsigned int) (1 + (double) (td->o.max_bs[ddir] - 1) * r / (RAND_MAX + 1.0)); 169 if (!td->o.bs_unaligned) 170 buflen = (buflen + td->o.min_bs[ddir] - 1) & ~(td->o.min_bs[ddir] - 1); 171 } 172 173 return buflen; 174} 175 176static void set_rwmix_bytes(struct thread_data *td) 177{ 178 unsigned long long rbytes; 179 unsigned int diff; 180 181 /* 182 * we do time or byte based switch. this is needed because 183 * buffered writes may issue a lot quicker than they complete, 184 * whereas reads do not. 185 */ 186 rbytes = td->io_bytes[td->rwmix_ddir] - td->rwmix_bytes; 187 diff = td->o.rwmix[td->rwmix_ddir ^ 1]; 188 189 td->rwmix_bytes = td->io_bytes[td->rwmix_ddir] + (rbytes * ((100 - diff)) / diff); 190} 191 192static inline enum fio_ddir get_rand_ddir(struct thread_data *td) 193{ 194 unsigned int v; 195 long r; 196 197 r = os_random_long(&td->rwmix_state); 198 v = 1 + (int) (100.0 * (r / (RAND_MAX + 1.0))); 199 if (v < td->o.rwmix[DDIR_READ]) 200 return DDIR_READ; 201 202 return DDIR_WRITE; 203} 204 205/* 206 * Return the data direction for the next io_u. If the job is a 207 * mixed read/write workload, check the rwmix cycle and switch if 208 * necessary. 209 */ 210static enum fio_ddir get_rw_ddir(struct thread_data *td) 211{ 212 if (td_rw(td)) { 213 struct timeval now; 214 unsigned long elapsed; 215 unsigned int cycle; 216 217 fio_gettime(&now, NULL); 218 elapsed = mtime_since_now(&td->rwmix_switch); 219 220 /* 221 * if this is the first cycle, make it shorter 222 */ 223 cycle = td->o.rwmixcycle; 224 if (!td->rwmix_bytes) 225 cycle /= 10; 226 227 /* 228 * Check if it's time to seed a new data direction. 229 */ 230 if (elapsed >= cycle || 231 td->io_bytes[td->rwmix_ddir] >= td->rwmix_bytes) { 232 unsigned long long max_bytes; 233 enum fio_ddir ddir; 234 235 /* 236 * Put a top limit on how many bytes we do for 237 * one data direction, to avoid overflowing the 238 * ranges too much 239 */ 240 ddir = get_rand_ddir(td); 241 max_bytes = td->this_io_bytes[ddir]; 242 if (max_bytes >= (td->o.size * td->o.rwmix[ddir] / 100)) { 243 if (!td->rw_end_set[ddir]) { 244 td->rw_end_set[ddir] = 1; 245 memcpy(&td->rw_end[ddir], &now, sizeof(now)); 246 } 247 ddir ^= 1; 248 } 249 250 if (ddir != td->rwmix_ddir) 251 set_rwmix_bytes(td); 252 253 td->rwmix_ddir = ddir; 254 memcpy(&td->rwmix_switch, &now, sizeof(now)); 255 } 256 return td->rwmix_ddir; 257 } else if (td_read(td)) 258 return DDIR_READ; 259 else 260 return DDIR_WRITE; 261} 262 263void put_io_u(struct thread_data *td, struct io_u *io_u) 264{ 265 assert((io_u->flags & IO_U_F_FREE) == 0); 266 io_u->flags |= IO_U_F_FREE; 267 268 io_u->file = NULL; 269 list_del(&io_u->list); 270 list_add(&io_u->list, &td->io_u_freelist); 271 td->cur_depth--; 272} 273 274void requeue_io_u(struct thread_data *td, struct io_u **io_u) 275{ 276 struct io_u *__io_u = *io_u; 277 278 __io_u->flags |= IO_U_F_FREE; 279 __io_u->flags &= ~IO_U_F_FLIGHT; 280 281 list_del(&__io_u->list); 282 list_add_tail(&__io_u->list, &td->io_u_requeues); 283 td->cur_depth--; 284 *io_u = NULL; 285} 286 287static int fill_io_u(struct thread_data *td, struct io_u *io_u) 288{ 289 /* 290 * If using an iolog, grab next piece if any available. 291 */ 292 if (td->o.read_iolog) 293 return read_iolog_get(td, io_u); 294 295 /* 296 * see if it's time to sync 297 */ 298 if (td->o.fsync_blocks && 299 !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) && 300 td->io_issues[DDIR_WRITE] && should_fsync(td)) { 301 io_u->ddir = DDIR_SYNC; 302 goto out; 303 } 304 305 io_u->ddir = get_rw_ddir(td); 306 307 /* 308 * No log, let the seq/rand engine retrieve the next buflen and 309 * position. 310 */ 311 if (get_next_offset(td, io_u)) 312 return 1; 313 314 io_u->buflen = get_next_buflen(td, io_u); 315 if (!io_u->buflen) 316 return 1; 317 318 /* 319 * mark entry before potentially trimming io_u 320 */ 321 if (td_random(td) && !td->o.norandommap) 322 mark_random_map(td, io_u); 323 324 /* 325 * If using a write iolog, store this entry. 326 */ 327out: 328 if (td->o.write_iolog_file) 329 write_iolog_put(td, io_u); 330 331 return 0; 332} 333 334void io_u_mark_depth(struct thread_data *td, struct io_u *io_u) 335{ 336 int index = 0; 337 338 if (io_u->ddir == DDIR_SYNC) 339 return; 340 341 switch (td->cur_depth) { 342 default: 343 index++; 344 case 32 ... 63: 345 index++; 346 case 16 ... 31: 347 index++; 348 case 8 ... 15: 349 index++; 350 case 4 ... 7: 351 index++; 352 case 2 ... 3: 353 index++; 354 case 1: 355 break; 356 } 357 358 td->ts.io_u_map[index]++; 359 td->ts.total_io_u[io_u->ddir]++; 360} 361 362static void io_u_mark_latency(struct thread_data *td, unsigned long msec) 363{ 364 int index = 0; 365 366 switch (msec) { 367 default: 368 index++; 369 case 1000 ... 1999: 370 index++; 371 case 750 ... 999: 372 index++; 373 case 500 ... 749: 374 index++; 375 case 250 ... 499: 376 index++; 377 case 100 ... 249: 378 index++; 379 case 50 ... 99: 380 index++; 381 case 20 ... 49: 382 index++; 383 case 10 ... 19: 384 index++; 385 case 4 ... 9: 386 index++; 387 case 2 ... 3: 388 index++; 389 case 0 ... 1: 390 break; 391 } 392 393 td->ts.io_u_lat[index]++; 394} 395 396/* 397 * Get next file to service by choosing one at random 398 */ 399static struct fio_file *get_next_file_rand(struct thread_data *td, int goodf, 400 int badf) 401{ 402 struct fio_file *f; 403 int fno; 404 405 do { 406 long r = os_random_long(&td->next_file_state); 407 408 fno = (unsigned int) ((double) td->o.nr_files * (r / (RAND_MAX + 1.0))); 409 f = &td->files[fno]; 410 if (f->flags & FIO_FILE_DONE) 411 continue; 412 413 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) 414 return f; 415 } while (1); 416} 417 418/* 419 * Get next file to service by doing round robin between all available ones 420 */ 421static struct fio_file *get_next_file_rr(struct thread_data *td, int goodf, 422 int badf) 423{ 424 unsigned int old_next_file = td->next_file; 425 struct fio_file *f; 426 427 do { 428 f = &td->files[td->next_file]; 429 430 td->next_file++; 431 if (td->next_file >= td->o.nr_files) 432 td->next_file = 0; 433 434 if (f->flags & FIO_FILE_DONE) { 435 f = NULL; 436 continue; 437 } 438 439 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) 440 break; 441 442 f = NULL; 443 } while (td->next_file != old_next_file); 444 445 return f; 446} 447 448static struct fio_file *get_next_file(struct thread_data *td) 449{ 450 struct fio_file *f; 451 452 assert(td->o.nr_files <= td->files_index); 453 454 if (!td->nr_open_files || td->nr_done_files >= td->o.nr_files) 455 return NULL; 456 457 f = td->file_service_file; 458 if (f && (f->flags & FIO_FILE_OPEN) && td->file_service_left--) 459 return f; 460 461 if (td->o.file_service_type == FIO_FSERVICE_RR) 462 f = get_next_file_rr(td, FIO_FILE_OPEN, FIO_FILE_CLOSING); 463 else 464 f = get_next_file_rand(td, FIO_FILE_OPEN, FIO_FILE_CLOSING); 465 466 td->file_service_file = f; 467 td->file_service_left = td->file_service_nr - 1; 468 return f; 469} 470 471static struct fio_file *find_next_new_file(struct thread_data *td) 472{ 473 struct fio_file *f; 474 475 if (!td->nr_open_files || td->nr_done_files >= td->o.nr_files) 476 return NULL; 477 478 if (td->o.file_service_type == FIO_FSERVICE_RR) 479 f = get_next_file_rr(td, 0, FIO_FILE_OPEN); 480 else 481 f = get_next_file_rand(td, 0, FIO_FILE_OPEN); 482 483 return f; 484} 485 486struct io_u *__get_io_u(struct thread_data *td) 487{ 488 struct io_u *io_u = NULL; 489 490 if (!list_empty(&td->io_u_requeues)) 491 io_u = list_entry(td->io_u_requeues.next, struct io_u, list); 492 else if (!queue_full(td)) { 493 io_u = list_entry(td->io_u_freelist.next, struct io_u, list); 494 495 io_u->buflen = 0; 496 io_u->resid = 0; 497 io_u->file = NULL; 498 io_u->end_io = NULL; 499 } 500 501 if (io_u) { 502 assert(io_u->flags & IO_U_F_FREE); 503 io_u->flags &= ~IO_U_F_FREE; 504 505 io_u->error = 0; 506 list_del(&io_u->list); 507 list_add(&io_u->list, &td->io_u_busylist); 508 td->cur_depth++; 509 } 510 511 return io_u; 512} 513 514/* 515 * Return an io_u to be processed. Gets a buflen and offset, sets direction, 516 * etc. The returned io_u is fully ready to be prepped and submitted. 517 */ 518struct io_u *get_io_u(struct thread_data *td) 519{ 520 struct fio_file *f; 521 struct io_u *io_u; 522 int ret; 523 524 io_u = __get_io_u(td); 525 if (!io_u) 526 return NULL; 527 528 /* 529 * from a requeue, io_u already setup 530 */ 531 if (io_u->file) 532 goto out; 533 534 do { 535 f = get_next_file(td); 536 if (!f) { 537 put_io_u(td, io_u); 538 return NULL; 539 } 540 541set_file: 542 io_u->file = f; 543 544 if (!fill_io_u(td, io_u)) 545 break; 546 547 /* 548 * No more to do for this file, close it 549 */ 550 io_u->file = NULL; 551 td_io_close_file(td, f); 552 f->flags |= FIO_FILE_DONE; 553 td->nr_done_files++; 554 555 /* 556 * probably not the right place to do this, but see 557 * if we need to open a new file 558 */ 559 if (td->nr_open_files < td->o.open_files && 560 td->o.open_files != td->o.nr_files) { 561 f = find_next_new_file(td); 562 563 if (!f || (ret = td_io_open_file(td, f))) { 564 put_io_u(td, io_u); 565 return NULL; 566 } 567 goto set_file; 568 } 569 } while (1); 570 571 if (td->zone_bytes >= td->o.zone_size) { 572 td->zone_bytes = 0; 573 f->last_pos += td->o.zone_skip; 574 } 575 576 if (io_u->ddir != DDIR_SYNC) { 577 if (!io_u->buflen) { 578 put_io_u(td, io_u); 579 return NULL; 580 } 581 582 f->last_pos = io_u->offset + io_u->buflen; 583 584 if (td->o.verify != VERIFY_NONE) 585 populate_verify_io_u(td, io_u); 586 } 587 588 /* 589 * Set io data pointers. 590 */ 591out: 592 io_u->xfer_buf = io_u->buf; 593 io_u->xfer_buflen = io_u->buflen; 594 595 if (td_io_prep(td, io_u)) { 596 put_io_u(td, io_u); 597 return NULL; 598 } 599 600 fio_gettime(&io_u->start_time, NULL); 601 return io_u; 602} 603 604void io_u_log_error(struct thread_data *td, struct io_u *io_u) 605{ 606 const char *msg[] = { "read", "write", "sync" }; 607 608 log_err("fio: io_u error"); 609 610 if (io_u->file) 611 log_err(" on file %s", io_u->file->file_name); 612 613 log_err(": %s\n", strerror(io_u->error)); 614 615 log_err(" %s offset=%llu, buflen=%lu\n", msg[io_u->ddir], io_u->offset, io_u->xfer_buflen); 616 617 if (!td->error) 618 td_verror(td, io_u->error, "io_u error"); 619} 620 621static void io_completed(struct thread_data *td, struct io_u *io_u, 622 struct io_completion_data *icd) 623{ 624 unsigned long msec; 625 626 assert(io_u->flags & IO_U_F_FLIGHT); 627 io_u->flags &= ~IO_U_F_FLIGHT; 628 629 put_file(td, io_u->file); 630 631 if (io_u->ddir == DDIR_SYNC) { 632 td->last_was_sync = 1; 633 return; 634 } 635 636 td->last_was_sync = 0; 637 638 if (!io_u->error) { 639 unsigned int bytes = io_u->buflen - io_u->resid; 640 const enum fio_ddir idx = io_u->ddir; 641 int ret; 642 643 td->io_blocks[idx]++; 644 td->io_bytes[idx] += bytes; 645 td->zone_bytes += bytes; 646 td->this_io_bytes[idx] += bytes; 647 648 io_u->file->last_completed_pos = io_u->offset + io_u->buflen; 649 650 msec = mtime_since(&io_u->issue_time, &icd->time); 651 652 add_clat_sample(td, idx, msec); 653 add_bw_sample(td, idx, &icd->time); 654 io_u_mark_latency(td, msec); 655 656 if ((td_rw(td) || td_write(td)) && idx == DDIR_WRITE && 657 td->o.verify != VERIFY_NONE) 658 log_io_piece(td, io_u); 659 660 icd->bytes_done[idx] += bytes; 661 662 if (io_u->end_io) { 663 ret = io_u->end_io(td, io_u); 664 if (ret && !icd->error) 665 icd->error = ret; 666 } 667 } else { 668 icd->error = io_u->error; 669 io_u_log_error(td, io_u); 670 } 671} 672 673static void init_icd(struct io_completion_data *icd, int nr) 674{ 675 fio_gettime(&icd->time, NULL); 676 677 icd->nr = nr; 678 679 icd->error = 0; 680 icd->bytes_done[0] = icd->bytes_done[1] = 0; 681} 682 683static void ios_completed(struct thread_data *td, 684 struct io_completion_data *icd) 685{ 686 struct io_u *io_u; 687 int i; 688 689 for (i = 0; i < icd->nr; i++) { 690 io_u = td->io_ops->event(td, i); 691 692 io_completed(td, io_u, icd); 693 put_io_u(td, io_u); 694 } 695} 696 697/* 698 * Complete a single io_u for the sync engines. 699 */ 700long io_u_sync_complete(struct thread_data *td, struct io_u *io_u) 701{ 702 struct io_completion_data icd; 703 704 init_icd(&icd, 1); 705 io_completed(td, io_u, &icd); 706 put_io_u(td, io_u); 707 708 if (!icd.error) 709 return icd.bytes_done[0] + icd.bytes_done[1]; 710 711 td_verror(td, icd.error, "io_u_sync_complete"); 712 return -1; 713} 714 715/* 716 * Called to complete min_events number of io for the async engines. 717 */ 718long io_u_queued_complete(struct thread_data *td, int min_events) 719{ 720 struct io_completion_data icd; 721 struct timespec *tvp = NULL; 722 int ret; 723 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, }; 724 725 if (!min_events) 726 tvp = &ts; 727 728 ret = td_io_getevents(td, min_events, td->cur_depth, tvp); 729 if (ret < 0) { 730 td_verror(td, -ret, "td_io_getevents"); 731 return ret; 732 } else if (!ret) 733 return ret; 734 735 init_icd(&icd, ret); 736 ios_completed(td, &icd); 737 if (!icd.error) 738 return icd.bytes_done[0] + icd.bytes_done[1]; 739 740 td_verror(td, icd.error, "io_u_queued_complete"); 741 return -1; 742} 743 744/* 745 * Call when io_u is really queued, to update the submission latency. 746 */ 747void io_u_queued(struct thread_data *td, struct io_u *io_u) 748{ 749 unsigned long slat_time; 750 751 slat_time = mtime_since(&io_u->start_time, &io_u->issue_time); 752 add_slat_sample(td, io_u->ddir, slat_time); 753} 754 755#ifdef FIO_USE_TIMEOUT 756void io_u_set_timeout(struct thread_data *td) 757{ 758 assert(td->cur_depth); 759 760 td->timer.it_interval.tv_sec = 0; 761 td->timer.it_interval.tv_usec = 0; 762 td->timer.it_value.tv_sec = IO_U_TIMEOUT + IO_U_TIMEOUT_INC; 763 td->timer.it_value.tv_usec = 0; 764 setitimer(ITIMER_REAL, &td->timer, NULL); 765 fio_gettime(&td->timeout_end, NULL); 766} 767 768static void io_u_dump(struct io_u *io_u) 769{ 770 unsigned long t_start = mtime_since_now(&io_u->start_time); 771 unsigned long t_issue = mtime_since_now(&io_u->issue_time); 772 773 log_err("io_u=%p, t_start=%lu, t_issue=%lu\n", io_u, t_start, t_issue); 774 log_err(" buf=%p/%p, len=%lu/%lu, offset=%llu\n", io_u->buf, io_u->xfer_buf, io_u->buflen, io_u->xfer_buflen, io_u->offset); 775 log_err(" ddir=%d, fname=%s\n", io_u->ddir, io_u->file->file_name); 776} 777#else 778void io_u_set_timeout(struct thread_data fio_unused *td) 779{ 780} 781#endif 782 783#ifdef FIO_USE_TIMEOUT 784static void io_u_timeout_handler(int fio_unused sig) 785{ 786 struct thread_data *td, *__td; 787 pid_t pid = getpid(); 788 struct list_head *entry; 789 struct io_u *io_u; 790 int i; 791 792 log_err("fio: io_u timeout\n"); 793 794 /* 795 * TLS would be nice... 796 */ 797 td = NULL; 798 for_each_td(__td, i) { 799 if (__td->pid == pid) { 800 td = __td; 801 break; 802 } 803 } 804 805 if (!td) { 806 log_err("fio: io_u timeout, can't find job\n"); 807 exit(1); 808 } 809 810 if (!td->cur_depth) { 811 log_err("fio: timeout without pending work?\n"); 812 return; 813 } 814 815 log_err("fio: io_u timeout: job=%s, pid=%d\n", td->o.name, td->pid); 816 817 list_for_each(entry, &td->io_u_busylist) { 818 io_u = list_entry(entry, struct io_u, list); 819 820 io_u_dump(io_u); 821 } 822 823 td_verror(td, ETIMEDOUT, "io_u timeout"); 824 exit(1); 825} 826#endif 827 828void io_u_init_timeout(void) 829{ 830#ifdef FIO_USE_TIMEOUT 831 signal(SIGALRM, io_u_timeout_handler); 832#endif 833} 834