io_u.c revision d7762cf829fd3e44e50e5e2e9889b6449772097c
1#include <unistd.h> 2#include <fcntl.h> 3#include <string.h> 4#include <signal.h> 5#include <time.h> 6#include <assert.h> 7 8#include "fio.h" 9#include "os.h" 10 11/* 12 * Change this define to play with the timeout handling 13 */ 14#undef FIO_USE_TIMEOUT 15 16struct io_completion_data { 17 int nr; /* input */ 18 19 int error; /* output */ 20 unsigned long bytes_done[2]; /* output */ 21 struct timeval time; /* output */ 22}; 23 24/* 25 * The ->file_map[] contains a map of blocks we have or have not done io 26 * to yet. Used to make sure we cover the entire range in a fair fashion. 27 */ 28static int random_map_free(struct thread_data *td, struct fio_file *f, 29 unsigned long long block) 30{ 31 unsigned int idx = RAND_MAP_IDX(td, f, block); 32 unsigned int bit = RAND_MAP_BIT(td, f, block); 33 34 return (f->file_map[idx] & (1UL << bit)) == 0; 35} 36 37/* 38 * Mark a given offset as used in the map. 39 */ 40static void mark_random_map(struct thread_data *td, struct fio_file *f, 41 struct io_u *io_u) 42{ 43 unsigned int min_bs = td->rw_min_bs; 44 unsigned long long block; 45 unsigned int blocks; 46 unsigned int nr_blocks; 47 48 block = io_u->offset / (unsigned long long) min_bs; 49 blocks = 0; 50 nr_blocks = (io_u->buflen + min_bs - 1) / min_bs; 51 52 while (blocks < nr_blocks) { 53 unsigned int idx, bit; 54 55 if (!random_map_free(td, f, block)) 56 break; 57 58 idx = RAND_MAP_IDX(td, f, block); 59 bit = RAND_MAP_BIT(td, f, block); 60 61 fio_assert(td, idx < f->num_maps); 62 63 f->file_map[idx] |= (1UL << bit); 64 block++; 65 blocks++; 66 } 67 68 if ((blocks * min_bs) < io_u->buflen) 69 io_u->buflen = blocks * min_bs; 70} 71 72/* 73 * Return the next free block in the map. 74 */ 75static int get_next_free_block(struct thread_data *td, struct fio_file *f, 76 unsigned long long *b) 77{ 78 int i; 79 80 i = f->last_free_lookup; 81 *b = (i * BLOCKS_PER_MAP); 82 while ((*b) * td->rw_min_bs < f->real_file_size) { 83 if (f->file_map[i] != -1UL) { 84 *b += ffz(f->file_map[i]); 85 f->last_free_lookup = i; 86 return 0; 87 } 88 89 *b += BLOCKS_PER_MAP; 90 i++; 91 } 92 93 return 1; 94} 95 96/* 97 * For random io, generate a random new block and see if it's used. Repeat 98 * until we find a free one. For sequential io, just return the end of 99 * the last io issued. 100 */ 101static int get_next_offset(struct thread_data *td, struct fio_file *f, 102 struct io_u *io_u) 103{ 104 const int ddir = io_u->ddir; 105 unsigned long long b, rb; 106 long r; 107 108 if (td_random(td)) { 109 unsigned long long max_blocks = f->file_size / td->min_bs[ddir]; 110 int loops = 5; 111 112 do { 113 r = os_random_long(&td->random_state); 114 b = ((max_blocks - 1) * r / (unsigned long long) (RAND_MAX+1.0)); 115 if (td->norandommap) 116 break; 117 rb = b + (f->file_offset / td->min_bs[ddir]); 118 loops--; 119 } while (!random_map_free(td, f, rb) && loops); 120 121 /* 122 * if we failed to retrieve a truly random offset within 123 * the loops assigned, see if there are free ones left at all 124 */ 125 if (!loops && get_next_free_block(td, f, &b)) 126 return 1; 127 } else 128 b = f->last_pos / td->min_bs[ddir]; 129 130 io_u->offset = (b * td->min_bs[ddir]) + f->file_offset; 131 if (io_u->offset >= f->real_file_size) 132 return 1; 133 134 return 0; 135} 136 137static unsigned int get_next_buflen(struct thread_data *td, struct fio_file *f, 138 struct io_u *io_u) 139{ 140 const int ddir = io_u->ddir; 141 unsigned int buflen; 142 long r; 143 144 if (td->min_bs[ddir] == td->max_bs[ddir]) 145 buflen = td->min_bs[ddir]; 146 else { 147 r = os_random_long(&td->bsrange_state); 148 buflen = (unsigned int) (1 + (double) (td->max_bs[ddir] - 1) * r / (RAND_MAX + 1.0)); 149 if (!td->bs_unaligned) 150 buflen = (buflen + td->min_bs[ddir] - 1) & ~(td->min_bs[ddir] - 1); 151 } 152 153 while (buflen + io_u->offset > f->real_file_size) { 154 if (buflen == td->min_bs[ddir]) 155 return 0; 156 157 buflen = td->min_bs[ddir]; 158 } 159 160 return buflen; 161} 162 163/* 164 * Return the data direction for the next io_u. If the job is a 165 * mixed read/write workload, check the rwmix cycle and switch if 166 * necessary. 167 */ 168static enum fio_ddir get_rw_ddir(struct thread_data *td) 169{ 170 if (td_rw(td)) { 171 struct timeval now; 172 unsigned long elapsed; 173 174 fio_gettime(&now, NULL); 175 elapsed = mtime_since_now(&td->rwmix_switch); 176 177 /* 178 * Check if it's time to seed a new data direction. 179 */ 180 if (elapsed >= td->rwmixcycle) { 181 unsigned int v; 182 long r; 183 184 r = os_random_long(&td->rwmix_state); 185 v = 1 + (int) (100.0 * (r / (RAND_MAX + 1.0))); 186 if (v < td->rwmixread) 187 td->rwmix_ddir = DDIR_READ; 188 else 189 td->rwmix_ddir = DDIR_WRITE; 190 memcpy(&td->rwmix_switch, &now, sizeof(now)); 191 } 192 return td->rwmix_ddir; 193 } else if (td_read(td)) 194 return DDIR_READ; 195 else 196 return DDIR_WRITE; 197} 198 199void put_io_u(struct thread_data *td, struct io_u *io_u) 200{ 201 assert((io_u->flags & IO_U_F_FREE) == 0); 202 io_u->flags |= IO_U_F_FREE; 203 204 io_u->file = NULL; 205 list_del(&io_u->list); 206 list_add(&io_u->list, &td->io_u_freelist); 207 td->cur_depth--; 208} 209 210void requeue_io_u(struct thread_data *td, struct io_u **io_u) 211{ 212 struct io_u *__io_u = *io_u; 213 214 list_del(&__io_u->list); 215 list_add_tail(&__io_u->list, &td->io_u_requeues); 216 td->cur_depth--; 217 *io_u = NULL; 218} 219 220static int fill_io_u(struct thread_data *td, struct fio_file *f, 221 struct io_u *io_u) 222{ 223 /* 224 * If using an iolog, grab next piece if any available. 225 */ 226 if (td->read_iolog) 227 return read_iolog_get(td, io_u); 228 229 /* 230 * see if it's time to sync 231 */ 232 if (td->fsync_blocks && !(td->io_issues[DDIR_WRITE] % td->fsync_blocks) 233 && td->io_issues[DDIR_WRITE] && should_fsync(td)) { 234 io_u->ddir = DDIR_SYNC; 235 io_u->file = f; 236 return 0; 237 } 238 239 io_u->ddir = get_rw_ddir(td); 240 241 /* 242 * No log, let the seq/rand engine retrieve the next buflen and 243 * position. 244 */ 245 if (get_next_offset(td, f, io_u)) 246 return 1; 247 248 io_u->buflen = get_next_buflen(td, f, io_u); 249 if (!io_u->buflen) 250 return 1; 251 252 /* 253 * mark entry before potentially trimming io_u 254 */ 255 if (!td->read_iolog && td_random(td) && !td->norandommap) 256 mark_random_map(td, f, io_u); 257 258 /* 259 * If using a write iolog, store this entry. 260 */ 261 if (td->write_iolog_file) 262 write_iolog_put(td, io_u); 263 264 io_u->file = f; 265 return 0; 266} 267 268static void io_u_mark_depth(struct thread_data *td) 269{ 270 int index = 0; 271 272 switch (td->cur_depth) { 273 default: 274 index++; 275 case 32 ... 63: 276 index++; 277 case 16 ... 31: 278 index++; 279 case 8 ... 15: 280 index++; 281 case 4 ... 7: 282 index++; 283 case 2 ... 3: 284 index++; 285 case 1: 286 break; 287 } 288 289 td->io_u_map[index]++; 290 td->total_io_u++; 291} 292 293static void io_u_mark_latency(struct thread_data *td, unsigned long msec) 294{ 295 int index = 0; 296 297 switch (msec) { 298 default: 299 index++; 300 case 1000 ... 1999: 301 index++; 302 case 750 ... 999: 303 index++; 304 case 500 ... 749: 305 index++; 306 case 250 ... 499: 307 index++; 308 case 100 ... 249: 309 index++; 310 case 50 ... 99: 311 index++; 312 case 20 ... 49: 313 index++; 314 case 10 ... 19: 315 index++; 316 case 4 ... 9: 317 index++; 318 case 2 ... 3: 319 index++; 320 case 0 ... 1: 321 break; 322 } 323 324 td->io_u_lat[index]++; 325} 326 327/* 328 * Get next file to service by choosing one at random 329 */ 330static struct fio_file *get_next_file_rand(struct thread_data *td) 331{ 332 long r = os_random_long(&td->next_file_state); 333 unsigned int fileno; 334 struct fio_file *f; 335 336 do { 337 fileno = (unsigned int) ((double) (td->nr_files - 1) * r / (RAND_MAX + 1.0)); 338 f = &td->files[fileno]; 339 if (f->fd != -1) 340 return f; 341 } while (1); 342} 343 344/* 345 * Get next file to service by doing round robin between all available ones 346 */ 347static struct fio_file *get_next_file_rr(struct thread_data *td) 348{ 349 unsigned int old_next_file = td->next_file; 350 struct fio_file *f; 351 352 do { 353 f = &td->files[td->next_file]; 354 355 td->next_file++; 356 if (td->next_file >= td->nr_files) 357 td->next_file = 0; 358 359 if (f->fd != -1) 360 break; 361 362 f = NULL; 363 } while (td->next_file != old_next_file); 364 365 return f; 366} 367 368struct io_u *__get_io_u(struct thread_data *td) 369{ 370 struct io_u *io_u = NULL; 371 372 if (!list_empty(&td->io_u_requeues)) 373 io_u = list_entry(td->io_u_requeues.next, struct io_u, list); 374 else if (!queue_full(td)) { 375 io_u = list_entry(td->io_u_freelist.next, struct io_u, list); 376 377 io_u->buflen = 0; 378 io_u->resid = 0; 379 io_u->file = NULL; 380 io_u->end_io = NULL; 381 } 382 383 if (io_u) { 384 assert(io_u->flags & IO_U_F_FREE); 385 io_u->flags &= ~IO_U_F_FREE; 386 387 io_u->error = 0; 388 list_del(&io_u->list); 389 list_add(&io_u->list, &td->io_u_busylist); 390 td->cur_depth++; 391 io_u_mark_depth(td); 392 } 393 394 return io_u; 395} 396 397/* 398 * Return an io_u to be processed. Gets a buflen and offset, sets direction, 399 * etc. The returned io_u is fully ready to be prepped and submitted. 400 */ 401struct io_u *get_io_u(struct thread_data *td) 402{ 403 struct fio_file *f; 404 struct io_u *io_u; 405 406 io_u = __get_io_u(td); 407 if (!io_u) 408 return NULL; 409 410 /* 411 * from a requeue, io_u already setup 412 */ 413 if (io_u->file) 414 goto out; 415 416 if (td->file_service_type == FIO_FSERVICE_RR) 417 f = get_next_file_rr(td); 418 else 419 f = get_next_file_rand(td); 420 421 if (!f) { 422 put_io_u(td, io_u); 423 return NULL; 424 } 425 426 io_u->file = f; 427 428 if (td->zone_bytes >= td->zone_size) { 429 td->zone_bytes = 0; 430 f->last_pos += td->zone_skip; 431 } 432 433 if (fill_io_u(td, f, io_u)) { 434 put_io_u(td, io_u); 435 return NULL; 436 } 437 438 if (io_u->buflen + io_u->offset > f->real_file_size) { 439 if (td->io_ops->flags & FIO_RAWIO) { 440 put_io_u(td, io_u); 441 return NULL; 442 } 443 444 io_u->buflen = f->real_file_size - io_u->offset; 445 } 446 447 if (io_u->ddir != DDIR_SYNC) { 448 if (!io_u->buflen) { 449 put_io_u(td, io_u); 450 return NULL; 451 } 452 453 f->last_pos = io_u->offset + io_u->buflen; 454 455 if (td->verify != VERIFY_NONE) 456 populate_verify_io_u(td, io_u); 457 } 458 459 /* 460 * Set io data pointers. 461 */ 462out: 463 io_u->xfer_buf = io_u->buf; 464 io_u->xfer_buflen = io_u->buflen; 465 466 if (td_io_prep(td, io_u)) { 467 put_io_u(td, io_u); 468 return NULL; 469 } 470 471 fio_gettime(&io_u->start_time, NULL); 472 return io_u; 473} 474 475static void io_completed(struct thread_data *td, struct io_u *io_u, 476 struct io_completion_data *icd) 477{ 478 unsigned long msec; 479 480 assert(io_u->flags & IO_U_F_FLIGHT); 481 io_u->flags &= ~IO_U_F_FLIGHT; 482 483 if (io_u->ddir == DDIR_SYNC) { 484 td->last_was_sync = 1; 485 return; 486 } 487 488 td->last_was_sync = 0; 489 490 if (!io_u->error) { 491 unsigned int bytes = io_u->buflen - io_u->resid; 492 const enum fio_ddir idx = io_u->ddir; 493 int ret; 494 495 td->io_blocks[idx]++; 496 td->io_bytes[idx] += bytes; 497 td->zone_bytes += bytes; 498 td->this_io_bytes[idx] += bytes; 499 500 io_u->file->last_completed_pos = io_u->offset + io_u->buflen; 501 502 msec = mtime_since(&io_u->issue_time, &icd->time); 503 504 add_clat_sample(td, idx, msec); 505 add_bw_sample(td, idx, &icd->time); 506 io_u_mark_latency(td, msec); 507 508 if ((td_rw(td) || td_write(td)) && idx == DDIR_WRITE) 509 log_io_piece(td, io_u); 510 511 icd->bytes_done[idx] += bytes; 512 513 if (io_u->end_io) { 514 ret = io_u->end_io(io_u); 515 if (ret && !icd->error) 516 icd->error = ret; 517 } 518 } else 519 icd->error = io_u->error; 520} 521 522static void init_icd(struct io_completion_data *icd, int nr) 523{ 524 fio_gettime(&icd->time, NULL); 525 526 icd->nr = nr; 527 528 icd->error = 0; 529 icd->bytes_done[0] = icd->bytes_done[1] = 0; 530} 531 532static void ios_completed(struct thread_data *td, 533 struct io_completion_data *icd) 534{ 535 struct io_u *io_u; 536 int i; 537 538 for (i = 0; i < icd->nr; i++) { 539 io_u = td->io_ops->event(td, i); 540 541 io_completed(td, io_u, icd); 542 put_io_u(td, io_u); 543 } 544} 545 546/* 547 * Complete a single io_u for the sync engines. 548 */ 549long io_u_sync_complete(struct thread_data *td, struct io_u *io_u) 550{ 551 struct io_completion_data icd; 552 553 init_icd(&icd, 1); 554 io_completed(td, io_u, &icd); 555 put_io_u(td, io_u); 556 557 if (!icd.error) 558 return icd.bytes_done[0] + icd.bytes_done[1]; 559 560 return -1; 561} 562 563/* 564 * Called to complete min_events number of io for the async engines. 565 */ 566long io_u_queued_complete(struct thread_data *td, int min_events) 567{ 568 struct io_completion_data icd; 569 struct timespec *tvp = NULL; 570 int ret; 571 572 if (min_events > 0) { 573 ret = td_io_commit(td); 574 if (ret < 0) { 575 td_verror(td, -ret, "td_io_commit"); 576 return ret; 577 } 578 } else { 579 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, }; 580 581 tvp = &ts; 582 } 583 584 ret = td_io_getevents(td, min_events, td->cur_depth, tvp); 585 if (ret < 0) { 586 td_verror(td, -ret, "td_io_getevents"); 587 return ret; 588 } else if (!ret) 589 return ret; 590 591 init_icd(&icd, ret); 592 ios_completed(td, &icd); 593 if (!icd.error) 594 return icd.bytes_done[0] + icd.bytes_done[1]; 595 596 return -1; 597} 598 599/* 600 * Call when io_u is really queued, to update the submission latency. 601 */ 602void io_u_queued(struct thread_data *td, struct io_u *io_u) 603{ 604 unsigned long slat_time; 605 606 slat_time = mtime_since(&io_u->start_time, &io_u->issue_time); 607 add_slat_sample(td, io_u->ddir, slat_time); 608} 609 610#ifdef FIO_USE_TIMEOUT 611void io_u_set_timeout(struct thread_data *td) 612{ 613 assert(td->cur_depth); 614 615 td->timer.it_interval.tv_sec = 0; 616 td->timer.it_interval.tv_usec = 0; 617 td->timer.it_value.tv_sec = IO_U_TIMEOUT + IO_U_TIMEOUT_INC; 618 td->timer.it_value.tv_usec = 0; 619 setitimer(ITIMER_REAL, &td->timer, NULL); 620 fio_gettime(&td->timeout_end, NULL); 621} 622 623static void io_u_dump(struct io_u *io_u) 624{ 625 unsigned long t_start = mtime_since_now(&io_u->start_time); 626 unsigned long t_issue = mtime_since_now(&io_u->issue_time); 627 628 log_err("io_u=%p, t_start=%lu, t_issue=%lu\n", io_u, t_start, t_issue); 629 log_err(" buf=%p/%p, len=%lu/%lu, offset=%llu\n", io_u->buf, io_u->xfer_buf, io_u->buflen, io_u->xfer_buflen, io_u->offset); 630 log_err(" ddir=%d, fname=%s\n", io_u->ddir, io_u->file->file_name); 631} 632#else 633void io_u_set_timeout(struct thread_data fio_unused *td) 634{ 635} 636#endif 637 638#ifdef FIO_USE_TIMEOUT 639static void io_u_timeout_handler(int fio_unused sig) 640{ 641 struct thread_data *td, *__td; 642 pid_t pid = getpid(); 643 struct list_head *entry; 644 struct io_u *io_u; 645 int i; 646 647 log_err("fio: io_u timeout\n"); 648 649 /* 650 * TLS would be nice... 651 */ 652 td = NULL; 653 for_each_td(__td, i) { 654 if (__td->pid == pid) { 655 td = __td; 656 break; 657 } 658 } 659 660 if (!td) { 661 log_err("fio: io_u timeout, can't find job\n"); 662 exit(1); 663 } 664 665 if (!td->cur_depth) { 666 log_err("fio: timeout without pending work?\n"); 667 return; 668 } 669 670 log_err("fio: io_u timeout: job=%s, pid=%d\n", td->name, td->pid); 671 672 list_for_each(entry, &td->io_u_busylist) { 673 io_u = list_entry(entry, struct io_u, list); 674 675 io_u_dump(io_u); 676 } 677 678 td_verror(td, ETIMEDOUT, "io_u timeout"); 679 exit(1); 680} 681#endif 682 683void io_u_init_timeout(void) 684{ 685#ifdef FIO_USE_TIMEOUT 686 signal(SIGALRM, io_u_timeout_handler); 687#endif 688} 689