cfq-iosched.c revision b217a903ab6581cba04f88c44284dcdd2a752561
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/blkdev.h> 11#include <linux/elevator.h> 12#include <linux/rbtree.h> 13#include <linux/ioprio.h> 14#include <linux/blktrace_api.h> 15 16/* 17 * tunables 18 */ 19/* max queue in one round of service */ 20static const int cfq_quantum = 4; 21static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 22/* maximum backwards seek, in KiB */ 23static const int cfq_back_max = 16 * 1024; 24/* penalty of a backwards seek */ 25static const int cfq_back_penalty = 2; 26static const int cfq_slice_sync = HZ / 10; 27static int cfq_slice_async = HZ / 25; 28static const int cfq_slice_async_rq = 2; 29static int cfq_slice_idle = HZ / 125; 30 31/* 32 * offset from end of service tree 33 */ 34#define CFQ_IDLE_DELAY (HZ / 5) 35 36/* 37 * below this threshold, we consider thinktime immediate 38 */ 39#define CFQ_MIN_TT (2) 40 41#define CFQ_SLICE_SCALE (5) 42#define CFQ_HW_QUEUE_MIN (5) 43 44#define RQ_CIC(rq) \ 45 ((struct cfq_io_context *) (rq)->elevator_private) 46#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 47 48static struct kmem_cache *cfq_pool; 49static struct kmem_cache *cfq_ioc_pool; 50 51static DEFINE_PER_CPU(unsigned long, ioc_count); 52static struct completion *ioc_gone; 53static DEFINE_SPINLOCK(ioc_gone_lock); 54 55#define CFQ_PRIO_LISTS IOPRIO_BE_NR 56#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 57#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 58 59#define sample_valid(samples) ((samples) > 80) 60 61/* 62 * Most of our rbtree usage is for sorting with min extraction, so 63 * if we cache the leftmost node we don't have to walk down the tree 64 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 65 * move this into the elevator for the rq sorting as well. 66 */ 67struct cfq_rb_root { 68 struct rb_root rb; 69 struct rb_node *left; 70}; 71#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } 72 73/* 74 * Per process-grouping structure 75 */ 76struct cfq_queue { 77 /* reference count */ 78 atomic_t ref; 79 /* various state flags, see below */ 80 unsigned int flags; 81 /* parent cfq_data */ 82 struct cfq_data *cfqd; 83 /* service_tree member */ 84 struct rb_node rb_node; 85 /* service_tree key */ 86 unsigned long rb_key; 87 /* prio tree member */ 88 struct rb_node p_node; 89 /* prio tree root we belong to, if any */ 90 struct rb_root *p_root; 91 /* sorted list of pending requests */ 92 struct rb_root sort_list; 93 /* if fifo isn't expired, next request to serve */ 94 struct request *next_rq; 95 /* requests queued in sort_list */ 96 int queued[2]; 97 /* currently allocated requests */ 98 int allocated[2]; 99 /* fifo list of requests in sort_list */ 100 struct list_head fifo; 101 102 unsigned long slice_end; 103 long slice_resid; 104 unsigned int slice_dispatch; 105 106 /* pending metadata requests */ 107 int meta_pending; 108 /* number of requests that are on the dispatch list or inside driver */ 109 int dispatched; 110 111 /* io prio of this group */ 112 unsigned short ioprio, org_ioprio; 113 unsigned short ioprio_class, org_ioprio_class; 114 115 pid_t pid; 116}; 117 118/* 119 * Per block device queue structure 120 */ 121struct cfq_data { 122 struct request_queue *queue; 123 124 /* 125 * rr list of queues with requests and the count of them 126 */ 127 struct cfq_rb_root service_tree; 128 129 /* 130 * Each priority tree is sorted by next_request position. These 131 * trees are used when determining if two or more queues are 132 * interleaving requests (see cfq_close_cooperator). 133 */ 134 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 135 136 unsigned int busy_queues; 137 138 int rq_in_driver[2]; 139 int sync_flight; 140 141 /* 142 * queue-depth detection 143 */ 144 int rq_queued; 145 int hw_tag; 146 int hw_tag_samples; 147 int rq_in_driver_peak; 148 149 /* 150 * idle window management 151 */ 152 struct timer_list idle_slice_timer; 153 struct work_struct unplug_work; 154 155 struct cfq_queue *active_queue; 156 struct cfq_io_context *active_cic; 157 158 /* 159 * async queue for each priority case 160 */ 161 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; 162 struct cfq_queue *async_idle_cfqq; 163 164 sector_t last_position; 165 166 /* 167 * tunables, see top of file 168 */ 169 unsigned int cfq_quantum; 170 unsigned int cfq_fifo_expire[2]; 171 unsigned int cfq_back_penalty; 172 unsigned int cfq_back_max; 173 unsigned int cfq_slice[2]; 174 unsigned int cfq_slice_async_rq; 175 unsigned int cfq_slice_idle; 176 177 struct list_head cic_list; 178 179 /* 180 * Fallback dummy cfqq for extreme OOM conditions 181 */ 182 struct cfq_queue oom_cfqq; 183}; 184 185enum cfqq_state_flags { 186 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 187 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 188 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 189 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 190 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 191 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 192 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 193 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 194 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 195 CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ 196}; 197 198#define CFQ_CFQQ_FNS(name) \ 199static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 200{ \ 201 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 202} \ 203static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 204{ \ 205 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 206} \ 207static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 208{ \ 209 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 210} 211 212CFQ_CFQQ_FNS(on_rr); 213CFQ_CFQQ_FNS(wait_request); 214CFQ_CFQQ_FNS(must_dispatch); 215CFQ_CFQQ_FNS(must_alloc_slice); 216CFQ_CFQQ_FNS(fifo_expire); 217CFQ_CFQQ_FNS(idle_window); 218CFQ_CFQQ_FNS(prio_changed); 219CFQ_CFQQ_FNS(slice_new); 220CFQ_CFQQ_FNS(sync); 221CFQ_CFQQ_FNS(coop); 222#undef CFQ_CFQQ_FNS 223 224#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 225 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 226#define cfq_log(cfqd, fmt, args...) \ 227 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 228 229static void cfq_dispatch_insert(struct request_queue *, struct request *); 230static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, 231 struct io_context *, gfp_t); 232static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 233 struct io_context *); 234 235static inline int rq_in_driver(struct cfq_data *cfqd) 236{ 237 return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1]; 238} 239 240static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 241 int is_sync) 242{ 243 return cic->cfqq[!!is_sync]; 244} 245 246static inline void cic_set_cfqq(struct cfq_io_context *cic, 247 struct cfq_queue *cfqq, int is_sync) 248{ 249 cic->cfqq[!!is_sync] = cfqq; 250} 251 252/* 253 * We regard a request as SYNC, if it's either a read or has the SYNC bit 254 * set (in which case it could also be direct WRITE). 255 */ 256static inline int cfq_bio_sync(struct bio *bio) 257{ 258 if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO)) 259 return 1; 260 261 return 0; 262} 263 264/* 265 * scheduler run of queue, if there are requests pending and no one in the 266 * driver that will restart queueing 267 */ 268static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 269{ 270 if (cfqd->busy_queues) { 271 cfq_log(cfqd, "schedule dispatch"); 272 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 273 } 274} 275 276static int cfq_queue_empty(struct request_queue *q) 277{ 278 struct cfq_data *cfqd = q->elevator->elevator_data; 279 280 return !cfqd->busy_queues; 281} 282 283/* 284 * Scale schedule slice based on io priority. Use the sync time slice only 285 * if a queue is marked sync and has sync io queued. A sync queue with async 286 * io only, should not get full sync slice length. 287 */ 288static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync, 289 unsigned short prio) 290{ 291 const int base_slice = cfqd->cfq_slice[sync]; 292 293 WARN_ON(prio >= IOPRIO_BE_NR); 294 295 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); 296} 297 298static inline int 299cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 300{ 301 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 302} 303 304static inline void 305cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 306{ 307 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; 308 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 309} 310 311/* 312 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 313 * isn't valid until the first request from the dispatch is activated 314 * and the slice time set. 315 */ 316static inline int cfq_slice_used(struct cfq_queue *cfqq) 317{ 318 if (cfq_cfqq_slice_new(cfqq)) 319 return 0; 320 if (time_before(jiffies, cfqq->slice_end)) 321 return 0; 322 323 return 1; 324} 325 326/* 327 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 328 * We choose the request that is closest to the head right now. Distance 329 * behind the head is penalized and only allowed to a certain extent. 330 */ 331static struct request * 332cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) 333{ 334 sector_t last, s1, s2, d1 = 0, d2 = 0; 335 unsigned long back_max; 336#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 337#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 338 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 339 340 if (rq1 == NULL || rq1 == rq2) 341 return rq2; 342 if (rq2 == NULL) 343 return rq1; 344 345 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 346 return rq1; 347 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 348 return rq2; 349 if (rq_is_meta(rq1) && !rq_is_meta(rq2)) 350 return rq1; 351 else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) 352 return rq2; 353 354 s1 = blk_rq_pos(rq1); 355 s2 = blk_rq_pos(rq2); 356 357 last = cfqd->last_position; 358 359 /* 360 * by definition, 1KiB is 2 sectors 361 */ 362 back_max = cfqd->cfq_back_max * 2; 363 364 /* 365 * Strict one way elevator _except_ in the case where we allow 366 * short backward seeks which are biased as twice the cost of a 367 * similar forward seek. 368 */ 369 if (s1 >= last) 370 d1 = s1 - last; 371 else if (s1 + back_max >= last) 372 d1 = (last - s1) * cfqd->cfq_back_penalty; 373 else 374 wrap |= CFQ_RQ1_WRAP; 375 376 if (s2 >= last) 377 d2 = s2 - last; 378 else if (s2 + back_max >= last) 379 d2 = (last - s2) * cfqd->cfq_back_penalty; 380 else 381 wrap |= CFQ_RQ2_WRAP; 382 383 /* Found required data */ 384 385 /* 386 * By doing switch() on the bit mask "wrap" we avoid having to 387 * check two variables for all permutations: --> faster! 388 */ 389 switch (wrap) { 390 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 391 if (d1 < d2) 392 return rq1; 393 else if (d2 < d1) 394 return rq2; 395 else { 396 if (s1 >= s2) 397 return rq1; 398 else 399 return rq2; 400 } 401 402 case CFQ_RQ2_WRAP: 403 return rq1; 404 case CFQ_RQ1_WRAP: 405 return rq2; 406 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 407 default: 408 /* 409 * Since both rqs are wrapped, 410 * start with the one that's further behind head 411 * (--> only *one* back seek required), 412 * since back seek takes more time than forward. 413 */ 414 if (s1 <= s2) 415 return rq1; 416 else 417 return rq2; 418 } 419} 420 421/* 422 * The below is leftmost cache rbtree addon 423 */ 424static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 425{ 426 if (!root->left) 427 root->left = rb_first(&root->rb); 428 429 if (root->left) 430 return rb_entry(root->left, struct cfq_queue, rb_node); 431 432 return NULL; 433} 434 435static void rb_erase_init(struct rb_node *n, struct rb_root *root) 436{ 437 rb_erase(n, root); 438 RB_CLEAR_NODE(n); 439} 440 441static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 442{ 443 if (root->left == n) 444 root->left = NULL; 445 rb_erase_init(n, &root->rb); 446} 447 448/* 449 * would be nice to take fifo expire time into account as well 450 */ 451static struct request * 452cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 453 struct request *last) 454{ 455 struct rb_node *rbnext = rb_next(&last->rb_node); 456 struct rb_node *rbprev = rb_prev(&last->rb_node); 457 struct request *next = NULL, *prev = NULL; 458 459 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 460 461 if (rbprev) 462 prev = rb_entry_rq(rbprev); 463 464 if (rbnext) 465 next = rb_entry_rq(rbnext); 466 else { 467 rbnext = rb_first(&cfqq->sort_list); 468 if (rbnext && rbnext != &last->rb_node) 469 next = rb_entry_rq(rbnext); 470 } 471 472 return cfq_choose_req(cfqd, next, prev); 473} 474 475static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 476 struct cfq_queue *cfqq) 477{ 478 /* 479 * just an approximation, should be ok. 480 */ 481 return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - 482 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 483} 484 485/* 486 * The cfqd->service_tree holds all pending cfq_queue's that have 487 * requests waiting to be processed. It is sorted in the order that 488 * we will service the queues. 489 */ 490static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 491 int add_front) 492{ 493 struct rb_node **p, *parent; 494 struct cfq_queue *__cfqq; 495 unsigned long rb_key; 496 int left; 497 498 if (cfq_class_idle(cfqq)) { 499 rb_key = CFQ_IDLE_DELAY; 500 parent = rb_last(&cfqd->service_tree.rb); 501 if (parent && parent != &cfqq->rb_node) { 502 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 503 rb_key += __cfqq->rb_key; 504 } else 505 rb_key += jiffies; 506 } else if (!add_front) { 507 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; 508 rb_key += cfqq->slice_resid; 509 cfqq->slice_resid = 0; 510 } else 511 rb_key = 0; 512 513 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 514 /* 515 * same position, nothing more to do 516 */ 517 if (rb_key == cfqq->rb_key) 518 return; 519 520 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 521 } 522 523 left = 1; 524 parent = NULL; 525 p = &cfqd->service_tree.rb.rb_node; 526 while (*p) { 527 struct rb_node **n; 528 529 parent = *p; 530 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 531 532 /* 533 * sort RT queues first, we always want to give 534 * preference to them. IDLE queues goes to the back. 535 * after that, sort on the next service time. 536 */ 537 if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) 538 n = &(*p)->rb_left; 539 else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) 540 n = &(*p)->rb_right; 541 else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) 542 n = &(*p)->rb_left; 543 else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) 544 n = &(*p)->rb_right; 545 else if (rb_key < __cfqq->rb_key) 546 n = &(*p)->rb_left; 547 else 548 n = &(*p)->rb_right; 549 550 if (n == &(*p)->rb_right) 551 left = 0; 552 553 p = n; 554 } 555 556 if (left) 557 cfqd->service_tree.left = &cfqq->rb_node; 558 559 cfqq->rb_key = rb_key; 560 rb_link_node(&cfqq->rb_node, parent, p); 561 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 562} 563 564static struct cfq_queue * 565cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, 566 sector_t sector, struct rb_node **ret_parent, 567 struct rb_node ***rb_link) 568{ 569 struct rb_node **p, *parent; 570 struct cfq_queue *cfqq = NULL; 571 572 parent = NULL; 573 p = &root->rb_node; 574 while (*p) { 575 struct rb_node **n; 576 577 parent = *p; 578 cfqq = rb_entry(parent, struct cfq_queue, p_node); 579 580 /* 581 * Sort strictly based on sector. Smallest to the left, 582 * largest to the right. 583 */ 584 if (sector > blk_rq_pos(cfqq->next_rq)) 585 n = &(*p)->rb_right; 586 else if (sector < blk_rq_pos(cfqq->next_rq)) 587 n = &(*p)->rb_left; 588 else 589 break; 590 p = n; 591 cfqq = NULL; 592 } 593 594 *ret_parent = parent; 595 if (rb_link) 596 *rb_link = p; 597 return cfqq; 598} 599 600static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 601{ 602 struct rb_node **p, *parent; 603 struct cfq_queue *__cfqq; 604 605 if (cfqq->p_root) { 606 rb_erase(&cfqq->p_node, cfqq->p_root); 607 cfqq->p_root = NULL; 608 } 609 610 if (cfq_class_idle(cfqq)) 611 return; 612 if (!cfqq->next_rq) 613 return; 614 615 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; 616 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, 617 blk_rq_pos(cfqq->next_rq), &parent, &p); 618 if (!__cfqq) { 619 rb_link_node(&cfqq->p_node, parent, p); 620 rb_insert_color(&cfqq->p_node, cfqq->p_root); 621 } else 622 cfqq->p_root = NULL; 623} 624 625/* 626 * Update cfqq's position in the service tree. 627 */ 628static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 629{ 630 /* 631 * Resorting requires the cfqq to be on the RR list already. 632 */ 633 if (cfq_cfqq_on_rr(cfqq)) { 634 cfq_service_tree_add(cfqd, cfqq, 0); 635 cfq_prio_tree_add(cfqd, cfqq); 636 } 637} 638 639/* 640 * add to busy list of queues for service, trying to be fair in ordering 641 * the pending list according to last request service 642 */ 643static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 644{ 645 cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); 646 BUG_ON(cfq_cfqq_on_rr(cfqq)); 647 cfq_mark_cfqq_on_rr(cfqq); 648 cfqd->busy_queues++; 649 650 cfq_resort_rr_list(cfqd, cfqq); 651} 652 653/* 654 * Called when the cfqq no longer has requests pending, remove it from 655 * the service tree. 656 */ 657static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 658{ 659 cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); 660 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 661 cfq_clear_cfqq_on_rr(cfqq); 662 663 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 664 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 665 if (cfqq->p_root) { 666 rb_erase(&cfqq->p_node, cfqq->p_root); 667 cfqq->p_root = NULL; 668 } 669 670 BUG_ON(!cfqd->busy_queues); 671 cfqd->busy_queues--; 672} 673 674/* 675 * rb tree support functions 676 */ 677static void cfq_del_rq_rb(struct request *rq) 678{ 679 struct cfq_queue *cfqq = RQ_CFQQ(rq); 680 struct cfq_data *cfqd = cfqq->cfqd; 681 const int sync = rq_is_sync(rq); 682 683 BUG_ON(!cfqq->queued[sync]); 684 cfqq->queued[sync]--; 685 686 elv_rb_del(&cfqq->sort_list, rq); 687 688 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 689 cfq_del_cfqq_rr(cfqd, cfqq); 690} 691 692static void cfq_add_rq_rb(struct request *rq) 693{ 694 struct cfq_queue *cfqq = RQ_CFQQ(rq); 695 struct cfq_data *cfqd = cfqq->cfqd; 696 struct request *__alias, *prev; 697 698 cfqq->queued[rq_is_sync(rq)]++; 699 700 /* 701 * looks a little odd, but the first insert might return an alias. 702 * if that happens, put the alias on the dispatch list 703 */ 704 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 705 cfq_dispatch_insert(cfqd->queue, __alias); 706 707 if (!cfq_cfqq_on_rr(cfqq)) 708 cfq_add_cfqq_rr(cfqd, cfqq); 709 710 /* 711 * check if this request is a better next-serve candidate 712 */ 713 prev = cfqq->next_rq; 714 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 715 716 /* 717 * adjust priority tree position, if ->next_rq changes 718 */ 719 if (prev != cfqq->next_rq) 720 cfq_prio_tree_add(cfqd, cfqq); 721 722 BUG_ON(!cfqq->next_rq); 723} 724 725static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 726{ 727 elv_rb_del(&cfqq->sort_list, rq); 728 cfqq->queued[rq_is_sync(rq)]--; 729 cfq_add_rq_rb(rq); 730} 731 732static struct request * 733cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 734{ 735 struct task_struct *tsk = current; 736 struct cfq_io_context *cic; 737 struct cfq_queue *cfqq; 738 739 cic = cfq_cic_lookup(cfqd, tsk->io_context); 740 if (!cic) 741 return NULL; 742 743 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 744 if (cfqq) { 745 sector_t sector = bio->bi_sector + bio_sectors(bio); 746 747 return elv_rb_find(&cfqq->sort_list, sector); 748 } 749 750 return NULL; 751} 752 753static void cfq_activate_request(struct request_queue *q, struct request *rq) 754{ 755 struct cfq_data *cfqd = q->elevator->elevator_data; 756 757 cfqd->rq_in_driver[rq_is_sync(rq)]++; 758 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 759 rq_in_driver(cfqd)); 760 761 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 762} 763 764static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 765{ 766 struct cfq_data *cfqd = q->elevator->elevator_data; 767 const int sync = rq_is_sync(rq); 768 769 WARN_ON(!cfqd->rq_in_driver[sync]); 770 cfqd->rq_in_driver[sync]--; 771 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 772 rq_in_driver(cfqd)); 773} 774 775static void cfq_remove_request(struct request *rq) 776{ 777 struct cfq_queue *cfqq = RQ_CFQQ(rq); 778 779 if (cfqq->next_rq == rq) 780 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 781 782 list_del_init(&rq->queuelist); 783 cfq_del_rq_rb(rq); 784 785 cfqq->cfqd->rq_queued--; 786 if (rq_is_meta(rq)) { 787 WARN_ON(!cfqq->meta_pending); 788 cfqq->meta_pending--; 789 } 790} 791 792static int cfq_merge(struct request_queue *q, struct request **req, 793 struct bio *bio) 794{ 795 struct cfq_data *cfqd = q->elevator->elevator_data; 796 struct request *__rq; 797 798 __rq = cfq_find_rq_fmerge(cfqd, bio); 799 if (__rq && elv_rq_merge_ok(__rq, bio)) { 800 *req = __rq; 801 return ELEVATOR_FRONT_MERGE; 802 } 803 804 return ELEVATOR_NO_MERGE; 805} 806 807static void cfq_merged_request(struct request_queue *q, struct request *req, 808 int type) 809{ 810 if (type == ELEVATOR_FRONT_MERGE) { 811 struct cfq_queue *cfqq = RQ_CFQQ(req); 812 813 cfq_reposition_rq_rb(cfqq, req); 814 } 815} 816 817static void 818cfq_merged_requests(struct request_queue *q, struct request *rq, 819 struct request *next) 820{ 821 /* 822 * reposition in fifo if next is older than rq 823 */ 824 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 825 time_before(next->start_time, rq->start_time)) 826 list_move(&rq->queuelist, &next->queuelist); 827 828 cfq_remove_request(next); 829} 830 831static int cfq_allow_merge(struct request_queue *q, struct request *rq, 832 struct bio *bio) 833{ 834 struct cfq_data *cfqd = q->elevator->elevator_data; 835 struct cfq_io_context *cic; 836 struct cfq_queue *cfqq; 837 838 /* 839 * Disallow merge of a sync bio into an async request. 840 */ 841 if (cfq_bio_sync(bio) && !rq_is_sync(rq)) 842 return 0; 843 844 /* 845 * Lookup the cfqq that this bio will be queued with. Allow 846 * merge only if rq is queued there. 847 */ 848 cic = cfq_cic_lookup(cfqd, current->io_context); 849 if (!cic) 850 return 0; 851 852 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 853 if (cfqq == RQ_CFQQ(rq)) 854 return 1; 855 856 return 0; 857} 858 859static void __cfq_set_active_queue(struct cfq_data *cfqd, 860 struct cfq_queue *cfqq) 861{ 862 if (cfqq) { 863 cfq_log_cfqq(cfqd, cfqq, "set_active"); 864 cfqq->slice_end = 0; 865 cfqq->slice_dispatch = 0; 866 867 cfq_clear_cfqq_wait_request(cfqq); 868 cfq_clear_cfqq_must_dispatch(cfqq); 869 cfq_clear_cfqq_must_alloc_slice(cfqq); 870 cfq_clear_cfqq_fifo_expire(cfqq); 871 cfq_mark_cfqq_slice_new(cfqq); 872 873 del_timer(&cfqd->idle_slice_timer); 874 } 875 876 cfqd->active_queue = cfqq; 877} 878 879/* 880 * current cfqq expired its slice (or was too idle), select new one 881 */ 882static void 883__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 884 int timed_out) 885{ 886 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 887 888 if (cfq_cfqq_wait_request(cfqq)) 889 del_timer(&cfqd->idle_slice_timer); 890 891 cfq_clear_cfqq_wait_request(cfqq); 892 893 /* 894 * store what was left of this slice, if the queue idled/timed out 895 */ 896 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 897 cfqq->slice_resid = cfqq->slice_end - jiffies; 898 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 899 } 900 901 cfq_resort_rr_list(cfqd, cfqq); 902 903 if (cfqq == cfqd->active_queue) 904 cfqd->active_queue = NULL; 905 906 if (cfqd->active_cic) { 907 put_io_context(cfqd->active_cic->ioc); 908 cfqd->active_cic = NULL; 909 } 910} 911 912static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out) 913{ 914 struct cfq_queue *cfqq = cfqd->active_queue; 915 916 if (cfqq) 917 __cfq_slice_expired(cfqd, cfqq, timed_out); 918} 919 920/* 921 * Get next queue for service. Unless we have a queue preemption, 922 * we'll simply select the first cfqq in the service tree. 923 */ 924static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 925{ 926 if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) 927 return NULL; 928 929 return cfq_rb_first(&cfqd->service_tree); 930} 931 932/* 933 * Get and set a new active queue for service. 934 */ 935static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 936 struct cfq_queue *cfqq) 937{ 938 if (!cfqq) { 939 cfqq = cfq_get_next_queue(cfqd); 940 if (cfqq) 941 cfq_clear_cfqq_coop(cfqq); 942 } 943 944 __cfq_set_active_queue(cfqd, cfqq); 945 return cfqq; 946} 947 948static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 949 struct request *rq) 950{ 951 if (blk_rq_pos(rq) >= cfqd->last_position) 952 return blk_rq_pos(rq) - cfqd->last_position; 953 else 954 return cfqd->last_position - blk_rq_pos(rq); 955} 956 957#define CIC_SEEK_THR 8 * 1024 958#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR) 959 960static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) 961{ 962 struct cfq_io_context *cic = cfqd->active_cic; 963 sector_t sdist = cic->seek_mean; 964 965 if (!sample_valid(cic->seek_samples)) 966 sdist = CIC_SEEK_THR; 967 968 return cfq_dist_from_last(cfqd, rq) <= sdist; 969} 970 971static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 972 struct cfq_queue *cur_cfqq) 973{ 974 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; 975 struct rb_node *parent, *node; 976 struct cfq_queue *__cfqq; 977 sector_t sector = cfqd->last_position; 978 979 if (RB_EMPTY_ROOT(root)) 980 return NULL; 981 982 /* 983 * First, if we find a request starting at the end of the last 984 * request, choose it. 985 */ 986 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); 987 if (__cfqq) 988 return __cfqq; 989 990 /* 991 * If the exact sector wasn't found, the parent of the NULL leaf 992 * will contain the closest sector. 993 */ 994 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 995 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 996 return __cfqq; 997 998 if (blk_rq_pos(__cfqq->next_rq) < sector) 999 node = rb_next(&__cfqq->p_node); 1000 else 1001 node = rb_prev(&__cfqq->p_node); 1002 if (!node) 1003 return NULL; 1004 1005 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1006 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1007 return __cfqq; 1008 1009 return NULL; 1010} 1011 1012/* 1013 * cfqd - obvious 1014 * cur_cfqq - passed in so that we don't decide that the current queue is 1015 * closely cooperating with itself. 1016 * 1017 * So, basically we're assuming that that cur_cfqq has dispatched at least 1018 * one request, and that cfqd->last_position reflects a position on the disk 1019 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 1020 * assumption. 1021 */ 1022static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1023 struct cfq_queue *cur_cfqq, 1024 int probe) 1025{ 1026 struct cfq_queue *cfqq; 1027 1028 /* 1029 * A valid cfq_io_context is necessary to compare requests against 1030 * the seek_mean of the current cfqq. 1031 */ 1032 if (!cfqd->active_cic) 1033 return NULL; 1034 1035 /* 1036 * We should notice if some of the queues are cooperating, eg 1037 * working closely on the same area of the disk. In that case, 1038 * we can group them together and don't waste time idling. 1039 */ 1040 cfqq = cfqq_close(cfqd, cur_cfqq); 1041 if (!cfqq) 1042 return NULL; 1043 1044 if (cfq_cfqq_coop(cfqq)) 1045 return NULL; 1046 1047 if (!probe) 1048 cfq_mark_cfqq_coop(cfqq); 1049 return cfqq; 1050} 1051 1052static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1053{ 1054 struct cfq_queue *cfqq = cfqd->active_queue; 1055 struct cfq_io_context *cic; 1056 unsigned long sl; 1057 1058 /* 1059 * SSD device without seek penalty, disable idling. But only do so 1060 * for devices that support queuing, otherwise we still have a problem 1061 * with sync vs async workloads. 1062 */ 1063 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) 1064 return; 1065 1066 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 1067 WARN_ON(cfq_cfqq_slice_new(cfqq)); 1068 1069 /* 1070 * idle is disabled, either manually or by past process history 1071 */ 1072 if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) 1073 return; 1074 1075 /* 1076 * still requests with the driver, don't idle 1077 */ 1078 if (rq_in_driver(cfqd)) 1079 return; 1080 1081 /* 1082 * task has exited, don't wait 1083 */ 1084 cic = cfqd->active_cic; 1085 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1086 return; 1087 1088 cfq_mark_cfqq_wait_request(cfqq); 1089 1090 /* 1091 * we don't want to idle for seeks, but we do want to allow 1092 * fair distribution of slice time for a process doing back-to-back 1093 * seeks. so allow a little bit of time for him to submit a new rq 1094 */ 1095 sl = cfqd->cfq_slice_idle; 1096 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) 1097 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); 1098 1099 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1100 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1101} 1102 1103/* 1104 * Move request from internal lists to the request queue dispatch list. 1105 */ 1106static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) 1107{ 1108 struct cfq_data *cfqd = q->elevator->elevator_data; 1109 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1110 1111 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 1112 1113 cfq_remove_request(rq); 1114 cfqq->dispatched++; 1115 elv_dispatch_sort(q, rq); 1116 1117 if (cfq_cfqq_sync(cfqq)) 1118 cfqd->sync_flight++; 1119} 1120 1121/* 1122 * return expired entry, or NULL to just start from scratch in rbtree 1123 */ 1124static struct request *cfq_check_fifo(struct cfq_queue *cfqq) 1125{ 1126 struct cfq_data *cfqd = cfqq->cfqd; 1127 struct request *rq; 1128 int fifo; 1129 1130 if (cfq_cfqq_fifo_expire(cfqq)) 1131 return NULL; 1132 1133 cfq_mark_cfqq_fifo_expire(cfqq); 1134 1135 if (list_empty(&cfqq->fifo)) 1136 return NULL; 1137 1138 fifo = cfq_cfqq_sync(cfqq); 1139 rq = rq_entry_fifo(cfqq->fifo.next); 1140 1141 if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) 1142 rq = NULL; 1143 1144 cfq_log_cfqq(cfqd, cfqq, "fifo=%p", rq); 1145 return rq; 1146} 1147 1148static inline int 1149cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1150{ 1151 const int base_rq = cfqd->cfq_slice_async_rq; 1152 1153 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 1154 1155 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 1156} 1157 1158/* 1159 * Select a queue for service. If we have a current active queue, 1160 * check whether to continue servicing it, or retrieve and set a new one. 1161 */ 1162static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 1163{ 1164 struct cfq_queue *cfqq, *new_cfqq = NULL; 1165 1166 cfqq = cfqd->active_queue; 1167 if (!cfqq) 1168 goto new_queue; 1169 1170 /* 1171 * The active queue has run out of time, expire it and select new. 1172 */ 1173 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) 1174 goto expire; 1175 1176 /* 1177 * The active queue has requests and isn't expired, allow it to 1178 * dispatch. 1179 */ 1180 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 1181 goto keep_queue; 1182 1183 /* 1184 * If another queue has a request waiting within our mean seek 1185 * distance, let it run. The expire code will check for close 1186 * cooperators and put the close queue at the front of the service 1187 * tree. 1188 */ 1189 new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); 1190 if (new_cfqq) 1191 goto expire; 1192 1193 /* 1194 * No requests pending. If the active queue still has requests in 1195 * flight or is idling for a new request, allow either of these 1196 * conditions to happen (or time out) before selecting a new queue. 1197 */ 1198 if (timer_pending(&cfqd->idle_slice_timer) || 1199 (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { 1200 cfqq = NULL; 1201 goto keep_queue; 1202 } 1203 1204expire: 1205 cfq_slice_expired(cfqd, 0); 1206new_queue: 1207 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 1208keep_queue: 1209 return cfqq; 1210} 1211 1212static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 1213{ 1214 int dispatched = 0; 1215 1216 while (cfqq->next_rq) { 1217 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 1218 dispatched++; 1219 } 1220 1221 BUG_ON(!list_empty(&cfqq->fifo)); 1222 return dispatched; 1223} 1224 1225/* 1226 * Drain our current requests. Used for barriers and when switching 1227 * io schedulers on-the-fly. 1228 */ 1229static int cfq_forced_dispatch(struct cfq_data *cfqd) 1230{ 1231 struct cfq_queue *cfqq; 1232 int dispatched = 0; 1233 1234 while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) 1235 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 1236 1237 cfq_slice_expired(cfqd, 0); 1238 1239 BUG_ON(cfqd->busy_queues); 1240 1241 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 1242 return dispatched; 1243} 1244 1245/* 1246 * Dispatch a request from cfqq, moving them to the request queue 1247 * dispatch list. 1248 */ 1249static void cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1250{ 1251 struct request *rq; 1252 1253 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 1254 1255 /* 1256 * follow expired path, else get first next available 1257 */ 1258 rq = cfq_check_fifo(cfqq); 1259 if (!rq) 1260 rq = cfqq->next_rq; 1261 1262 /* 1263 * insert request into driver dispatch list 1264 */ 1265 cfq_dispatch_insert(cfqd->queue, rq); 1266 1267 if (!cfqd->active_cic) { 1268 struct cfq_io_context *cic = RQ_CIC(rq); 1269 1270 atomic_long_inc(&cic->ioc->refcount); 1271 cfqd->active_cic = cic; 1272 } 1273} 1274 1275/* 1276 * Find the cfqq that we need to service and move a request from that to the 1277 * dispatch list 1278 */ 1279static int cfq_dispatch_requests(struct request_queue *q, int force) 1280{ 1281 struct cfq_data *cfqd = q->elevator->elevator_data; 1282 struct cfq_queue *cfqq; 1283 unsigned int max_dispatch; 1284 1285 if (!cfqd->busy_queues) 1286 return 0; 1287 1288 if (unlikely(force)) 1289 return cfq_forced_dispatch(cfqd); 1290 1291 cfqq = cfq_select_queue(cfqd); 1292 if (!cfqq) 1293 return 0; 1294 1295 /* 1296 * Drain async requests before we start sync IO 1297 */ 1298 if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) 1299 return 0; 1300 1301 /* 1302 * If this is an async queue and we have sync IO in flight, let it wait 1303 */ 1304 if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) 1305 return 0; 1306 1307 max_dispatch = cfqd->cfq_quantum; 1308 if (cfq_class_idle(cfqq)) 1309 max_dispatch = 1; 1310 1311 /* 1312 * Does this cfqq already have too much IO in flight? 1313 */ 1314 if (cfqq->dispatched >= max_dispatch) { 1315 /* 1316 * idle queue must always only have a single IO in flight 1317 */ 1318 if (cfq_class_idle(cfqq)) 1319 return 0; 1320 1321 /* 1322 * We have other queues, don't allow more IO from this one 1323 */ 1324 if (cfqd->busy_queues > 1) 1325 return 0; 1326 1327 /* 1328 * we are the only queue, allow up to 4 times of 'quantum' 1329 */ 1330 if (cfqq->dispatched >= 4 * max_dispatch) 1331 return 0; 1332 } 1333 1334 /* 1335 * Dispatch a request from this cfqq 1336 */ 1337 cfq_dispatch_request(cfqd, cfqq); 1338 cfqq->slice_dispatch++; 1339 cfq_clear_cfqq_must_dispatch(cfqq); 1340 1341 /* 1342 * expire an async queue immediately if it has used up its slice. idle 1343 * queue always expire after 1 dispatch round. 1344 */ 1345 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 1346 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 1347 cfq_class_idle(cfqq))) { 1348 cfqq->slice_end = jiffies + 1; 1349 cfq_slice_expired(cfqd, 0); 1350 } 1351 1352 cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); 1353 return 1; 1354} 1355 1356/* 1357 * task holds one reference to the queue, dropped when task exits. each rq 1358 * in-flight on this queue also holds a reference, dropped when rq is freed. 1359 * 1360 * queue lock must be held here. 1361 */ 1362static void cfq_put_queue(struct cfq_queue *cfqq) 1363{ 1364 struct cfq_data *cfqd = cfqq->cfqd; 1365 1366 BUG_ON(atomic_read(&cfqq->ref) <= 0); 1367 1368 if (!atomic_dec_and_test(&cfqq->ref)) 1369 return; 1370 1371 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 1372 BUG_ON(rb_first(&cfqq->sort_list)); 1373 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 1374 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1375 1376 if (unlikely(cfqd->active_queue == cfqq)) { 1377 __cfq_slice_expired(cfqd, cfqq, 0); 1378 cfq_schedule_dispatch(cfqd); 1379 } 1380 1381 kmem_cache_free(cfq_pool, cfqq); 1382} 1383 1384/* 1385 * Must always be called with the rcu_read_lock() held 1386 */ 1387static void 1388__call_for_each_cic(struct io_context *ioc, 1389 void (*func)(struct io_context *, struct cfq_io_context *)) 1390{ 1391 struct cfq_io_context *cic; 1392 struct hlist_node *n; 1393 1394 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 1395 func(ioc, cic); 1396} 1397 1398/* 1399 * Call func for each cic attached to this ioc. 1400 */ 1401static void 1402call_for_each_cic(struct io_context *ioc, 1403 void (*func)(struct io_context *, struct cfq_io_context *)) 1404{ 1405 rcu_read_lock(); 1406 __call_for_each_cic(ioc, func); 1407 rcu_read_unlock(); 1408} 1409 1410static void cfq_cic_free_rcu(struct rcu_head *head) 1411{ 1412 struct cfq_io_context *cic; 1413 1414 cic = container_of(head, struct cfq_io_context, rcu_head); 1415 1416 kmem_cache_free(cfq_ioc_pool, cic); 1417 elv_ioc_count_dec(ioc_count); 1418 1419 if (ioc_gone) { 1420 /* 1421 * CFQ scheduler is exiting, grab exit lock and check 1422 * the pending io context count. If it hits zero, 1423 * complete ioc_gone and set it back to NULL 1424 */ 1425 spin_lock(&ioc_gone_lock); 1426 if (ioc_gone && !elv_ioc_count_read(ioc_count)) { 1427 complete(ioc_gone); 1428 ioc_gone = NULL; 1429 } 1430 spin_unlock(&ioc_gone_lock); 1431 } 1432} 1433 1434static void cfq_cic_free(struct cfq_io_context *cic) 1435{ 1436 call_rcu(&cic->rcu_head, cfq_cic_free_rcu); 1437} 1438 1439static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 1440{ 1441 unsigned long flags; 1442 1443 BUG_ON(!cic->dead_key); 1444 1445 spin_lock_irqsave(&ioc->lock, flags); 1446 radix_tree_delete(&ioc->radix_root, cic->dead_key); 1447 hlist_del_rcu(&cic->cic_list); 1448 spin_unlock_irqrestore(&ioc->lock, flags); 1449 1450 cfq_cic_free(cic); 1451} 1452 1453/* 1454 * Must be called with rcu_read_lock() held or preemption otherwise disabled. 1455 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), 1456 * and ->trim() which is called with the task lock held 1457 */ 1458static void cfq_free_io_context(struct io_context *ioc) 1459{ 1460 /* 1461 * ioc->refcount is zero here, or we are called from elv_unregister(), 1462 * so no more cic's are allowed to be linked into this ioc. So it 1463 * should be ok to iterate over the known list, we will see all cic's 1464 * since no new ones are added. 1465 */ 1466 __call_for_each_cic(ioc, cic_free_func); 1467} 1468 1469static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1470{ 1471 if (unlikely(cfqq == cfqd->active_queue)) { 1472 __cfq_slice_expired(cfqd, cfqq, 0); 1473 cfq_schedule_dispatch(cfqd); 1474 } 1475 1476 cfq_put_queue(cfqq); 1477} 1478 1479static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 1480 struct cfq_io_context *cic) 1481{ 1482 struct io_context *ioc = cic->ioc; 1483 1484 list_del_init(&cic->queue_list); 1485 1486 /* 1487 * Make sure key == NULL is seen for dead queues 1488 */ 1489 smp_wmb(); 1490 cic->dead_key = (unsigned long) cic->key; 1491 cic->key = NULL; 1492 1493 if (ioc->ioc_data == cic) 1494 rcu_assign_pointer(ioc->ioc_data, NULL); 1495 1496 if (cic->cfqq[BLK_RW_ASYNC]) { 1497 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 1498 cic->cfqq[BLK_RW_ASYNC] = NULL; 1499 } 1500 1501 if (cic->cfqq[BLK_RW_SYNC]) { 1502 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 1503 cic->cfqq[BLK_RW_SYNC] = NULL; 1504 } 1505} 1506 1507static void cfq_exit_single_io_context(struct io_context *ioc, 1508 struct cfq_io_context *cic) 1509{ 1510 struct cfq_data *cfqd = cic->key; 1511 1512 if (cfqd) { 1513 struct request_queue *q = cfqd->queue; 1514 unsigned long flags; 1515 1516 spin_lock_irqsave(q->queue_lock, flags); 1517 1518 /* 1519 * Ensure we get a fresh copy of the ->key to prevent 1520 * race between exiting task and queue 1521 */ 1522 smp_read_barrier_depends(); 1523 if (cic->key) 1524 __cfq_exit_single_io_context(cfqd, cic); 1525 1526 spin_unlock_irqrestore(q->queue_lock, flags); 1527 } 1528} 1529 1530/* 1531 * The process that ioc belongs to has exited, we need to clean up 1532 * and put the internal structures we have that belongs to that process. 1533 */ 1534static void cfq_exit_io_context(struct io_context *ioc) 1535{ 1536 call_for_each_cic(ioc, cfq_exit_single_io_context); 1537} 1538 1539static struct cfq_io_context * 1540cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 1541{ 1542 struct cfq_io_context *cic; 1543 1544 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, 1545 cfqd->queue->node); 1546 if (cic) { 1547 cic->last_end_request = jiffies; 1548 INIT_LIST_HEAD(&cic->queue_list); 1549 INIT_HLIST_NODE(&cic->cic_list); 1550 cic->dtor = cfq_free_io_context; 1551 cic->exit = cfq_exit_io_context; 1552 elv_ioc_count_inc(ioc_count); 1553 } 1554 1555 return cic; 1556} 1557 1558static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 1559{ 1560 struct task_struct *tsk = current; 1561 int ioprio_class; 1562 1563 if (!cfq_cfqq_prio_changed(cfqq)) 1564 return; 1565 1566 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); 1567 switch (ioprio_class) { 1568 default: 1569 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 1570 case IOPRIO_CLASS_NONE: 1571 /* 1572 * no prio set, inherit CPU scheduling settings 1573 */ 1574 cfqq->ioprio = task_nice_ioprio(tsk); 1575 cfqq->ioprio_class = task_nice_ioclass(tsk); 1576 break; 1577 case IOPRIO_CLASS_RT: 1578 cfqq->ioprio = task_ioprio(ioc); 1579 cfqq->ioprio_class = IOPRIO_CLASS_RT; 1580 break; 1581 case IOPRIO_CLASS_BE: 1582 cfqq->ioprio = task_ioprio(ioc); 1583 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1584 break; 1585 case IOPRIO_CLASS_IDLE: 1586 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 1587 cfqq->ioprio = 7; 1588 cfq_clear_cfqq_idle_window(cfqq); 1589 break; 1590 } 1591 1592 /* 1593 * keep track of original prio settings in case we have to temporarily 1594 * elevate the priority of this queue 1595 */ 1596 cfqq->org_ioprio = cfqq->ioprio; 1597 cfqq->org_ioprio_class = cfqq->ioprio_class; 1598 cfq_clear_cfqq_prio_changed(cfqq); 1599} 1600 1601static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 1602{ 1603 struct cfq_data *cfqd = cic->key; 1604 struct cfq_queue *cfqq; 1605 unsigned long flags; 1606 1607 if (unlikely(!cfqd)) 1608 return; 1609 1610 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1611 1612 cfqq = cic->cfqq[BLK_RW_ASYNC]; 1613 if (cfqq) { 1614 struct cfq_queue *new_cfqq; 1615 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 1616 GFP_ATOMIC); 1617 if (new_cfqq) { 1618 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 1619 cfq_put_queue(cfqq); 1620 } 1621 } 1622 1623 cfqq = cic->cfqq[BLK_RW_SYNC]; 1624 if (cfqq) 1625 cfq_mark_cfqq_prio_changed(cfqq); 1626 1627 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1628} 1629 1630static void cfq_ioc_set_ioprio(struct io_context *ioc) 1631{ 1632 call_for_each_cic(ioc, changed_ioprio); 1633 ioc->ioprio_changed = 0; 1634} 1635 1636static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1637 pid_t pid, int is_sync) 1638{ 1639 RB_CLEAR_NODE(&cfqq->rb_node); 1640 RB_CLEAR_NODE(&cfqq->p_node); 1641 INIT_LIST_HEAD(&cfqq->fifo); 1642 1643 atomic_set(&cfqq->ref, 0); 1644 cfqq->cfqd = cfqd; 1645 1646 cfq_mark_cfqq_prio_changed(cfqq); 1647 1648 if (is_sync) { 1649 if (!cfq_class_idle(cfqq)) 1650 cfq_mark_cfqq_idle_window(cfqq); 1651 cfq_mark_cfqq_sync(cfqq); 1652 } 1653 cfqq->pid = pid; 1654} 1655 1656static struct cfq_queue * 1657cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync, 1658 struct io_context *ioc, gfp_t gfp_mask) 1659{ 1660 struct cfq_queue *cfqq, *new_cfqq = NULL; 1661 struct cfq_io_context *cic; 1662 1663retry: 1664 cic = cfq_cic_lookup(cfqd, ioc); 1665 /* cic always exists here */ 1666 cfqq = cic_to_cfqq(cic, is_sync); 1667 1668 /* 1669 * Always try a new alloc if we fell back to the OOM cfqq 1670 * originally, since it should just be a temporary situation. 1671 */ 1672 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 1673 cfqq = NULL; 1674 if (new_cfqq) { 1675 cfqq = new_cfqq; 1676 new_cfqq = NULL; 1677 } else if (gfp_mask & __GFP_WAIT) { 1678 spin_unlock_irq(cfqd->queue->queue_lock); 1679 new_cfqq = kmem_cache_alloc_node(cfq_pool, 1680 gfp_mask | __GFP_ZERO, 1681 cfqd->queue->node); 1682 spin_lock_irq(cfqd->queue->queue_lock); 1683 if (new_cfqq) 1684 goto retry; 1685 } else { 1686 cfqq = kmem_cache_alloc_node(cfq_pool, 1687 gfp_mask | __GFP_ZERO, 1688 cfqd->queue->node); 1689 } 1690 1691 if (cfqq) { 1692 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 1693 cfq_init_prio_data(cfqq, ioc); 1694 cfq_log_cfqq(cfqd, cfqq, "alloced"); 1695 } else 1696 cfqq = &cfqd->oom_cfqq; 1697 } 1698 1699 if (new_cfqq) 1700 kmem_cache_free(cfq_pool, new_cfqq); 1701 1702 return cfqq; 1703} 1704 1705static struct cfq_queue ** 1706cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) 1707{ 1708 switch (ioprio_class) { 1709 case IOPRIO_CLASS_RT: 1710 return &cfqd->async_cfqq[0][ioprio]; 1711 case IOPRIO_CLASS_BE: 1712 return &cfqd->async_cfqq[1][ioprio]; 1713 case IOPRIO_CLASS_IDLE: 1714 return &cfqd->async_idle_cfqq; 1715 default: 1716 BUG(); 1717 } 1718} 1719 1720static struct cfq_queue * 1721cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc, 1722 gfp_t gfp_mask) 1723{ 1724 const int ioprio = task_ioprio(ioc); 1725 const int ioprio_class = task_ioprio_class(ioc); 1726 struct cfq_queue **async_cfqq = NULL; 1727 struct cfq_queue *cfqq = NULL; 1728 1729 if (!is_sync) { 1730 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 1731 cfqq = *async_cfqq; 1732 } 1733 1734 if (!cfqq) 1735 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 1736 1737 /* 1738 * pin the queue now that it's allocated, scheduler exit will prune it 1739 */ 1740 if (!is_sync && !(*async_cfqq)) { 1741 atomic_inc(&cfqq->ref); 1742 *async_cfqq = cfqq; 1743 } 1744 1745 atomic_inc(&cfqq->ref); 1746 return cfqq; 1747} 1748 1749/* 1750 * We drop cfq io contexts lazily, so we may find a dead one. 1751 */ 1752static void 1753cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, 1754 struct cfq_io_context *cic) 1755{ 1756 unsigned long flags; 1757 1758 WARN_ON(!list_empty(&cic->queue_list)); 1759 1760 spin_lock_irqsave(&ioc->lock, flags); 1761 1762 BUG_ON(ioc->ioc_data == cic); 1763 1764 radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd); 1765 hlist_del_rcu(&cic->cic_list); 1766 spin_unlock_irqrestore(&ioc->lock, flags); 1767 1768 cfq_cic_free(cic); 1769} 1770 1771static struct cfq_io_context * 1772cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 1773{ 1774 struct cfq_io_context *cic; 1775 unsigned long flags; 1776 void *k; 1777 1778 if (unlikely(!ioc)) 1779 return NULL; 1780 1781 rcu_read_lock(); 1782 1783 /* 1784 * we maintain a last-hit cache, to avoid browsing over the tree 1785 */ 1786 cic = rcu_dereference(ioc->ioc_data); 1787 if (cic && cic->key == cfqd) { 1788 rcu_read_unlock(); 1789 return cic; 1790 } 1791 1792 do { 1793 cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); 1794 rcu_read_unlock(); 1795 if (!cic) 1796 break; 1797 /* ->key must be copied to avoid race with cfq_exit_queue() */ 1798 k = cic->key; 1799 if (unlikely(!k)) { 1800 cfq_drop_dead_cic(cfqd, ioc, cic); 1801 rcu_read_lock(); 1802 continue; 1803 } 1804 1805 spin_lock_irqsave(&ioc->lock, flags); 1806 rcu_assign_pointer(ioc->ioc_data, cic); 1807 spin_unlock_irqrestore(&ioc->lock, flags); 1808 break; 1809 } while (1); 1810 1811 return cic; 1812} 1813 1814/* 1815 * Add cic into ioc, using cfqd as the search key. This enables us to lookup 1816 * the process specific cfq io context when entered from the block layer. 1817 * Also adds the cic to a per-cfqd list, used when this queue is removed. 1818 */ 1819static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 1820 struct cfq_io_context *cic, gfp_t gfp_mask) 1821{ 1822 unsigned long flags; 1823 int ret; 1824 1825 ret = radix_tree_preload(gfp_mask); 1826 if (!ret) { 1827 cic->ioc = ioc; 1828 cic->key = cfqd; 1829 1830 spin_lock_irqsave(&ioc->lock, flags); 1831 ret = radix_tree_insert(&ioc->radix_root, 1832 (unsigned long) cfqd, cic); 1833 if (!ret) 1834 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 1835 spin_unlock_irqrestore(&ioc->lock, flags); 1836 1837 radix_tree_preload_end(); 1838 1839 if (!ret) { 1840 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1841 list_add(&cic->queue_list, &cfqd->cic_list); 1842 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1843 } 1844 } 1845 1846 if (ret) 1847 printk(KERN_ERR "cfq: cic link failed!\n"); 1848 1849 return ret; 1850} 1851 1852/* 1853 * Setup general io context and cfq io context. There can be several cfq 1854 * io contexts per general io context, if this process is doing io to more 1855 * than one device managed by cfq. 1856 */ 1857static struct cfq_io_context * 1858cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 1859{ 1860 struct io_context *ioc = NULL; 1861 struct cfq_io_context *cic; 1862 1863 might_sleep_if(gfp_mask & __GFP_WAIT); 1864 1865 ioc = get_io_context(gfp_mask, cfqd->queue->node); 1866 if (!ioc) 1867 return NULL; 1868 1869 cic = cfq_cic_lookup(cfqd, ioc); 1870 if (cic) 1871 goto out; 1872 1873 cic = cfq_alloc_io_context(cfqd, gfp_mask); 1874 if (cic == NULL) 1875 goto err; 1876 1877 if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) 1878 goto err_free; 1879 1880out: 1881 smp_read_barrier_depends(); 1882 if (unlikely(ioc->ioprio_changed)) 1883 cfq_ioc_set_ioprio(ioc); 1884 1885 return cic; 1886err_free: 1887 cfq_cic_free(cic); 1888err: 1889 put_io_context(ioc); 1890 return NULL; 1891} 1892 1893static void 1894cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 1895{ 1896 unsigned long elapsed = jiffies - cic->last_end_request; 1897 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 1898 1899 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 1900 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 1901 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 1902} 1903 1904static void 1905cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, 1906 struct request *rq) 1907{ 1908 sector_t sdist; 1909 u64 total; 1910 1911 if (!cic->last_request_pos) 1912 sdist = 0; 1913 else if (cic->last_request_pos < blk_rq_pos(rq)) 1914 sdist = blk_rq_pos(rq) - cic->last_request_pos; 1915 else 1916 sdist = cic->last_request_pos - blk_rq_pos(rq); 1917 1918 /* 1919 * Don't allow the seek distance to get too large from the 1920 * odd fragment, pagein, etc 1921 */ 1922 if (cic->seek_samples <= 60) /* second&third seek */ 1923 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); 1924 else 1925 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); 1926 1927 cic->seek_samples = (7*cic->seek_samples + 256) / 8; 1928 cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 1929 total = cic->seek_total + (cic->seek_samples/2); 1930 do_div(total, cic->seek_samples); 1931 cic->seek_mean = (sector_t)total; 1932} 1933 1934/* 1935 * Disable idle window if the process thinks too long or seeks so much that 1936 * it doesn't matter 1937 */ 1938static void 1939cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1940 struct cfq_io_context *cic) 1941{ 1942 int old_idle, enable_idle; 1943 1944 /* 1945 * Don't idle for async or idle io prio class 1946 */ 1947 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) 1948 return; 1949 1950 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 1951 1952 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 1953 (cfqd->hw_tag && CIC_SEEKY(cic))) 1954 enable_idle = 0; 1955 else if (sample_valid(cic->ttime_samples)) { 1956 if (cic->ttime_mean > cfqd->cfq_slice_idle) 1957 enable_idle = 0; 1958 else 1959 enable_idle = 1; 1960 } 1961 1962 if (old_idle != enable_idle) { 1963 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); 1964 if (enable_idle) 1965 cfq_mark_cfqq_idle_window(cfqq); 1966 else 1967 cfq_clear_cfqq_idle_window(cfqq); 1968 } 1969} 1970 1971/* 1972 * Check if new_cfqq should preempt the currently active queue. Return 0 for 1973 * no or if we aren't sure, a 1 will cause a preempt. 1974 */ 1975static int 1976cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 1977 struct request *rq) 1978{ 1979 struct cfq_queue *cfqq; 1980 1981 cfqq = cfqd->active_queue; 1982 if (!cfqq) 1983 return 0; 1984 1985 if (cfq_slice_used(cfqq)) 1986 return 1; 1987 1988 if (cfq_class_idle(new_cfqq)) 1989 return 0; 1990 1991 if (cfq_class_idle(cfqq)) 1992 return 1; 1993 1994 /* 1995 * if the new request is sync, but the currently running queue is 1996 * not, let the sync request have priority. 1997 */ 1998 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 1999 return 1; 2000 2001 /* 2002 * So both queues are sync. Let the new request get disk time if 2003 * it's a metadata request and the current queue is doing regular IO. 2004 */ 2005 if (rq_is_meta(rq) && !cfqq->meta_pending) 2006 return 1; 2007 2008 /* 2009 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. 2010 */ 2011 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 2012 return 1; 2013 2014 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 2015 return 0; 2016 2017 /* 2018 * if this request is as-good as one we would expect from the 2019 * current cfqq, let it preempt 2020 */ 2021 if (cfq_rq_close(cfqd, rq)) 2022 return 1; 2023 2024 return 0; 2025} 2026 2027/* 2028 * cfqq preempts the active queue. if we allowed preempt with no slice left, 2029 * let it have half of its nominal slice. 2030 */ 2031static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2032{ 2033 cfq_log_cfqq(cfqd, cfqq, "preempt"); 2034 cfq_slice_expired(cfqd, 1); 2035 2036 /* 2037 * Put the new queue at the front of the of the current list, 2038 * so we know that it will be selected next. 2039 */ 2040 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 2041 2042 cfq_service_tree_add(cfqd, cfqq, 1); 2043 2044 cfqq->slice_end = 0; 2045 cfq_mark_cfqq_slice_new(cfqq); 2046} 2047 2048/* 2049 * Called when a new fs request (rq) is added (to cfqq). Check if there's 2050 * something we should do about it 2051 */ 2052static void 2053cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2054 struct request *rq) 2055{ 2056 struct cfq_io_context *cic = RQ_CIC(rq); 2057 2058 cfqd->rq_queued++; 2059 if (rq_is_meta(rq)) 2060 cfqq->meta_pending++; 2061 2062 cfq_update_io_thinktime(cfqd, cic); 2063 cfq_update_io_seektime(cfqd, cic, rq); 2064 cfq_update_idle_window(cfqd, cfqq, cic); 2065 2066 cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 2067 2068 if (cfqq == cfqd->active_queue) { 2069 /* 2070 * Remember that we saw a request from this process, but 2071 * don't start queuing just yet. Otherwise we risk seeing lots 2072 * of tiny requests, because we disrupt the normal plugging 2073 * and merging. If the request is already larger than a single 2074 * page, let it rip immediately. For that case we assume that 2075 * merging is already done. Ditto for a busy system that 2076 * has other work pending, don't risk delaying until the 2077 * idle timer unplug to continue working. 2078 */ 2079 if (cfq_cfqq_wait_request(cfqq)) { 2080 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 2081 cfqd->busy_queues > 1) { 2082 del_timer(&cfqd->idle_slice_timer); 2083 __blk_run_queue(cfqd->queue); 2084 } 2085 cfq_mark_cfqq_must_dispatch(cfqq); 2086 } 2087 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 2088 /* 2089 * not the active queue - expire current slice if it is 2090 * idle and has expired it's mean thinktime or this new queue 2091 * has some old slice time left and is of higher priority or 2092 * this new queue is RT and the current one is BE 2093 */ 2094 cfq_preempt_queue(cfqd, cfqq); 2095 __blk_run_queue(cfqd->queue); 2096 } 2097} 2098 2099static void cfq_insert_request(struct request_queue *q, struct request *rq) 2100{ 2101 struct cfq_data *cfqd = q->elevator->elevator_data; 2102 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2103 2104 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 2105 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 2106 2107 cfq_add_rq_rb(rq); 2108 2109 list_add_tail(&rq->queuelist, &cfqq->fifo); 2110 2111 cfq_rq_enqueued(cfqd, cfqq, rq); 2112} 2113 2114/* 2115 * Update hw_tag based on peak queue depth over 50 samples under 2116 * sufficient load. 2117 */ 2118static void cfq_update_hw_tag(struct cfq_data *cfqd) 2119{ 2120 if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) 2121 cfqd->rq_in_driver_peak = rq_in_driver(cfqd); 2122 2123 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 2124 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) 2125 return; 2126 2127 if (cfqd->hw_tag_samples++ < 50) 2128 return; 2129 2130 if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN) 2131 cfqd->hw_tag = 1; 2132 else 2133 cfqd->hw_tag = 0; 2134 2135 cfqd->hw_tag_samples = 0; 2136 cfqd->rq_in_driver_peak = 0; 2137} 2138 2139static void cfq_completed_request(struct request_queue *q, struct request *rq) 2140{ 2141 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2142 struct cfq_data *cfqd = cfqq->cfqd; 2143 const int sync = rq_is_sync(rq); 2144 unsigned long now; 2145 2146 now = jiffies; 2147 cfq_log_cfqq(cfqd, cfqq, "complete"); 2148 2149 cfq_update_hw_tag(cfqd); 2150 2151 WARN_ON(!cfqd->rq_in_driver[sync]); 2152 WARN_ON(!cfqq->dispatched); 2153 cfqd->rq_in_driver[sync]--; 2154 cfqq->dispatched--; 2155 2156 if (cfq_cfqq_sync(cfqq)) 2157 cfqd->sync_flight--; 2158 2159 if (sync) 2160 RQ_CIC(rq)->last_end_request = now; 2161 2162 /* 2163 * If this is the active queue, check if it needs to be expired, 2164 * or if we want to idle in case it has no pending requests. 2165 */ 2166 if (cfqd->active_queue == cfqq) { 2167 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 2168 2169 if (cfq_cfqq_slice_new(cfqq)) { 2170 cfq_set_prio_slice(cfqd, cfqq); 2171 cfq_clear_cfqq_slice_new(cfqq); 2172 } 2173 /* 2174 * If there are no requests waiting in this queue, and 2175 * there are other queues ready to issue requests, AND 2176 * those other queues are issuing requests within our 2177 * mean seek distance, give them a chance to run instead 2178 * of idling. 2179 */ 2180 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 2181 cfq_slice_expired(cfqd, 1); 2182 else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && 2183 sync && !rq_noidle(rq)) 2184 cfq_arm_slice_timer(cfqd); 2185 } 2186 2187 if (!rq_in_driver(cfqd)) 2188 cfq_schedule_dispatch(cfqd); 2189} 2190 2191/* 2192 * we temporarily boost lower priority queues if they are holding fs exclusive 2193 * resources. they are boosted to normal prio (CLASS_BE/4) 2194 */ 2195static void cfq_prio_boost(struct cfq_queue *cfqq) 2196{ 2197 if (has_fs_excl()) { 2198 /* 2199 * boost idle prio on transactions that would lock out other 2200 * users of the filesystem 2201 */ 2202 if (cfq_class_idle(cfqq)) 2203 cfqq->ioprio_class = IOPRIO_CLASS_BE; 2204 if (cfqq->ioprio > IOPRIO_NORM) 2205 cfqq->ioprio = IOPRIO_NORM; 2206 } else { 2207 /* 2208 * check if we need to unboost the queue 2209 */ 2210 if (cfqq->ioprio_class != cfqq->org_ioprio_class) 2211 cfqq->ioprio_class = cfqq->org_ioprio_class; 2212 if (cfqq->ioprio != cfqq->org_ioprio) 2213 cfqq->ioprio = cfqq->org_ioprio; 2214 } 2215} 2216 2217static inline int __cfq_may_queue(struct cfq_queue *cfqq) 2218{ 2219 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { 2220 cfq_mark_cfqq_must_alloc_slice(cfqq); 2221 return ELV_MQUEUE_MUST; 2222 } 2223 2224 return ELV_MQUEUE_MAY; 2225} 2226 2227static int cfq_may_queue(struct request_queue *q, int rw) 2228{ 2229 struct cfq_data *cfqd = q->elevator->elevator_data; 2230 struct task_struct *tsk = current; 2231 struct cfq_io_context *cic; 2232 struct cfq_queue *cfqq; 2233 2234 /* 2235 * don't force setup of a queue from here, as a call to may_queue 2236 * does not necessarily imply that a request actually will be queued. 2237 * so just lookup a possibly existing queue, or return 'may queue' 2238 * if that fails 2239 */ 2240 cic = cfq_cic_lookup(cfqd, tsk->io_context); 2241 if (!cic) 2242 return ELV_MQUEUE_MAY; 2243 2244 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 2245 if (cfqq) { 2246 cfq_init_prio_data(cfqq, cic->ioc); 2247 cfq_prio_boost(cfqq); 2248 2249 return __cfq_may_queue(cfqq); 2250 } 2251 2252 return ELV_MQUEUE_MAY; 2253} 2254 2255/* 2256 * queue lock held here 2257 */ 2258static void cfq_put_request(struct request *rq) 2259{ 2260 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2261 2262 if (cfqq) { 2263 const int rw = rq_data_dir(rq); 2264 2265 BUG_ON(!cfqq->allocated[rw]); 2266 cfqq->allocated[rw]--; 2267 2268 put_io_context(RQ_CIC(rq)->ioc); 2269 2270 rq->elevator_private = NULL; 2271 rq->elevator_private2 = NULL; 2272 2273 cfq_put_queue(cfqq); 2274 } 2275} 2276 2277/* 2278 * Allocate cfq data structures associated with this request. 2279 */ 2280static int 2281cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 2282{ 2283 struct cfq_data *cfqd = q->elevator->elevator_data; 2284 struct cfq_io_context *cic; 2285 const int rw = rq_data_dir(rq); 2286 const int is_sync = rq_is_sync(rq); 2287 struct cfq_queue *cfqq; 2288 unsigned long flags; 2289 2290 might_sleep_if(gfp_mask & __GFP_WAIT); 2291 2292 cic = cfq_get_io_context(cfqd, gfp_mask); 2293 2294 spin_lock_irqsave(q->queue_lock, flags); 2295 2296 if (!cic) 2297 goto queue_fail; 2298 2299 cfqq = cic_to_cfqq(cic, is_sync); 2300 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 2301 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 2302 cic_set_cfqq(cic, cfqq, is_sync); 2303 } 2304 2305 cfqq->allocated[rw]++; 2306 atomic_inc(&cfqq->ref); 2307 2308 spin_unlock_irqrestore(q->queue_lock, flags); 2309 2310 rq->elevator_private = cic; 2311 rq->elevator_private2 = cfqq; 2312 return 0; 2313 2314queue_fail: 2315 if (cic) 2316 put_io_context(cic->ioc); 2317 2318 cfq_schedule_dispatch(cfqd); 2319 spin_unlock_irqrestore(q->queue_lock, flags); 2320 cfq_log(cfqd, "set_request fail"); 2321 return 1; 2322} 2323 2324static void cfq_kick_queue(struct work_struct *work) 2325{ 2326 struct cfq_data *cfqd = 2327 container_of(work, struct cfq_data, unplug_work); 2328 struct request_queue *q = cfqd->queue; 2329 2330 spin_lock_irq(q->queue_lock); 2331 __blk_run_queue(cfqd->queue); 2332 spin_unlock_irq(q->queue_lock); 2333} 2334 2335/* 2336 * Timer running if the active_queue is currently idling inside its time slice 2337 */ 2338static void cfq_idle_slice_timer(unsigned long data) 2339{ 2340 struct cfq_data *cfqd = (struct cfq_data *) data; 2341 struct cfq_queue *cfqq; 2342 unsigned long flags; 2343 int timed_out = 1; 2344 2345 cfq_log(cfqd, "idle timer fired"); 2346 2347 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2348 2349 cfqq = cfqd->active_queue; 2350 if (cfqq) { 2351 timed_out = 0; 2352 2353 /* 2354 * We saw a request before the queue expired, let it through 2355 */ 2356 if (cfq_cfqq_must_dispatch(cfqq)) 2357 goto out_kick; 2358 2359 /* 2360 * expired 2361 */ 2362 if (cfq_slice_used(cfqq)) 2363 goto expire; 2364 2365 /* 2366 * only expire and reinvoke request handler, if there are 2367 * other queues with pending requests 2368 */ 2369 if (!cfqd->busy_queues) 2370 goto out_cont; 2371 2372 /* 2373 * not expired and it has a request pending, let it dispatch 2374 */ 2375 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 2376 goto out_kick; 2377 } 2378expire: 2379 cfq_slice_expired(cfqd, timed_out); 2380out_kick: 2381 cfq_schedule_dispatch(cfqd); 2382out_cont: 2383 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2384} 2385 2386static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 2387{ 2388 del_timer_sync(&cfqd->idle_slice_timer); 2389 cancel_work_sync(&cfqd->unplug_work); 2390} 2391 2392static void cfq_put_async_queues(struct cfq_data *cfqd) 2393{ 2394 int i; 2395 2396 for (i = 0; i < IOPRIO_BE_NR; i++) { 2397 if (cfqd->async_cfqq[0][i]) 2398 cfq_put_queue(cfqd->async_cfqq[0][i]); 2399 if (cfqd->async_cfqq[1][i]) 2400 cfq_put_queue(cfqd->async_cfqq[1][i]); 2401 } 2402 2403 if (cfqd->async_idle_cfqq) 2404 cfq_put_queue(cfqd->async_idle_cfqq); 2405} 2406 2407static void cfq_exit_queue(struct elevator_queue *e) 2408{ 2409 struct cfq_data *cfqd = e->elevator_data; 2410 struct request_queue *q = cfqd->queue; 2411 2412 cfq_shutdown_timer_wq(cfqd); 2413 2414 spin_lock_irq(q->queue_lock); 2415 2416 if (cfqd->active_queue) 2417 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 2418 2419 while (!list_empty(&cfqd->cic_list)) { 2420 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 2421 struct cfq_io_context, 2422 queue_list); 2423 2424 __cfq_exit_single_io_context(cfqd, cic); 2425 } 2426 2427 cfq_put_async_queues(cfqd); 2428 2429 spin_unlock_irq(q->queue_lock); 2430 2431 cfq_shutdown_timer_wq(cfqd); 2432 2433 kfree(cfqd); 2434} 2435 2436static void *cfq_init_queue(struct request_queue *q) 2437{ 2438 struct cfq_data *cfqd; 2439 int i; 2440 2441 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 2442 if (!cfqd) 2443 return NULL; 2444 2445 cfqd->service_tree = CFQ_RB_ROOT; 2446 2447 /* 2448 * Not strictly needed (since RB_ROOT just clears the node and we 2449 * zeroed cfqd on alloc), but better be safe in case someone decides 2450 * to add magic to the rb code 2451 */ 2452 for (i = 0; i < CFQ_PRIO_LISTS; i++) 2453 cfqd->prio_trees[i] = RB_ROOT; 2454 2455 /* 2456 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 2457 * Grab a permanent reference to it, so that the normal code flow 2458 * will not attempt to free it. 2459 */ 2460 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 2461 atomic_inc(&cfqd->oom_cfqq.ref); 2462 2463 INIT_LIST_HEAD(&cfqd->cic_list); 2464 2465 cfqd->queue = q; 2466 2467 init_timer(&cfqd->idle_slice_timer); 2468 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 2469 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 2470 2471 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 2472 2473 cfqd->cfq_quantum = cfq_quantum; 2474 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 2475 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 2476 cfqd->cfq_back_max = cfq_back_max; 2477 cfqd->cfq_back_penalty = cfq_back_penalty; 2478 cfqd->cfq_slice[0] = cfq_slice_async; 2479 cfqd->cfq_slice[1] = cfq_slice_sync; 2480 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 2481 cfqd->cfq_slice_idle = cfq_slice_idle; 2482 cfqd->hw_tag = 1; 2483 2484 return cfqd; 2485} 2486 2487static void cfq_slab_kill(void) 2488{ 2489 /* 2490 * Caller already ensured that pending RCU callbacks are completed, 2491 * so we should have no busy allocations at this point. 2492 */ 2493 if (cfq_pool) 2494 kmem_cache_destroy(cfq_pool); 2495 if (cfq_ioc_pool) 2496 kmem_cache_destroy(cfq_ioc_pool); 2497} 2498 2499static int __init cfq_slab_setup(void) 2500{ 2501 cfq_pool = KMEM_CACHE(cfq_queue, 0); 2502 if (!cfq_pool) 2503 goto fail; 2504 2505 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); 2506 if (!cfq_ioc_pool) 2507 goto fail; 2508 2509 return 0; 2510fail: 2511 cfq_slab_kill(); 2512 return -ENOMEM; 2513} 2514 2515/* 2516 * sysfs parts below --> 2517 */ 2518static ssize_t 2519cfq_var_show(unsigned int var, char *page) 2520{ 2521 return sprintf(page, "%d\n", var); 2522} 2523 2524static ssize_t 2525cfq_var_store(unsigned int *var, const char *page, size_t count) 2526{ 2527 char *p = (char *) page; 2528 2529 *var = simple_strtoul(p, &p, 10); 2530 return count; 2531} 2532 2533#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 2534static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 2535{ \ 2536 struct cfq_data *cfqd = e->elevator_data; \ 2537 unsigned int __data = __VAR; \ 2538 if (__CONV) \ 2539 __data = jiffies_to_msecs(__data); \ 2540 return cfq_var_show(__data, (page)); \ 2541} 2542SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 2543SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 2544SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 2545SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 2546SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 2547SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 2548SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 2549SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 2550SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 2551#undef SHOW_FUNCTION 2552 2553#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 2554static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 2555{ \ 2556 struct cfq_data *cfqd = e->elevator_data; \ 2557 unsigned int __data; \ 2558 int ret = cfq_var_store(&__data, (page), count); \ 2559 if (__data < (MIN)) \ 2560 __data = (MIN); \ 2561 else if (__data > (MAX)) \ 2562 __data = (MAX); \ 2563 if (__CONV) \ 2564 *(__PTR) = msecs_to_jiffies(__data); \ 2565 else \ 2566 *(__PTR) = __data; \ 2567 return ret; \ 2568} 2569STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 2570STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, 2571 UINT_MAX, 1); 2572STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, 2573 UINT_MAX, 1); 2574STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 2575STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 2576 UINT_MAX, 0); 2577STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 2578STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 2579STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 2580STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 2581 UINT_MAX, 0); 2582#undef STORE_FUNCTION 2583 2584#define CFQ_ATTR(name) \ 2585 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 2586 2587static struct elv_fs_entry cfq_attrs[] = { 2588 CFQ_ATTR(quantum), 2589 CFQ_ATTR(fifo_expire_sync), 2590 CFQ_ATTR(fifo_expire_async), 2591 CFQ_ATTR(back_seek_max), 2592 CFQ_ATTR(back_seek_penalty), 2593 CFQ_ATTR(slice_sync), 2594 CFQ_ATTR(slice_async), 2595 CFQ_ATTR(slice_async_rq), 2596 CFQ_ATTR(slice_idle), 2597 __ATTR_NULL 2598}; 2599 2600static struct elevator_type iosched_cfq = { 2601 .ops = { 2602 .elevator_merge_fn = cfq_merge, 2603 .elevator_merged_fn = cfq_merged_request, 2604 .elevator_merge_req_fn = cfq_merged_requests, 2605 .elevator_allow_merge_fn = cfq_allow_merge, 2606 .elevator_dispatch_fn = cfq_dispatch_requests, 2607 .elevator_add_req_fn = cfq_insert_request, 2608 .elevator_activate_req_fn = cfq_activate_request, 2609 .elevator_deactivate_req_fn = cfq_deactivate_request, 2610 .elevator_queue_empty_fn = cfq_queue_empty, 2611 .elevator_completed_req_fn = cfq_completed_request, 2612 .elevator_former_req_fn = elv_rb_former_request, 2613 .elevator_latter_req_fn = elv_rb_latter_request, 2614 .elevator_set_req_fn = cfq_set_request, 2615 .elevator_put_req_fn = cfq_put_request, 2616 .elevator_may_queue_fn = cfq_may_queue, 2617 .elevator_init_fn = cfq_init_queue, 2618 .elevator_exit_fn = cfq_exit_queue, 2619 .trim = cfq_free_io_context, 2620 }, 2621 .elevator_attrs = cfq_attrs, 2622 .elevator_name = "cfq", 2623 .elevator_owner = THIS_MODULE, 2624}; 2625 2626static int __init cfq_init(void) 2627{ 2628 /* 2629 * could be 0 on HZ < 1000 setups 2630 */ 2631 if (!cfq_slice_async) 2632 cfq_slice_async = 1; 2633 if (!cfq_slice_idle) 2634 cfq_slice_idle = 1; 2635 2636 if (cfq_slab_setup()) 2637 return -ENOMEM; 2638 2639 elv_register(&iosched_cfq); 2640 2641 return 0; 2642} 2643 2644static void __exit cfq_exit(void) 2645{ 2646 DECLARE_COMPLETION_ONSTACK(all_gone); 2647 elv_unregister(&iosched_cfq); 2648 ioc_gone = &all_gone; 2649 /* ioc_gone's update must be visible before reading ioc_count */ 2650 smp_wmb(); 2651 2652 /* 2653 * this also protects us from entering cfq_slab_kill() with 2654 * pending RCU callbacks 2655 */ 2656 if (elv_ioc_count_read(ioc_count)) 2657 wait_for_completion(&all_gone); 2658 cfq_slab_kill(); 2659} 2660 2661module_init(cfq_init); 2662module_exit(cfq_exit); 2663 2664MODULE_AUTHOR("Jens Axboe"); 2665MODULE_LICENSE("GPL"); 2666MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 2667