cfq-iosched.c revision f2d1f0ae7851be5ebd9613a80dac139270938809
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/blkdev.h> 11#include <linux/elevator.h> 12#include <linux/rbtree.h> 13#include <linux/ioprio.h> 14#include <linux/blktrace_api.h> 15 16/* 17 * tunables 18 */ 19/* max queue in one round of service */ 20static const int cfq_quantum = 4; 21static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 22/* maximum backwards seek, in KiB */ 23static const int cfq_back_max = 16 * 1024; 24/* penalty of a backwards seek */ 25static const int cfq_back_penalty = 2; 26static const int cfq_slice_sync = HZ / 10; 27static int cfq_slice_async = HZ / 25; 28static const int cfq_slice_async_rq = 2; 29static int cfq_slice_idle = HZ / 125; 30 31/* 32 * offset from end of service tree 33 */ 34#define CFQ_IDLE_DELAY (HZ / 5) 35 36/* 37 * below this threshold, we consider thinktime immediate 38 */ 39#define CFQ_MIN_TT (2) 40 41#define CFQ_SLICE_SCALE (5) 42#define CFQ_HW_QUEUE_MIN (5) 43 44#define RQ_CIC(rq) \ 45 ((struct cfq_io_context *) (rq)->elevator_private) 46#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 47 48static struct kmem_cache *cfq_pool; 49static struct kmem_cache *cfq_ioc_pool; 50 51static DEFINE_PER_CPU(unsigned long, ioc_count); 52static struct completion *ioc_gone; 53static DEFINE_SPINLOCK(ioc_gone_lock); 54 55#define CFQ_PRIO_LISTS IOPRIO_BE_NR 56#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 57#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 58 59#define sample_valid(samples) ((samples) > 80) 60 61/* 62 * Most of our rbtree usage is for sorting with min extraction, so 63 * if we cache the leftmost node we don't have to walk down the tree 64 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 65 * move this into the elevator for the rq sorting as well. 66 */ 67struct cfq_rb_root { 68 struct rb_root rb; 69 struct rb_node *left; 70}; 71#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } 72 73/* 74 * Per block device queue structure 75 */ 76struct cfq_data { 77 struct request_queue *queue; 78 79 /* 80 * rr list of queues with requests and the count of them 81 */ 82 struct cfq_rb_root service_tree; 83 84 /* 85 * Each priority tree is sorted by next_request position. These 86 * trees are used when determining if two or more queues are 87 * interleaving requests (see cfq_close_cooperator). 88 */ 89 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 90 91 unsigned int busy_queues; 92 /* 93 * Used to track any pending rt requests so we can pre-empt current 94 * non-RT cfqq in service when this value is non-zero. 95 */ 96 unsigned int busy_rt_queues; 97 98 int rq_in_driver; 99 int sync_flight; 100 101 /* 102 * queue-depth detection 103 */ 104 int rq_queued; 105 int hw_tag; 106 int hw_tag_samples; 107 int rq_in_driver_peak; 108 109 /* 110 * idle window management 111 */ 112 struct timer_list idle_slice_timer; 113 struct work_struct unplug_work; 114 115 struct cfq_queue *active_queue; 116 struct cfq_io_context *active_cic; 117 118 /* 119 * async queue for each priority case 120 */ 121 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; 122 struct cfq_queue *async_idle_cfqq; 123 124 sector_t last_position; 125 unsigned long last_end_request; 126 127 /* 128 * tunables, see top of file 129 */ 130 unsigned int cfq_quantum; 131 unsigned int cfq_fifo_expire[2]; 132 unsigned int cfq_back_penalty; 133 unsigned int cfq_back_max; 134 unsigned int cfq_slice[2]; 135 unsigned int cfq_slice_async_rq; 136 unsigned int cfq_slice_idle; 137 138 struct list_head cic_list; 139}; 140 141/* 142 * Per process-grouping structure 143 */ 144struct cfq_queue { 145 /* reference count */ 146 atomic_t ref; 147 /* various state flags, see below */ 148 unsigned int flags; 149 /* parent cfq_data */ 150 struct cfq_data *cfqd; 151 /* service_tree member */ 152 struct rb_node rb_node; 153 /* service_tree key */ 154 unsigned long rb_key; 155 /* prio tree member */ 156 struct rb_node p_node; 157 /* prio tree root we belong to, if any */ 158 struct rb_root *p_root; 159 /* sorted list of pending requests */ 160 struct rb_root sort_list; 161 /* if fifo isn't expired, next request to serve */ 162 struct request *next_rq; 163 /* requests queued in sort_list */ 164 int queued[2]; 165 /* currently allocated requests */ 166 int allocated[2]; 167 /* fifo list of requests in sort_list */ 168 struct list_head fifo; 169 170 unsigned long slice_end; 171 long slice_resid; 172 unsigned int slice_dispatch; 173 174 /* pending metadata requests */ 175 int meta_pending; 176 /* number of requests that are on the dispatch list or inside driver */ 177 int dispatched; 178 179 /* io prio of this group */ 180 unsigned short ioprio, org_ioprio; 181 unsigned short ioprio_class, org_ioprio_class; 182 183 pid_t pid; 184}; 185 186enum cfqq_state_flags { 187 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 188 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 189 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 190 CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ 191 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 192 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 193 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 194 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 195 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 196 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 197 CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ 198}; 199 200#define CFQ_CFQQ_FNS(name) \ 201static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 202{ \ 203 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 204} \ 205static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 206{ \ 207 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 208} \ 209static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 210{ \ 211 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 212} 213 214CFQ_CFQQ_FNS(on_rr); 215CFQ_CFQQ_FNS(wait_request); 216CFQ_CFQQ_FNS(must_dispatch); 217CFQ_CFQQ_FNS(must_alloc); 218CFQ_CFQQ_FNS(must_alloc_slice); 219CFQ_CFQQ_FNS(fifo_expire); 220CFQ_CFQQ_FNS(idle_window); 221CFQ_CFQQ_FNS(prio_changed); 222CFQ_CFQQ_FNS(slice_new); 223CFQ_CFQQ_FNS(sync); 224CFQ_CFQQ_FNS(coop); 225#undef CFQ_CFQQ_FNS 226 227#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 228 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 229#define cfq_log(cfqd, fmt, args...) \ 230 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 231 232static void cfq_dispatch_insert(struct request_queue *, struct request *); 233static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, 234 struct io_context *, gfp_t); 235static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 236 struct io_context *); 237 238static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 239 int is_sync) 240{ 241 return cic->cfqq[!!is_sync]; 242} 243 244static inline void cic_set_cfqq(struct cfq_io_context *cic, 245 struct cfq_queue *cfqq, int is_sync) 246{ 247 cic->cfqq[!!is_sync] = cfqq; 248} 249 250/* 251 * We regard a request as SYNC, if it's either a read or has the SYNC bit 252 * set (in which case it could also be direct WRITE). 253 */ 254static inline int cfq_bio_sync(struct bio *bio) 255{ 256 if (bio_data_dir(bio) == READ || bio_sync(bio)) 257 return 1; 258 259 return 0; 260} 261 262/* 263 * scheduler run of queue, if there are requests pending and no one in the 264 * driver that will restart queueing 265 */ 266static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 267{ 268 if (cfqd->busy_queues) { 269 cfq_log(cfqd, "schedule dispatch"); 270 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 271 } 272} 273 274static int cfq_queue_empty(struct request_queue *q) 275{ 276 struct cfq_data *cfqd = q->elevator->elevator_data; 277 278 return !cfqd->busy_queues; 279} 280 281/* 282 * Scale schedule slice based on io priority. Use the sync time slice only 283 * if a queue is marked sync and has sync io queued. A sync queue with async 284 * io only, should not get full sync slice length. 285 */ 286static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync, 287 unsigned short prio) 288{ 289 const int base_slice = cfqd->cfq_slice[sync]; 290 291 WARN_ON(prio >= IOPRIO_BE_NR); 292 293 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); 294} 295 296static inline int 297cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 298{ 299 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 300} 301 302static inline void 303cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 304{ 305 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; 306 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 307} 308 309/* 310 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 311 * isn't valid until the first request from the dispatch is activated 312 * and the slice time set. 313 */ 314static inline int cfq_slice_used(struct cfq_queue *cfqq) 315{ 316 if (cfq_cfqq_slice_new(cfqq)) 317 return 0; 318 if (time_before(jiffies, cfqq->slice_end)) 319 return 0; 320 321 return 1; 322} 323 324/* 325 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 326 * We choose the request that is closest to the head right now. Distance 327 * behind the head is penalized and only allowed to a certain extent. 328 */ 329static struct request * 330cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) 331{ 332 sector_t last, s1, s2, d1 = 0, d2 = 0; 333 unsigned long back_max; 334#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 335#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 336 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 337 338 if (rq1 == NULL || rq1 == rq2) 339 return rq2; 340 if (rq2 == NULL) 341 return rq1; 342 343 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 344 return rq1; 345 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 346 return rq2; 347 if (rq_is_meta(rq1) && !rq_is_meta(rq2)) 348 return rq1; 349 else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) 350 return rq2; 351 352 s1 = rq1->sector; 353 s2 = rq2->sector; 354 355 last = cfqd->last_position; 356 357 /* 358 * by definition, 1KiB is 2 sectors 359 */ 360 back_max = cfqd->cfq_back_max * 2; 361 362 /* 363 * Strict one way elevator _except_ in the case where we allow 364 * short backward seeks which are biased as twice the cost of a 365 * similar forward seek. 366 */ 367 if (s1 >= last) 368 d1 = s1 - last; 369 else if (s1 + back_max >= last) 370 d1 = (last - s1) * cfqd->cfq_back_penalty; 371 else 372 wrap |= CFQ_RQ1_WRAP; 373 374 if (s2 >= last) 375 d2 = s2 - last; 376 else if (s2 + back_max >= last) 377 d2 = (last - s2) * cfqd->cfq_back_penalty; 378 else 379 wrap |= CFQ_RQ2_WRAP; 380 381 /* Found required data */ 382 383 /* 384 * By doing switch() on the bit mask "wrap" we avoid having to 385 * check two variables for all permutations: --> faster! 386 */ 387 switch (wrap) { 388 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 389 if (d1 < d2) 390 return rq1; 391 else if (d2 < d1) 392 return rq2; 393 else { 394 if (s1 >= s2) 395 return rq1; 396 else 397 return rq2; 398 } 399 400 case CFQ_RQ2_WRAP: 401 return rq1; 402 case CFQ_RQ1_WRAP: 403 return rq2; 404 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 405 default: 406 /* 407 * Since both rqs are wrapped, 408 * start with the one that's further behind head 409 * (--> only *one* back seek required), 410 * since back seek takes more time than forward. 411 */ 412 if (s1 <= s2) 413 return rq1; 414 else 415 return rq2; 416 } 417} 418 419/* 420 * The below is leftmost cache rbtree addon 421 */ 422static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 423{ 424 if (!root->left) 425 root->left = rb_first(&root->rb); 426 427 if (root->left) 428 return rb_entry(root->left, struct cfq_queue, rb_node); 429 430 return NULL; 431} 432 433static void rb_erase_init(struct rb_node *n, struct rb_root *root) 434{ 435 rb_erase(n, root); 436 RB_CLEAR_NODE(n); 437} 438 439static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 440{ 441 if (root->left == n) 442 root->left = NULL; 443 rb_erase_init(n, &root->rb); 444} 445 446/* 447 * would be nice to take fifo expire time into account as well 448 */ 449static struct request * 450cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 451 struct request *last) 452{ 453 struct rb_node *rbnext = rb_next(&last->rb_node); 454 struct rb_node *rbprev = rb_prev(&last->rb_node); 455 struct request *next = NULL, *prev = NULL; 456 457 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 458 459 if (rbprev) 460 prev = rb_entry_rq(rbprev); 461 462 if (rbnext) 463 next = rb_entry_rq(rbnext); 464 else { 465 rbnext = rb_first(&cfqq->sort_list); 466 if (rbnext && rbnext != &last->rb_node) 467 next = rb_entry_rq(rbnext); 468 } 469 470 return cfq_choose_req(cfqd, next, prev); 471} 472 473static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 474 struct cfq_queue *cfqq) 475{ 476 /* 477 * just an approximation, should be ok. 478 */ 479 return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - 480 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 481} 482 483/* 484 * The cfqd->service_tree holds all pending cfq_queue's that have 485 * requests waiting to be processed. It is sorted in the order that 486 * we will service the queues. 487 */ 488static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 489 int add_front) 490{ 491 struct rb_node **p, *parent; 492 struct cfq_queue *__cfqq; 493 unsigned long rb_key; 494 int left; 495 496 if (cfq_class_idle(cfqq)) { 497 rb_key = CFQ_IDLE_DELAY; 498 parent = rb_last(&cfqd->service_tree.rb); 499 if (parent && parent != &cfqq->rb_node) { 500 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 501 rb_key += __cfqq->rb_key; 502 } else 503 rb_key += jiffies; 504 } else if (!add_front) { 505 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; 506 rb_key += cfqq->slice_resid; 507 cfqq->slice_resid = 0; 508 } else 509 rb_key = 0; 510 511 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 512 /* 513 * same position, nothing more to do 514 */ 515 if (rb_key == cfqq->rb_key) 516 return; 517 518 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 519 } 520 521 left = 1; 522 parent = NULL; 523 p = &cfqd->service_tree.rb.rb_node; 524 while (*p) { 525 struct rb_node **n; 526 527 parent = *p; 528 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 529 530 /* 531 * sort RT queues first, we always want to give 532 * preference to them. IDLE queues goes to the back. 533 * after that, sort on the next service time. 534 */ 535 if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) 536 n = &(*p)->rb_left; 537 else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) 538 n = &(*p)->rb_right; 539 else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) 540 n = &(*p)->rb_left; 541 else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) 542 n = &(*p)->rb_right; 543 else if (rb_key < __cfqq->rb_key) 544 n = &(*p)->rb_left; 545 else 546 n = &(*p)->rb_right; 547 548 if (n == &(*p)->rb_right) 549 left = 0; 550 551 p = n; 552 } 553 554 if (left) 555 cfqd->service_tree.left = &cfqq->rb_node; 556 557 cfqq->rb_key = rb_key; 558 rb_link_node(&cfqq->rb_node, parent, p); 559 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 560} 561 562static struct cfq_queue * 563cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, 564 sector_t sector, struct rb_node **ret_parent, 565 struct rb_node ***rb_link) 566{ 567 struct rb_node **p, *parent; 568 struct cfq_queue *cfqq = NULL; 569 570 parent = NULL; 571 p = &root->rb_node; 572 while (*p) { 573 struct rb_node **n; 574 575 parent = *p; 576 cfqq = rb_entry(parent, struct cfq_queue, p_node); 577 578 /* 579 * Sort strictly based on sector. Smallest to the left, 580 * largest to the right. 581 */ 582 if (sector > cfqq->next_rq->sector) 583 n = &(*p)->rb_right; 584 else if (sector < cfqq->next_rq->sector) 585 n = &(*p)->rb_left; 586 else 587 break; 588 p = n; 589 cfqq = NULL; 590 } 591 592 *ret_parent = parent; 593 if (rb_link) 594 *rb_link = p; 595 return cfqq; 596} 597 598static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 599{ 600 struct rb_node **p, *parent; 601 struct cfq_queue *__cfqq; 602 603 if (cfqq->p_root) { 604 rb_erase(&cfqq->p_node, cfqq->p_root); 605 cfqq->p_root = NULL; 606 } 607 608 if (cfq_class_idle(cfqq)) 609 return; 610 if (!cfqq->next_rq) 611 return; 612 613 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; 614 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, cfqq->next_rq->sector, 615 &parent, &p); 616 if (!__cfqq) { 617 rb_link_node(&cfqq->p_node, parent, p); 618 rb_insert_color(&cfqq->p_node, cfqq->p_root); 619 } else 620 cfqq->p_root = NULL; 621} 622 623/* 624 * Update cfqq's position in the service tree. 625 */ 626static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 627{ 628 /* 629 * Resorting requires the cfqq to be on the RR list already. 630 */ 631 if (cfq_cfqq_on_rr(cfqq)) { 632 cfq_service_tree_add(cfqd, cfqq, 0); 633 cfq_prio_tree_add(cfqd, cfqq); 634 } 635} 636 637/* 638 * add to busy list of queues for service, trying to be fair in ordering 639 * the pending list according to last request service 640 */ 641static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 642{ 643 cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); 644 BUG_ON(cfq_cfqq_on_rr(cfqq)); 645 cfq_mark_cfqq_on_rr(cfqq); 646 cfqd->busy_queues++; 647 if (cfq_class_rt(cfqq)) 648 cfqd->busy_rt_queues++; 649 650 cfq_resort_rr_list(cfqd, cfqq); 651} 652 653/* 654 * Called when the cfqq no longer has requests pending, remove it from 655 * the service tree. 656 */ 657static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 658{ 659 cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); 660 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 661 cfq_clear_cfqq_on_rr(cfqq); 662 663 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 664 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 665 if (cfqq->p_root) { 666 rb_erase(&cfqq->p_node, cfqq->p_root); 667 cfqq->p_root = NULL; 668 } 669 670 BUG_ON(!cfqd->busy_queues); 671 cfqd->busy_queues--; 672 if (cfq_class_rt(cfqq)) 673 cfqd->busy_rt_queues--; 674} 675 676/* 677 * rb tree support functions 678 */ 679static void cfq_del_rq_rb(struct request *rq) 680{ 681 struct cfq_queue *cfqq = RQ_CFQQ(rq); 682 struct cfq_data *cfqd = cfqq->cfqd; 683 const int sync = rq_is_sync(rq); 684 685 BUG_ON(!cfqq->queued[sync]); 686 cfqq->queued[sync]--; 687 688 elv_rb_del(&cfqq->sort_list, rq); 689 690 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 691 cfq_del_cfqq_rr(cfqd, cfqq); 692} 693 694static void cfq_add_rq_rb(struct request *rq) 695{ 696 struct cfq_queue *cfqq = RQ_CFQQ(rq); 697 struct cfq_data *cfqd = cfqq->cfqd; 698 struct request *__alias, *prev; 699 700 cfqq->queued[rq_is_sync(rq)]++; 701 702 /* 703 * looks a little odd, but the first insert might return an alias. 704 * if that happens, put the alias on the dispatch list 705 */ 706 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 707 cfq_dispatch_insert(cfqd->queue, __alias); 708 709 if (!cfq_cfqq_on_rr(cfqq)) 710 cfq_add_cfqq_rr(cfqd, cfqq); 711 712 /* 713 * check if this request is a better next-serve candidate 714 */ 715 prev = cfqq->next_rq; 716 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 717 718 /* 719 * adjust priority tree position, if ->next_rq changes 720 */ 721 if (prev != cfqq->next_rq) 722 cfq_prio_tree_add(cfqd, cfqq); 723 724 BUG_ON(!cfqq->next_rq); 725} 726 727static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 728{ 729 elv_rb_del(&cfqq->sort_list, rq); 730 cfqq->queued[rq_is_sync(rq)]--; 731 cfq_add_rq_rb(rq); 732} 733 734static struct request * 735cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 736{ 737 struct task_struct *tsk = current; 738 struct cfq_io_context *cic; 739 struct cfq_queue *cfqq; 740 741 cic = cfq_cic_lookup(cfqd, tsk->io_context); 742 if (!cic) 743 return NULL; 744 745 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 746 if (cfqq) { 747 sector_t sector = bio->bi_sector + bio_sectors(bio); 748 749 return elv_rb_find(&cfqq->sort_list, sector); 750 } 751 752 return NULL; 753} 754 755static void cfq_activate_request(struct request_queue *q, struct request *rq) 756{ 757 struct cfq_data *cfqd = q->elevator->elevator_data; 758 759 cfqd->rq_in_driver++; 760 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 761 cfqd->rq_in_driver); 762 763 cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors; 764} 765 766static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 767{ 768 struct cfq_data *cfqd = q->elevator->elevator_data; 769 770 WARN_ON(!cfqd->rq_in_driver); 771 cfqd->rq_in_driver--; 772 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 773 cfqd->rq_in_driver); 774} 775 776static void cfq_remove_request(struct request *rq) 777{ 778 struct cfq_queue *cfqq = RQ_CFQQ(rq); 779 780 if (cfqq->next_rq == rq) 781 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 782 783 list_del_init(&rq->queuelist); 784 cfq_del_rq_rb(rq); 785 786 cfqq->cfqd->rq_queued--; 787 if (rq_is_meta(rq)) { 788 WARN_ON(!cfqq->meta_pending); 789 cfqq->meta_pending--; 790 } 791} 792 793static int cfq_merge(struct request_queue *q, struct request **req, 794 struct bio *bio) 795{ 796 struct cfq_data *cfqd = q->elevator->elevator_data; 797 struct request *__rq; 798 799 __rq = cfq_find_rq_fmerge(cfqd, bio); 800 if (__rq && elv_rq_merge_ok(__rq, bio)) { 801 *req = __rq; 802 return ELEVATOR_FRONT_MERGE; 803 } 804 805 return ELEVATOR_NO_MERGE; 806} 807 808static void cfq_merged_request(struct request_queue *q, struct request *req, 809 int type) 810{ 811 if (type == ELEVATOR_FRONT_MERGE) { 812 struct cfq_queue *cfqq = RQ_CFQQ(req); 813 814 cfq_reposition_rq_rb(cfqq, req); 815 } 816} 817 818static void 819cfq_merged_requests(struct request_queue *q, struct request *rq, 820 struct request *next) 821{ 822 /* 823 * reposition in fifo if next is older than rq 824 */ 825 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 826 time_before(next->start_time, rq->start_time)) 827 list_move(&rq->queuelist, &next->queuelist); 828 829 cfq_remove_request(next); 830} 831 832static int cfq_allow_merge(struct request_queue *q, struct request *rq, 833 struct bio *bio) 834{ 835 struct cfq_data *cfqd = q->elevator->elevator_data; 836 struct cfq_io_context *cic; 837 struct cfq_queue *cfqq; 838 839 /* 840 * Disallow merge of a sync bio into an async request. 841 */ 842 if (cfq_bio_sync(bio) && !rq_is_sync(rq)) 843 return 0; 844 845 /* 846 * Lookup the cfqq that this bio will be queued with. Allow 847 * merge only if rq is queued there. 848 */ 849 cic = cfq_cic_lookup(cfqd, current->io_context); 850 if (!cic) 851 return 0; 852 853 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 854 if (cfqq == RQ_CFQQ(rq)) 855 return 1; 856 857 return 0; 858} 859 860static void __cfq_set_active_queue(struct cfq_data *cfqd, 861 struct cfq_queue *cfqq) 862{ 863 if (cfqq) { 864 cfq_log_cfqq(cfqd, cfqq, "set_active"); 865 cfqq->slice_end = 0; 866 cfqq->slice_dispatch = 0; 867 868 cfq_clear_cfqq_wait_request(cfqq); 869 cfq_clear_cfqq_must_dispatch(cfqq); 870 cfq_clear_cfqq_must_alloc_slice(cfqq); 871 cfq_clear_cfqq_fifo_expire(cfqq); 872 cfq_mark_cfqq_slice_new(cfqq); 873 874 del_timer(&cfqd->idle_slice_timer); 875 } 876 877 cfqd->active_queue = cfqq; 878} 879 880/* 881 * current cfqq expired its slice (or was too idle), select new one 882 */ 883static void 884__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 885 int timed_out) 886{ 887 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 888 889 if (cfq_cfqq_wait_request(cfqq)) 890 del_timer(&cfqd->idle_slice_timer); 891 892 cfq_clear_cfqq_wait_request(cfqq); 893 894 /* 895 * store what was left of this slice, if the queue idled/timed out 896 */ 897 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 898 cfqq->slice_resid = cfqq->slice_end - jiffies; 899 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 900 } 901 902 cfq_resort_rr_list(cfqd, cfqq); 903 904 if (cfqq == cfqd->active_queue) 905 cfqd->active_queue = NULL; 906 907 if (cfqd->active_cic) { 908 put_io_context(cfqd->active_cic->ioc); 909 cfqd->active_cic = NULL; 910 } 911} 912 913static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out) 914{ 915 struct cfq_queue *cfqq = cfqd->active_queue; 916 917 if (cfqq) 918 __cfq_slice_expired(cfqd, cfqq, timed_out); 919} 920 921/* 922 * Get next queue for service. Unless we have a queue preemption, 923 * we'll simply select the first cfqq in the service tree. 924 */ 925static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 926{ 927 if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) 928 return NULL; 929 930 return cfq_rb_first(&cfqd->service_tree); 931} 932 933/* 934 * Get and set a new active queue for service. 935 */ 936static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 937 struct cfq_queue *cfqq) 938{ 939 if (!cfqq) { 940 cfqq = cfq_get_next_queue(cfqd); 941 if (cfqq) 942 cfq_clear_cfqq_coop(cfqq); 943 } 944 945 __cfq_set_active_queue(cfqd, cfqq); 946 return cfqq; 947} 948 949static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 950 struct request *rq) 951{ 952 if (rq->sector >= cfqd->last_position) 953 return rq->sector - cfqd->last_position; 954 else 955 return cfqd->last_position - rq->sector; 956} 957 958#define CIC_SEEK_THR 8 * 1024 959#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR) 960 961static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) 962{ 963 struct cfq_io_context *cic = cfqd->active_cic; 964 sector_t sdist = cic->seek_mean; 965 966 if (!sample_valid(cic->seek_samples)) 967 sdist = CIC_SEEK_THR; 968 969 return cfq_dist_from_last(cfqd, rq) <= sdist; 970} 971 972static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 973 struct cfq_queue *cur_cfqq) 974{ 975 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; 976 struct rb_node *parent, *node; 977 struct cfq_queue *__cfqq; 978 sector_t sector = cfqd->last_position; 979 980 if (RB_EMPTY_ROOT(root)) 981 return NULL; 982 983 /* 984 * First, if we find a request starting at the end of the last 985 * request, choose it. 986 */ 987 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); 988 if (__cfqq) 989 return __cfqq; 990 991 /* 992 * If the exact sector wasn't found, the parent of the NULL leaf 993 * will contain the closest sector. 994 */ 995 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 996 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 997 return __cfqq; 998 999 if (__cfqq->next_rq->sector < sector) 1000 node = rb_next(&__cfqq->p_node); 1001 else 1002 node = rb_prev(&__cfqq->p_node); 1003 if (!node) 1004 return NULL; 1005 1006 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1007 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1008 return __cfqq; 1009 1010 return NULL; 1011} 1012 1013/* 1014 * cfqd - obvious 1015 * cur_cfqq - passed in so that we don't decide that the current queue is 1016 * closely cooperating with itself. 1017 * 1018 * So, basically we're assuming that that cur_cfqq has dispatched at least 1019 * one request, and that cfqd->last_position reflects a position on the disk 1020 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 1021 * assumption. 1022 */ 1023static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1024 struct cfq_queue *cur_cfqq, 1025 int probe) 1026{ 1027 struct cfq_queue *cfqq; 1028 1029 /* 1030 * A valid cfq_io_context is necessary to compare requests against 1031 * the seek_mean of the current cfqq. 1032 */ 1033 if (!cfqd->active_cic) 1034 return NULL; 1035 1036 /* 1037 * We should notice if some of the queues are cooperating, eg 1038 * working closely on the same area of the disk. In that case, 1039 * we can group them together and don't waste time idling. 1040 */ 1041 cfqq = cfqq_close(cfqd, cur_cfqq); 1042 if (!cfqq) 1043 return NULL; 1044 1045 if (cfq_cfqq_coop(cfqq)) 1046 return NULL; 1047 1048 if (!probe) 1049 cfq_mark_cfqq_coop(cfqq); 1050 return cfqq; 1051} 1052 1053static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1054{ 1055 struct cfq_queue *cfqq = cfqd->active_queue; 1056 struct cfq_io_context *cic; 1057 unsigned long sl; 1058 1059 /* 1060 * SSD device without seek penalty, disable idling. But only do so 1061 * for devices that support queuing, otherwise we still have a problem 1062 * with sync vs async workloads. 1063 */ 1064 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) 1065 return; 1066 1067 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 1068 WARN_ON(cfq_cfqq_slice_new(cfqq)); 1069 1070 /* 1071 * idle is disabled, either manually or by past process history 1072 */ 1073 if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) 1074 return; 1075 1076 /* 1077 * still requests with the driver, don't idle 1078 */ 1079 if (cfqd->rq_in_driver) 1080 return; 1081 1082 /* 1083 * task has exited, don't wait 1084 */ 1085 cic = cfqd->active_cic; 1086 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1087 return; 1088 1089 cfq_mark_cfqq_wait_request(cfqq); 1090 1091 /* 1092 * we don't want to idle for seeks, but we do want to allow 1093 * fair distribution of slice time for a process doing back-to-back 1094 * seeks. so allow a little bit of time for him to submit a new rq 1095 */ 1096 sl = cfqd->cfq_slice_idle; 1097 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) 1098 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); 1099 1100 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1101 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1102} 1103 1104/* 1105 * Move request from internal lists to the request queue dispatch list. 1106 */ 1107static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) 1108{ 1109 struct cfq_data *cfqd = q->elevator->elevator_data; 1110 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1111 1112 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 1113 1114 cfq_remove_request(rq); 1115 cfqq->dispatched++; 1116 elv_dispatch_sort(q, rq); 1117 1118 if (cfq_cfqq_sync(cfqq)) 1119 cfqd->sync_flight++; 1120} 1121 1122/* 1123 * return expired entry, or NULL to just start from scratch in rbtree 1124 */ 1125static struct request *cfq_check_fifo(struct cfq_queue *cfqq) 1126{ 1127 struct cfq_data *cfqd = cfqq->cfqd; 1128 struct request *rq; 1129 int fifo; 1130 1131 if (cfq_cfqq_fifo_expire(cfqq)) 1132 return NULL; 1133 1134 cfq_mark_cfqq_fifo_expire(cfqq); 1135 1136 if (list_empty(&cfqq->fifo)) 1137 return NULL; 1138 1139 fifo = cfq_cfqq_sync(cfqq); 1140 rq = rq_entry_fifo(cfqq->fifo.next); 1141 1142 if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) 1143 rq = NULL; 1144 1145 cfq_log_cfqq(cfqd, cfqq, "fifo=%p", rq); 1146 return rq; 1147} 1148 1149static inline int 1150cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1151{ 1152 const int base_rq = cfqd->cfq_slice_async_rq; 1153 1154 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 1155 1156 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 1157} 1158 1159/* 1160 * Select a queue for service. If we have a current active queue, 1161 * check whether to continue servicing it, or retrieve and set a new one. 1162 */ 1163static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 1164{ 1165 struct cfq_queue *cfqq, *new_cfqq = NULL; 1166 1167 cfqq = cfqd->active_queue; 1168 if (!cfqq) 1169 goto new_queue; 1170 1171 /* 1172 * The active queue has run out of time, expire it and select new. 1173 */ 1174 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) 1175 goto expire; 1176 1177 /* 1178 * If we have a RT cfqq waiting, then we pre-empt the current non-rt 1179 * cfqq. 1180 */ 1181 if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) { 1182 /* 1183 * We simulate this as cfqq timed out so that it gets to bank 1184 * the remaining of its time slice. 1185 */ 1186 cfq_log_cfqq(cfqd, cfqq, "preempt"); 1187 cfq_slice_expired(cfqd, 1); 1188 goto new_queue; 1189 } 1190 1191 /* 1192 * The active queue has requests and isn't expired, allow it to 1193 * dispatch. 1194 */ 1195 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 1196 goto keep_queue; 1197 1198 /* 1199 * If another queue has a request waiting within our mean seek 1200 * distance, let it run. The expire code will check for close 1201 * cooperators and put the close queue at the front of the service 1202 * tree. 1203 */ 1204 new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); 1205 if (new_cfqq) 1206 goto expire; 1207 1208 /* 1209 * No requests pending. If the active queue still has requests in 1210 * flight or is idling for a new request, allow either of these 1211 * conditions to happen (or time out) before selecting a new queue. 1212 */ 1213 if (timer_pending(&cfqd->idle_slice_timer) || 1214 (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { 1215 cfqq = NULL; 1216 goto keep_queue; 1217 } 1218 1219expire: 1220 cfq_slice_expired(cfqd, 0); 1221new_queue: 1222 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 1223keep_queue: 1224 return cfqq; 1225} 1226 1227static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 1228{ 1229 int dispatched = 0; 1230 1231 while (cfqq->next_rq) { 1232 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 1233 dispatched++; 1234 } 1235 1236 BUG_ON(!list_empty(&cfqq->fifo)); 1237 return dispatched; 1238} 1239 1240/* 1241 * Drain our current requests. Used for barriers and when switching 1242 * io schedulers on-the-fly. 1243 */ 1244static int cfq_forced_dispatch(struct cfq_data *cfqd) 1245{ 1246 struct cfq_queue *cfqq; 1247 int dispatched = 0; 1248 1249 while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) 1250 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 1251 1252 cfq_slice_expired(cfqd, 0); 1253 1254 BUG_ON(cfqd->busy_queues); 1255 1256 cfq_log(cfqd, "forced_dispatch=%d\n", dispatched); 1257 return dispatched; 1258} 1259 1260/* 1261 * Dispatch a request from cfqq, moving them to the request queue 1262 * dispatch list. 1263 */ 1264static void cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1265{ 1266 struct request *rq; 1267 1268 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 1269 1270 /* 1271 * follow expired path, else get first next available 1272 */ 1273 rq = cfq_check_fifo(cfqq); 1274 if (!rq) 1275 rq = cfqq->next_rq; 1276 1277 /* 1278 * insert request into driver dispatch list 1279 */ 1280 cfq_dispatch_insert(cfqd->queue, rq); 1281 1282 if (!cfqd->active_cic) { 1283 struct cfq_io_context *cic = RQ_CIC(rq); 1284 1285 atomic_inc(&cic->ioc->refcount); 1286 cfqd->active_cic = cic; 1287 } 1288} 1289 1290/* 1291 * Find the cfqq that we need to service and move a request from that to the 1292 * dispatch list 1293 */ 1294static int cfq_dispatch_requests(struct request_queue *q, int force) 1295{ 1296 struct cfq_data *cfqd = q->elevator->elevator_data; 1297 struct cfq_queue *cfqq; 1298 unsigned int max_dispatch; 1299 1300 if (!cfqd->busy_queues) 1301 return 0; 1302 1303 if (unlikely(force)) 1304 return cfq_forced_dispatch(cfqd); 1305 1306 cfqq = cfq_select_queue(cfqd); 1307 if (!cfqq) 1308 return 0; 1309 1310 /* 1311 * If this is an async queue and we have sync IO in flight, let it wait 1312 */ 1313 if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) 1314 return 0; 1315 1316 max_dispatch = cfqd->cfq_quantum; 1317 if (cfq_class_idle(cfqq)) 1318 max_dispatch = 1; 1319 1320 /* 1321 * Does this cfqq already have too much IO in flight? 1322 */ 1323 if (cfqq->dispatched >= max_dispatch) { 1324 /* 1325 * idle queue must always only have a single IO in flight 1326 */ 1327 if (cfq_class_idle(cfqq)) 1328 return 0; 1329 1330 /* 1331 * We have other queues, don't allow more IO from this one 1332 */ 1333 if (cfqd->busy_queues > 1) 1334 return 0; 1335 1336 /* 1337 * we are the only queue, allow up to 4 times of 'quantum' 1338 */ 1339 if (cfqq->dispatched >= 4 * max_dispatch) 1340 return 0; 1341 } 1342 1343 /* 1344 * Dispatch a request from this cfqq 1345 */ 1346 cfq_dispatch_request(cfqd, cfqq); 1347 cfqq->slice_dispatch++; 1348 cfq_clear_cfqq_must_dispatch(cfqq); 1349 1350 /* 1351 * expire an async queue immediately if it has used up its slice. idle 1352 * queue always expire after 1 dispatch round. 1353 */ 1354 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 1355 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 1356 cfq_class_idle(cfqq))) { 1357 cfqq->slice_end = jiffies + 1; 1358 cfq_slice_expired(cfqd, 0); 1359 } 1360 1361 cfq_log(cfqd, "dispatched a request"); 1362 return 1; 1363} 1364 1365/* 1366 * task holds one reference to the queue, dropped when task exits. each rq 1367 * in-flight on this queue also holds a reference, dropped when rq is freed. 1368 * 1369 * queue lock must be held here. 1370 */ 1371static void cfq_put_queue(struct cfq_queue *cfqq) 1372{ 1373 struct cfq_data *cfqd = cfqq->cfqd; 1374 1375 BUG_ON(atomic_read(&cfqq->ref) <= 0); 1376 1377 if (!atomic_dec_and_test(&cfqq->ref)) 1378 return; 1379 1380 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 1381 BUG_ON(rb_first(&cfqq->sort_list)); 1382 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 1383 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1384 1385 if (unlikely(cfqd->active_queue == cfqq)) { 1386 __cfq_slice_expired(cfqd, cfqq, 0); 1387 cfq_schedule_dispatch(cfqd); 1388 } 1389 1390 kmem_cache_free(cfq_pool, cfqq); 1391} 1392 1393/* 1394 * Must always be called with the rcu_read_lock() held 1395 */ 1396static void 1397__call_for_each_cic(struct io_context *ioc, 1398 void (*func)(struct io_context *, struct cfq_io_context *)) 1399{ 1400 struct cfq_io_context *cic; 1401 struct hlist_node *n; 1402 1403 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 1404 func(ioc, cic); 1405} 1406 1407/* 1408 * Call func for each cic attached to this ioc. 1409 */ 1410static void 1411call_for_each_cic(struct io_context *ioc, 1412 void (*func)(struct io_context *, struct cfq_io_context *)) 1413{ 1414 rcu_read_lock(); 1415 __call_for_each_cic(ioc, func); 1416 rcu_read_unlock(); 1417} 1418 1419static void cfq_cic_free_rcu(struct rcu_head *head) 1420{ 1421 struct cfq_io_context *cic; 1422 1423 cic = container_of(head, struct cfq_io_context, rcu_head); 1424 1425 kmem_cache_free(cfq_ioc_pool, cic); 1426 elv_ioc_count_dec(ioc_count); 1427 1428 if (ioc_gone) { 1429 /* 1430 * CFQ scheduler is exiting, grab exit lock and check 1431 * the pending io context count. If it hits zero, 1432 * complete ioc_gone and set it back to NULL 1433 */ 1434 spin_lock(&ioc_gone_lock); 1435 if (ioc_gone && !elv_ioc_count_read(ioc_count)) { 1436 complete(ioc_gone); 1437 ioc_gone = NULL; 1438 } 1439 spin_unlock(&ioc_gone_lock); 1440 } 1441} 1442 1443static void cfq_cic_free(struct cfq_io_context *cic) 1444{ 1445 call_rcu(&cic->rcu_head, cfq_cic_free_rcu); 1446} 1447 1448static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 1449{ 1450 unsigned long flags; 1451 1452 BUG_ON(!cic->dead_key); 1453 1454 spin_lock_irqsave(&ioc->lock, flags); 1455 radix_tree_delete(&ioc->radix_root, cic->dead_key); 1456 hlist_del_rcu(&cic->cic_list); 1457 spin_unlock_irqrestore(&ioc->lock, flags); 1458 1459 cfq_cic_free(cic); 1460} 1461 1462/* 1463 * Must be called with rcu_read_lock() held or preemption otherwise disabled. 1464 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), 1465 * and ->trim() which is called with the task lock held 1466 */ 1467static void cfq_free_io_context(struct io_context *ioc) 1468{ 1469 /* 1470 * ioc->refcount is zero here, or we are called from elv_unregister(), 1471 * so no more cic's are allowed to be linked into this ioc. So it 1472 * should be ok to iterate over the known list, we will see all cic's 1473 * since no new ones are added. 1474 */ 1475 __call_for_each_cic(ioc, cic_free_func); 1476} 1477 1478static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1479{ 1480 if (unlikely(cfqq == cfqd->active_queue)) { 1481 __cfq_slice_expired(cfqd, cfqq, 0); 1482 cfq_schedule_dispatch(cfqd); 1483 } 1484 1485 cfq_put_queue(cfqq); 1486} 1487 1488static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 1489 struct cfq_io_context *cic) 1490{ 1491 struct io_context *ioc = cic->ioc; 1492 1493 list_del_init(&cic->queue_list); 1494 1495 /* 1496 * Make sure key == NULL is seen for dead queues 1497 */ 1498 smp_wmb(); 1499 cic->dead_key = (unsigned long) cic->key; 1500 cic->key = NULL; 1501 1502 if (ioc->ioc_data == cic) 1503 rcu_assign_pointer(ioc->ioc_data, NULL); 1504 1505 if (cic->cfqq[BLK_RW_ASYNC]) { 1506 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 1507 cic->cfqq[BLK_RW_ASYNC] = NULL; 1508 } 1509 1510 if (cic->cfqq[BLK_RW_SYNC]) { 1511 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 1512 cic->cfqq[BLK_RW_SYNC] = NULL; 1513 } 1514} 1515 1516static void cfq_exit_single_io_context(struct io_context *ioc, 1517 struct cfq_io_context *cic) 1518{ 1519 struct cfq_data *cfqd = cic->key; 1520 1521 if (cfqd) { 1522 struct request_queue *q = cfqd->queue; 1523 unsigned long flags; 1524 1525 spin_lock_irqsave(q->queue_lock, flags); 1526 1527 /* 1528 * Ensure we get a fresh copy of the ->key to prevent 1529 * race between exiting task and queue 1530 */ 1531 smp_read_barrier_depends(); 1532 if (cic->key) 1533 __cfq_exit_single_io_context(cfqd, cic); 1534 1535 spin_unlock_irqrestore(q->queue_lock, flags); 1536 } 1537} 1538 1539/* 1540 * The process that ioc belongs to has exited, we need to clean up 1541 * and put the internal structures we have that belongs to that process. 1542 */ 1543static void cfq_exit_io_context(struct io_context *ioc) 1544{ 1545 call_for_each_cic(ioc, cfq_exit_single_io_context); 1546} 1547 1548static struct cfq_io_context * 1549cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 1550{ 1551 struct cfq_io_context *cic; 1552 1553 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, 1554 cfqd->queue->node); 1555 if (cic) { 1556 cic->last_end_request = jiffies; 1557 INIT_LIST_HEAD(&cic->queue_list); 1558 INIT_HLIST_NODE(&cic->cic_list); 1559 cic->dtor = cfq_free_io_context; 1560 cic->exit = cfq_exit_io_context; 1561 elv_ioc_count_inc(ioc_count); 1562 } 1563 1564 return cic; 1565} 1566 1567static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 1568{ 1569 struct task_struct *tsk = current; 1570 int ioprio_class; 1571 1572 if (!cfq_cfqq_prio_changed(cfqq)) 1573 return; 1574 1575 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); 1576 switch (ioprio_class) { 1577 default: 1578 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 1579 case IOPRIO_CLASS_NONE: 1580 /* 1581 * no prio set, inherit CPU scheduling settings 1582 */ 1583 cfqq->ioprio = task_nice_ioprio(tsk); 1584 cfqq->ioprio_class = task_nice_ioclass(tsk); 1585 break; 1586 case IOPRIO_CLASS_RT: 1587 cfqq->ioprio = task_ioprio(ioc); 1588 cfqq->ioprio_class = IOPRIO_CLASS_RT; 1589 break; 1590 case IOPRIO_CLASS_BE: 1591 cfqq->ioprio = task_ioprio(ioc); 1592 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1593 break; 1594 case IOPRIO_CLASS_IDLE: 1595 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 1596 cfqq->ioprio = 7; 1597 cfq_clear_cfqq_idle_window(cfqq); 1598 break; 1599 } 1600 1601 /* 1602 * keep track of original prio settings in case we have to temporarily 1603 * elevate the priority of this queue 1604 */ 1605 cfqq->org_ioprio = cfqq->ioprio; 1606 cfqq->org_ioprio_class = cfqq->ioprio_class; 1607 cfq_clear_cfqq_prio_changed(cfqq); 1608} 1609 1610static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 1611{ 1612 struct cfq_data *cfqd = cic->key; 1613 struct cfq_queue *cfqq; 1614 unsigned long flags; 1615 1616 if (unlikely(!cfqd)) 1617 return; 1618 1619 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1620 1621 cfqq = cic->cfqq[BLK_RW_ASYNC]; 1622 if (cfqq) { 1623 struct cfq_queue *new_cfqq; 1624 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 1625 GFP_ATOMIC); 1626 if (new_cfqq) { 1627 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 1628 cfq_put_queue(cfqq); 1629 } 1630 } 1631 1632 cfqq = cic->cfqq[BLK_RW_SYNC]; 1633 if (cfqq) 1634 cfq_mark_cfqq_prio_changed(cfqq); 1635 1636 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1637} 1638 1639static void cfq_ioc_set_ioprio(struct io_context *ioc) 1640{ 1641 call_for_each_cic(ioc, changed_ioprio); 1642 ioc->ioprio_changed = 0; 1643} 1644 1645static struct cfq_queue * 1646cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync, 1647 struct io_context *ioc, gfp_t gfp_mask) 1648{ 1649 struct cfq_queue *cfqq, *new_cfqq = NULL; 1650 struct cfq_io_context *cic; 1651 1652retry: 1653 cic = cfq_cic_lookup(cfqd, ioc); 1654 /* cic always exists here */ 1655 cfqq = cic_to_cfqq(cic, is_sync); 1656 1657 if (!cfqq) { 1658 if (new_cfqq) { 1659 cfqq = new_cfqq; 1660 new_cfqq = NULL; 1661 } else if (gfp_mask & __GFP_WAIT) { 1662 /* 1663 * Inform the allocator of the fact that we will 1664 * just repeat this allocation if it fails, to allow 1665 * the allocator to do whatever it needs to attempt to 1666 * free memory. 1667 */ 1668 spin_unlock_irq(cfqd->queue->queue_lock); 1669 new_cfqq = kmem_cache_alloc_node(cfq_pool, 1670 gfp_mask | __GFP_NOFAIL | __GFP_ZERO, 1671 cfqd->queue->node); 1672 spin_lock_irq(cfqd->queue->queue_lock); 1673 goto retry; 1674 } else { 1675 cfqq = kmem_cache_alloc_node(cfq_pool, 1676 gfp_mask | __GFP_ZERO, 1677 cfqd->queue->node); 1678 if (!cfqq) 1679 goto out; 1680 } 1681 1682 RB_CLEAR_NODE(&cfqq->rb_node); 1683 RB_CLEAR_NODE(&cfqq->p_node); 1684 INIT_LIST_HEAD(&cfqq->fifo); 1685 1686 atomic_set(&cfqq->ref, 0); 1687 cfqq->cfqd = cfqd; 1688 1689 cfq_mark_cfqq_prio_changed(cfqq); 1690 1691 cfq_init_prio_data(cfqq, ioc); 1692 1693 if (is_sync) { 1694 if (!cfq_class_idle(cfqq)) 1695 cfq_mark_cfqq_idle_window(cfqq); 1696 cfq_mark_cfqq_sync(cfqq); 1697 } 1698 cfqq->pid = current->pid; 1699 cfq_log_cfqq(cfqd, cfqq, "alloced"); 1700 } 1701 1702 if (new_cfqq) 1703 kmem_cache_free(cfq_pool, new_cfqq); 1704 1705out: 1706 WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); 1707 return cfqq; 1708} 1709 1710static struct cfq_queue ** 1711cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) 1712{ 1713 switch (ioprio_class) { 1714 case IOPRIO_CLASS_RT: 1715 return &cfqd->async_cfqq[0][ioprio]; 1716 case IOPRIO_CLASS_BE: 1717 return &cfqd->async_cfqq[1][ioprio]; 1718 case IOPRIO_CLASS_IDLE: 1719 return &cfqd->async_idle_cfqq; 1720 default: 1721 BUG(); 1722 } 1723} 1724 1725static struct cfq_queue * 1726cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc, 1727 gfp_t gfp_mask) 1728{ 1729 const int ioprio = task_ioprio(ioc); 1730 const int ioprio_class = task_ioprio_class(ioc); 1731 struct cfq_queue **async_cfqq = NULL; 1732 struct cfq_queue *cfqq = NULL; 1733 1734 if (!is_sync) { 1735 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 1736 cfqq = *async_cfqq; 1737 } 1738 1739 if (!cfqq) { 1740 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 1741 if (!cfqq) 1742 return NULL; 1743 } 1744 1745 /* 1746 * pin the queue now that it's allocated, scheduler exit will prune it 1747 */ 1748 if (!is_sync && !(*async_cfqq)) { 1749 atomic_inc(&cfqq->ref); 1750 *async_cfqq = cfqq; 1751 } 1752 1753 atomic_inc(&cfqq->ref); 1754 return cfqq; 1755} 1756 1757/* 1758 * We drop cfq io contexts lazily, so we may find a dead one. 1759 */ 1760static void 1761cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, 1762 struct cfq_io_context *cic) 1763{ 1764 unsigned long flags; 1765 1766 WARN_ON(!list_empty(&cic->queue_list)); 1767 1768 spin_lock_irqsave(&ioc->lock, flags); 1769 1770 BUG_ON(ioc->ioc_data == cic); 1771 1772 radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd); 1773 hlist_del_rcu(&cic->cic_list); 1774 spin_unlock_irqrestore(&ioc->lock, flags); 1775 1776 cfq_cic_free(cic); 1777} 1778 1779static struct cfq_io_context * 1780cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 1781{ 1782 struct cfq_io_context *cic; 1783 unsigned long flags; 1784 void *k; 1785 1786 if (unlikely(!ioc)) 1787 return NULL; 1788 1789 rcu_read_lock(); 1790 1791 /* 1792 * we maintain a last-hit cache, to avoid browsing over the tree 1793 */ 1794 cic = rcu_dereference(ioc->ioc_data); 1795 if (cic && cic->key == cfqd) { 1796 rcu_read_unlock(); 1797 return cic; 1798 } 1799 1800 do { 1801 cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); 1802 rcu_read_unlock(); 1803 if (!cic) 1804 break; 1805 /* ->key must be copied to avoid race with cfq_exit_queue() */ 1806 k = cic->key; 1807 if (unlikely(!k)) { 1808 cfq_drop_dead_cic(cfqd, ioc, cic); 1809 rcu_read_lock(); 1810 continue; 1811 } 1812 1813 spin_lock_irqsave(&ioc->lock, flags); 1814 rcu_assign_pointer(ioc->ioc_data, cic); 1815 spin_unlock_irqrestore(&ioc->lock, flags); 1816 break; 1817 } while (1); 1818 1819 return cic; 1820} 1821 1822/* 1823 * Add cic into ioc, using cfqd as the search key. This enables us to lookup 1824 * the process specific cfq io context when entered from the block layer. 1825 * Also adds the cic to a per-cfqd list, used when this queue is removed. 1826 */ 1827static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 1828 struct cfq_io_context *cic, gfp_t gfp_mask) 1829{ 1830 unsigned long flags; 1831 int ret; 1832 1833 ret = radix_tree_preload(gfp_mask); 1834 if (!ret) { 1835 cic->ioc = ioc; 1836 cic->key = cfqd; 1837 1838 spin_lock_irqsave(&ioc->lock, flags); 1839 ret = radix_tree_insert(&ioc->radix_root, 1840 (unsigned long) cfqd, cic); 1841 if (!ret) 1842 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 1843 spin_unlock_irqrestore(&ioc->lock, flags); 1844 1845 radix_tree_preload_end(); 1846 1847 if (!ret) { 1848 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1849 list_add(&cic->queue_list, &cfqd->cic_list); 1850 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1851 } 1852 } 1853 1854 if (ret) 1855 printk(KERN_ERR "cfq: cic link failed!\n"); 1856 1857 return ret; 1858} 1859 1860/* 1861 * Setup general io context and cfq io context. There can be several cfq 1862 * io contexts per general io context, if this process is doing io to more 1863 * than one device managed by cfq. 1864 */ 1865static struct cfq_io_context * 1866cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 1867{ 1868 struct io_context *ioc = NULL; 1869 struct cfq_io_context *cic; 1870 1871 might_sleep_if(gfp_mask & __GFP_WAIT); 1872 1873 ioc = get_io_context(gfp_mask, cfqd->queue->node); 1874 if (!ioc) 1875 return NULL; 1876 1877 cic = cfq_cic_lookup(cfqd, ioc); 1878 if (cic) 1879 goto out; 1880 1881 cic = cfq_alloc_io_context(cfqd, gfp_mask); 1882 if (cic == NULL) 1883 goto err; 1884 1885 if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) 1886 goto err_free; 1887 1888out: 1889 smp_read_barrier_depends(); 1890 if (unlikely(ioc->ioprio_changed)) 1891 cfq_ioc_set_ioprio(ioc); 1892 1893 return cic; 1894err_free: 1895 cfq_cic_free(cic); 1896err: 1897 put_io_context(ioc); 1898 return NULL; 1899} 1900 1901static void 1902cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 1903{ 1904 unsigned long elapsed = jiffies - cic->last_end_request; 1905 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 1906 1907 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 1908 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 1909 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 1910} 1911 1912static void 1913cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, 1914 struct request *rq) 1915{ 1916 sector_t sdist; 1917 u64 total; 1918 1919 if (!cic->last_request_pos) 1920 sdist = 0; 1921 else if (cic->last_request_pos < rq->sector) 1922 sdist = rq->sector - cic->last_request_pos; 1923 else 1924 sdist = cic->last_request_pos - rq->sector; 1925 1926 /* 1927 * Don't allow the seek distance to get too large from the 1928 * odd fragment, pagein, etc 1929 */ 1930 if (cic->seek_samples <= 60) /* second&third seek */ 1931 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); 1932 else 1933 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); 1934 1935 cic->seek_samples = (7*cic->seek_samples + 256) / 8; 1936 cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 1937 total = cic->seek_total + (cic->seek_samples/2); 1938 do_div(total, cic->seek_samples); 1939 cic->seek_mean = (sector_t)total; 1940} 1941 1942/* 1943 * Disable idle window if the process thinks too long or seeks so much that 1944 * it doesn't matter 1945 */ 1946static void 1947cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1948 struct cfq_io_context *cic) 1949{ 1950 int old_idle, enable_idle; 1951 1952 /* 1953 * Don't idle for async or idle io prio class 1954 */ 1955 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) 1956 return; 1957 1958 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 1959 1960 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 1961 (cfqd->hw_tag && CIC_SEEKY(cic))) 1962 enable_idle = 0; 1963 else if (sample_valid(cic->ttime_samples)) { 1964 if (cic->ttime_mean > cfqd->cfq_slice_idle) 1965 enable_idle = 0; 1966 else 1967 enable_idle = 1; 1968 } 1969 1970 if (old_idle != enable_idle) { 1971 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); 1972 if (enable_idle) 1973 cfq_mark_cfqq_idle_window(cfqq); 1974 else 1975 cfq_clear_cfqq_idle_window(cfqq); 1976 } 1977} 1978 1979/* 1980 * Check if new_cfqq should preempt the currently active queue. Return 0 for 1981 * no or if we aren't sure, a 1 will cause a preempt. 1982 */ 1983static int 1984cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 1985 struct request *rq) 1986{ 1987 struct cfq_queue *cfqq; 1988 1989 cfqq = cfqd->active_queue; 1990 if (!cfqq) 1991 return 0; 1992 1993 if (cfq_slice_used(cfqq)) 1994 return 1; 1995 1996 if (cfq_class_idle(new_cfqq)) 1997 return 0; 1998 1999 if (cfq_class_idle(cfqq)) 2000 return 1; 2001 2002 /* 2003 * if the new request is sync, but the currently running queue is 2004 * not, let the sync request have priority. 2005 */ 2006 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 2007 return 1; 2008 2009 /* 2010 * So both queues are sync. Let the new request get disk time if 2011 * it's a metadata request and the current queue is doing regular IO. 2012 */ 2013 if (rq_is_meta(rq) && !cfqq->meta_pending) 2014 return 1; 2015 2016 /* 2017 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. 2018 */ 2019 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 2020 return 1; 2021 2022 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 2023 return 0; 2024 2025 /* 2026 * if this request is as-good as one we would expect from the 2027 * current cfqq, let it preempt 2028 */ 2029 if (cfq_rq_close(cfqd, rq)) 2030 return 1; 2031 2032 return 0; 2033} 2034 2035/* 2036 * cfqq preempts the active queue. if we allowed preempt with no slice left, 2037 * let it have half of its nominal slice. 2038 */ 2039static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2040{ 2041 cfq_log_cfqq(cfqd, cfqq, "preempt"); 2042 cfq_slice_expired(cfqd, 1); 2043 2044 /* 2045 * Put the new queue at the front of the of the current list, 2046 * so we know that it will be selected next. 2047 */ 2048 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 2049 2050 cfq_service_tree_add(cfqd, cfqq, 1); 2051 2052 cfqq->slice_end = 0; 2053 cfq_mark_cfqq_slice_new(cfqq); 2054} 2055 2056/* 2057 * Called when a new fs request (rq) is added (to cfqq). Check if there's 2058 * something we should do about it 2059 */ 2060static void 2061cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2062 struct request *rq) 2063{ 2064 struct cfq_io_context *cic = RQ_CIC(rq); 2065 2066 cfqd->rq_queued++; 2067 if (rq_is_meta(rq)) 2068 cfqq->meta_pending++; 2069 2070 cfq_update_io_thinktime(cfqd, cic); 2071 cfq_update_io_seektime(cfqd, cic, rq); 2072 cfq_update_idle_window(cfqd, cfqq, cic); 2073 2074 cic->last_request_pos = rq->sector + rq->nr_sectors; 2075 2076 if (cfqq == cfqd->active_queue) { 2077 /* 2078 * Remember that we saw a request from this process, but 2079 * don't start queuing just yet. Otherwise we risk seeing lots 2080 * of tiny requests, because we disrupt the normal plugging 2081 * and merging. If the request is already larger than a single 2082 * page, let it rip immediately. For that case we assume that 2083 * merging is already done. Ditto for a busy system that 2084 * has other work pending, don't risk delaying until the 2085 * idle timer unplug to continue working. 2086 */ 2087 if (cfq_cfqq_wait_request(cfqq)) { 2088 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 2089 cfqd->busy_queues > 1) { 2090 del_timer(&cfqd->idle_slice_timer); 2091 blk_start_queueing(cfqd->queue); 2092 } 2093 cfq_mark_cfqq_must_dispatch(cfqq); 2094 } 2095 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 2096 /* 2097 * not the active queue - expire current slice if it is 2098 * idle and has expired it's mean thinktime or this new queue 2099 * has some old slice time left and is of higher priority or 2100 * this new queue is RT and the current one is BE 2101 */ 2102 cfq_preempt_queue(cfqd, cfqq); 2103 blk_start_queueing(cfqd->queue); 2104 } 2105} 2106 2107static void cfq_insert_request(struct request_queue *q, struct request *rq) 2108{ 2109 struct cfq_data *cfqd = q->elevator->elevator_data; 2110 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2111 2112 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 2113 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 2114 2115 cfq_add_rq_rb(rq); 2116 2117 list_add_tail(&rq->queuelist, &cfqq->fifo); 2118 2119 cfq_rq_enqueued(cfqd, cfqq, rq); 2120} 2121 2122/* 2123 * Update hw_tag based on peak queue depth over 50 samples under 2124 * sufficient load. 2125 */ 2126static void cfq_update_hw_tag(struct cfq_data *cfqd) 2127{ 2128 if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak) 2129 cfqd->rq_in_driver_peak = cfqd->rq_in_driver; 2130 2131 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 2132 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) 2133 return; 2134 2135 if (cfqd->hw_tag_samples++ < 50) 2136 return; 2137 2138 if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN) 2139 cfqd->hw_tag = 1; 2140 else 2141 cfqd->hw_tag = 0; 2142 2143 cfqd->hw_tag_samples = 0; 2144 cfqd->rq_in_driver_peak = 0; 2145} 2146 2147static void cfq_completed_request(struct request_queue *q, struct request *rq) 2148{ 2149 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2150 struct cfq_data *cfqd = cfqq->cfqd; 2151 const int sync = rq_is_sync(rq); 2152 unsigned long now; 2153 2154 now = jiffies; 2155 cfq_log_cfqq(cfqd, cfqq, "complete"); 2156 2157 cfq_update_hw_tag(cfqd); 2158 2159 WARN_ON(!cfqd->rq_in_driver); 2160 WARN_ON(!cfqq->dispatched); 2161 cfqd->rq_in_driver--; 2162 cfqq->dispatched--; 2163 2164 if (cfq_cfqq_sync(cfqq)) 2165 cfqd->sync_flight--; 2166 2167 if (!cfq_class_idle(cfqq)) 2168 cfqd->last_end_request = now; 2169 2170 if (sync) 2171 RQ_CIC(rq)->last_end_request = now; 2172 2173 /* 2174 * If this is the active queue, check if it needs to be expired, 2175 * or if we want to idle in case it has no pending requests. 2176 */ 2177 if (cfqd->active_queue == cfqq) { 2178 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 2179 2180 if (cfq_cfqq_slice_new(cfqq)) { 2181 cfq_set_prio_slice(cfqd, cfqq); 2182 cfq_clear_cfqq_slice_new(cfqq); 2183 } 2184 /* 2185 * If there are no requests waiting in this queue, and 2186 * there are other queues ready to issue requests, AND 2187 * those other queues are issuing requests within our 2188 * mean seek distance, give them a chance to run instead 2189 * of idling. 2190 */ 2191 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 2192 cfq_slice_expired(cfqd, 1); 2193 else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && 2194 sync && !rq_noidle(rq)) 2195 cfq_arm_slice_timer(cfqd); 2196 } 2197 2198 if (!cfqd->rq_in_driver) 2199 cfq_schedule_dispatch(cfqd); 2200} 2201 2202/* 2203 * we temporarily boost lower priority queues if they are holding fs exclusive 2204 * resources. they are boosted to normal prio (CLASS_BE/4) 2205 */ 2206static void cfq_prio_boost(struct cfq_queue *cfqq) 2207{ 2208 if (has_fs_excl()) { 2209 /* 2210 * boost idle prio on transactions that would lock out other 2211 * users of the filesystem 2212 */ 2213 if (cfq_class_idle(cfqq)) 2214 cfqq->ioprio_class = IOPRIO_CLASS_BE; 2215 if (cfqq->ioprio > IOPRIO_NORM) 2216 cfqq->ioprio = IOPRIO_NORM; 2217 } else { 2218 /* 2219 * check if we need to unboost the queue 2220 */ 2221 if (cfqq->ioprio_class != cfqq->org_ioprio_class) 2222 cfqq->ioprio_class = cfqq->org_ioprio_class; 2223 if (cfqq->ioprio != cfqq->org_ioprio) 2224 cfqq->ioprio = cfqq->org_ioprio; 2225 } 2226} 2227 2228static inline int __cfq_may_queue(struct cfq_queue *cfqq) 2229{ 2230 if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && 2231 !cfq_cfqq_must_alloc_slice(cfqq)) { 2232 cfq_mark_cfqq_must_alloc_slice(cfqq); 2233 return ELV_MQUEUE_MUST; 2234 } 2235 2236 return ELV_MQUEUE_MAY; 2237} 2238 2239static int cfq_may_queue(struct request_queue *q, int rw) 2240{ 2241 struct cfq_data *cfqd = q->elevator->elevator_data; 2242 struct task_struct *tsk = current; 2243 struct cfq_io_context *cic; 2244 struct cfq_queue *cfqq; 2245 2246 /* 2247 * don't force setup of a queue from here, as a call to may_queue 2248 * does not necessarily imply that a request actually will be queued. 2249 * so just lookup a possibly existing queue, or return 'may queue' 2250 * if that fails 2251 */ 2252 cic = cfq_cic_lookup(cfqd, tsk->io_context); 2253 if (!cic) 2254 return ELV_MQUEUE_MAY; 2255 2256 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 2257 if (cfqq) { 2258 cfq_init_prio_data(cfqq, cic->ioc); 2259 cfq_prio_boost(cfqq); 2260 2261 return __cfq_may_queue(cfqq); 2262 } 2263 2264 return ELV_MQUEUE_MAY; 2265} 2266 2267/* 2268 * queue lock held here 2269 */ 2270static void cfq_put_request(struct request *rq) 2271{ 2272 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2273 2274 if (cfqq) { 2275 const int rw = rq_data_dir(rq); 2276 2277 BUG_ON(!cfqq->allocated[rw]); 2278 cfqq->allocated[rw]--; 2279 2280 put_io_context(RQ_CIC(rq)->ioc); 2281 2282 rq->elevator_private = NULL; 2283 rq->elevator_private2 = NULL; 2284 2285 cfq_put_queue(cfqq); 2286 } 2287} 2288 2289/* 2290 * Allocate cfq data structures associated with this request. 2291 */ 2292static int 2293cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 2294{ 2295 struct cfq_data *cfqd = q->elevator->elevator_data; 2296 struct cfq_io_context *cic; 2297 const int rw = rq_data_dir(rq); 2298 const int is_sync = rq_is_sync(rq); 2299 struct cfq_queue *cfqq; 2300 unsigned long flags; 2301 2302 might_sleep_if(gfp_mask & __GFP_WAIT); 2303 2304 cic = cfq_get_io_context(cfqd, gfp_mask); 2305 2306 spin_lock_irqsave(q->queue_lock, flags); 2307 2308 if (!cic) 2309 goto queue_fail; 2310 2311 cfqq = cic_to_cfqq(cic, is_sync); 2312 if (!cfqq) { 2313 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 2314 2315 if (!cfqq) 2316 goto queue_fail; 2317 2318 cic_set_cfqq(cic, cfqq, is_sync); 2319 } 2320 2321 cfqq->allocated[rw]++; 2322 cfq_clear_cfqq_must_alloc(cfqq); 2323 atomic_inc(&cfqq->ref); 2324 2325 spin_unlock_irqrestore(q->queue_lock, flags); 2326 2327 rq->elevator_private = cic; 2328 rq->elevator_private2 = cfqq; 2329 return 0; 2330 2331queue_fail: 2332 if (cic) 2333 put_io_context(cic->ioc); 2334 2335 cfq_schedule_dispatch(cfqd); 2336 spin_unlock_irqrestore(q->queue_lock, flags); 2337 cfq_log(cfqd, "set_request fail"); 2338 return 1; 2339} 2340 2341static void cfq_kick_queue(struct work_struct *work) 2342{ 2343 struct cfq_data *cfqd = 2344 container_of(work, struct cfq_data, unplug_work); 2345 struct request_queue *q = cfqd->queue; 2346 2347 spin_lock_irq(q->queue_lock); 2348 blk_start_queueing(q); 2349 spin_unlock_irq(q->queue_lock); 2350} 2351 2352/* 2353 * Timer running if the active_queue is currently idling inside its time slice 2354 */ 2355static void cfq_idle_slice_timer(unsigned long data) 2356{ 2357 struct cfq_data *cfqd = (struct cfq_data *) data; 2358 struct cfq_queue *cfqq; 2359 unsigned long flags; 2360 int timed_out = 1; 2361 2362 cfq_log(cfqd, "idle timer fired"); 2363 2364 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2365 2366 cfqq = cfqd->active_queue; 2367 if (cfqq) { 2368 timed_out = 0; 2369 2370 /* 2371 * We saw a request before the queue expired, let it through 2372 */ 2373 if (cfq_cfqq_must_dispatch(cfqq)) 2374 goto out_kick; 2375 2376 /* 2377 * expired 2378 */ 2379 if (cfq_slice_used(cfqq)) 2380 goto expire; 2381 2382 /* 2383 * only expire and reinvoke request handler, if there are 2384 * other queues with pending requests 2385 */ 2386 if (!cfqd->busy_queues) 2387 goto out_cont; 2388 2389 /* 2390 * not expired and it has a request pending, let it dispatch 2391 */ 2392 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 2393 goto out_kick; 2394 } 2395expire: 2396 cfq_slice_expired(cfqd, timed_out); 2397out_kick: 2398 cfq_schedule_dispatch(cfqd); 2399out_cont: 2400 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2401} 2402 2403static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 2404{ 2405 del_timer_sync(&cfqd->idle_slice_timer); 2406 cancel_work_sync(&cfqd->unplug_work); 2407} 2408 2409static void cfq_put_async_queues(struct cfq_data *cfqd) 2410{ 2411 int i; 2412 2413 for (i = 0; i < IOPRIO_BE_NR; i++) { 2414 if (cfqd->async_cfqq[0][i]) 2415 cfq_put_queue(cfqd->async_cfqq[0][i]); 2416 if (cfqd->async_cfqq[1][i]) 2417 cfq_put_queue(cfqd->async_cfqq[1][i]); 2418 } 2419 2420 if (cfqd->async_idle_cfqq) 2421 cfq_put_queue(cfqd->async_idle_cfqq); 2422} 2423 2424static void cfq_exit_queue(struct elevator_queue *e) 2425{ 2426 struct cfq_data *cfqd = e->elevator_data; 2427 struct request_queue *q = cfqd->queue; 2428 2429 cfq_shutdown_timer_wq(cfqd); 2430 2431 spin_lock_irq(q->queue_lock); 2432 2433 if (cfqd->active_queue) 2434 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 2435 2436 while (!list_empty(&cfqd->cic_list)) { 2437 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 2438 struct cfq_io_context, 2439 queue_list); 2440 2441 __cfq_exit_single_io_context(cfqd, cic); 2442 } 2443 2444 cfq_put_async_queues(cfqd); 2445 2446 spin_unlock_irq(q->queue_lock); 2447 2448 cfq_shutdown_timer_wq(cfqd); 2449 2450 kfree(cfqd); 2451} 2452 2453static void *cfq_init_queue(struct request_queue *q) 2454{ 2455 struct cfq_data *cfqd; 2456 int i; 2457 2458 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 2459 if (!cfqd) 2460 return NULL; 2461 2462 cfqd->service_tree = CFQ_RB_ROOT; 2463 2464 /* 2465 * Not strictly needed (since RB_ROOT just clears the node and we 2466 * zeroed cfqd on alloc), but better be safe in case someone decides 2467 * to add magic to the rb code 2468 */ 2469 for (i = 0; i < CFQ_PRIO_LISTS; i++) 2470 cfqd->prio_trees[i] = RB_ROOT; 2471 2472 INIT_LIST_HEAD(&cfqd->cic_list); 2473 2474 cfqd->queue = q; 2475 2476 init_timer(&cfqd->idle_slice_timer); 2477 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 2478 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 2479 2480 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 2481 2482 cfqd->last_end_request = jiffies; 2483 cfqd->cfq_quantum = cfq_quantum; 2484 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 2485 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 2486 cfqd->cfq_back_max = cfq_back_max; 2487 cfqd->cfq_back_penalty = cfq_back_penalty; 2488 cfqd->cfq_slice[0] = cfq_slice_async; 2489 cfqd->cfq_slice[1] = cfq_slice_sync; 2490 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 2491 cfqd->cfq_slice_idle = cfq_slice_idle; 2492 cfqd->hw_tag = 1; 2493 2494 return cfqd; 2495} 2496 2497static void cfq_slab_kill(void) 2498{ 2499 /* 2500 * Caller already ensured that pending RCU callbacks are completed, 2501 * so we should have no busy allocations at this point. 2502 */ 2503 if (cfq_pool) 2504 kmem_cache_destroy(cfq_pool); 2505 if (cfq_ioc_pool) 2506 kmem_cache_destroy(cfq_ioc_pool); 2507} 2508 2509static int __init cfq_slab_setup(void) 2510{ 2511 cfq_pool = KMEM_CACHE(cfq_queue, 0); 2512 if (!cfq_pool) 2513 goto fail; 2514 2515 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); 2516 if (!cfq_ioc_pool) 2517 goto fail; 2518 2519 return 0; 2520fail: 2521 cfq_slab_kill(); 2522 return -ENOMEM; 2523} 2524 2525/* 2526 * sysfs parts below --> 2527 */ 2528static ssize_t 2529cfq_var_show(unsigned int var, char *page) 2530{ 2531 return sprintf(page, "%d\n", var); 2532} 2533 2534static ssize_t 2535cfq_var_store(unsigned int *var, const char *page, size_t count) 2536{ 2537 char *p = (char *) page; 2538 2539 *var = simple_strtoul(p, &p, 10); 2540 return count; 2541} 2542 2543#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 2544static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 2545{ \ 2546 struct cfq_data *cfqd = e->elevator_data; \ 2547 unsigned int __data = __VAR; \ 2548 if (__CONV) \ 2549 __data = jiffies_to_msecs(__data); \ 2550 return cfq_var_show(__data, (page)); \ 2551} 2552SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 2553SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 2554SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 2555SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 2556SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 2557SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 2558SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 2559SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 2560SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 2561#undef SHOW_FUNCTION 2562 2563#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 2564static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 2565{ \ 2566 struct cfq_data *cfqd = e->elevator_data; \ 2567 unsigned int __data; \ 2568 int ret = cfq_var_store(&__data, (page), count); \ 2569 if (__data < (MIN)) \ 2570 __data = (MIN); \ 2571 else if (__data > (MAX)) \ 2572 __data = (MAX); \ 2573 if (__CONV) \ 2574 *(__PTR) = msecs_to_jiffies(__data); \ 2575 else \ 2576 *(__PTR) = __data; \ 2577 return ret; \ 2578} 2579STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 2580STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, 2581 UINT_MAX, 1); 2582STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, 2583 UINT_MAX, 1); 2584STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 2585STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 2586 UINT_MAX, 0); 2587STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 2588STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 2589STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 2590STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 2591 UINT_MAX, 0); 2592#undef STORE_FUNCTION 2593 2594#define CFQ_ATTR(name) \ 2595 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 2596 2597static struct elv_fs_entry cfq_attrs[] = { 2598 CFQ_ATTR(quantum), 2599 CFQ_ATTR(fifo_expire_sync), 2600 CFQ_ATTR(fifo_expire_async), 2601 CFQ_ATTR(back_seek_max), 2602 CFQ_ATTR(back_seek_penalty), 2603 CFQ_ATTR(slice_sync), 2604 CFQ_ATTR(slice_async), 2605 CFQ_ATTR(slice_async_rq), 2606 CFQ_ATTR(slice_idle), 2607 __ATTR_NULL 2608}; 2609 2610static struct elevator_type iosched_cfq = { 2611 .ops = { 2612 .elevator_merge_fn = cfq_merge, 2613 .elevator_merged_fn = cfq_merged_request, 2614 .elevator_merge_req_fn = cfq_merged_requests, 2615 .elevator_allow_merge_fn = cfq_allow_merge, 2616 .elevator_dispatch_fn = cfq_dispatch_requests, 2617 .elevator_add_req_fn = cfq_insert_request, 2618 .elevator_activate_req_fn = cfq_activate_request, 2619 .elevator_deactivate_req_fn = cfq_deactivate_request, 2620 .elevator_queue_empty_fn = cfq_queue_empty, 2621 .elevator_completed_req_fn = cfq_completed_request, 2622 .elevator_former_req_fn = elv_rb_former_request, 2623 .elevator_latter_req_fn = elv_rb_latter_request, 2624 .elevator_set_req_fn = cfq_set_request, 2625 .elevator_put_req_fn = cfq_put_request, 2626 .elevator_may_queue_fn = cfq_may_queue, 2627 .elevator_init_fn = cfq_init_queue, 2628 .elevator_exit_fn = cfq_exit_queue, 2629 .trim = cfq_free_io_context, 2630 }, 2631 .elevator_attrs = cfq_attrs, 2632 .elevator_name = "cfq", 2633 .elevator_owner = THIS_MODULE, 2634}; 2635 2636static int __init cfq_init(void) 2637{ 2638 /* 2639 * could be 0 on HZ < 1000 setups 2640 */ 2641 if (!cfq_slice_async) 2642 cfq_slice_async = 1; 2643 if (!cfq_slice_idle) 2644 cfq_slice_idle = 1; 2645 2646 if (cfq_slab_setup()) 2647 return -ENOMEM; 2648 2649 elv_register(&iosched_cfq); 2650 2651 return 0; 2652} 2653 2654static void __exit cfq_exit(void) 2655{ 2656 DECLARE_COMPLETION_ONSTACK(all_gone); 2657 elv_unregister(&iosched_cfq); 2658 ioc_gone = &all_gone; 2659 /* ioc_gone's update must be visible before reading ioc_count */ 2660 smp_wmb(); 2661 2662 /* 2663 * this also protects us from entering cfq_slab_kill() with 2664 * pending RCU callbacks 2665 */ 2666 if (elv_ioc_count_read(ioc_count)) 2667 wait_for_completion(&all_gone); 2668 cfq_slab_kill(); 2669} 2670 2671module_init(cfq_init); 2672module_exit(cfq_exit); 2673 2674MODULE_AUTHOR("Jens Axboe"); 2675MODULE_LICENSE("GPL"); 2676MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 2677