cfq-iosched.c revision 6d048f5310aa2dda2b5acd947eab3598c25e269f
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/blkdev.h> 11#include <linux/elevator.h> 12#include <linux/hash.h> 13#include <linux/rbtree.h> 14#include <linux/ioprio.h> 15 16/* 17 * tunables 18 */ 19static const int cfq_quantum = 4; /* max queue in one round of service */ 20static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 21static const int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */ 22static const int cfq_back_penalty = 2; /* penalty of a backwards seek */ 23 24static const int cfq_slice_sync = HZ / 10; 25static int cfq_slice_async = HZ / 25; 26static const int cfq_slice_async_rq = 2; 27static int cfq_slice_idle = HZ / 125; 28 29#define CFQ_IDLE_GRACE (HZ / 10) 30#define CFQ_SLICE_SCALE (5) 31 32#define CFQ_KEY_ASYNC (0) 33 34/* 35 * for the hash of cfqq inside the cfqd 36 */ 37#define CFQ_QHASH_SHIFT 6 38#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) 39#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash) 40 41#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) 42 43#define RQ_CIC(rq) ((struct cfq_io_context*)(rq)->elevator_private) 44#define RQ_CFQQ(rq) ((rq)->elevator_private2) 45 46static struct kmem_cache *cfq_pool; 47static struct kmem_cache *cfq_ioc_pool; 48 49static DEFINE_PER_CPU(unsigned long, ioc_count); 50static struct completion *ioc_gone; 51 52#define CFQ_PRIO_LISTS IOPRIO_BE_NR 53#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 54#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 55 56#define ASYNC (0) 57#define SYNC (1) 58 59#define cfq_cfqq_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC) 60 61#define sample_valid(samples) ((samples) > 80) 62 63/* 64 * Per block device queue structure 65 */ 66struct cfq_data { 67 request_queue_t *queue; 68 69 /* 70 * rr list of queues with requests and the count of them 71 */ 72 struct list_head rr_list[CFQ_PRIO_LISTS]; 73 struct list_head busy_rr; 74 struct list_head cur_rr; 75 struct list_head idle_rr; 76 unsigned long cur_rr_tick; 77 unsigned int busy_queues; 78 79 /* 80 * cfqq lookup hash 81 */ 82 struct hlist_head *cfq_hash; 83 84 int rq_in_driver; 85 int hw_tag; 86 87 /* 88 * idle window management 89 */ 90 struct timer_list idle_slice_timer; 91 struct work_struct unplug_work; 92 93 struct cfq_queue *active_queue; 94 struct cfq_io_context *active_cic; 95 int cur_prio, cur_end_prio; 96 unsigned long prio_time; 97 unsigned int dispatch_slice; 98 99 struct timer_list idle_class_timer; 100 101 sector_t last_position; 102 unsigned long last_end_request; 103 104 /* 105 * tunables, see top of file 106 */ 107 unsigned int cfq_quantum; 108 unsigned int cfq_fifo_expire[2]; 109 unsigned int cfq_back_penalty; 110 unsigned int cfq_back_max; 111 unsigned int cfq_slice[2]; 112 unsigned int cfq_slice_async_rq; 113 unsigned int cfq_slice_idle; 114 115 struct list_head cic_list; 116 117 sector_t new_seek_mean; 118 u64 new_seek_total; 119}; 120 121/* 122 * Per process-grouping structure 123 */ 124struct cfq_queue { 125 /* reference count */ 126 atomic_t ref; 127 /* parent cfq_data */ 128 struct cfq_data *cfqd; 129 /* cfqq lookup hash */ 130 struct hlist_node cfq_hash; 131 /* hash key */ 132 unsigned int key; 133 /* member of the rr/busy/cur/idle cfqd list */ 134 struct list_head cfq_list; 135 /* in what tick we were last serviced */ 136 unsigned long rr_tick; 137 /* sorted list of pending requests */ 138 struct rb_root sort_list; 139 /* if fifo isn't expired, next request to serve */ 140 struct request *next_rq; 141 /* requests queued in sort_list */ 142 int queued[2]; 143 /* currently allocated requests */ 144 int allocated[2]; 145 /* pending metadata requests */ 146 int meta_pending; 147 /* fifo list of requests in sort_list */ 148 struct list_head fifo; 149 150 unsigned long slice_end; 151 unsigned long service_last; 152 unsigned long slice_start; 153 long slice_resid; 154 155 /* number of requests that are on the dispatch list or inside driver */ 156 int dispatched; 157 158 /* io prio of this group */ 159 unsigned short ioprio, org_ioprio; 160 unsigned short ioprio_class, org_ioprio_class; 161 162 /* various state flags, see below */ 163 unsigned int flags; 164 165 sector_t last_request_pos; 166}; 167 168enum cfqq_state_flags { 169 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 170 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 171 CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ 172 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 173 CFQ_CFQQ_FLAG_must_dispatch, /* must dispatch, even if expired */ 174 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 175 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 176 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 177 CFQ_CFQQ_FLAG_queue_new, /* queue never been serviced */ 178 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 179}; 180 181#define CFQ_CFQQ_FNS(name) \ 182static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 183{ \ 184 cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 185} \ 186static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 187{ \ 188 cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 189} \ 190static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 191{ \ 192 return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 193} 194 195CFQ_CFQQ_FNS(on_rr); 196CFQ_CFQQ_FNS(wait_request); 197CFQ_CFQQ_FNS(must_alloc); 198CFQ_CFQQ_FNS(must_alloc_slice); 199CFQ_CFQQ_FNS(must_dispatch); 200CFQ_CFQQ_FNS(fifo_expire); 201CFQ_CFQQ_FNS(idle_window); 202CFQ_CFQQ_FNS(prio_changed); 203CFQ_CFQQ_FNS(queue_new); 204CFQ_CFQQ_FNS(slice_new); 205#undef CFQ_CFQQ_FNS 206 207static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); 208static void cfq_dispatch_insert(request_queue_t *, struct request *); 209static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask); 210 211/* 212 * scheduler run of queue, if there are requests pending and no one in the 213 * driver that will restart queueing 214 */ 215static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 216{ 217 if (cfqd->busy_queues) 218 kblockd_schedule_work(&cfqd->unplug_work); 219} 220 221static int cfq_queue_empty(request_queue_t *q) 222{ 223 struct cfq_data *cfqd = q->elevator->elevator_data; 224 225 return !cfqd->busy_queues; 226} 227 228static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync) 229{ 230 /* 231 * Use the per-process queue, for read requests and syncronous writes 232 */ 233 if (!(rw & REQ_RW) || is_sync) 234 return task->pid; 235 236 return CFQ_KEY_ASYNC; 237} 238 239/* 240 * Scale schedule slice based on io priority. Use the sync time slice only 241 * if a queue is marked sync and has sync io queued. A sync queue with async 242 * io only, should not get full sync slice length. 243 */ 244static inline int 245cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 246{ 247 const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)]; 248 249 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 250 251 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio)); 252} 253 254static inline void 255cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 256{ 257 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; 258 cfqq->slice_end += cfqq->slice_resid; 259 260 /* 261 * Don't carry over residual for more than one slice, we only want 262 * to slightly correct the fairness. Carrying over forever would 263 * easily introduce oscillations. 264 */ 265 cfqq->slice_resid = 0; 266 267 cfqq->slice_start = jiffies; 268} 269 270/* 271 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 272 * isn't valid until the first request from the dispatch is activated 273 * and the slice time set. 274 */ 275static inline int cfq_slice_used(struct cfq_queue *cfqq) 276{ 277 if (cfq_cfqq_slice_new(cfqq)) 278 return 0; 279 if (time_before(jiffies, cfqq->slice_end)) 280 return 0; 281 282 return 1; 283} 284 285/* 286 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 287 * We choose the request that is closest to the head right now. Distance 288 * behind the head is penalized and only allowed to a certain extent. 289 */ 290static struct request * 291cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) 292{ 293 sector_t last, s1, s2, d1 = 0, d2 = 0; 294 unsigned long back_max; 295#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 296#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 297 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 298 299 if (rq1 == NULL || rq1 == rq2) 300 return rq2; 301 if (rq2 == NULL) 302 return rq1; 303 304 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 305 return rq1; 306 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 307 return rq2; 308 if (rq_is_meta(rq1) && !rq_is_meta(rq2)) 309 return rq1; 310 else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) 311 return rq2; 312 313 s1 = rq1->sector; 314 s2 = rq2->sector; 315 316 last = cfqd->last_position; 317 318 /* 319 * by definition, 1KiB is 2 sectors 320 */ 321 back_max = cfqd->cfq_back_max * 2; 322 323 /* 324 * Strict one way elevator _except_ in the case where we allow 325 * short backward seeks which are biased as twice the cost of a 326 * similar forward seek. 327 */ 328 if (s1 >= last) 329 d1 = s1 - last; 330 else if (s1 + back_max >= last) 331 d1 = (last - s1) * cfqd->cfq_back_penalty; 332 else 333 wrap |= CFQ_RQ1_WRAP; 334 335 if (s2 >= last) 336 d2 = s2 - last; 337 else if (s2 + back_max >= last) 338 d2 = (last - s2) * cfqd->cfq_back_penalty; 339 else 340 wrap |= CFQ_RQ2_WRAP; 341 342 /* Found required data */ 343 344 /* 345 * By doing switch() on the bit mask "wrap" we avoid having to 346 * check two variables for all permutations: --> faster! 347 */ 348 switch (wrap) { 349 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 350 if (d1 < d2) 351 return rq1; 352 else if (d2 < d1) 353 return rq2; 354 else { 355 if (s1 >= s2) 356 return rq1; 357 else 358 return rq2; 359 } 360 361 case CFQ_RQ2_WRAP: 362 return rq1; 363 case CFQ_RQ1_WRAP: 364 return rq2; 365 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 366 default: 367 /* 368 * Since both rqs are wrapped, 369 * start with the one that's further behind head 370 * (--> only *one* back seek required), 371 * since back seek takes more time than forward. 372 */ 373 if (s1 <= s2) 374 return rq1; 375 else 376 return rq2; 377 } 378} 379 380/* 381 * would be nice to take fifo expire time into account as well 382 */ 383static struct request * 384cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 385 struct request *last) 386{ 387 struct rb_node *rbnext = rb_next(&last->rb_node); 388 struct rb_node *rbprev = rb_prev(&last->rb_node); 389 struct request *next = NULL, *prev = NULL; 390 391 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 392 393 if (rbprev) 394 prev = rb_entry_rq(rbprev); 395 396 if (rbnext) 397 next = rb_entry_rq(rbnext); 398 else { 399 rbnext = rb_first(&cfqq->sort_list); 400 if (rbnext && rbnext != &last->rb_node) 401 next = rb_entry_rq(rbnext); 402 } 403 404 return cfq_choose_req(cfqd, next, prev); 405} 406 407/* 408 * This function finds out where to insert a BE queue in the service hierarchy 409 */ 410static void cfq_resort_be_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, 411 int preempted) 412{ 413 struct list_head *list, *n; 414 struct cfq_queue *__cfqq; 415 int add_tail = 0; 416 417 /* 418 * if cfqq has requests in flight, don't allow it to be 419 * found in cfq_set_active_queue before it has finished them. 420 * this is done to increase fairness between a process that 421 * has lots of io pending vs one that only generates one 422 * sporadically or synchronously 423 */ 424 if (cfqq->dispatched) 425 list = &cfqd->busy_rr; 426 else if (cfqq->ioprio == (cfqd->cur_prio + 1) && 427 cfq_cfqq_sync(cfqq) && 428 (time_before(cfqd->prio_time, cfqq->service_last) || 429 cfq_cfqq_queue_new(cfqq) || preempted)) { 430 list = &cfqd->cur_rr; 431 add_tail = 1; 432 } else 433 list = &cfqd->rr_list[cfqq->ioprio]; 434 435 if (!cfq_cfqq_sync(cfqq) || add_tail) { 436 /* 437 * async queue always goes to the end. this wont be overly 438 * unfair to writes, as the sort of the sync queue wont be 439 * allowed to pass the async queue again. 440 */ 441 list_add_tail(&cfqq->cfq_list, list); 442 } else if (preempted || cfq_cfqq_queue_new(cfqq)) { 443 /* 444 * If this queue was preempted or is new (never been serviced), 445 * let it be added first for fairness but beind other new 446 * queues. 447 */ 448 n = list; 449 while (n->next != list) { 450 __cfqq = list_entry_cfqq(n->next); 451 if (!cfq_cfqq_queue_new(__cfqq)) 452 break; 453 454 n = n->next; 455 } 456 list_add(&cfqq->cfq_list, n); 457 } else { 458 /* 459 * sort by last service, but don't cross a new or async 460 * queue. we don't cross a new queue because it hasn't been 461 * service before, and we don't cross an async queue because 462 * it gets added to the end on expire. 463 */ 464 n = list; 465 while ((n = n->prev) != list) { 466 struct cfq_queue *__c = list_entry_cfqq(n); 467 468 if (!cfq_cfqq_sync(__c) || !__c->service_last) 469 break; 470 if (time_before(__c->service_last, cfqq->service_last)) 471 break; 472 } 473 list_add(&cfqq->cfq_list, n); 474 } 475} 476 477static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) 478{ 479 struct cfq_data *cfqd = cfqq->cfqd; 480 struct list_head *n; 481 482 /* 483 * Resorting requires the cfqq to be on the RR list already. 484 */ 485 if (!cfq_cfqq_on_rr(cfqq)) 486 return; 487 488 list_del(&cfqq->cfq_list); 489 490 if (cfq_class_rt(cfqq)) { 491 /* 492 * At to the front of the current list, but behind other 493 * RT queues. 494 */ 495 n = &cfqd->cur_rr; 496 while (n->next != &cfqd->cur_rr) 497 if (!cfq_class_rt(cfqq)) 498 break; 499 500 list_add(&cfqq->cfq_list, n); 501 } else if (cfq_class_idle(cfqq)) { 502 /* 503 * IDLE goes to the tail of the idle list 504 */ 505 list_add_tail(&cfqq->cfq_list, &cfqd->idle_rr); 506 } else { 507 /* 508 * So we get here, ergo the queue is a regular best-effort queue 509 */ 510 cfq_resort_be_queue(cfqd, cfqq, preempted); 511 } 512} 513 514/* 515 * add to busy list of queues for service, trying to be fair in ordering 516 * the pending list according to last request service 517 */ 518static inline void 519cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 520{ 521 BUG_ON(cfq_cfqq_on_rr(cfqq)); 522 cfq_mark_cfqq_on_rr(cfqq); 523 cfqd->busy_queues++; 524 525 cfq_resort_rr_list(cfqq, 0); 526} 527 528static inline void 529cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 530{ 531 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 532 cfq_clear_cfqq_on_rr(cfqq); 533 list_del_init(&cfqq->cfq_list); 534 535 BUG_ON(!cfqd->busy_queues); 536 cfqd->busy_queues--; 537} 538 539/* 540 * rb tree support functions 541 */ 542static inline void cfq_del_rq_rb(struct request *rq) 543{ 544 struct cfq_queue *cfqq = RQ_CFQQ(rq); 545 struct cfq_data *cfqd = cfqq->cfqd; 546 const int sync = rq_is_sync(rq); 547 548 BUG_ON(!cfqq->queued[sync]); 549 cfqq->queued[sync]--; 550 551 elv_rb_del(&cfqq->sort_list, rq); 552 553 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 554 cfq_del_cfqq_rr(cfqd, cfqq); 555} 556 557static void cfq_add_rq_rb(struct request *rq) 558{ 559 struct cfq_queue *cfqq = RQ_CFQQ(rq); 560 struct cfq_data *cfqd = cfqq->cfqd; 561 struct request *__alias; 562 563 cfqq->queued[rq_is_sync(rq)]++; 564 565 /* 566 * looks a little odd, but the first insert might return an alias. 567 * if that happens, put the alias on the dispatch list 568 */ 569 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 570 cfq_dispatch_insert(cfqd->queue, __alias); 571 572 if (!cfq_cfqq_on_rr(cfqq)) 573 cfq_add_cfqq_rr(cfqd, cfqq); 574 575 /* 576 * check if this request is a better next-serve candidate 577 */ 578 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 579 BUG_ON(!cfqq->next_rq); 580} 581 582static inline void 583cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 584{ 585 elv_rb_del(&cfqq->sort_list, rq); 586 cfqq->queued[rq_is_sync(rq)]--; 587 cfq_add_rq_rb(rq); 588} 589 590static struct request * 591cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 592{ 593 struct task_struct *tsk = current; 594 pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio), bio_sync(bio)); 595 struct cfq_queue *cfqq; 596 597 cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); 598 if (cfqq) { 599 sector_t sector = bio->bi_sector + bio_sectors(bio); 600 601 return elv_rb_find(&cfqq->sort_list, sector); 602 } 603 604 return NULL; 605} 606 607static void cfq_activate_request(request_queue_t *q, struct request *rq) 608{ 609 struct cfq_data *cfqd = q->elevator->elevator_data; 610 611 cfqd->rq_in_driver++; 612 613 /* 614 * If the depth is larger 1, it really could be queueing. But lets 615 * make the mark a little higher - idling could still be good for 616 * low queueing, and a low queueing number could also just indicate 617 * a SCSI mid layer like behaviour where limit+1 is often seen. 618 */ 619 if (!cfqd->hw_tag && cfqd->rq_in_driver > 4) 620 cfqd->hw_tag = 1; 621 622 cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors; 623} 624 625static void cfq_deactivate_request(request_queue_t *q, struct request *rq) 626{ 627 struct cfq_data *cfqd = q->elevator->elevator_data; 628 629 WARN_ON(!cfqd->rq_in_driver); 630 cfqd->rq_in_driver--; 631} 632 633static void cfq_remove_request(struct request *rq) 634{ 635 struct cfq_queue *cfqq = RQ_CFQQ(rq); 636 637 if (cfqq->next_rq == rq) 638 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 639 640 list_del_init(&rq->queuelist); 641 cfq_del_rq_rb(rq); 642 643 if (rq_is_meta(rq)) { 644 WARN_ON(!cfqq->meta_pending); 645 cfqq->meta_pending--; 646 } 647} 648 649static int 650cfq_merge(request_queue_t *q, struct request **req, struct bio *bio) 651{ 652 struct cfq_data *cfqd = q->elevator->elevator_data; 653 struct request *__rq; 654 655 __rq = cfq_find_rq_fmerge(cfqd, bio); 656 if (__rq && elv_rq_merge_ok(__rq, bio)) { 657 *req = __rq; 658 return ELEVATOR_FRONT_MERGE; 659 } 660 661 return ELEVATOR_NO_MERGE; 662} 663 664static void cfq_merged_request(request_queue_t *q, struct request *req, 665 int type) 666{ 667 if (type == ELEVATOR_FRONT_MERGE) { 668 struct cfq_queue *cfqq = RQ_CFQQ(req); 669 670 cfq_reposition_rq_rb(cfqq, req); 671 } 672} 673 674static void 675cfq_merged_requests(request_queue_t *q, struct request *rq, 676 struct request *next) 677{ 678 /* 679 * reposition in fifo if next is older than rq 680 */ 681 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 682 time_before(next->start_time, rq->start_time)) 683 list_move(&rq->queuelist, &next->queuelist); 684 685 cfq_remove_request(next); 686} 687 688static int cfq_allow_merge(request_queue_t *q, struct request *rq, 689 struct bio *bio) 690{ 691 struct cfq_data *cfqd = q->elevator->elevator_data; 692 const int rw = bio_data_dir(bio); 693 struct cfq_queue *cfqq; 694 pid_t key; 695 696 /* 697 * Disallow merge of a sync bio into an async request. 698 */ 699 if ((bio_data_dir(bio) == READ || bio_sync(bio)) && !rq_is_sync(rq)) 700 return 0; 701 702 /* 703 * Lookup the cfqq that this bio will be queued with. Allow 704 * merge only if rq is queued there. 705 */ 706 key = cfq_queue_pid(current, rw, bio_sync(bio)); 707 cfqq = cfq_find_cfq_hash(cfqd, key, current->ioprio); 708 709 if (cfqq == RQ_CFQQ(rq)) 710 return 1; 711 712 return 0; 713} 714 715static inline void 716__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 717{ 718 if (cfqq) { 719 /* 720 * stop potential idle class queues waiting service 721 */ 722 del_timer(&cfqd->idle_class_timer); 723 724 cfqq->slice_end = 0; 725 cfq_clear_cfqq_must_alloc_slice(cfqq); 726 cfq_clear_cfqq_fifo_expire(cfqq); 727 cfq_mark_cfqq_slice_new(cfqq); 728 cfqq->rr_tick = cfqd->cur_rr_tick; 729 } 730 731 cfqd->active_queue = cfqq; 732} 733 734/* 735 * current cfqq expired its slice (or was too idle), select new one 736 */ 737static void 738__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 739 int preempted, int timed_out) 740{ 741 if (cfq_cfqq_wait_request(cfqq)) 742 del_timer(&cfqd->idle_slice_timer); 743 744 cfq_clear_cfqq_must_dispatch(cfqq); 745 cfq_clear_cfqq_wait_request(cfqq); 746 cfq_clear_cfqq_queue_new(cfqq); 747 748 /* 749 * store what was left of this slice, if the queue idled out 750 * or was preempted 751 */ 752 if (timed_out && !cfq_cfqq_slice_new(cfqq)) 753 cfqq->slice_resid = cfqq->slice_end - jiffies; 754 755 cfq_resort_rr_list(cfqq, preempted); 756 757 if (cfqq == cfqd->active_queue) 758 cfqd->active_queue = NULL; 759 760 if (cfqd->active_cic) { 761 put_io_context(cfqd->active_cic->ioc); 762 cfqd->active_cic = NULL; 763 } 764 765 cfqd->dispatch_slice = 0; 766} 767 768static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted, 769 int timed_out) 770{ 771 struct cfq_queue *cfqq = cfqd->active_queue; 772 773 if (cfqq) 774 __cfq_slice_expired(cfqd, cfqq, preempted, timed_out); 775} 776 777/* 778 * 0 779 * 0,1 780 * 0,1,2 781 * 0,1,2,3 782 * 0,1,2,3,4 783 * 0,1,2,3,4,5 784 * 0,1,2,3,4,5,6 785 * 0,1,2,3,4,5,6,7 786 */ 787static int cfq_get_next_prio_level(struct cfq_data *cfqd) 788{ 789 int prio, wrap; 790 791 prio = -1; 792 wrap = 0; 793 do { 794 int p; 795 796 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) { 797 if (!list_empty(&cfqd->rr_list[p])) { 798 prio = p; 799 break; 800 } 801 } 802 803 if (prio != -1) 804 break; 805 cfqd->cur_prio = 0; 806 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) { 807 cfqd->cur_end_prio = 0; 808 if (wrap) 809 break; 810 wrap = 1; 811 } 812 } while (1); 813 814 if (unlikely(prio == -1)) 815 return -1; 816 817 BUG_ON(prio >= CFQ_PRIO_LISTS); 818 819 list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr); 820 821 cfqd->cur_prio = prio + 1; 822 if (cfqd->cur_prio > cfqd->cur_end_prio) { 823 cfqd->cur_end_prio = cfqd->cur_prio; 824 cfqd->cur_prio = 0; 825 } 826 if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) { 827 cfqd->cur_prio = 0; 828 cfqd->cur_end_prio = 0; 829 } 830 831 cfqd->cur_rr_tick++; 832 cfqd->prio_time = jiffies; 833 return prio; 834} 835 836static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 837 struct request *rq) 838{ 839 if (rq->sector >= cfqd->last_position) 840 return rq->sector - cfqd->last_position; 841 else 842 return cfqd->last_position - rq->sector; 843} 844 845static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd) 846{ 847 struct cfq_queue *cfqq = NULL, *__cfqq; 848 sector_t best = -1, dist; 849 850 list_for_each_entry(__cfqq, &cfqd->cur_rr, cfq_list) { 851 if (!__cfqq->next_rq || !cfq_cfqq_sync(__cfqq)) 852 continue; 853 854 dist = cfq_dist_from_last(cfqd, __cfqq->next_rq); 855 if (dist < best) { 856 best = dist; 857 cfqq = __cfqq; 858 } 859 } 860 861 /* 862 * Only async queue(s) available, grab first entry 863 */ 864 if (!cfqq) 865 cfqq = list_entry_cfqq(cfqd->cur_rr.next); 866 867 return cfqq; 868} 869 870static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 871{ 872 struct cfq_queue *cfqq = NULL; 873 874 if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) { 875 /* 876 * if current list is non-empty, grab first entry. if it is 877 * empty, get next prio level and grab first entry then if any 878 * are spliced 879 */ 880 cfqq = cfq_get_best_queue(cfqd); 881 } else if (!list_empty(&cfqd->busy_rr)) { 882 /* 883 * If no new queues are available, check if the busy list has 884 * some before falling back to idle io. 885 */ 886 cfqq = list_entry_cfqq(cfqd->busy_rr.next); 887 } else if (!list_empty(&cfqd->idle_rr)) { 888 /* 889 * if we have idle queues and no rt or be queues had pending 890 * requests, either allow immediate service if the grace period 891 * has passed or arm the idle grace timer 892 */ 893 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE; 894 895 if (time_after_eq(jiffies, end)) 896 cfqq = list_entry_cfqq(cfqd->idle_rr.next); 897 else 898 mod_timer(&cfqd->idle_class_timer, end); 899 } 900 901 return cfqq; 902} 903 904static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) 905{ 906 struct cfq_queue *cfqq; 907 908 do { 909 long prio; 910 911 cfqq = cfq_get_next_queue(cfqd); 912 if (!cfqq) 913 break; 914 915 prio = cfq_prio_to_slice(cfqd, cfqq); 916 if (cfqq->slice_resid > -prio) 917 break; 918 919 cfqq->slice_resid += prio; 920 list_del_init(&cfqq->cfq_list); 921 list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]); 922 cfqq = NULL; 923 } while (1); 924 925 __cfq_set_active_queue(cfqd, cfqq); 926 return cfqq; 927} 928 929static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) 930{ 931 struct cfq_io_context *cic = cfqd->active_cic; 932 933 if (!sample_valid(cic->seek_samples)) 934 return 0; 935 936 return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; 937} 938 939static struct cfq_queue *__cfq_close_cooperator(struct cfq_data *cfqd, 940 struct cfq_queue *cur_cfqq, 941 struct list_head *list) 942{ 943 struct cfq_queue *cfqq; 944 945 list_for_each_entry(cfqq, list, cfq_list) { 946 if (cfqq == cur_cfqq || !cfq_cfqq_sync(cfqq)) 947 continue; 948 949 BUG_ON(!cfqq->next_rq); 950 951 if (cfq_rq_close(cfqd, cfqq->next_rq)) 952 return cfqq; 953 } 954 955 return NULL; 956} 957 958static int cfq_close_cooperator(struct cfq_data *cfqd, 959 struct cfq_queue *cur_cfqq) 960{ 961 struct cfq_queue *cfqq; 962 963 if (!cfqd->busy_queues) 964 return 0; 965 966 /* 967 * check cur_rr and same-prio rr_list for candidates 968 */ 969 cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->cur_rr); 970 if (cfqq) 971 return 1; 972 973 cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->rr_list[cur_cfqq->ioprio]); 974 if (cfqq && (cfqq->rr_tick == cfqd->cur_rr_tick)) 975 cfqq = NULL; 976 977 return cfqq != NULL; 978} 979 980#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) 981 982static void cfq_arm_slice_timer(struct cfq_data *cfqd) 983{ 984 struct cfq_queue *cfqq = cfqd->active_queue; 985 struct cfq_io_context *cic; 986 unsigned long sl; 987 988 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 989 WARN_ON(cfq_cfqq_slice_new(cfqq)); 990 991 /* 992 * idle is disabled, either manually or by past process history 993 */ 994 if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) 995 return; 996 997 /* 998 * task has exited, don't wait 999 */ 1000 cic = cfqd->active_cic; 1001 if (!cic || !cic->ioc->task) 1002 return; 1003 1004 /* 1005 * See if this prio level has a good candidate 1006 */ 1007 if (cfq_close_cooperator(cfqd, cfqq)) 1008 return; 1009 1010 cfq_mark_cfqq_must_dispatch(cfqq); 1011 cfq_mark_cfqq_wait_request(cfqq); 1012 1013 /* 1014 * we don't want to idle for seeks, but we do want to allow 1015 * fair distribution of slice time for a process doing back-to-back 1016 * seeks. so allow a little bit of time for him to submit a new rq 1017 */ 1018 sl = cfqd->cfq_slice_idle; 1019 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) 1020 sl = min(sl, msecs_to_jiffies(2)); 1021 1022 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1023} 1024 1025static void cfq_dispatch_insert(request_queue_t *q, struct request *rq) 1026{ 1027 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1028 1029 cfq_remove_request(rq); 1030 cfqq->dispatched++; 1031 elv_dispatch_sort(q, rq); 1032} 1033 1034/* 1035 * return expired entry, or NULL to just start from scratch in rbtree 1036 */ 1037static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq) 1038{ 1039 struct cfq_data *cfqd = cfqq->cfqd; 1040 struct request *rq; 1041 int fifo; 1042 1043 if (cfq_cfqq_fifo_expire(cfqq)) 1044 return NULL; 1045 1046 cfq_mark_cfqq_fifo_expire(cfqq); 1047 1048 if (list_empty(&cfqq->fifo)) 1049 return NULL; 1050 1051 fifo = cfq_cfqq_sync(cfqq); 1052 rq = rq_entry_fifo(cfqq->fifo.next); 1053 1054 if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) 1055 return NULL; 1056 1057 return rq; 1058} 1059 1060static inline int 1061cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1062{ 1063 const int base_rq = cfqd->cfq_slice_async_rq; 1064 1065 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 1066 1067 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 1068} 1069 1070/* 1071 * get next queue for service 1072 */ 1073static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 1074{ 1075 struct cfq_queue *cfqq; 1076 1077 cfqq = cfqd->active_queue; 1078 if (!cfqq) 1079 goto new_queue; 1080 1081 /* 1082 * The active queue has run out of time, expire it and select new. 1083 */ 1084 if (cfq_slice_used(cfqq)) 1085 goto expire; 1086 1087 /* 1088 * The active queue has requests and isn't expired, allow it to 1089 * dispatch. 1090 */ 1091 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 1092 goto keep_queue; 1093 1094 /* 1095 * No requests pending. If the active queue still has requests in 1096 * flight or is idling for a new request, allow either of these 1097 * conditions to happen (or time out) before selecting a new queue. 1098 */ 1099 if (cfqq->dispatched || timer_pending(&cfqd->idle_slice_timer)) { 1100 cfqq = NULL; 1101 goto keep_queue; 1102 } 1103 1104expire: 1105 cfq_slice_expired(cfqd, 0, 0); 1106new_queue: 1107 cfqq = cfq_set_active_queue(cfqd); 1108keep_queue: 1109 return cfqq; 1110} 1111 1112static int 1113__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1114 int max_dispatch) 1115{ 1116 int dispatched = 0; 1117 1118 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 1119 1120 do { 1121 struct request *rq; 1122 1123 /* 1124 * follow expired path, else get first next available 1125 */ 1126 if ((rq = cfq_check_fifo(cfqq)) == NULL) 1127 rq = cfqq->next_rq; 1128 1129 /* 1130 * finally, insert request into driver dispatch list 1131 */ 1132 cfq_dispatch_insert(cfqd->queue, rq); 1133 1134 cfqd->dispatch_slice++; 1135 dispatched++; 1136 1137 if (!cfqd->active_cic) { 1138 atomic_inc(&RQ_CIC(rq)->ioc->refcount); 1139 cfqd->active_cic = RQ_CIC(rq); 1140 } 1141 1142 if (RB_EMPTY_ROOT(&cfqq->sort_list)) 1143 break; 1144 1145 } while (dispatched < max_dispatch); 1146 1147 /* 1148 * expire an async queue immediately if it has used up its slice. idle 1149 * queue always expire after 1 dispatch round. 1150 */ 1151 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 1152 cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) || 1153 cfq_class_idle(cfqq))) { 1154 cfqq->slice_end = jiffies + 1; 1155 cfq_slice_expired(cfqd, 0, 0); 1156 } 1157 1158 return dispatched; 1159} 1160 1161static int 1162cfq_forced_dispatch_cfqqs(struct list_head *list) 1163{ 1164 struct cfq_queue *cfqq, *next; 1165 int dispatched; 1166 1167 dispatched = 0; 1168 list_for_each_entry_safe(cfqq, next, list, cfq_list) { 1169 while (cfqq->next_rq) { 1170 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 1171 dispatched++; 1172 } 1173 BUG_ON(!list_empty(&cfqq->fifo)); 1174 } 1175 1176 return dispatched; 1177} 1178 1179static int 1180cfq_forced_dispatch(struct cfq_data *cfqd) 1181{ 1182 int i, dispatched = 0; 1183 1184 for (i = 0; i < CFQ_PRIO_LISTS; i++) 1185 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]); 1186 1187 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr); 1188 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr); 1189 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr); 1190 1191 cfq_slice_expired(cfqd, 0, 0); 1192 1193 BUG_ON(cfqd->busy_queues); 1194 1195 return dispatched; 1196} 1197 1198static int 1199cfq_dispatch_requests(request_queue_t *q, int force) 1200{ 1201 struct cfq_data *cfqd = q->elevator->elevator_data; 1202 struct cfq_queue *cfqq; 1203 int dispatched; 1204 1205 if (!cfqd->busy_queues) 1206 return 0; 1207 1208 if (unlikely(force)) 1209 return cfq_forced_dispatch(cfqd); 1210 1211 dispatched = 0; 1212 while ((cfqq = cfq_select_queue(cfqd)) != NULL) { 1213 int max_dispatch; 1214 1215 if (cfqd->busy_queues > 1) { 1216 /* 1217 * So we have dispatched before in this round, if the 1218 * next queue has idling enabled (must be sync), don't 1219 * allow it service until the previous have completed. 1220 */ 1221 if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq) && 1222 dispatched) 1223 break; 1224 if (cfqq->dispatched >= cfqd->cfq_quantum) 1225 break; 1226 } 1227 1228 cfq_clear_cfqq_must_dispatch(cfqq); 1229 cfq_clear_cfqq_wait_request(cfqq); 1230 del_timer(&cfqd->idle_slice_timer); 1231 1232 max_dispatch = cfqd->cfq_quantum; 1233 if (cfq_class_idle(cfqq)) 1234 max_dispatch = 1; 1235 1236 dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); 1237 } 1238 1239 return dispatched; 1240} 1241 1242/* 1243 * task holds one reference to the queue, dropped when task exits. each rq 1244 * in-flight on this queue also holds a reference, dropped when rq is freed. 1245 * 1246 * queue lock must be held here. 1247 */ 1248static void cfq_put_queue(struct cfq_queue *cfqq) 1249{ 1250 struct cfq_data *cfqd = cfqq->cfqd; 1251 1252 BUG_ON(atomic_read(&cfqq->ref) <= 0); 1253 1254 if (!atomic_dec_and_test(&cfqq->ref)) 1255 return; 1256 1257 BUG_ON(rb_first(&cfqq->sort_list)); 1258 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 1259 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1260 1261 if (unlikely(cfqd->active_queue == cfqq)) { 1262 __cfq_slice_expired(cfqd, cfqq, 0, 0); 1263 cfq_schedule_dispatch(cfqd); 1264 } 1265 1266 /* 1267 * it's on the empty list and still hashed 1268 */ 1269 list_del(&cfqq->cfq_list); 1270 hlist_del(&cfqq->cfq_hash); 1271 kmem_cache_free(cfq_pool, cfqq); 1272} 1273 1274static struct cfq_queue * 1275__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio, 1276 const int hashval) 1277{ 1278 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; 1279 struct hlist_node *entry; 1280 struct cfq_queue *__cfqq; 1281 1282 hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) { 1283 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio); 1284 1285 if (__cfqq->key == key && (__p == prio || !prio)) 1286 return __cfqq; 1287 } 1288 1289 return NULL; 1290} 1291 1292static struct cfq_queue * 1293cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio) 1294{ 1295 return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT)); 1296} 1297 1298static void cfq_free_io_context(struct io_context *ioc) 1299{ 1300 struct cfq_io_context *__cic; 1301 struct rb_node *n; 1302 int freed = 0; 1303 1304 while ((n = rb_first(&ioc->cic_root)) != NULL) { 1305 __cic = rb_entry(n, struct cfq_io_context, rb_node); 1306 rb_erase(&__cic->rb_node, &ioc->cic_root); 1307 kmem_cache_free(cfq_ioc_pool, __cic); 1308 freed++; 1309 } 1310 1311 elv_ioc_count_mod(ioc_count, -freed); 1312 1313 if (ioc_gone && !elv_ioc_count_read(ioc_count)) 1314 complete(ioc_gone); 1315} 1316 1317static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1318{ 1319 if (unlikely(cfqq == cfqd->active_queue)) { 1320 __cfq_slice_expired(cfqd, cfqq, 0, 0); 1321 cfq_schedule_dispatch(cfqd); 1322 } 1323 1324 cfq_put_queue(cfqq); 1325} 1326 1327static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 1328 struct cfq_io_context *cic) 1329{ 1330 list_del_init(&cic->queue_list); 1331 smp_wmb(); 1332 cic->key = NULL; 1333 1334 if (cic->cfqq[ASYNC]) { 1335 cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]); 1336 cic->cfqq[ASYNC] = NULL; 1337 } 1338 1339 if (cic->cfqq[SYNC]) { 1340 cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]); 1341 cic->cfqq[SYNC] = NULL; 1342 } 1343} 1344 1345 1346/* 1347 * Called with interrupts disabled 1348 */ 1349static void cfq_exit_single_io_context(struct cfq_io_context *cic) 1350{ 1351 struct cfq_data *cfqd = cic->key; 1352 1353 if (cfqd) { 1354 request_queue_t *q = cfqd->queue; 1355 1356 spin_lock_irq(q->queue_lock); 1357 __cfq_exit_single_io_context(cfqd, cic); 1358 spin_unlock_irq(q->queue_lock); 1359 } 1360} 1361 1362static void cfq_exit_io_context(struct io_context *ioc) 1363{ 1364 struct cfq_io_context *__cic; 1365 struct rb_node *n; 1366 1367 /* 1368 * put the reference this task is holding to the various queues 1369 */ 1370 1371 n = rb_first(&ioc->cic_root); 1372 while (n != NULL) { 1373 __cic = rb_entry(n, struct cfq_io_context, rb_node); 1374 1375 cfq_exit_single_io_context(__cic); 1376 n = rb_next(n); 1377 } 1378} 1379 1380static struct cfq_io_context * 1381cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 1382{ 1383 struct cfq_io_context *cic; 1384 1385 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask, cfqd->queue->node); 1386 if (cic) { 1387 memset(cic, 0, sizeof(*cic)); 1388 cic->last_end_request = jiffies; 1389 INIT_LIST_HEAD(&cic->queue_list); 1390 cic->dtor = cfq_free_io_context; 1391 cic->exit = cfq_exit_io_context; 1392 elv_ioc_count_inc(ioc_count); 1393 } 1394 1395 return cic; 1396} 1397 1398static void cfq_init_prio_data(struct cfq_queue *cfqq) 1399{ 1400 struct task_struct *tsk = current; 1401 int ioprio_class; 1402 1403 if (!cfq_cfqq_prio_changed(cfqq)) 1404 return; 1405 1406 ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio); 1407 switch (ioprio_class) { 1408 default: 1409 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 1410 case IOPRIO_CLASS_NONE: 1411 /* 1412 * no prio set, place us in the middle of the BE classes 1413 */ 1414 cfqq->ioprio = task_nice_ioprio(tsk); 1415 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1416 break; 1417 case IOPRIO_CLASS_RT: 1418 cfqq->ioprio = task_ioprio(tsk); 1419 cfqq->ioprio_class = IOPRIO_CLASS_RT; 1420 break; 1421 case IOPRIO_CLASS_BE: 1422 cfqq->ioprio = task_ioprio(tsk); 1423 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1424 break; 1425 case IOPRIO_CLASS_IDLE: 1426 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 1427 cfqq->ioprio = 7; 1428 cfq_clear_cfqq_idle_window(cfqq); 1429 break; 1430 } 1431 1432 /* 1433 * keep track of original prio settings in case we have to temporarily 1434 * elevate the priority of this queue 1435 */ 1436 cfqq->org_ioprio = cfqq->ioprio; 1437 cfqq->org_ioprio_class = cfqq->ioprio_class; 1438 1439 cfq_resort_rr_list(cfqq, 0); 1440 cfq_clear_cfqq_prio_changed(cfqq); 1441} 1442 1443static inline void changed_ioprio(struct cfq_io_context *cic) 1444{ 1445 struct cfq_data *cfqd = cic->key; 1446 struct cfq_queue *cfqq; 1447 unsigned long flags; 1448 1449 if (unlikely(!cfqd)) 1450 return; 1451 1452 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1453 1454 cfqq = cic->cfqq[ASYNC]; 1455 if (cfqq) { 1456 struct cfq_queue *new_cfqq; 1457 new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task, 1458 GFP_ATOMIC); 1459 if (new_cfqq) { 1460 cic->cfqq[ASYNC] = new_cfqq; 1461 cfq_put_queue(cfqq); 1462 } 1463 } 1464 1465 cfqq = cic->cfqq[SYNC]; 1466 if (cfqq) 1467 cfq_mark_cfqq_prio_changed(cfqq); 1468 1469 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1470} 1471 1472static void cfq_ioc_set_ioprio(struct io_context *ioc) 1473{ 1474 struct cfq_io_context *cic; 1475 struct rb_node *n; 1476 1477 ioc->ioprio_changed = 0; 1478 1479 n = rb_first(&ioc->cic_root); 1480 while (n != NULL) { 1481 cic = rb_entry(n, struct cfq_io_context, rb_node); 1482 1483 changed_ioprio(cic); 1484 n = rb_next(n); 1485 } 1486} 1487 1488static struct cfq_queue * 1489cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, 1490 gfp_t gfp_mask) 1491{ 1492 const int hashval = hash_long(key, CFQ_QHASH_SHIFT); 1493 struct cfq_queue *cfqq, *new_cfqq = NULL; 1494 unsigned short ioprio; 1495 1496retry: 1497 ioprio = tsk->ioprio; 1498 cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval); 1499 1500 if (!cfqq) { 1501 if (new_cfqq) { 1502 cfqq = new_cfqq; 1503 new_cfqq = NULL; 1504 } else if (gfp_mask & __GFP_WAIT) { 1505 /* 1506 * Inform the allocator of the fact that we will 1507 * just repeat this allocation if it fails, to allow 1508 * the allocator to do whatever it needs to attempt to 1509 * free memory. 1510 */ 1511 spin_unlock_irq(cfqd->queue->queue_lock); 1512 new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask|__GFP_NOFAIL, cfqd->queue->node); 1513 spin_lock_irq(cfqd->queue->queue_lock); 1514 goto retry; 1515 } else { 1516 cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask, cfqd->queue->node); 1517 if (!cfqq) 1518 goto out; 1519 } 1520 1521 memset(cfqq, 0, sizeof(*cfqq)); 1522 1523 INIT_HLIST_NODE(&cfqq->cfq_hash); 1524 INIT_LIST_HEAD(&cfqq->cfq_list); 1525 INIT_LIST_HEAD(&cfqq->fifo); 1526 1527 cfqq->key = key; 1528 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); 1529 atomic_set(&cfqq->ref, 0); 1530 cfqq->cfqd = cfqd; 1531 1532 if (key != CFQ_KEY_ASYNC) 1533 cfq_mark_cfqq_idle_window(cfqq); 1534 1535 cfq_mark_cfqq_prio_changed(cfqq); 1536 cfq_mark_cfqq_queue_new(cfqq); 1537 cfq_init_prio_data(cfqq); 1538 } 1539 1540 if (new_cfqq) 1541 kmem_cache_free(cfq_pool, new_cfqq); 1542 1543 atomic_inc(&cfqq->ref); 1544out: 1545 WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); 1546 return cfqq; 1547} 1548 1549static void 1550cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic) 1551{ 1552 WARN_ON(!list_empty(&cic->queue_list)); 1553 rb_erase(&cic->rb_node, &ioc->cic_root); 1554 kmem_cache_free(cfq_ioc_pool, cic); 1555 elv_ioc_count_dec(ioc_count); 1556} 1557 1558static struct cfq_io_context * 1559cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc) 1560{ 1561 struct rb_node *n; 1562 struct cfq_io_context *cic; 1563 void *k, *key = cfqd; 1564 1565restart: 1566 n = ioc->cic_root.rb_node; 1567 while (n) { 1568 cic = rb_entry(n, struct cfq_io_context, rb_node); 1569 /* ->key must be copied to avoid race with cfq_exit_queue() */ 1570 k = cic->key; 1571 if (unlikely(!k)) { 1572 cfq_drop_dead_cic(ioc, cic); 1573 goto restart; 1574 } 1575 1576 if (key < k) 1577 n = n->rb_left; 1578 else if (key > k) 1579 n = n->rb_right; 1580 else 1581 return cic; 1582 } 1583 1584 return NULL; 1585} 1586 1587static inline void 1588cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 1589 struct cfq_io_context *cic) 1590{ 1591 struct rb_node **p; 1592 struct rb_node *parent; 1593 struct cfq_io_context *__cic; 1594 unsigned long flags; 1595 void *k; 1596 1597 cic->ioc = ioc; 1598 cic->key = cfqd; 1599 1600restart: 1601 parent = NULL; 1602 p = &ioc->cic_root.rb_node; 1603 while (*p) { 1604 parent = *p; 1605 __cic = rb_entry(parent, struct cfq_io_context, rb_node); 1606 /* ->key must be copied to avoid race with cfq_exit_queue() */ 1607 k = __cic->key; 1608 if (unlikely(!k)) { 1609 cfq_drop_dead_cic(ioc, __cic); 1610 goto restart; 1611 } 1612 1613 if (cic->key < k) 1614 p = &(*p)->rb_left; 1615 else if (cic->key > k) 1616 p = &(*p)->rb_right; 1617 else 1618 BUG(); 1619 } 1620 1621 rb_link_node(&cic->rb_node, parent, p); 1622 rb_insert_color(&cic->rb_node, &ioc->cic_root); 1623 1624 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1625 list_add(&cic->queue_list, &cfqd->cic_list); 1626 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1627} 1628 1629/* 1630 * Setup general io context and cfq io context. There can be several cfq 1631 * io contexts per general io context, if this process is doing io to more 1632 * than one device managed by cfq. 1633 */ 1634static struct cfq_io_context * 1635cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 1636{ 1637 struct io_context *ioc = NULL; 1638 struct cfq_io_context *cic; 1639 1640 might_sleep_if(gfp_mask & __GFP_WAIT); 1641 1642 ioc = get_io_context(gfp_mask, cfqd->queue->node); 1643 if (!ioc) 1644 return NULL; 1645 1646 cic = cfq_cic_rb_lookup(cfqd, ioc); 1647 if (cic) 1648 goto out; 1649 1650 cic = cfq_alloc_io_context(cfqd, gfp_mask); 1651 if (cic == NULL) 1652 goto err; 1653 1654 cfq_cic_link(cfqd, ioc, cic); 1655out: 1656 smp_read_barrier_depends(); 1657 if (unlikely(ioc->ioprio_changed)) 1658 cfq_ioc_set_ioprio(ioc); 1659 1660 return cic; 1661err: 1662 put_io_context(ioc); 1663 return NULL; 1664} 1665 1666static void 1667cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 1668{ 1669 unsigned long elapsed = jiffies - cic->last_end_request; 1670 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 1671 1672 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 1673 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 1674 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 1675} 1676 1677static void 1678cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, 1679 struct request *rq) 1680{ 1681 sector_t sdist; 1682 u64 total; 1683 1684 if (cic->last_request_pos < rq->sector) 1685 sdist = rq->sector - cic->last_request_pos; 1686 else 1687 sdist = cic->last_request_pos - rq->sector; 1688 1689 if (!cic->seek_samples) { 1690 cfqd->new_seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 1691 cfqd->new_seek_mean = cfqd->new_seek_total / 256; 1692 } 1693 1694 /* 1695 * Don't allow the seek distance to get too large from the 1696 * odd fragment, pagein, etc 1697 */ 1698 if (cic->seek_samples <= 60) /* second&third seek */ 1699 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); 1700 else 1701 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); 1702 1703 cic->seek_samples = (7*cic->seek_samples + 256) / 8; 1704 cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 1705 total = cic->seek_total + (cic->seek_samples/2); 1706 do_div(total, cic->seek_samples); 1707 cic->seek_mean = (sector_t)total; 1708} 1709 1710/* 1711 * Disable idle window if the process thinks too long or seeks so much that 1712 * it doesn't matter 1713 */ 1714static void 1715cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1716 struct cfq_io_context *cic) 1717{ 1718 int enable_idle = cfq_cfqq_idle_window(cfqq); 1719 1720 if (!cic->ioc->task || !cfqd->cfq_slice_idle || 1721 (cfqd->hw_tag && CIC_SEEKY(cic))) 1722 enable_idle = 0; 1723 else if (sample_valid(cic->ttime_samples)) { 1724 if (cic->ttime_mean > cfqd->cfq_slice_idle) 1725 enable_idle = 0; 1726 else 1727 enable_idle = 1; 1728 } 1729 1730 if (enable_idle) 1731 cfq_mark_cfqq_idle_window(cfqq); 1732 else 1733 cfq_clear_cfqq_idle_window(cfqq); 1734} 1735 1736/* 1737 * Check if new_cfqq should preempt the currently active queue. Return 0 for 1738 * no or if we aren't sure, a 1 will cause a preempt. 1739 */ 1740static int 1741cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 1742 struct request *rq) 1743{ 1744 struct cfq_queue *cfqq; 1745 1746 cfqq = cfqd->active_queue; 1747 if (!cfqq) 1748 return 0; 1749 1750 if (cfq_slice_used(cfqq)) 1751 return 1; 1752 1753 if (cfq_class_idle(new_cfqq)) 1754 return 0; 1755 1756 if (cfq_class_idle(cfqq)) 1757 return 1; 1758 1759 /* 1760 * if the new request is sync, but the currently running queue is 1761 * not, let the sync request have priority. 1762 */ 1763 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 1764 return 1; 1765 1766 /* 1767 * So both queues are sync. Let the new request get disk time if 1768 * it's a metadata request and the current queue is doing regular IO. 1769 */ 1770 if (rq_is_meta(rq) && !cfqq->meta_pending) 1771 return 1; 1772 1773 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 1774 return 0; 1775 1776 /* 1777 * if this request is as-good as one we would expect from the 1778 * current cfqq, let it preempt 1779 */ 1780 if (cfq_rq_close(cfqd, rq)) 1781 return 1; 1782 1783 return 0; 1784} 1785 1786/* 1787 * cfqq preempts the active queue. if we allowed preempt with no slice left, 1788 * let it have half of its nominal slice. 1789 */ 1790static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1791{ 1792 cfq_slice_expired(cfqd, 1, 1); 1793 1794 /* 1795 * Put the new queue at the front of the of the current list, 1796 * so we know that it will be selected next. 1797 */ 1798 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1799 list_move(&cfqq->cfq_list, &cfqd->cur_rr); 1800 1801 cfqq->slice_end = 0; 1802 cfq_mark_cfqq_slice_new(cfqq); 1803} 1804 1805/* 1806 * Called when a new fs request (rq) is added (to cfqq). Check if there's 1807 * something we should do about it 1808 */ 1809static void 1810cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1811 struct request *rq) 1812{ 1813 struct cfq_io_context *cic = RQ_CIC(rq); 1814 1815 if (rq_is_meta(rq)) 1816 cfqq->meta_pending++; 1817 1818 cfq_update_io_thinktime(cfqd, cic); 1819 cfq_update_io_seektime(cfqd, cic, rq); 1820 cfq_update_idle_window(cfqd, cfqq, cic); 1821 1822 cic->last_request_pos = rq->sector + rq->nr_sectors; 1823 cfqq->last_request_pos = cic->last_request_pos; 1824 1825 if (cfqq == cfqd->active_queue) { 1826 /* 1827 * if we are waiting for a request for this queue, let it rip 1828 * immediately and flag that we must not expire this queue 1829 * just now 1830 */ 1831 if (cfq_cfqq_wait_request(cfqq)) { 1832 cfq_mark_cfqq_must_dispatch(cfqq); 1833 del_timer(&cfqd->idle_slice_timer); 1834 blk_start_queueing(cfqd->queue); 1835 } 1836 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 1837 /* 1838 * not the active queue - expire current slice if it is 1839 * idle and has expired it's mean thinktime or this new queue 1840 * has some old slice time left and is of higher priority 1841 */ 1842 cfq_preempt_queue(cfqd, cfqq); 1843 cfq_mark_cfqq_must_dispatch(cfqq); 1844 blk_start_queueing(cfqd->queue); 1845 } 1846} 1847 1848static void cfq_insert_request(request_queue_t *q, struct request *rq) 1849{ 1850 struct cfq_data *cfqd = q->elevator->elevator_data; 1851 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1852 1853 cfq_init_prio_data(cfqq); 1854 1855 cfq_add_rq_rb(rq); 1856 1857 list_add_tail(&rq->queuelist, &cfqq->fifo); 1858 1859 cfq_rq_enqueued(cfqd, cfqq, rq); 1860} 1861 1862static void cfq_completed_request(request_queue_t *q, struct request *rq) 1863{ 1864 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1865 struct cfq_data *cfqd = cfqq->cfqd; 1866 const int sync = rq_is_sync(rq); 1867 unsigned long now; 1868 1869 now = jiffies; 1870 1871 WARN_ON(!cfqd->rq_in_driver); 1872 WARN_ON(!cfqq->dispatched); 1873 cfqd->rq_in_driver--; 1874 cfqq->dispatched--; 1875 cfqq->service_last = now; 1876 1877 if (!cfq_class_idle(cfqq)) 1878 cfqd->last_end_request = now; 1879 1880 cfq_resort_rr_list(cfqq, 0); 1881 1882 if (sync) 1883 RQ_CIC(rq)->last_end_request = now; 1884 1885 /* 1886 * If this is the active queue, check if it needs to be expired, 1887 * or if we want to idle in case it has no pending requests. 1888 */ 1889 if (cfqd->active_queue == cfqq) { 1890 if (cfq_cfqq_slice_new(cfqq)) { 1891 cfq_set_prio_slice(cfqd, cfqq); 1892 cfq_clear_cfqq_slice_new(cfqq); 1893 } 1894 if (cfq_slice_used(cfqq)) 1895 cfq_slice_expired(cfqd, 0, 1); 1896 else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) 1897 cfq_arm_slice_timer(cfqd); 1898 } 1899 1900 if (!cfqd->rq_in_driver) 1901 cfq_schedule_dispatch(cfqd); 1902} 1903 1904/* 1905 * we temporarily boost lower priority queues if they are holding fs exclusive 1906 * resources. they are boosted to normal prio (CLASS_BE/4) 1907 */ 1908static void cfq_prio_boost(struct cfq_queue *cfqq) 1909{ 1910 const int ioprio_class = cfqq->ioprio_class; 1911 const int ioprio = cfqq->ioprio; 1912 1913 if (has_fs_excl()) { 1914 /* 1915 * boost idle prio on transactions that would lock out other 1916 * users of the filesystem 1917 */ 1918 if (cfq_class_idle(cfqq)) 1919 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1920 if (cfqq->ioprio > IOPRIO_NORM) 1921 cfqq->ioprio = IOPRIO_NORM; 1922 } else { 1923 /* 1924 * check if we need to unboost the queue 1925 */ 1926 if (cfqq->ioprio_class != cfqq->org_ioprio_class) 1927 cfqq->ioprio_class = cfqq->org_ioprio_class; 1928 if (cfqq->ioprio != cfqq->org_ioprio) 1929 cfqq->ioprio = cfqq->org_ioprio; 1930 } 1931 1932 /* 1933 * refile between round-robin lists if we moved the priority class 1934 */ 1935 if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio)) 1936 cfq_resort_rr_list(cfqq, 0); 1937} 1938 1939static inline int __cfq_may_queue(struct cfq_queue *cfqq) 1940{ 1941 if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && 1942 !cfq_cfqq_must_alloc_slice(cfqq)) { 1943 cfq_mark_cfqq_must_alloc_slice(cfqq); 1944 return ELV_MQUEUE_MUST; 1945 } 1946 1947 return ELV_MQUEUE_MAY; 1948} 1949 1950static int cfq_may_queue(request_queue_t *q, int rw) 1951{ 1952 struct cfq_data *cfqd = q->elevator->elevator_data; 1953 struct task_struct *tsk = current; 1954 struct cfq_queue *cfqq; 1955 unsigned int key; 1956 1957 key = cfq_queue_pid(tsk, rw, rw & REQ_RW_SYNC); 1958 1959 /* 1960 * don't force setup of a queue from here, as a call to may_queue 1961 * does not necessarily imply that a request actually will be queued. 1962 * so just lookup a possibly existing queue, or return 'may queue' 1963 * if that fails 1964 */ 1965 cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); 1966 if (cfqq) { 1967 cfq_init_prio_data(cfqq); 1968 cfq_prio_boost(cfqq); 1969 1970 return __cfq_may_queue(cfqq); 1971 } 1972 1973 return ELV_MQUEUE_MAY; 1974} 1975 1976/* 1977 * queue lock held here 1978 */ 1979static void cfq_put_request(struct request *rq) 1980{ 1981 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1982 1983 if (cfqq) { 1984 const int rw = rq_data_dir(rq); 1985 1986 BUG_ON(!cfqq->allocated[rw]); 1987 cfqq->allocated[rw]--; 1988 1989 put_io_context(RQ_CIC(rq)->ioc); 1990 1991 rq->elevator_private = NULL; 1992 rq->elevator_private2 = NULL; 1993 1994 cfq_put_queue(cfqq); 1995 } 1996} 1997 1998/* 1999 * Allocate cfq data structures associated with this request. 2000 */ 2001static int 2002cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask) 2003{ 2004 struct cfq_data *cfqd = q->elevator->elevator_data; 2005 struct task_struct *tsk = current; 2006 struct cfq_io_context *cic; 2007 const int rw = rq_data_dir(rq); 2008 const int is_sync = rq_is_sync(rq); 2009 pid_t key = cfq_queue_pid(tsk, rw, is_sync); 2010 struct cfq_queue *cfqq; 2011 unsigned long flags; 2012 2013 might_sleep_if(gfp_mask & __GFP_WAIT); 2014 2015 cic = cfq_get_io_context(cfqd, gfp_mask); 2016 2017 spin_lock_irqsave(q->queue_lock, flags); 2018 2019 if (!cic) 2020 goto queue_fail; 2021 2022 if (!cic->cfqq[is_sync]) { 2023 cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask); 2024 if (!cfqq) 2025 goto queue_fail; 2026 2027 cic->cfqq[is_sync] = cfqq; 2028 } else 2029 cfqq = cic->cfqq[is_sync]; 2030 2031 cfqq->allocated[rw]++; 2032 cfq_clear_cfqq_must_alloc(cfqq); 2033 atomic_inc(&cfqq->ref); 2034 2035 spin_unlock_irqrestore(q->queue_lock, flags); 2036 2037 rq->elevator_private = cic; 2038 rq->elevator_private2 = cfqq; 2039 return 0; 2040 2041queue_fail: 2042 if (cic) 2043 put_io_context(cic->ioc); 2044 2045 cfq_schedule_dispatch(cfqd); 2046 spin_unlock_irqrestore(q->queue_lock, flags); 2047 return 1; 2048} 2049 2050static void cfq_kick_queue(struct work_struct *work) 2051{ 2052 struct cfq_data *cfqd = 2053 container_of(work, struct cfq_data, unplug_work); 2054 request_queue_t *q = cfqd->queue; 2055 unsigned long flags; 2056 2057 spin_lock_irqsave(q->queue_lock, flags); 2058 blk_start_queueing(q); 2059 spin_unlock_irqrestore(q->queue_lock, flags); 2060} 2061 2062/* 2063 * Timer running if the active_queue is currently idling inside its time slice 2064 */ 2065static void cfq_idle_slice_timer(unsigned long data) 2066{ 2067 struct cfq_data *cfqd = (struct cfq_data *) data; 2068 struct cfq_queue *cfqq; 2069 unsigned long flags; 2070 int timed_out = 1; 2071 2072 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2073 2074 if ((cfqq = cfqd->active_queue) != NULL) { 2075 timed_out = 0; 2076 2077 /* 2078 * expired 2079 */ 2080 if (cfq_slice_used(cfqq)) 2081 goto expire; 2082 2083 /* 2084 * only expire and reinvoke request handler, if there are 2085 * other queues with pending requests 2086 */ 2087 if (!cfqd->busy_queues) 2088 goto out_cont; 2089 2090 /* 2091 * not expired and it has a request pending, let it dispatch 2092 */ 2093 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) { 2094 cfq_mark_cfqq_must_dispatch(cfqq); 2095 goto out_kick; 2096 } 2097 } 2098expire: 2099 cfq_slice_expired(cfqd, 0, timed_out); 2100out_kick: 2101 cfq_schedule_dispatch(cfqd); 2102out_cont: 2103 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2104} 2105 2106/* 2107 * Timer running if an idle class queue is waiting for service 2108 */ 2109static void cfq_idle_class_timer(unsigned long data) 2110{ 2111 struct cfq_data *cfqd = (struct cfq_data *) data; 2112 unsigned long flags, end; 2113 2114 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2115 2116 /* 2117 * race with a non-idle queue, reset timer 2118 */ 2119 end = cfqd->last_end_request + CFQ_IDLE_GRACE; 2120 if (!time_after_eq(jiffies, end)) 2121 mod_timer(&cfqd->idle_class_timer, end); 2122 else 2123 cfq_schedule_dispatch(cfqd); 2124 2125 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2126} 2127 2128static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 2129{ 2130 del_timer_sync(&cfqd->idle_slice_timer); 2131 del_timer_sync(&cfqd->idle_class_timer); 2132 blk_sync_queue(cfqd->queue); 2133} 2134 2135static void cfq_exit_queue(elevator_t *e) 2136{ 2137 struct cfq_data *cfqd = e->elevator_data; 2138 request_queue_t *q = cfqd->queue; 2139 2140 cfq_shutdown_timer_wq(cfqd); 2141 2142 spin_lock_irq(q->queue_lock); 2143 2144 if (cfqd->active_queue) 2145 __cfq_slice_expired(cfqd, cfqd->active_queue, 0, 0); 2146 2147 while (!list_empty(&cfqd->cic_list)) { 2148 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 2149 struct cfq_io_context, 2150 queue_list); 2151 2152 __cfq_exit_single_io_context(cfqd, cic); 2153 } 2154 2155 spin_unlock_irq(q->queue_lock); 2156 2157 cfq_shutdown_timer_wq(cfqd); 2158 2159 kfree(cfqd->cfq_hash); 2160 kfree(cfqd); 2161} 2162 2163static void *cfq_init_queue(request_queue_t *q) 2164{ 2165 struct cfq_data *cfqd; 2166 int i; 2167 2168 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); 2169 if (!cfqd) 2170 return NULL; 2171 2172 memset(cfqd, 0, sizeof(*cfqd)); 2173 2174 for (i = 0; i < CFQ_PRIO_LISTS; i++) 2175 INIT_LIST_HEAD(&cfqd->rr_list[i]); 2176 2177 INIT_LIST_HEAD(&cfqd->busy_rr); 2178 INIT_LIST_HEAD(&cfqd->cur_rr); 2179 INIT_LIST_HEAD(&cfqd->idle_rr); 2180 INIT_LIST_HEAD(&cfqd->cic_list); 2181 2182 cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node); 2183 if (!cfqd->cfq_hash) 2184 goto out_free; 2185 2186 for (i = 0; i < CFQ_QHASH_ENTRIES; i++) 2187 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); 2188 2189 cfqd->queue = q; 2190 2191 init_timer(&cfqd->idle_slice_timer); 2192 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 2193 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 2194 2195 init_timer(&cfqd->idle_class_timer); 2196 cfqd->idle_class_timer.function = cfq_idle_class_timer; 2197 cfqd->idle_class_timer.data = (unsigned long) cfqd; 2198 2199 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 2200 2201 cfqd->cfq_quantum = cfq_quantum; 2202 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 2203 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 2204 cfqd->cfq_back_max = cfq_back_max; 2205 cfqd->cfq_back_penalty = cfq_back_penalty; 2206 cfqd->cfq_slice[0] = cfq_slice_async; 2207 cfqd->cfq_slice[1] = cfq_slice_sync; 2208 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 2209 cfqd->cfq_slice_idle = cfq_slice_idle; 2210 2211 return cfqd; 2212out_free: 2213 kfree(cfqd); 2214 return NULL; 2215} 2216 2217static void cfq_slab_kill(void) 2218{ 2219 if (cfq_pool) 2220 kmem_cache_destroy(cfq_pool); 2221 if (cfq_ioc_pool) 2222 kmem_cache_destroy(cfq_ioc_pool); 2223} 2224 2225static int __init cfq_slab_setup(void) 2226{ 2227 cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0, 2228 NULL, NULL); 2229 if (!cfq_pool) 2230 goto fail; 2231 2232 cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool", 2233 sizeof(struct cfq_io_context), 0, 0, NULL, NULL); 2234 if (!cfq_ioc_pool) 2235 goto fail; 2236 2237 return 0; 2238fail: 2239 cfq_slab_kill(); 2240 return -ENOMEM; 2241} 2242 2243/* 2244 * sysfs parts below --> 2245 */ 2246static ssize_t 2247cfq_var_show(unsigned int var, char *page) 2248{ 2249 return sprintf(page, "%d\n", var); 2250} 2251 2252static ssize_t 2253cfq_var_store(unsigned int *var, const char *page, size_t count) 2254{ 2255 char *p = (char *) page; 2256 2257 *var = simple_strtoul(p, &p, 10); 2258 return count; 2259} 2260 2261#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 2262static ssize_t __FUNC(elevator_t *e, char *page) \ 2263{ \ 2264 struct cfq_data *cfqd = e->elevator_data; \ 2265 unsigned int __data = __VAR; \ 2266 if (__CONV) \ 2267 __data = jiffies_to_msecs(__data); \ 2268 return cfq_var_show(__data, (page)); \ 2269} 2270SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 2271SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 2272SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 2273SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 2274SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 2275SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 2276SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 2277SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 2278SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 2279#undef SHOW_FUNCTION 2280 2281#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 2282static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \ 2283{ \ 2284 struct cfq_data *cfqd = e->elevator_data; \ 2285 unsigned int __data; \ 2286 int ret = cfq_var_store(&__data, (page), count); \ 2287 if (__data < (MIN)) \ 2288 __data = (MIN); \ 2289 else if (__data > (MAX)) \ 2290 __data = (MAX); \ 2291 if (__CONV) \ 2292 *(__PTR) = msecs_to_jiffies(__data); \ 2293 else \ 2294 *(__PTR) = __data; \ 2295 return ret; \ 2296} 2297STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 2298STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1); 2299STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1); 2300STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 2301STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); 2302STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 2303STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 2304STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 2305STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); 2306#undef STORE_FUNCTION 2307 2308#define CFQ_ATTR(name) \ 2309 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 2310 2311static struct elv_fs_entry cfq_attrs[] = { 2312 CFQ_ATTR(quantum), 2313 CFQ_ATTR(fifo_expire_sync), 2314 CFQ_ATTR(fifo_expire_async), 2315 CFQ_ATTR(back_seek_max), 2316 CFQ_ATTR(back_seek_penalty), 2317 CFQ_ATTR(slice_sync), 2318 CFQ_ATTR(slice_async), 2319 CFQ_ATTR(slice_async_rq), 2320 CFQ_ATTR(slice_idle), 2321 __ATTR_NULL 2322}; 2323 2324static struct elevator_type iosched_cfq = { 2325 .ops = { 2326 .elevator_merge_fn = cfq_merge, 2327 .elevator_merged_fn = cfq_merged_request, 2328 .elevator_merge_req_fn = cfq_merged_requests, 2329 .elevator_allow_merge_fn = cfq_allow_merge, 2330 .elevator_dispatch_fn = cfq_dispatch_requests, 2331 .elevator_add_req_fn = cfq_insert_request, 2332 .elevator_activate_req_fn = cfq_activate_request, 2333 .elevator_deactivate_req_fn = cfq_deactivate_request, 2334 .elevator_queue_empty_fn = cfq_queue_empty, 2335 .elevator_completed_req_fn = cfq_completed_request, 2336 .elevator_former_req_fn = elv_rb_former_request, 2337 .elevator_latter_req_fn = elv_rb_latter_request, 2338 .elevator_set_req_fn = cfq_set_request, 2339 .elevator_put_req_fn = cfq_put_request, 2340 .elevator_may_queue_fn = cfq_may_queue, 2341 .elevator_init_fn = cfq_init_queue, 2342 .elevator_exit_fn = cfq_exit_queue, 2343 .trim = cfq_free_io_context, 2344 }, 2345 .elevator_attrs = cfq_attrs, 2346 .elevator_name = "cfq", 2347 .elevator_owner = THIS_MODULE, 2348}; 2349 2350static int __init cfq_init(void) 2351{ 2352 int ret; 2353 2354 /* 2355 * could be 0 on HZ < 1000 setups 2356 */ 2357 if (!cfq_slice_async) 2358 cfq_slice_async = 1; 2359 if (!cfq_slice_idle) 2360 cfq_slice_idle = 1; 2361 2362 if (cfq_slab_setup()) 2363 return -ENOMEM; 2364 2365 ret = elv_register(&iosched_cfq); 2366 if (ret) 2367 cfq_slab_kill(); 2368 2369 return ret; 2370} 2371 2372static void __exit cfq_exit(void) 2373{ 2374 DECLARE_COMPLETION_ONSTACK(all_gone); 2375 elv_unregister(&iosched_cfq); 2376 ioc_gone = &all_gone; 2377 /* ioc_gone's update must be visible before reading ioc_count */ 2378 smp_wmb(); 2379 if (elv_ioc_count_read(ioc_count)) 2380 wait_for_completion(ioc_gone); 2381 synchronize_rcu(); 2382 cfq_slab_kill(); 2383} 2384 2385module_init(cfq_init); 2386module_exit(cfq_exit); 2387 2388MODULE_AUTHOR("Jens Axboe"); 2389MODULE_LICENSE("GPL"); 2390MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 2391