cfq-iosched.c revision bc1c116974a5c3f498112a6f175d3e4a8cd5bdbc
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@suse.de> 8 */ 9#include <linux/config.h> 10#include <linux/module.h> 11#include <linux/blkdev.h> 12#include <linux/elevator.h> 13#include <linux/hash.h> 14#include <linux/rbtree.h> 15#include <linux/ioprio.h> 16 17/* 18 * tunables 19 */ 20static const int cfq_quantum = 4; /* max queue in one round of service */ 21static const int cfq_queued = 8; /* minimum rq allocate limit per-queue*/ 22static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 23static const int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */ 24static const int cfq_back_penalty = 2; /* penalty of a backwards seek */ 25 26static const int cfq_slice_sync = HZ / 10; 27static int cfq_slice_async = HZ / 25; 28static const int cfq_slice_async_rq = 2; 29static int cfq_slice_idle = HZ / 70; 30 31#define CFQ_IDLE_GRACE (HZ / 10) 32#define CFQ_SLICE_SCALE (5) 33 34#define CFQ_KEY_ASYNC (0) 35 36static DEFINE_SPINLOCK(cfq_exit_lock); 37 38/* 39 * for the hash of cfqq inside the cfqd 40 */ 41#define CFQ_QHASH_SHIFT 6 42#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) 43#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash) 44 45/* 46 * for the hash of crq inside the cfqq 47 */ 48#define CFQ_MHASH_SHIFT 6 49#define CFQ_MHASH_BLOCK(sec) ((sec) >> 3) 50#define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT) 51#define CFQ_MHASH_FN(sec) hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT) 52#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) 53#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash) 54 55#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) 56#define list_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) 57 58#define RQ_DATA(rq) (rq)->elevator_private 59 60/* 61 * rb-tree defines 62 */ 63#define RB_NONE (2) 64#define RB_EMPTY(node) ((node)->rb_node == NULL) 65#define RB_CLEAR_COLOR(node) (node)->rb_color = RB_NONE 66#define RB_CLEAR(node) do { \ 67 (node)->rb_parent = NULL; \ 68 RB_CLEAR_COLOR((node)); \ 69 (node)->rb_right = NULL; \ 70 (node)->rb_left = NULL; \ 71} while (0) 72#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL) 73#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) 74#define rq_rb_key(rq) (rq)->sector 75 76static kmem_cache_t *crq_pool; 77static kmem_cache_t *cfq_pool; 78static kmem_cache_t *cfq_ioc_pool; 79 80static atomic_t ioc_count = ATOMIC_INIT(0); 81static struct completion *ioc_gone; 82 83#define CFQ_PRIO_LISTS IOPRIO_BE_NR 84#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 85#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE) 86#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 87 88#define ASYNC (0) 89#define SYNC (1) 90 91#define cfq_cfqq_dispatched(cfqq) \ 92 ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC]) 93 94#define cfq_cfqq_class_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC) 95 96#define cfq_cfqq_sync(cfqq) \ 97 (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC]) 98 99#define sample_valid(samples) ((samples) > 80) 100 101/* 102 * Per block device queue structure 103 */ 104struct cfq_data { 105 request_queue_t *queue; 106 107 /* 108 * rr list of queues with requests and the count of them 109 */ 110 struct list_head rr_list[CFQ_PRIO_LISTS]; 111 struct list_head busy_rr; 112 struct list_head cur_rr; 113 struct list_head idle_rr; 114 unsigned int busy_queues; 115 116 /* 117 * non-ordered list of empty cfqq's 118 */ 119 struct list_head empty_list; 120 121 /* 122 * cfqq lookup hash 123 */ 124 struct hlist_head *cfq_hash; 125 126 /* 127 * global crq hash for all queues 128 */ 129 struct hlist_head *crq_hash; 130 131 unsigned int max_queued; 132 133 mempool_t *crq_pool; 134 135 int rq_in_driver; 136 int hw_tag; 137 138 /* 139 * schedule slice state info 140 */ 141 /* 142 * idle window management 143 */ 144 struct timer_list idle_slice_timer; 145 struct work_struct unplug_work; 146 147 struct cfq_queue *active_queue; 148 struct cfq_io_context *active_cic; 149 int cur_prio, cur_end_prio; 150 unsigned int dispatch_slice; 151 152 struct timer_list idle_class_timer; 153 154 sector_t last_sector; 155 unsigned long last_end_request; 156 157 unsigned int rq_starved; 158 159 /* 160 * tunables, see top of file 161 */ 162 unsigned int cfq_quantum; 163 unsigned int cfq_queued; 164 unsigned int cfq_fifo_expire[2]; 165 unsigned int cfq_back_penalty; 166 unsigned int cfq_back_max; 167 unsigned int cfq_slice[2]; 168 unsigned int cfq_slice_async_rq; 169 unsigned int cfq_slice_idle; 170 171 struct list_head cic_list; 172}; 173 174/* 175 * Per process-grouping structure 176 */ 177struct cfq_queue { 178 /* reference count */ 179 atomic_t ref; 180 /* parent cfq_data */ 181 struct cfq_data *cfqd; 182 /* cfqq lookup hash */ 183 struct hlist_node cfq_hash; 184 /* hash key */ 185 unsigned int key; 186 /* on either rr or empty list of cfqd */ 187 struct list_head cfq_list; 188 /* sorted list of pending requests */ 189 struct rb_root sort_list; 190 /* if fifo isn't expired, next request to serve */ 191 struct cfq_rq *next_crq; 192 /* requests queued in sort_list */ 193 int queued[2]; 194 /* currently allocated requests */ 195 int allocated[2]; 196 /* fifo list of requests in sort_list */ 197 struct list_head fifo; 198 199 unsigned long slice_start; 200 unsigned long slice_end; 201 unsigned long slice_left; 202 unsigned long service_last; 203 204 /* number of requests that are on the dispatch list */ 205 int on_dispatch[2]; 206 207 /* io prio of this group */ 208 unsigned short ioprio, org_ioprio; 209 unsigned short ioprio_class, org_ioprio_class; 210 211 /* various state flags, see below */ 212 unsigned int flags; 213}; 214 215struct cfq_rq { 216 struct rb_node rb_node; 217 sector_t rb_key; 218 struct request *request; 219 struct hlist_node hash; 220 221 struct cfq_queue *cfq_queue; 222 struct cfq_io_context *io_context; 223 224 unsigned int crq_flags; 225}; 226 227enum cfqq_state_flags { 228 CFQ_CFQQ_FLAG_on_rr = 0, 229 CFQ_CFQQ_FLAG_wait_request, 230 CFQ_CFQQ_FLAG_must_alloc, 231 CFQ_CFQQ_FLAG_must_alloc_slice, 232 CFQ_CFQQ_FLAG_must_dispatch, 233 CFQ_CFQQ_FLAG_fifo_expire, 234 CFQ_CFQQ_FLAG_idle_window, 235 CFQ_CFQQ_FLAG_prio_changed, 236}; 237 238#define CFQ_CFQQ_FNS(name) \ 239static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 240{ \ 241 cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 242} \ 243static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 244{ \ 245 cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 246} \ 247static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 248{ \ 249 return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 250} 251 252CFQ_CFQQ_FNS(on_rr); 253CFQ_CFQQ_FNS(wait_request); 254CFQ_CFQQ_FNS(must_alloc); 255CFQ_CFQQ_FNS(must_alloc_slice); 256CFQ_CFQQ_FNS(must_dispatch); 257CFQ_CFQQ_FNS(fifo_expire); 258CFQ_CFQQ_FNS(idle_window); 259CFQ_CFQQ_FNS(prio_changed); 260#undef CFQ_CFQQ_FNS 261 262enum cfq_rq_state_flags { 263 CFQ_CRQ_FLAG_is_sync = 0, 264}; 265 266#define CFQ_CRQ_FNS(name) \ 267static inline void cfq_mark_crq_##name(struct cfq_rq *crq) \ 268{ \ 269 crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name); \ 270} \ 271static inline void cfq_clear_crq_##name(struct cfq_rq *crq) \ 272{ \ 273 crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name); \ 274} \ 275static inline int cfq_crq_##name(const struct cfq_rq *crq) \ 276{ \ 277 return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0; \ 278} 279 280CFQ_CRQ_FNS(is_sync); 281#undef CFQ_CRQ_FNS 282 283static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); 284static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *); 285static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask); 286 287#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE) 288 289/* 290 * lots of deadline iosched dupes, can be abstracted later... 291 */ 292static inline void cfq_del_crq_hash(struct cfq_rq *crq) 293{ 294 hlist_del_init(&crq->hash); 295} 296 297static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) 298{ 299 const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request)); 300 301 hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]); 302} 303 304static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) 305{ 306 struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; 307 struct hlist_node *entry, *next; 308 309 hlist_for_each_safe(entry, next, hash_list) { 310 struct cfq_rq *crq = list_entry_hash(entry); 311 struct request *__rq = crq->request; 312 313 if (!rq_mergeable(__rq)) { 314 cfq_del_crq_hash(crq); 315 continue; 316 } 317 318 if (rq_hash_key(__rq) == offset) 319 return __rq; 320 } 321 322 return NULL; 323} 324 325/* 326 * scheduler run of queue, if there are requests pending and no one in the 327 * driver that will restart queueing 328 */ 329static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 330{ 331 if (cfqd->busy_queues) 332 kblockd_schedule_work(&cfqd->unplug_work); 333} 334 335static int cfq_queue_empty(request_queue_t *q) 336{ 337 struct cfq_data *cfqd = q->elevator->elevator_data; 338 339 return !cfqd->busy_queues; 340} 341 342static inline pid_t cfq_queue_pid(struct task_struct *task, int rw) 343{ 344 if (rw == READ || process_sync(task)) 345 return task->pid; 346 347 return CFQ_KEY_ASYNC; 348} 349 350/* 351 * Lifted from AS - choose which of crq1 and crq2 that is best served now. 352 * We choose the request that is closest to the head right now. Distance 353 * behind the head is penalized and only allowed to a certain extent. 354 */ 355static struct cfq_rq * 356cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2) 357{ 358 sector_t last, s1, s2, d1 = 0, d2 = 0; 359 unsigned long back_max; 360#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 361#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 362 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 363 364 if (crq1 == NULL || crq1 == crq2) 365 return crq2; 366 if (crq2 == NULL) 367 return crq1; 368 369 if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2)) 370 return crq1; 371 else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1)) 372 return crq2; 373 374 s1 = crq1->request->sector; 375 s2 = crq2->request->sector; 376 377 last = cfqd->last_sector; 378 379 /* 380 * by definition, 1KiB is 2 sectors 381 */ 382 back_max = cfqd->cfq_back_max * 2; 383 384 /* 385 * Strict one way elevator _except_ in the case where we allow 386 * short backward seeks which are biased as twice the cost of a 387 * similar forward seek. 388 */ 389 if (s1 >= last) 390 d1 = s1 - last; 391 else if (s1 + back_max >= last) 392 d1 = (last - s1) * cfqd->cfq_back_penalty; 393 else 394 wrap |= CFQ_RQ1_WRAP; 395 396 if (s2 >= last) 397 d2 = s2 - last; 398 else if (s2 + back_max >= last) 399 d2 = (last - s2) * cfqd->cfq_back_penalty; 400 else 401 wrap |= CFQ_RQ2_WRAP; 402 403 /* Found required data */ 404 405 /* 406 * By doing switch() on the bit mask "wrap" we avoid having to 407 * check two variables for all permutations: --> faster! 408 */ 409 switch (wrap) { 410 case 0: /* common case for CFQ: crq1 and crq2 not wrapped */ 411 if (d1 < d2) 412 return crq1; 413 else if (d2 < d1) 414 return crq2; 415 else { 416 if (s1 >= s2) 417 return crq1; 418 else 419 return crq2; 420 } 421 422 case CFQ_RQ2_WRAP: 423 return crq1; 424 case CFQ_RQ1_WRAP: 425 return crq2; 426 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */ 427 default: 428 /* 429 * Since both rqs are wrapped, 430 * start with the one that's further behind head 431 * (--> only *one* back seek required), 432 * since back seek takes more time than forward. 433 */ 434 if (s1 <= s2) 435 return crq1; 436 else 437 return crq2; 438 } 439} 440 441/* 442 * would be nice to take fifo expire time into account as well 443 */ 444static struct cfq_rq * 445cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 446 struct cfq_rq *last) 447{ 448 struct cfq_rq *crq_next = NULL, *crq_prev = NULL; 449 struct rb_node *rbnext, *rbprev; 450 451 if (!(rbnext = rb_next(&last->rb_node))) { 452 rbnext = rb_first(&cfqq->sort_list); 453 if (rbnext == &last->rb_node) 454 rbnext = NULL; 455 } 456 457 rbprev = rb_prev(&last->rb_node); 458 459 if (rbprev) 460 crq_prev = rb_entry_crq(rbprev); 461 if (rbnext) 462 crq_next = rb_entry_crq(rbnext); 463 464 return cfq_choose_req(cfqd, crq_next, crq_prev); 465} 466 467static void cfq_update_next_crq(struct cfq_rq *crq) 468{ 469 struct cfq_queue *cfqq = crq->cfq_queue; 470 471 if (cfqq->next_crq == crq) 472 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq); 473} 474 475static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) 476{ 477 struct cfq_data *cfqd = cfqq->cfqd; 478 struct list_head *list, *entry; 479 480 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 481 482 list_del(&cfqq->cfq_list); 483 484 if (cfq_class_rt(cfqq)) 485 list = &cfqd->cur_rr; 486 else if (cfq_class_idle(cfqq)) 487 list = &cfqd->idle_rr; 488 else { 489 /* 490 * if cfqq has requests in flight, don't allow it to be 491 * found in cfq_set_active_queue before it has finished them. 492 * this is done to increase fairness between a process that 493 * has lots of io pending vs one that only generates one 494 * sporadically or synchronously 495 */ 496 if (cfq_cfqq_dispatched(cfqq)) 497 list = &cfqd->busy_rr; 498 else 499 list = &cfqd->rr_list[cfqq->ioprio]; 500 } 501 502 /* 503 * if queue was preempted, just add to front to be fair. busy_rr 504 * isn't sorted, but insert at the back for fairness. 505 */ 506 if (preempted || list == &cfqd->busy_rr) { 507 if (preempted) 508 list = list->prev; 509 510 list_add_tail(&cfqq->cfq_list, list); 511 return; 512 } 513 514 /* 515 * sort by when queue was last serviced 516 */ 517 entry = list; 518 while ((entry = entry->prev) != list) { 519 struct cfq_queue *__cfqq = list_entry_cfqq(entry); 520 521 if (!__cfqq->service_last) 522 break; 523 if (time_before(__cfqq->service_last, cfqq->service_last)) 524 break; 525 } 526 527 list_add(&cfqq->cfq_list, entry); 528} 529 530/* 531 * add to busy list of queues for service, trying to be fair in ordering 532 * the pending list according to last request service 533 */ 534static inline void 535cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 536{ 537 BUG_ON(cfq_cfqq_on_rr(cfqq)); 538 cfq_mark_cfqq_on_rr(cfqq); 539 cfqd->busy_queues++; 540 541 cfq_resort_rr_list(cfqq, 0); 542} 543 544static inline void 545cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 546{ 547 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 548 cfq_clear_cfqq_on_rr(cfqq); 549 list_move(&cfqq->cfq_list, &cfqd->empty_list); 550 551 BUG_ON(!cfqd->busy_queues); 552 cfqd->busy_queues--; 553} 554 555/* 556 * rb tree support functions 557 */ 558static inline void cfq_del_crq_rb(struct cfq_rq *crq) 559{ 560 struct cfq_queue *cfqq = crq->cfq_queue; 561 struct cfq_data *cfqd = cfqq->cfqd; 562 const int sync = cfq_crq_is_sync(crq); 563 564 BUG_ON(!cfqq->queued[sync]); 565 cfqq->queued[sync]--; 566 567 cfq_update_next_crq(crq); 568 569 rb_erase(&crq->rb_node, &cfqq->sort_list); 570 RB_CLEAR_COLOR(&crq->rb_node); 571 572 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list)) 573 cfq_del_cfqq_rr(cfqd, cfqq); 574} 575 576static struct cfq_rq * 577__cfq_add_crq_rb(struct cfq_rq *crq) 578{ 579 struct rb_node **p = &crq->cfq_queue->sort_list.rb_node; 580 struct rb_node *parent = NULL; 581 struct cfq_rq *__crq; 582 583 while (*p) { 584 parent = *p; 585 __crq = rb_entry_crq(parent); 586 587 if (crq->rb_key < __crq->rb_key) 588 p = &(*p)->rb_left; 589 else if (crq->rb_key > __crq->rb_key) 590 p = &(*p)->rb_right; 591 else 592 return __crq; 593 } 594 595 rb_link_node(&crq->rb_node, parent, p); 596 return NULL; 597} 598 599static void cfq_add_crq_rb(struct cfq_rq *crq) 600{ 601 struct cfq_queue *cfqq = crq->cfq_queue; 602 struct cfq_data *cfqd = cfqq->cfqd; 603 struct request *rq = crq->request; 604 struct cfq_rq *__alias; 605 606 crq->rb_key = rq_rb_key(rq); 607 cfqq->queued[cfq_crq_is_sync(crq)]++; 608 609 /* 610 * looks a little odd, but the first insert might return an alias. 611 * if that happens, put the alias on the dispatch list 612 */ 613 while ((__alias = __cfq_add_crq_rb(crq)) != NULL) 614 cfq_dispatch_insert(cfqd->queue, __alias); 615 616 rb_insert_color(&crq->rb_node, &cfqq->sort_list); 617 618 if (!cfq_cfqq_on_rr(cfqq)) 619 cfq_add_cfqq_rr(cfqd, cfqq); 620 621 /* 622 * check if this request is a better next-serve candidate 623 */ 624 cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq); 625} 626 627static inline void 628cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) 629{ 630 rb_erase(&crq->rb_node, &cfqq->sort_list); 631 cfqq->queued[cfq_crq_is_sync(crq)]--; 632 633 cfq_add_crq_rb(crq); 634} 635 636static struct request * 637cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 638{ 639 struct task_struct *tsk = current; 640 pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio)); 641 struct cfq_queue *cfqq; 642 struct rb_node *n; 643 sector_t sector; 644 645 cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); 646 if (!cfqq) 647 goto out; 648 649 sector = bio->bi_sector + bio_sectors(bio); 650 n = cfqq->sort_list.rb_node; 651 while (n) { 652 struct cfq_rq *crq = rb_entry_crq(n); 653 654 if (sector < crq->rb_key) 655 n = n->rb_left; 656 else if (sector > crq->rb_key) 657 n = n->rb_right; 658 else 659 return crq->request; 660 } 661 662out: 663 return NULL; 664} 665 666static void cfq_activate_request(request_queue_t *q, struct request *rq) 667{ 668 struct cfq_data *cfqd = q->elevator->elevator_data; 669 670 cfqd->rq_in_driver++; 671 672 /* 673 * If the depth is larger 1, it really could be queueing. But lets 674 * make the mark a little higher - idling could still be good for 675 * low queueing, and a low queueing number could also just indicate 676 * a SCSI mid layer like behaviour where limit+1 is often seen. 677 */ 678 if (!cfqd->hw_tag && cfqd->rq_in_driver > 4) 679 cfqd->hw_tag = 1; 680} 681 682static void cfq_deactivate_request(request_queue_t *q, struct request *rq) 683{ 684 struct cfq_data *cfqd = q->elevator->elevator_data; 685 686 WARN_ON(!cfqd->rq_in_driver); 687 cfqd->rq_in_driver--; 688} 689 690static void cfq_remove_request(struct request *rq) 691{ 692 struct cfq_rq *crq = RQ_DATA(rq); 693 694 list_del_init(&rq->queuelist); 695 cfq_del_crq_rb(crq); 696 cfq_del_crq_hash(crq); 697} 698 699static int 700cfq_merge(request_queue_t *q, struct request **req, struct bio *bio) 701{ 702 struct cfq_data *cfqd = q->elevator->elevator_data; 703 struct request *__rq; 704 int ret; 705 706 __rq = cfq_find_rq_hash(cfqd, bio->bi_sector); 707 if (__rq && elv_rq_merge_ok(__rq, bio)) { 708 ret = ELEVATOR_BACK_MERGE; 709 goto out; 710 } 711 712 __rq = cfq_find_rq_fmerge(cfqd, bio); 713 if (__rq && elv_rq_merge_ok(__rq, bio)) { 714 ret = ELEVATOR_FRONT_MERGE; 715 goto out; 716 } 717 718 return ELEVATOR_NO_MERGE; 719out: 720 *req = __rq; 721 return ret; 722} 723 724static void cfq_merged_request(request_queue_t *q, struct request *req) 725{ 726 struct cfq_data *cfqd = q->elevator->elevator_data; 727 struct cfq_rq *crq = RQ_DATA(req); 728 729 cfq_del_crq_hash(crq); 730 cfq_add_crq_hash(cfqd, crq); 731 732 if (rq_rb_key(req) != crq->rb_key) { 733 struct cfq_queue *cfqq = crq->cfq_queue; 734 735 cfq_update_next_crq(crq); 736 cfq_reposition_crq_rb(cfqq, crq); 737 } 738} 739 740static void 741cfq_merged_requests(request_queue_t *q, struct request *rq, 742 struct request *next) 743{ 744 cfq_merged_request(q, rq); 745 746 /* 747 * reposition in fifo if next is older than rq 748 */ 749 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 750 time_before(next->start_time, rq->start_time)) 751 list_move(&rq->queuelist, &next->queuelist); 752 753 cfq_remove_request(next); 754} 755 756static inline void 757__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 758{ 759 if (cfqq) { 760 /* 761 * stop potential idle class queues waiting service 762 */ 763 del_timer(&cfqd->idle_class_timer); 764 765 cfqq->slice_start = jiffies; 766 cfqq->slice_end = 0; 767 cfqq->slice_left = 0; 768 cfq_clear_cfqq_must_alloc_slice(cfqq); 769 cfq_clear_cfqq_fifo_expire(cfqq); 770 } 771 772 cfqd->active_queue = cfqq; 773} 774 775/* 776 * current cfqq expired its slice (or was too idle), select new one 777 */ 778static void 779__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 780 int preempted) 781{ 782 unsigned long now = jiffies; 783 784 if (cfq_cfqq_wait_request(cfqq)) 785 del_timer(&cfqd->idle_slice_timer); 786 787 if (!preempted && !cfq_cfqq_dispatched(cfqq)) { 788 cfqq->service_last = now; 789 cfq_schedule_dispatch(cfqd); 790 } 791 792 cfq_clear_cfqq_must_dispatch(cfqq); 793 cfq_clear_cfqq_wait_request(cfqq); 794 795 /* 796 * store what was left of this slice, if the queue idled out 797 * or was preempted 798 */ 799 if (time_after(cfqq->slice_end, now)) 800 cfqq->slice_left = cfqq->slice_end - now; 801 else 802 cfqq->slice_left = 0; 803 804 if (cfq_cfqq_on_rr(cfqq)) 805 cfq_resort_rr_list(cfqq, preempted); 806 807 if (cfqq == cfqd->active_queue) 808 cfqd->active_queue = NULL; 809 810 if (cfqd->active_cic) { 811 put_io_context(cfqd->active_cic->ioc); 812 cfqd->active_cic = NULL; 813 } 814 815 cfqd->dispatch_slice = 0; 816} 817 818static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted) 819{ 820 struct cfq_queue *cfqq = cfqd->active_queue; 821 822 if (cfqq) 823 __cfq_slice_expired(cfqd, cfqq, preempted); 824} 825 826/* 827 * 0 828 * 0,1 829 * 0,1,2 830 * 0,1,2,3 831 * 0,1,2,3,4 832 * 0,1,2,3,4,5 833 * 0,1,2,3,4,5,6 834 * 0,1,2,3,4,5,6,7 835 */ 836static int cfq_get_next_prio_level(struct cfq_data *cfqd) 837{ 838 int prio, wrap; 839 840 prio = -1; 841 wrap = 0; 842 do { 843 int p; 844 845 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) { 846 if (!list_empty(&cfqd->rr_list[p])) { 847 prio = p; 848 break; 849 } 850 } 851 852 if (prio != -1) 853 break; 854 cfqd->cur_prio = 0; 855 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) { 856 cfqd->cur_end_prio = 0; 857 if (wrap) 858 break; 859 wrap = 1; 860 } 861 } while (1); 862 863 if (unlikely(prio == -1)) 864 return -1; 865 866 BUG_ON(prio >= CFQ_PRIO_LISTS); 867 868 list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr); 869 870 cfqd->cur_prio = prio + 1; 871 if (cfqd->cur_prio > cfqd->cur_end_prio) { 872 cfqd->cur_end_prio = cfqd->cur_prio; 873 cfqd->cur_prio = 0; 874 } 875 if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) { 876 cfqd->cur_prio = 0; 877 cfqd->cur_end_prio = 0; 878 } 879 880 return prio; 881} 882 883static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) 884{ 885 struct cfq_queue *cfqq = NULL; 886 887 /* 888 * if current list is non-empty, grab first entry. if it is empty, 889 * get next prio level and grab first entry then if any are spliced 890 */ 891 if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) 892 cfqq = list_entry_cfqq(cfqd->cur_rr.next); 893 894 /* 895 * If no new queues are available, check if the busy list has some 896 * before falling back to idle io. 897 */ 898 if (!cfqq && !list_empty(&cfqd->busy_rr)) 899 cfqq = list_entry_cfqq(cfqd->busy_rr.next); 900 901 /* 902 * if we have idle queues and no rt or be queues had pending 903 * requests, either allow immediate service if the grace period 904 * has passed or arm the idle grace timer 905 */ 906 if (!cfqq && !list_empty(&cfqd->idle_rr)) { 907 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE; 908 909 if (time_after_eq(jiffies, end)) 910 cfqq = list_entry_cfqq(cfqd->idle_rr.next); 911 else 912 mod_timer(&cfqd->idle_class_timer, end); 913 } 914 915 __cfq_set_active_queue(cfqd, cfqq); 916 return cfqq; 917} 918 919static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 920 921{ 922 struct cfq_io_context *cic; 923 unsigned long sl; 924 925 WARN_ON(!RB_EMPTY(&cfqq->sort_list)); 926 WARN_ON(cfqq != cfqd->active_queue); 927 928 /* 929 * idle is disabled, either manually or by past process history 930 */ 931 if (!cfqd->cfq_slice_idle) 932 return 0; 933 if (!cfq_cfqq_idle_window(cfqq)) 934 return 0; 935 /* 936 * task has exited, don't wait 937 */ 938 cic = cfqd->active_cic; 939 if (!cic || !cic->ioc->task) 940 return 0; 941 942 cfq_mark_cfqq_must_dispatch(cfqq); 943 cfq_mark_cfqq_wait_request(cfqq); 944 945 sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle); 946 947 /* 948 * we don't want to idle for seeks, but we do want to allow 949 * fair distribution of slice time for a process doing back-to-back 950 * seeks. so allow a little bit of time for him to submit a new rq 951 */ 952 if (sample_valid(cic->seek_samples) && cic->seek_mean > 131072) 953 sl = 2; 954 955 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 956 return 1; 957} 958 959static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq) 960{ 961 struct cfq_data *cfqd = q->elevator->elevator_data; 962 struct cfq_queue *cfqq = crq->cfq_queue; 963 964 cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq); 965 cfq_remove_request(crq->request); 966 cfqq->on_dispatch[cfq_crq_is_sync(crq)]++; 967 elv_dispatch_sort(q, crq->request); 968} 969 970/* 971 * return expired entry, or NULL to just start from scratch in rbtree 972 */ 973static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq) 974{ 975 struct cfq_data *cfqd = cfqq->cfqd; 976 struct request *rq; 977 struct cfq_rq *crq; 978 979 if (cfq_cfqq_fifo_expire(cfqq)) 980 return NULL; 981 982 if (!list_empty(&cfqq->fifo)) { 983 int fifo = cfq_cfqq_class_sync(cfqq); 984 985 crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next)); 986 rq = crq->request; 987 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) { 988 cfq_mark_cfqq_fifo_expire(cfqq); 989 return crq; 990 } 991 } 992 993 return NULL; 994} 995 996/* 997 * Scale schedule slice based on io priority. Use the sync time slice only 998 * if a queue is marked sync and has sync io queued. A sync queue with async 999 * io only, should not get full sync slice length. 1000 */ 1001static inline int 1002cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1003{ 1004 const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)]; 1005 1006 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 1007 1008 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio)); 1009} 1010 1011static inline void 1012cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1013{ 1014 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; 1015} 1016 1017static inline int 1018cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1019{ 1020 const int base_rq = cfqd->cfq_slice_async_rq; 1021 1022 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 1023 1024 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 1025} 1026 1027/* 1028 * get next queue for service 1029 */ 1030static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 1031{ 1032 unsigned long now = jiffies; 1033 struct cfq_queue *cfqq; 1034 1035 cfqq = cfqd->active_queue; 1036 if (!cfqq) 1037 goto new_queue; 1038 1039 /* 1040 * slice has expired 1041 */ 1042 if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end)) 1043 goto expire; 1044 1045 /* 1046 * if queue has requests, dispatch one. if not, check if 1047 * enough slice is left to wait for one 1048 */ 1049 if (!RB_EMPTY(&cfqq->sort_list)) 1050 goto keep_queue; 1051 else if (cfq_cfqq_class_sync(cfqq) && 1052 time_before(now, cfqq->slice_end)) { 1053 if (cfq_arm_slice_timer(cfqd, cfqq)) 1054 return NULL; 1055 } 1056 1057expire: 1058 cfq_slice_expired(cfqd, 0); 1059new_queue: 1060 cfqq = cfq_set_active_queue(cfqd); 1061keep_queue: 1062 return cfqq; 1063} 1064 1065static int 1066__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1067 int max_dispatch) 1068{ 1069 int dispatched = 0; 1070 1071 BUG_ON(RB_EMPTY(&cfqq->sort_list)); 1072 1073 do { 1074 struct cfq_rq *crq; 1075 1076 /* 1077 * follow expired path, else get first next available 1078 */ 1079 if ((crq = cfq_check_fifo(cfqq)) == NULL) 1080 crq = cfqq->next_crq; 1081 1082 /* 1083 * finally, insert request into driver dispatch list 1084 */ 1085 cfq_dispatch_insert(cfqd->queue, crq); 1086 1087 cfqd->dispatch_slice++; 1088 dispatched++; 1089 1090 if (!cfqd->active_cic) { 1091 atomic_inc(&crq->io_context->ioc->refcount); 1092 cfqd->active_cic = crq->io_context; 1093 } 1094 1095 if (RB_EMPTY(&cfqq->sort_list)) 1096 break; 1097 1098 } while (dispatched < max_dispatch); 1099 1100 /* 1101 * if slice end isn't set yet, set it. if at least one request was 1102 * sync, use the sync time slice value 1103 */ 1104 if (!cfqq->slice_end) 1105 cfq_set_prio_slice(cfqd, cfqq); 1106 1107 /* 1108 * expire an async queue immediately if it has used up its slice. idle 1109 * queue always expire after 1 dispatch round. 1110 */ 1111 if ((!cfq_cfqq_sync(cfqq) && 1112 cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) || 1113 cfq_class_idle(cfqq)) 1114 cfq_slice_expired(cfqd, 0); 1115 1116 return dispatched; 1117} 1118 1119static int 1120cfq_forced_dispatch_cfqqs(struct list_head *list) 1121{ 1122 int dispatched = 0; 1123 struct cfq_queue *cfqq, *next; 1124 struct cfq_rq *crq; 1125 1126 list_for_each_entry_safe(cfqq, next, list, cfq_list) { 1127 while ((crq = cfqq->next_crq)) { 1128 cfq_dispatch_insert(cfqq->cfqd->queue, crq); 1129 dispatched++; 1130 } 1131 BUG_ON(!list_empty(&cfqq->fifo)); 1132 } 1133 return dispatched; 1134} 1135 1136static int 1137cfq_forced_dispatch(struct cfq_data *cfqd) 1138{ 1139 int i, dispatched = 0; 1140 1141 for (i = 0; i < CFQ_PRIO_LISTS; i++) 1142 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]); 1143 1144 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr); 1145 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr); 1146 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr); 1147 1148 cfq_slice_expired(cfqd, 0); 1149 1150 BUG_ON(cfqd->busy_queues); 1151 1152 return dispatched; 1153} 1154 1155static int 1156cfq_dispatch_requests(request_queue_t *q, int force) 1157{ 1158 struct cfq_data *cfqd = q->elevator->elevator_data; 1159 struct cfq_queue *cfqq; 1160 1161 if (!cfqd->busy_queues) 1162 return 0; 1163 1164 if (unlikely(force)) 1165 return cfq_forced_dispatch(cfqd); 1166 1167 cfqq = cfq_select_queue(cfqd); 1168 if (cfqq) { 1169 int max_dispatch; 1170 1171 cfq_clear_cfqq_must_dispatch(cfqq); 1172 cfq_clear_cfqq_wait_request(cfqq); 1173 del_timer(&cfqd->idle_slice_timer); 1174 1175 max_dispatch = cfqd->cfq_quantum; 1176 if (cfq_class_idle(cfqq)) 1177 max_dispatch = 1; 1178 1179 return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); 1180 } 1181 1182 return 0; 1183} 1184 1185/* 1186 * task holds one reference to the queue, dropped when task exits. each crq 1187 * in-flight on this queue also holds a reference, dropped when crq is freed. 1188 * 1189 * queue lock must be held here. 1190 */ 1191static void cfq_put_queue(struct cfq_queue *cfqq) 1192{ 1193 struct cfq_data *cfqd = cfqq->cfqd; 1194 1195 BUG_ON(atomic_read(&cfqq->ref) <= 0); 1196 1197 if (!atomic_dec_and_test(&cfqq->ref)) 1198 return; 1199 1200 BUG_ON(rb_first(&cfqq->sort_list)); 1201 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 1202 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1203 1204 if (unlikely(cfqd->active_queue == cfqq)) 1205 __cfq_slice_expired(cfqd, cfqq, 0); 1206 1207 /* 1208 * it's on the empty list and still hashed 1209 */ 1210 list_del(&cfqq->cfq_list); 1211 hlist_del(&cfqq->cfq_hash); 1212 kmem_cache_free(cfq_pool, cfqq); 1213} 1214 1215static inline struct cfq_queue * 1216__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio, 1217 const int hashval) 1218{ 1219 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; 1220 struct hlist_node *entry; 1221 struct cfq_queue *__cfqq; 1222 1223 hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) { 1224 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio); 1225 1226 if (__cfqq->key == key && (__p == prio || !prio)) 1227 return __cfqq; 1228 } 1229 1230 return NULL; 1231} 1232 1233static struct cfq_queue * 1234cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio) 1235{ 1236 return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT)); 1237} 1238 1239static void cfq_free_io_context(struct io_context *ioc) 1240{ 1241 struct cfq_io_context *__cic; 1242 struct rb_node *n; 1243 int freed = 0; 1244 1245 while ((n = rb_first(&ioc->cic_root)) != NULL) { 1246 __cic = rb_entry(n, struct cfq_io_context, rb_node); 1247 rb_erase(&__cic->rb_node, &ioc->cic_root); 1248 kmem_cache_free(cfq_ioc_pool, __cic); 1249 freed++; 1250 } 1251 1252 if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone) 1253 complete(ioc_gone); 1254} 1255 1256static void cfq_trim(struct io_context *ioc) 1257{ 1258 ioc->set_ioprio = NULL; 1259 cfq_free_io_context(ioc); 1260} 1261 1262/* 1263 * Called with interrupts disabled 1264 */ 1265static void cfq_exit_single_io_context(struct cfq_io_context *cic) 1266{ 1267 struct cfq_data *cfqd = cic->key; 1268 request_queue_t *q; 1269 1270 if (!cfqd) 1271 return; 1272 1273 q = cfqd->queue; 1274 1275 WARN_ON(!irqs_disabled()); 1276 1277 spin_lock(q->queue_lock); 1278 1279 if (cic->cfqq[ASYNC]) { 1280 if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue)) 1281 __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0); 1282 cfq_put_queue(cic->cfqq[ASYNC]); 1283 cic->cfqq[ASYNC] = NULL; 1284 } 1285 1286 if (cic->cfqq[SYNC]) { 1287 if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue)) 1288 __cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0); 1289 cfq_put_queue(cic->cfqq[SYNC]); 1290 cic->cfqq[SYNC] = NULL; 1291 } 1292 1293 cic->key = NULL; 1294 list_del_init(&cic->queue_list); 1295 spin_unlock(q->queue_lock); 1296} 1297 1298static void cfq_exit_io_context(struct io_context *ioc) 1299{ 1300 struct cfq_io_context *__cic; 1301 unsigned long flags; 1302 struct rb_node *n; 1303 1304 /* 1305 * put the reference this task is holding to the various queues 1306 */ 1307 spin_lock_irqsave(&cfq_exit_lock, flags); 1308 1309 n = rb_first(&ioc->cic_root); 1310 while (n != NULL) { 1311 __cic = rb_entry(n, struct cfq_io_context, rb_node); 1312 1313 cfq_exit_single_io_context(__cic); 1314 n = rb_next(n); 1315 } 1316 1317 spin_unlock_irqrestore(&cfq_exit_lock, flags); 1318} 1319 1320static struct cfq_io_context * 1321cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 1322{ 1323 struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask); 1324 1325 if (cic) { 1326 RB_CLEAR(&cic->rb_node); 1327 cic->key = NULL; 1328 cic->cfqq[ASYNC] = NULL; 1329 cic->cfqq[SYNC] = NULL; 1330 cic->last_end_request = jiffies; 1331 cic->ttime_total = 0; 1332 cic->ttime_samples = 0; 1333 cic->ttime_mean = 0; 1334 cic->dtor = cfq_free_io_context; 1335 cic->exit = cfq_exit_io_context; 1336 INIT_LIST_HEAD(&cic->queue_list); 1337 atomic_inc(&ioc_count); 1338 } 1339 1340 return cic; 1341} 1342 1343static void cfq_init_prio_data(struct cfq_queue *cfqq) 1344{ 1345 struct task_struct *tsk = current; 1346 int ioprio_class; 1347 1348 if (!cfq_cfqq_prio_changed(cfqq)) 1349 return; 1350 1351 ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio); 1352 switch (ioprio_class) { 1353 default: 1354 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 1355 case IOPRIO_CLASS_NONE: 1356 /* 1357 * no prio set, place us in the middle of the BE classes 1358 */ 1359 cfqq->ioprio = task_nice_ioprio(tsk); 1360 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1361 break; 1362 case IOPRIO_CLASS_RT: 1363 cfqq->ioprio = task_ioprio(tsk); 1364 cfqq->ioprio_class = IOPRIO_CLASS_RT; 1365 break; 1366 case IOPRIO_CLASS_BE: 1367 cfqq->ioprio = task_ioprio(tsk); 1368 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1369 break; 1370 case IOPRIO_CLASS_IDLE: 1371 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 1372 cfqq->ioprio = 7; 1373 cfq_clear_cfqq_idle_window(cfqq); 1374 break; 1375 } 1376 1377 /* 1378 * keep track of original prio settings in case we have to temporarily 1379 * elevate the priority of this queue 1380 */ 1381 cfqq->org_ioprio = cfqq->ioprio; 1382 cfqq->org_ioprio_class = cfqq->ioprio_class; 1383 1384 if (cfq_cfqq_on_rr(cfqq)) 1385 cfq_resort_rr_list(cfqq, 0); 1386 1387 cfq_clear_cfqq_prio_changed(cfqq); 1388} 1389 1390static inline void changed_ioprio(struct cfq_io_context *cic) 1391{ 1392 struct cfq_data *cfqd = cic->key; 1393 struct cfq_queue *cfqq; 1394 if (cfqd) { 1395 spin_lock(cfqd->queue->queue_lock); 1396 cfqq = cic->cfqq[ASYNC]; 1397 if (cfqq) { 1398 struct cfq_queue *new_cfqq; 1399 new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, 1400 cic->ioc->task, GFP_ATOMIC); 1401 if (new_cfqq) { 1402 cic->cfqq[ASYNC] = new_cfqq; 1403 cfq_put_queue(cfqq); 1404 } 1405 } 1406 cfqq = cic->cfqq[SYNC]; 1407 if (cfqq) { 1408 cfq_mark_cfqq_prio_changed(cfqq); 1409 cfq_init_prio_data(cfqq); 1410 } 1411 spin_unlock(cfqd->queue->queue_lock); 1412 } 1413} 1414 1415/* 1416 * callback from sys_ioprio_set, irqs are disabled 1417 */ 1418static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio) 1419{ 1420 struct cfq_io_context *cic; 1421 struct rb_node *n; 1422 1423 spin_lock(&cfq_exit_lock); 1424 1425 n = rb_first(&ioc->cic_root); 1426 while (n != NULL) { 1427 cic = rb_entry(n, struct cfq_io_context, rb_node); 1428 1429 changed_ioprio(cic); 1430 n = rb_next(n); 1431 } 1432 1433 spin_unlock(&cfq_exit_lock); 1434 1435 return 0; 1436} 1437 1438static struct cfq_queue * 1439cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, 1440 gfp_t gfp_mask) 1441{ 1442 const int hashval = hash_long(key, CFQ_QHASH_SHIFT); 1443 struct cfq_queue *cfqq, *new_cfqq = NULL; 1444 unsigned short ioprio; 1445 1446retry: 1447 ioprio = tsk->ioprio; 1448 cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval); 1449 1450 if (!cfqq) { 1451 if (new_cfqq) { 1452 cfqq = new_cfqq; 1453 new_cfqq = NULL; 1454 } else if (gfp_mask & __GFP_WAIT) { 1455 spin_unlock_irq(cfqd->queue->queue_lock); 1456 new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); 1457 spin_lock_irq(cfqd->queue->queue_lock); 1458 goto retry; 1459 } else { 1460 cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); 1461 if (!cfqq) 1462 goto out; 1463 } 1464 1465 memset(cfqq, 0, sizeof(*cfqq)); 1466 1467 INIT_HLIST_NODE(&cfqq->cfq_hash); 1468 INIT_LIST_HEAD(&cfqq->cfq_list); 1469 RB_CLEAR_ROOT(&cfqq->sort_list); 1470 INIT_LIST_HEAD(&cfqq->fifo); 1471 1472 cfqq->key = key; 1473 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); 1474 atomic_set(&cfqq->ref, 0); 1475 cfqq->cfqd = cfqd; 1476 cfqq->service_last = 0; 1477 /* 1478 * set ->slice_left to allow preemption for a new process 1479 */ 1480 cfqq->slice_left = 2 * cfqd->cfq_slice_idle; 1481 if (!cfqd->hw_tag) 1482 cfq_mark_cfqq_idle_window(cfqq); 1483 cfq_mark_cfqq_prio_changed(cfqq); 1484 cfq_init_prio_data(cfqq); 1485 } 1486 1487 if (new_cfqq) 1488 kmem_cache_free(cfq_pool, new_cfqq); 1489 1490 atomic_inc(&cfqq->ref); 1491out: 1492 WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); 1493 return cfqq; 1494} 1495 1496static void 1497cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic) 1498{ 1499 spin_lock(&cfq_exit_lock); 1500 rb_erase(&cic->rb_node, &ioc->cic_root); 1501 list_del_init(&cic->queue_list); 1502 spin_unlock(&cfq_exit_lock); 1503 kmem_cache_free(cfq_ioc_pool, cic); 1504 atomic_dec(&ioc_count); 1505} 1506 1507static struct cfq_io_context * 1508cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc) 1509{ 1510 struct rb_node *n; 1511 struct cfq_io_context *cic; 1512 void *k, *key = cfqd; 1513 1514restart: 1515 n = ioc->cic_root.rb_node; 1516 while (n) { 1517 cic = rb_entry(n, struct cfq_io_context, rb_node); 1518 /* ->key must be copied to avoid race with cfq_exit_queue() */ 1519 k = cic->key; 1520 if (unlikely(!k)) { 1521 cfq_drop_dead_cic(ioc, cic); 1522 goto restart; 1523 } 1524 1525 if (key < k) 1526 n = n->rb_left; 1527 else if (key > k) 1528 n = n->rb_right; 1529 else 1530 return cic; 1531 } 1532 1533 return NULL; 1534} 1535 1536static inline void 1537cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 1538 struct cfq_io_context *cic) 1539{ 1540 struct rb_node **p; 1541 struct rb_node *parent; 1542 struct cfq_io_context *__cic; 1543 void *k; 1544 1545 cic->ioc = ioc; 1546 cic->key = cfqd; 1547 1548 ioc->set_ioprio = cfq_ioc_set_ioprio; 1549restart: 1550 parent = NULL; 1551 p = &ioc->cic_root.rb_node; 1552 while (*p) { 1553 parent = *p; 1554 __cic = rb_entry(parent, struct cfq_io_context, rb_node); 1555 /* ->key must be copied to avoid race with cfq_exit_queue() */ 1556 k = __cic->key; 1557 if (unlikely(!k)) { 1558 cfq_drop_dead_cic(ioc, cic); 1559 goto restart; 1560 } 1561 1562 if (cic->key < k) 1563 p = &(*p)->rb_left; 1564 else if (cic->key > k) 1565 p = &(*p)->rb_right; 1566 else 1567 BUG(); 1568 } 1569 1570 spin_lock(&cfq_exit_lock); 1571 rb_link_node(&cic->rb_node, parent, p); 1572 rb_insert_color(&cic->rb_node, &ioc->cic_root); 1573 list_add(&cic->queue_list, &cfqd->cic_list); 1574 spin_unlock(&cfq_exit_lock); 1575} 1576 1577/* 1578 * Setup general io context and cfq io context. There can be several cfq 1579 * io contexts per general io context, if this process is doing io to more 1580 * than one device managed by cfq. 1581 */ 1582static struct cfq_io_context * 1583cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 1584{ 1585 struct io_context *ioc = NULL; 1586 struct cfq_io_context *cic; 1587 1588 might_sleep_if(gfp_mask & __GFP_WAIT); 1589 1590 ioc = get_io_context(gfp_mask); 1591 if (!ioc) 1592 return NULL; 1593 1594 cic = cfq_cic_rb_lookup(cfqd, ioc); 1595 if (cic) 1596 goto out; 1597 1598 cic = cfq_alloc_io_context(cfqd, gfp_mask); 1599 if (cic == NULL) 1600 goto err; 1601 1602 cfq_cic_link(cfqd, ioc, cic); 1603out: 1604 return cic; 1605err: 1606 put_io_context(ioc); 1607 return NULL; 1608} 1609 1610static void 1611cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 1612{ 1613 unsigned long elapsed, ttime; 1614 1615 /* 1616 * if this context already has stuff queued, thinktime is from 1617 * last queue not last end 1618 */ 1619#if 0 1620 if (time_after(cic->last_end_request, cic->last_queue)) 1621 elapsed = jiffies - cic->last_end_request; 1622 else 1623 elapsed = jiffies - cic->last_queue; 1624#else 1625 elapsed = jiffies - cic->last_end_request; 1626#endif 1627 1628 ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 1629 1630 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 1631 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 1632 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 1633} 1634 1635static void 1636cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, 1637 struct cfq_rq *crq) 1638{ 1639 sector_t sdist; 1640 u64 total; 1641 1642 if (cic->last_request_pos < crq->request->sector) 1643 sdist = crq->request->sector - cic->last_request_pos; 1644 else 1645 sdist = cic->last_request_pos - crq->request->sector; 1646 1647 /* 1648 * Don't allow the seek distance to get too large from the 1649 * odd fragment, pagein, etc 1650 */ 1651 if (cic->seek_samples <= 60) /* second&third seek */ 1652 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); 1653 else 1654 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); 1655 1656 cic->seek_samples = (7*cic->seek_samples + 256) / 8; 1657 cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 1658 total = cic->seek_total + (cic->seek_samples/2); 1659 do_div(total, cic->seek_samples); 1660 cic->seek_mean = (sector_t)total; 1661} 1662 1663/* 1664 * Disable idle window if the process thinks too long or seeks so much that 1665 * it doesn't matter 1666 */ 1667static void 1668cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1669 struct cfq_io_context *cic) 1670{ 1671 int enable_idle = cfq_cfqq_idle_window(cfqq); 1672 1673 if (!cic->ioc->task || !cfqd->cfq_slice_idle || cfqd->hw_tag) 1674 enable_idle = 0; 1675 else if (sample_valid(cic->ttime_samples)) { 1676 if (cic->ttime_mean > cfqd->cfq_slice_idle) 1677 enable_idle = 0; 1678 else 1679 enable_idle = 1; 1680 } 1681 1682 if (enable_idle) 1683 cfq_mark_cfqq_idle_window(cfqq); 1684 else 1685 cfq_clear_cfqq_idle_window(cfqq); 1686} 1687 1688 1689/* 1690 * Check if new_cfqq should preempt the currently active queue. Return 0 for 1691 * no or if we aren't sure, a 1 will cause a preempt. 1692 */ 1693static int 1694cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 1695 struct cfq_rq *crq) 1696{ 1697 struct cfq_queue *cfqq = cfqd->active_queue; 1698 1699 if (cfq_class_idle(new_cfqq)) 1700 return 0; 1701 1702 if (!cfqq) 1703 return 1; 1704 1705 if (cfq_class_idle(cfqq)) 1706 return 1; 1707 if (!cfq_cfqq_wait_request(new_cfqq)) 1708 return 0; 1709 /* 1710 * if it doesn't have slice left, forget it 1711 */ 1712 if (new_cfqq->slice_left < cfqd->cfq_slice_idle) 1713 return 0; 1714 if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq)) 1715 return 1; 1716 1717 return 0; 1718} 1719 1720/* 1721 * cfqq preempts the active queue. if we allowed preempt with no slice left, 1722 * let it have half of its nominal slice. 1723 */ 1724static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1725{ 1726 struct cfq_queue *__cfqq, *next; 1727 1728 list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list) 1729 cfq_resort_rr_list(__cfqq, 1); 1730 1731 if (!cfqq->slice_left) 1732 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2; 1733 1734 cfqq->slice_end = cfqq->slice_left + jiffies; 1735 __cfq_slice_expired(cfqd, cfqq, 1); 1736 __cfq_set_active_queue(cfqd, cfqq); 1737} 1738 1739/* 1740 * should really be a ll_rw_blk.c helper 1741 */ 1742static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1743{ 1744 request_queue_t *q = cfqd->queue; 1745 1746 if (!blk_queue_plugged(q)) 1747 q->request_fn(q); 1748 else 1749 __generic_unplug_device(q); 1750} 1751 1752/* 1753 * Called when a new fs request (crq) is added (to cfqq). Check if there's 1754 * something we should do about it 1755 */ 1756static void 1757cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1758 struct cfq_rq *crq) 1759{ 1760 struct cfq_io_context *cic; 1761 1762 cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq); 1763 1764 cic = crq->io_context; 1765 1766 /* 1767 * we never wait for an async request and we don't allow preemption 1768 * of an async request. so just return early 1769 */ 1770 if (!cfq_crq_is_sync(crq)) { 1771 /* 1772 * sync process issued an async request, if it's waiting 1773 * then expire it and kick rq handling. 1774 */ 1775 if (cic == cfqd->active_cic && 1776 del_timer(&cfqd->idle_slice_timer)) { 1777 cfq_slice_expired(cfqd, 0); 1778 cfq_start_queueing(cfqd, cfqq); 1779 } 1780 return; 1781 } 1782 1783 cfq_update_io_thinktime(cfqd, cic); 1784 cfq_update_io_seektime(cfqd, cic, crq); 1785 cfq_update_idle_window(cfqd, cfqq, cic); 1786 1787 cic->last_queue = jiffies; 1788 cic->last_request_pos = crq->request->sector + crq->request->nr_sectors; 1789 1790 if (cfqq == cfqd->active_queue) { 1791 /* 1792 * if we are waiting for a request for this queue, let it rip 1793 * immediately and flag that we must not expire this queue 1794 * just now 1795 */ 1796 if (cfq_cfqq_wait_request(cfqq)) { 1797 cfq_mark_cfqq_must_dispatch(cfqq); 1798 del_timer(&cfqd->idle_slice_timer); 1799 cfq_start_queueing(cfqd, cfqq); 1800 } 1801 } else if (cfq_should_preempt(cfqd, cfqq, crq)) { 1802 /* 1803 * not the active queue - expire current slice if it is 1804 * idle and has expired it's mean thinktime or this new queue 1805 * has some old slice time left and is of higher priority 1806 */ 1807 cfq_preempt_queue(cfqd, cfqq); 1808 cfq_mark_cfqq_must_dispatch(cfqq); 1809 cfq_start_queueing(cfqd, cfqq); 1810 } 1811} 1812 1813static void cfq_insert_request(request_queue_t *q, struct request *rq) 1814{ 1815 struct cfq_data *cfqd = q->elevator->elevator_data; 1816 struct cfq_rq *crq = RQ_DATA(rq); 1817 struct cfq_queue *cfqq = crq->cfq_queue; 1818 1819 cfq_init_prio_data(cfqq); 1820 1821 cfq_add_crq_rb(crq); 1822 1823 list_add_tail(&rq->queuelist, &cfqq->fifo); 1824 1825 if (rq_mergeable(rq)) 1826 cfq_add_crq_hash(cfqd, crq); 1827 1828 cfq_crq_enqueued(cfqd, cfqq, crq); 1829} 1830 1831static void cfq_completed_request(request_queue_t *q, struct request *rq) 1832{ 1833 struct cfq_rq *crq = RQ_DATA(rq); 1834 struct cfq_queue *cfqq = crq->cfq_queue; 1835 struct cfq_data *cfqd = cfqq->cfqd; 1836 const int sync = cfq_crq_is_sync(crq); 1837 unsigned long now; 1838 1839 now = jiffies; 1840 1841 WARN_ON(!cfqd->rq_in_driver); 1842 WARN_ON(!cfqq->on_dispatch[sync]); 1843 cfqd->rq_in_driver--; 1844 cfqq->on_dispatch[sync]--; 1845 1846 if (!cfq_class_idle(cfqq)) 1847 cfqd->last_end_request = now; 1848 1849 if (!cfq_cfqq_dispatched(cfqq)) { 1850 if (cfq_cfqq_on_rr(cfqq)) { 1851 cfqq->service_last = now; 1852 cfq_resort_rr_list(cfqq, 0); 1853 } 1854 cfq_schedule_dispatch(cfqd); 1855 } 1856 1857 if (cfq_crq_is_sync(crq)) 1858 crq->io_context->last_end_request = now; 1859} 1860 1861static struct request * 1862cfq_former_request(request_queue_t *q, struct request *rq) 1863{ 1864 struct cfq_rq *crq = RQ_DATA(rq); 1865 struct rb_node *rbprev = rb_prev(&crq->rb_node); 1866 1867 if (rbprev) 1868 return rb_entry_crq(rbprev)->request; 1869 1870 return NULL; 1871} 1872 1873static struct request * 1874cfq_latter_request(request_queue_t *q, struct request *rq) 1875{ 1876 struct cfq_rq *crq = RQ_DATA(rq); 1877 struct rb_node *rbnext = rb_next(&crq->rb_node); 1878 1879 if (rbnext) 1880 return rb_entry_crq(rbnext)->request; 1881 1882 return NULL; 1883} 1884 1885/* 1886 * we temporarily boost lower priority queues if they are holding fs exclusive 1887 * resources. they are boosted to normal prio (CLASS_BE/4) 1888 */ 1889static void cfq_prio_boost(struct cfq_queue *cfqq) 1890{ 1891 const int ioprio_class = cfqq->ioprio_class; 1892 const int ioprio = cfqq->ioprio; 1893 1894 if (has_fs_excl()) { 1895 /* 1896 * boost idle prio on transactions that would lock out other 1897 * users of the filesystem 1898 */ 1899 if (cfq_class_idle(cfqq)) 1900 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1901 if (cfqq->ioprio > IOPRIO_NORM) 1902 cfqq->ioprio = IOPRIO_NORM; 1903 } else { 1904 /* 1905 * check if we need to unboost the queue 1906 */ 1907 if (cfqq->ioprio_class != cfqq->org_ioprio_class) 1908 cfqq->ioprio_class = cfqq->org_ioprio_class; 1909 if (cfqq->ioprio != cfqq->org_ioprio) 1910 cfqq->ioprio = cfqq->org_ioprio; 1911 } 1912 1913 /* 1914 * refile between round-robin lists if we moved the priority class 1915 */ 1916 if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) && 1917 cfq_cfqq_on_rr(cfqq)) 1918 cfq_resort_rr_list(cfqq, 0); 1919} 1920 1921static inline int 1922__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1923 struct task_struct *task, int rw) 1924{ 1925#if 1 1926 if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && 1927 !cfq_cfqq_must_alloc_slice(cfqq)) { 1928 cfq_mark_cfqq_must_alloc_slice(cfqq); 1929 return ELV_MQUEUE_MUST; 1930 } 1931 1932 return ELV_MQUEUE_MAY; 1933#else 1934 if (!cfqq || task->flags & PF_MEMALLOC) 1935 return ELV_MQUEUE_MAY; 1936 if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) { 1937 if (cfq_cfqq_wait_request(cfqq)) 1938 return ELV_MQUEUE_MUST; 1939 1940 /* 1941 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we 1942 * can quickly flood the queue with writes from a single task 1943 */ 1944 if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) { 1945 cfq_mark_cfqq_must_alloc_slice(cfqq); 1946 return ELV_MQUEUE_MUST; 1947 } 1948 1949 return ELV_MQUEUE_MAY; 1950 } 1951 if (cfq_class_idle(cfqq)) 1952 return ELV_MQUEUE_NO; 1953 if (cfqq->allocated[rw] >= cfqd->max_queued) { 1954 struct io_context *ioc = get_io_context(GFP_ATOMIC); 1955 int ret = ELV_MQUEUE_NO; 1956 1957 if (ioc && ioc->nr_batch_requests) 1958 ret = ELV_MQUEUE_MAY; 1959 1960 put_io_context(ioc); 1961 return ret; 1962 } 1963 1964 return ELV_MQUEUE_MAY; 1965#endif 1966} 1967 1968static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio) 1969{ 1970 struct cfq_data *cfqd = q->elevator->elevator_data; 1971 struct task_struct *tsk = current; 1972 struct cfq_queue *cfqq; 1973 1974 /* 1975 * don't force setup of a queue from here, as a call to may_queue 1976 * does not necessarily imply that a request actually will be queued. 1977 * so just lookup a possibly existing queue, or return 'may queue' 1978 * if that fails 1979 */ 1980 cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio); 1981 if (cfqq) { 1982 cfq_init_prio_data(cfqq); 1983 cfq_prio_boost(cfqq); 1984 1985 return __cfq_may_queue(cfqd, cfqq, tsk, rw); 1986 } 1987 1988 return ELV_MQUEUE_MAY; 1989} 1990 1991static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq) 1992{ 1993 struct cfq_data *cfqd = q->elevator->elevator_data; 1994 struct request_list *rl = &q->rq; 1995 1996 if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) { 1997 smp_mb(); 1998 if (waitqueue_active(&rl->wait[READ])) 1999 wake_up(&rl->wait[READ]); 2000 } 2001 2002 if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) { 2003 smp_mb(); 2004 if (waitqueue_active(&rl->wait[WRITE])) 2005 wake_up(&rl->wait[WRITE]); 2006 } 2007} 2008 2009/* 2010 * queue lock held here 2011 */ 2012static void cfq_put_request(request_queue_t *q, struct request *rq) 2013{ 2014 struct cfq_data *cfqd = q->elevator->elevator_data; 2015 struct cfq_rq *crq = RQ_DATA(rq); 2016 2017 if (crq) { 2018 struct cfq_queue *cfqq = crq->cfq_queue; 2019 const int rw = rq_data_dir(rq); 2020 2021 BUG_ON(!cfqq->allocated[rw]); 2022 cfqq->allocated[rw]--; 2023 2024 put_io_context(crq->io_context->ioc); 2025 2026 mempool_free(crq, cfqd->crq_pool); 2027 rq->elevator_private = NULL; 2028 2029 cfq_check_waiters(q, cfqq); 2030 cfq_put_queue(cfqq); 2031 } 2032} 2033 2034/* 2035 * Allocate cfq data structures associated with this request. 2036 */ 2037static int 2038cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio, 2039 gfp_t gfp_mask) 2040{ 2041 struct cfq_data *cfqd = q->elevator->elevator_data; 2042 struct task_struct *tsk = current; 2043 struct cfq_io_context *cic; 2044 const int rw = rq_data_dir(rq); 2045 pid_t key = cfq_queue_pid(tsk, rw); 2046 struct cfq_queue *cfqq; 2047 struct cfq_rq *crq; 2048 unsigned long flags; 2049 int is_sync = key != CFQ_KEY_ASYNC; 2050 2051 might_sleep_if(gfp_mask & __GFP_WAIT); 2052 2053 cic = cfq_get_io_context(cfqd, gfp_mask); 2054 2055 spin_lock_irqsave(q->queue_lock, flags); 2056 2057 if (!cic) 2058 goto queue_fail; 2059 2060 if (!cic->cfqq[is_sync]) { 2061 cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask); 2062 if (!cfqq) 2063 goto queue_fail; 2064 2065 cic->cfqq[is_sync] = cfqq; 2066 } else 2067 cfqq = cic->cfqq[is_sync]; 2068 2069 cfqq->allocated[rw]++; 2070 cfq_clear_cfqq_must_alloc(cfqq); 2071 cfqd->rq_starved = 0; 2072 atomic_inc(&cfqq->ref); 2073 spin_unlock_irqrestore(q->queue_lock, flags); 2074 2075 crq = mempool_alloc(cfqd->crq_pool, gfp_mask); 2076 if (crq) { 2077 RB_CLEAR(&crq->rb_node); 2078 crq->rb_key = 0; 2079 crq->request = rq; 2080 INIT_HLIST_NODE(&crq->hash); 2081 crq->cfq_queue = cfqq; 2082 crq->io_context = cic; 2083 2084 if (is_sync) 2085 cfq_mark_crq_is_sync(crq); 2086 else 2087 cfq_clear_crq_is_sync(crq); 2088 2089 rq->elevator_private = crq; 2090 return 0; 2091 } 2092 2093 spin_lock_irqsave(q->queue_lock, flags); 2094 cfqq->allocated[rw]--; 2095 if (!(cfqq->allocated[0] + cfqq->allocated[1])) 2096 cfq_mark_cfqq_must_alloc(cfqq); 2097 cfq_put_queue(cfqq); 2098queue_fail: 2099 if (cic) 2100 put_io_context(cic->ioc); 2101 /* 2102 * mark us rq allocation starved. we need to kickstart the process 2103 * ourselves if there are no pending requests that can do it for us. 2104 * that would be an extremely rare OOM situation 2105 */ 2106 cfqd->rq_starved = 1; 2107 cfq_schedule_dispatch(cfqd); 2108 spin_unlock_irqrestore(q->queue_lock, flags); 2109 return 1; 2110} 2111 2112static void cfq_kick_queue(void *data) 2113{ 2114 request_queue_t *q = data; 2115 struct cfq_data *cfqd = q->elevator->elevator_data; 2116 unsigned long flags; 2117 2118 spin_lock_irqsave(q->queue_lock, flags); 2119 2120 if (cfqd->rq_starved) { 2121 struct request_list *rl = &q->rq; 2122 2123 /* 2124 * we aren't guaranteed to get a request after this, but we 2125 * have to be opportunistic 2126 */ 2127 smp_mb(); 2128 if (waitqueue_active(&rl->wait[READ])) 2129 wake_up(&rl->wait[READ]); 2130 if (waitqueue_active(&rl->wait[WRITE])) 2131 wake_up(&rl->wait[WRITE]); 2132 } 2133 2134 blk_remove_plug(q); 2135 q->request_fn(q); 2136 spin_unlock_irqrestore(q->queue_lock, flags); 2137} 2138 2139/* 2140 * Timer running if the active_queue is currently idling inside its time slice 2141 */ 2142static void cfq_idle_slice_timer(unsigned long data) 2143{ 2144 struct cfq_data *cfqd = (struct cfq_data *) data; 2145 struct cfq_queue *cfqq; 2146 unsigned long flags; 2147 2148 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2149 2150 if ((cfqq = cfqd->active_queue) != NULL) { 2151 unsigned long now = jiffies; 2152 2153 /* 2154 * expired 2155 */ 2156 if (time_after(now, cfqq->slice_end)) 2157 goto expire; 2158 2159 /* 2160 * only expire and reinvoke request handler, if there are 2161 * other queues with pending requests 2162 */ 2163 if (!cfqd->busy_queues) { 2164 cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end); 2165 add_timer(&cfqd->idle_slice_timer); 2166 goto out_cont; 2167 } 2168 2169 /* 2170 * not expired and it has a request pending, let it dispatch 2171 */ 2172 if (!RB_EMPTY(&cfqq->sort_list)) { 2173 cfq_mark_cfqq_must_dispatch(cfqq); 2174 goto out_kick; 2175 } 2176 } 2177expire: 2178 cfq_slice_expired(cfqd, 0); 2179out_kick: 2180 cfq_schedule_dispatch(cfqd); 2181out_cont: 2182 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2183} 2184 2185/* 2186 * Timer running if an idle class queue is waiting for service 2187 */ 2188static void cfq_idle_class_timer(unsigned long data) 2189{ 2190 struct cfq_data *cfqd = (struct cfq_data *) data; 2191 unsigned long flags, end; 2192 2193 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2194 2195 /* 2196 * race with a non-idle queue, reset timer 2197 */ 2198 end = cfqd->last_end_request + CFQ_IDLE_GRACE; 2199 if (!time_after_eq(jiffies, end)) 2200 mod_timer(&cfqd->idle_class_timer, end); 2201 else 2202 cfq_schedule_dispatch(cfqd); 2203 2204 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2205} 2206 2207static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 2208{ 2209 del_timer_sync(&cfqd->idle_slice_timer); 2210 del_timer_sync(&cfqd->idle_class_timer); 2211 blk_sync_queue(cfqd->queue); 2212} 2213 2214static void cfq_exit_queue(elevator_t *e) 2215{ 2216 struct cfq_data *cfqd = e->elevator_data; 2217 request_queue_t *q = cfqd->queue; 2218 2219 cfq_shutdown_timer_wq(cfqd); 2220 2221 spin_lock(&cfq_exit_lock); 2222 spin_lock_irq(q->queue_lock); 2223 2224 if (cfqd->active_queue) 2225 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 2226 2227 while (!list_empty(&cfqd->cic_list)) { 2228 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 2229 struct cfq_io_context, 2230 queue_list); 2231 if (cic->cfqq[ASYNC]) { 2232 cfq_put_queue(cic->cfqq[ASYNC]); 2233 cic->cfqq[ASYNC] = NULL; 2234 } 2235 if (cic->cfqq[SYNC]) { 2236 cfq_put_queue(cic->cfqq[SYNC]); 2237 cic->cfqq[SYNC] = NULL; 2238 } 2239 cic->key = NULL; 2240 list_del_init(&cic->queue_list); 2241 } 2242 2243 spin_unlock_irq(q->queue_lock); 2244 spin_unlock(&cfq_exit_lock); 2245 2246 cfq_shutdown_timer_wq(cfqd); 2247 2248 mempool_destroy(cfqd->crq_pool); 2249 kfree(cfqd->crq_hash); 2250 kfree(cfqd->cfq_hash); 2251 kfree(cfqd); 2252} 2253 2254static void *cfq_init_queue(request_queue_t *q, elevator_t *e) 2255{ 2256 struct cfq_data *cfqd; 2257 int i; 2258 2259 cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL); 2260 if (!cfqd) 2261 return NULL; 2262 2263 memset(cfqd, 0, sizeof(*cfqd)); 2264 2265 for (i = 0; i < CFQ_PRIO_LISTS; i++) 2266 INIT_LIST_HEAD(&cfqd->rr_list[i]); 2267 2268 INIT_LIST_HEAD(&cfqd->busy_rr); 2269 INIT_LIST_HEAD(&cfqd->cur_rr); 2270 INIT_LIST_HEAD(&cfqd->idle_rr); 2271 INIT_LIST_HEAD(&cfqd->empty_list); 2272 INIT_LIST_HEAD(&cfqd->cic_list); 2273 2274 cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); 2275 if (!cfqd->crq_hash) 2276 goto out_crqhash; 2277 2278 cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL); 2279 if (!cfqd->cfq_hash) 2280 goto out_cfqhash; 2281 2282 cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool); 2283 if (!cfqd->crq_pool) 2284 goto out_crqpool; 2285 2286 for (i = 0; i < CFQ_MHASH_ENTRIES; i++) 2287 INIT_HLIST_HEAD(&cfqd->crq_hash[i]); 2288 for (i = 0; i < CFQ_QHASH_ENTRIES; i++) 2289 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); 2290 2291 cfqd->queue = q; 2292 2293 cfqd->max_queued = q->nr_requests / 4; 2294 q->nr_batching = cfq_queued; 2295 2296 init_timer(&cfqd->idle_slice_timer); 2297 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 2298 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 2299 2300 init_timer(&cfqd->idle_class_timer); 2301 cfqd->idle_class_timer.function = cfq_idle_class_timer; 2302 cfqd->idle_class_timer.data = (unsigned long) cfqd; 2303 2304 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q); 2305 2306 cfqd->cfq_queued = cfq_queued; 2307 cfqd->cfq_quantum = cfq_quantum; 2308 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 2309 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 2310 cfqd->cfq_back_max = cfq_back_max; 2311 cfqd->cfq_back_penalty = cfq_back_penalty; 2312 cfqd->cfq_slice[0] = cfq_slice_async; 2313 cfqd->cfq_slice[1] = cfq_slice_sync; 2314 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 2315 cfqd->cfq_slice_idle = cfq_slice_idle; 2316 2317 return cfqd; 2318out_crqpool: 2319 kfree(cfqd->cfq_hash); 2320out_cfqhash: 2321 kfree(cfqd->crq_hash); 2322out_crqhash: 2323 kfree(cfqd); 2324 return NULL; 2325} 2326 2327static void cfq_slab_kill(void) 2328{ 2329 if (crq_pool) 2330 kmem_cache_destroy(crq_pool); 2331 if (cfq_pool) 2332 kmem_cache_destroy(cfq_pool); 2333 if (cfq_ioc_pool) 2334 kmem_cache_destroy(cfq_ioc_pool); 2335} 2336 2337static int __init cfq_slab_setup(void) 2338{ 2339 crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0, 2340 NULL, NULL); 2341 if (!crq_pool) 2342 goto fail; 2343 2344 cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0, 2345 NULL, NULL); 2346 if (!cfq_pool) 2347 goto fail; 2348 2349 cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool", 2350 sizeof(struct cfq_io_context), 0, 0, NULL, NULL); 2351 if (!cfq_ioc_pool) 2352 goto fail; 2353 2354 return 0; 2355fail: 2356 cfq_slab_kill(); 2357 return -ENOMEM; 2358} 2359 2360/* 2361 * sysfs parts below --> 2362 */ 2363 2364static ssize_t 2365cfq_var_show(unsigned int var, char *page) 2366{ 2367 return sprintf(page, "%d\n", var); 2368} 2369 2370static ssize_t 2371cfq_var_store(unsigned int *var, const char *page, size_t count) 2372{ 2373 char *p = (char *) page; 2374 2375 *var = simple_strtoul(p, &p, 10); 2376 return count; 2377} 2378 2379#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 2380static ssize_t __FUNC(elevator_t *e, char *page) \ 2381{ \ 2382 struct cfq_data *cfqd = e->elevator_data; \ 2383 unsigned int __data = __VAR; \ 2384 if (__CONV) \ 2385 __data = jiffies_to_msecs(__data); \ 2386 return cfq_var_show(__data, (page)); \ 2387} 2388SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 2389SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0); 2390SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 2391SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 2392SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 2393SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 2394SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 2395SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 2396SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 2397SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 2398#undef SHOW_FUNCTION 2399 2400#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 2401static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \ 2402{ \ 2403 struct cfq_data *cfqd = e->elevator_data; \ 2404 unsigned int __data; \ 2405 int ret = cfq_var_store(&__data, (page), count); \ 2406 if (__data < (MIN)) \ 2407 __data = (MIN); \ 2408 else if (__data > (MAX)) \ 2409 __data = (MAX); \ 2410 if (__CONV) \ 2411 *(__PTR) = msecs_to_jiffies(__data); \ 2412 else \ 2413 *(__PTR) = __data; \ 2414 return ret; \ 2415} 2416STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 2417STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0); 2418STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1); 2419STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1); 2420STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 2421STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); 2422STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 2423STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 2424STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 2425STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); 2426#undef STORE_FUNCTION 2427 2428#define CFQ_ATTR(name) \ 2429 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 2430 2431static struct elv_fs_entry cfq_attrs[] = { 2432 CFQ_ATTR(quantum), 2433 CFQ_ATTR(queued), 2434 CFQ_ATTR(fifo_expire_sync), 2435 CFQ_ATTR(fifo_expire_async), 2436 CFQ_ATTR(back_seek_max), 2437 CFQ_ATTR(back_seek_penalty), 2438 CFQ_ATTR(slice_sync), 2439 CFQ_ATTR(slice_async), 2440 CFQ_ATTR(slice_async_rq), 2441 CFQ_ATTR(slice_idle), 2442 __ATTR_NULL 2443}; 2444 2445static struct elevator_type iosched_cfq = { 2446 .ops = { 2447 .elevator_merge_fn = cfq_merge, 2448 .elevator_merged_fn = cfq_merged_request, 2449 .elevator_merge_req_fn = cfq_merged_requests, 2450 .elevator_dispatch_fn = cfq_dispatch_requests, 2451 .elevator_add_req_fn = cfq_insert_request, 2452 .elevator_activate_req_fn = cfq_activate_request, 2453 .elevator_deactivate_req_fn = cfq_deactivate_request, 2454 .elevator_queue_empty_fn = cfq_queue_empty, 2455 .elevator_completed_req_fn = cfq_completed_request, 2456 .elevator_former_req_fn = cfq_former_request, 2457 .elevator_latter_req_fn = cfq_latter_request, 2458 .elevator_set_req_fn = cfq_set_request, 2459 .elevator_put_req_fn = cfq_put_request, 2460 .elevator_may_queue_fn = cfq_may_queue, 2461 .elevator_init_fn = cfq_init_queue, 2462 .elevator_exit_fn = cfq_exit_queue, 2463 .trim = cfq_trim, 2464 }, 2465 .elevator_attrs = cfq_attrs, 2466 .elevator_name = "cfq", 2467 .elevator_owner = THIS_MODULE, 2468}; 2469 2470static int __init cfq_init(void) 2471{ 2472 int ret; 2473 2474 /* 2475 * could be 0 on HZ < 1000 setups 2476 */ 2477 if (!cfq_slice_async) 2478 cfq_slice_async = 1; 2479 if (!cfq_slice_idle) 2480 cfq_slice_idle = 1; 2481 2482 if (cfq_slab_setup()) 2483 return -ENOMEM; 2484 2485 ret = elv_register(&iosched_cfq); 2486 if (ret) 2487 cfq_slab_kill(); 2488 2489 return ret; 2490} 2491 2492static void __exit cfq_exit(void) 2493{ 2494 DECLARE_COMPLETION(all_gone); 2495 elv_unregister(&iosched_cfq); 2496 ioc_gone = &all_gone; 2497 /* ioc_gone's update must be visible before reading ioc_count */ 2498 smp_wmb(); 2499 if (atomic_read(&ioc_count)) 2500 wait_for_completion(ioc_gone); 2501 synchronize_rcu(); 2502 cfq_slab_kill(); 2503} 2504 2505module_init(cfq_init); 2506module_exit(cfq_exit); 2507 2508MODULE_AUTHOR("Jens Axboe"); 2509MODULE_LICENSE("GPL"); 2510MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 2511