cfq-iosched.c revision 3ed9a2965c47636bc0ebafab31a39f1c105492ca
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/blkdev.h> 11#include <linux/elevator.h> 12#include <linux/hash.h> 13#include <linux/rbtree.h> 14#include <linux/ioprio.h> 15 16/* 17 * tunables 18 */ 19static const int cfq_quantum = 4; /* max queue in one round of service */ 20static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 21static const int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */ 22static const int cfq_back_penalty = 2; /* penalty of a backwards seek */ 23 24static const int cfq_slice_sync = HZ / 10; 25static int cfq_slice_async = HZ / 25; 26static const int cfq_slice_async_rq = 2; 27static int cfq_slice_idle = HZ / 125; 28 29/* 30 * grace period before allowing idle class to get disk access 31 */ 32#define CFQ_IDLE_GRACE (HZ / 10) 33 34/* 35 * below this threshold, we consider thinktime immediate 36 */ 37#define CFQ_MIN_TT (2) 38 39#define CFQ_SLICE_SCALE (5) 40 41#define CFQ_KEY_ASYNC (0) 42 43/* 44 * for the hash of cfqq inside the cfqd 45 */ 46#define CFQ_QHASH_SHIFT 6 47#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) 48 49#define RQ_CIC(rq) ((struct cfq_io_context*)(rq)->elevator_private) 50#define RQ_CFQQ(rq) ((rq)->elevator_private2) 51 52static struct kmem_cache *cfq_pool; 53static struct kmem_cache *cfq_ioc_pool; 54 55static DEFINE_PER_CPU(unsigned long, ioc_count); 56static struct completion *ioc_gone; 57 58#define CFQ_PRIO_LISTS IOPRIO_BE_NR 59#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 60#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 61 62#define ASYNC (0) 63#define SYNC (1) 64 65#define cfq_cfqq_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC) 66 67#define sample_valid(samples) ((samples) > 80) 68 69/* 70 * Most of our rbtree usage is for sorting with min extraction, so 71 * if we cache the leftmost node we don't have to walk down the tree 72 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 73 * move this into the elevator for the rq sorting as well. 74 */ 75struct cfq_rb_root { 76 struct rb_root rb; 77 struct rb_node *left; 78}; 79#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } 80 81/* 82 * Per block device queue structure 83 */ 84struct cfq_data { 85 request_queue_t *queue; 86 87 /* 88 * rr list of queues with requests and the count of them 89 */ 90 struct cfq_rb_root service_tree; 91 unsigned int busy_queues; 92 93 /* 94 * cfqq lookup hash 95 */ 96 struct hlist_head *cfq_hash; 97 98 int rq_in_driver; 99 int sync_flight; 100 int hw_tag; 101 102 /* 103 * idle window management 104 */ 105 struct timer_list idle_slice_timer; 106 struct work_struct unplug_work; 107 108 struct cfq_queue *active_queue; 109 struct cfq_io_context *active_cic; 110 111 struct timer_list idle_class_timer; 112 113 sector_t last_position; 114 unsigned long last_end_request; 115 116 /* 117 * tunables, see top of file 118 */ 119 unsigned int cfq_quantum; 120 unsigned int cfq_fifo_expire[2]; 121 unsigned int cfq_back_penalty; 122 unsigned int cfq_back_max; 123 unsigned int cfq_slice[2]; 124 unsigned int cfq_slice_async_rq; 125 unsigned int cfq_slice_idle; 126 127 struct list_head cic_list; 128 129 sector_t new_seek_mean; 130 u64 new_seek_total; 131}; 132 133/* 134 * Per process-grouping structure 135 */ 136struct cfq_queue { 137 /* reference count */ 138 atomic_t ref; 139 /* parent cfq_data */ 140 struct cfq_data *cfqd; 141 /* cfqq lookup hash */ 142 struct hlist_node cfq_hash; 143 /* hash key */ 144 unsigned int key; 145 /* service_tree member */ 146 struct rb_node rb_node; 147 /* service_tree key */ 148 unsigned long rb_key; 149 /* sorted list of pending requests */ 150 struct rb_root sort_list; 151 /* if fifo isn't expired, next request to serve */ 152 struct request *next_rq; 153 /* requests queued in sort_list */ 154 int queued[2]; 155 /* currently allocated requests */ 156 int allocated[2]; 157 /* pending metadata requests */ 158 int meta_pending; 159 /* fifo list of requests in sort_list */ 160 struct list_head fifo; 161 162 unsigned long slice_end; 163 long slice_resid; 164 165 /* number of requests that are on the dispatch list or inside driver */ 166 int dispatched; 167 168 /* io prio of this group */ 169 unsigned short ioprio, org_ioprio; 170 unsigned short ioprio_class, org_ioprio_class; 171 172 /* various state flags, see below */ 173 unsigned int flags; 174 175 sector_t last_request_pos; 176}; 177 178enum cfqq_state_flags { 179 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 180 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 181 CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ 182 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 183 CFQ_CFQQ_FLAG_must_dispatch, /* must dispatch, even if expired */ 184 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 185 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 186 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 187 CFQ_CFQQ_FLAG_queue_new, /* queue never been serviced */ 188 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 189}; 190 191#define CFQ_CFQQ_FNS(name) \ 192static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 193{ \ 194 cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 195} \ 196static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 197{ \ 198 cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 199} \ 200static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 201{ \ 202 return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 203} 204 205CFQ_CFQQ_FNS(on_rr); 206CFQ_CFQQ_FNS(wait_request); 207CFQ_CFQQ_FNS(must_alloc); 208CFQ_CFQQ_FNS(must_alloc_slice); 209CFQ_CFQQ_FNS(must_dispatch); 210CFQ_CFQQ_FNS(fifo_expire); 211CFQ_CFQQ_FNS(idle_window); 212CFQ_CFQQ_FNS(prio_changed); 213CFQ_CFQQ_FNS(queue_new); 214CFQ_CFQQ_FNS(slice_new); 215#undef CFQ_CFQQ_FNS 216 217static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); 218static void cfq_dispatch_insert(request_queue_t *, struct request *); 219static struct cfq_queue *cfq_get_queue(struct cfq_data *, unsigned int, struct task_struct *, gfp_t); 220 221/* 222 * scheduler run of queue, if there are requests pending and no one in the 223 * driver that will restart queueing 224 */ 225static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 226{ 227 if (cfqd->busy_queues) 228 kblockd_schedule_work(&cfqd->unplug_work); 229} 230 231static int cfq_queue_empty(request_queue_t *q) 232{ 233 struct cfq_data *cfqd = q->elevator->elevator_data; 234 235 return !cfqd->busy_queues; 236} 237 238static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync) 239{ 240 /* 241 * Use the per-process queue, for read requests and syncronous writes 242 */ 243 if (!(rw & REQ_RW) || is_sync) 244 return task->pid; 245 246 return CFQ_KEY_ASYNC; 247} 248 249/* 250 * Scale schedule slice based on io priority. Use the sync time slice only 251 * if a queue is marked sync and has sync io queued. A sync queue with async 252 * io only, should not get full sync slice length. 253 */ 254static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync, 255 unsigned short prio) 256{ 257 const int base_slice = cfqd->cfq_slice[sync]; 258 259 WARN_ON(prio >= IOPRIO_BE_NR); 260 261 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); 262} 263 264static inline int 265cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 266{ 267 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 268} 269 270static inline void 271cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 272{ 273 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; 274} 275 276/* 277 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 278 * isn't valid until the first request from the dispatch is activated 279 * and the slice time set. 280 */ 281static inline int cfq_slice_used(struct cfq_queue *cfqq) 282{ 283 if (cfq_cfqq_slice_new(cfqq)) 284 return 0; 285 if (time_before(jiffies, cfqq->slice_end)) 286 return 0; 287 288 return 1; 289} 290 291/* 292 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 293 * We choose the request that is closest to the head right now. Distance 294 * behind the head is penalized and only allowed to a certain extent. 295 */ 296static struct request * 297cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) 298{ 299 sector_t last, s1, s2, d1 = 0, d2 = 0; 300 unsigned long back_max; 301#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 302#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 303 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 304 305 if (rq1 == NULL || rq1 == rq2) 306 return rq2; 307 if (rq2 == NULL) 308 return rq1; 309 310 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 311 return rq1; 312 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 313 return rq2; 314 if (rq_is_meta(rq1) && !rq_is_meta(rq2)) 315 return rq1; 316 else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) 317 return rq2; 318 319 s1 = rq1->sector; 320 s2 = rq2->sector; 321 322 last = cfqd->last_position; 323 324 /* 325 * by definition, 1KiB is 2 sectors 326 */ 327 back_max = cfqd->cfq_back_max * 2; 328 329 /* 330 * Strict one way elevator _except_ in the case where we allow 331 * short backward seeks which are biased as twice the cost of a 332 * similar forward seek. 333 */ 334 if (s1 >= last) 335 d1 = s1 - last; 336 else if (s1 + back_max >= last) 337 d1 = (last - s1) * cfqd->cfq_back_penalty; 338 else 339 wrap |= CFQ_RQ1_WRAP; 340 341 if (s2 >= last) 342 d2 = s2 - last; 343 else if (s2 + back_max >= last) 344 d2 = (last - s2) * cfqd->cfq_back_penalty; 345 else 346 wrap |= CFQ_RQ2_WRAP; 347 348 /* Found required data */ 349 350 /* 351 * By doing switch() on the bit mask "wrap" we avoid having to 352 * check two variables for all permutations: --> faster! 353 */ 354 switch (wrap) { 355 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 356 if (d1 < d2) 357 return rq1; 358 else if (d2 < d1) 359 return rq2; 360 else { 361 if (s1 >= s2) 362 return rq1; 363 else 364 return rq2; 365 } 366 367 case CFQ_RQ2_WRAP: 368 return rq1; 369 case CFQ_RQ1_WRAP: 370 return rq2; 371 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 372 default: 373 /* 374 * Since both rqs are wrapped, 375 * start with the one that's further behind head 376 * (--> only *one* back seek required), 377 * since back seek takes more time than forward. 378 */ 379 if (s1 <= s2) 380 return rq1; 381 else 382 return rq2; 383 } 384} 385 386/* 387 * The below is leftmost cache rbtree addon 388 */ 389static struct rb_node *cfq_rb_first(struct cfq_rb_root *root) 390{ 391 if (!root->left) 392 root->left = rb_first(&root->rb); 393 394 return root->left; 395} 396 397static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 398{ 399 if (root->left == n) 400 root->left = NULL; 401 402 rb_erase(n, &root->rb); 403 RB_CLEAR_NODE(n); 404} 405 406/* 407 * would be nice to take fifo expire time into account as well 408 */ 409static struct request * 410cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 411 struct request *last) 412{ 413 struct rb_node *rbnext = rb_next(&last->rb_node); 414 struct rb_node *rbprev = rb_prev(&last->rb_node); 415 struct request *next = NULL, *prev = NULL; 416 417 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 418 419 if (rbprev) 420 prev = rb_entry_rq(rbprev); 421 422 if (rbnext) 423 next = rb_entry_rq(rbnext); 424 else { 425 rbnext = rb_first(&cfqq->sort_list); 426 if (rbnext && rbnext != &last->rb_node) 427 next = rb_entry_rq(rbnext); 428 } 429 430 return cfq_choose_req(cfqd, next, prev); 431} 432 433static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 434 struct cfq_queue *cfqq) 435{ 436 /* 437 * just an approximation, should be ok. 438 */ 439 return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - 440 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 441} 442 443/* 444 * The cfqd->service_tree holds all pending cfq_queue's that have 445 * requests waiting to be processed. It is sorted in the order that 446 * we will service the queues. 447 */ 448static void cfq_service_tree_add(struct cfq_data *cfqd, 449 struct cfq_queue *cfqq, int add_front) 450{ 451 struct rb_node **p = &cfqd->service_tree.rb.rb_node; 452 struct rb_node *parent = NULL; 453 unsigned long rb_key; 454 int left; 455 456 if (!add_front) { 457 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; 458 rb_key += cfqq->slice_resid; 459 cfqq->slice_resid = 0; 460 } else 461 rb_key = 0; 462 463 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 464 /* 465 * same position, nothing more to do 466 */ 467 if (rb_key == cfqq->rb_key) 468 return; 469 470 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 471 } 472 473 left = 1; 474 while (*p) { 475 struct cfq_queue *__cfqq; 476 struct rb_node **n; 477 478 parent = *p; 479 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 480 481 /* 482 * sort RT queues first, we always want to give 483 * preference to them. IDLE queues goes to the back. 484 * after that, sort on the next service time. 485 */ 486 if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) 487 n = &(*p)->rb_left; 488 else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) 489 n = &(*p)->rb_right; 490 else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) 491 n = &(*p)->rb_left; 492 else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) 493 n = &(*p)->rb_right; 494 else if (rb_key < __cfqq->rb_key) 495 n = &(*p)->rb_left; 496 else 497 n = &(*p)->rb_right; 498 499 if (n == &(*p)->rb_right) 500 left = 0; 501 502 p = n; 503 } 504 505 if (left) 506 cfqd->service_tree.left = &cfqq->rb_node; 507 508 cfqq->rb_key = rb_key; 509 rb_link_node(&cfqq->rb_node, parent, p); 510 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 511} 512 513/* 514 * Update cfqq's position in the service tree. 515 */ 516static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 517{ 518 /* 519 * Resorting requires the cfqq to be on the RR list already. 520 */ 521 if (cfq_cfqq_on_rr(cfqq)) 522 cfq_service_tree_add(cfqd, cfqq, 0); 523} 524 525/* 526 * add to busy list of queues for service, trying to be fair in ordering 527 * the pending list according to last request service 528 */ 529static inline void 530cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 531{ 532 BUG_ON(cfq_cfqq_on_rr(cfqq)); 533 cfq_mark_cfqq_on_rr(cfqq); 534 cfqd->busy_queues++; 535 536 cfq_resort_rr_list(cfqd, cfqq); 537} 538 539/* 540 * Called when the cfqq no longer has requests pending, remove it from 541 * the service tree. 542 */ 543static inline void 544cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 545{ 546 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 547 cfq_clear_cfqq_on_rr(cfqq); 548 549 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 550 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 551 552 BUG_ON(!cfqd->busy_queues); 553 cfqd->busy_queues--; 554} 555 556/* 557 * rb tree support functions 558 */ 559static inline void cfq_del_rq_rb(struct request *rq) 560{ 561 struct cfq_queue *cfqq = RQ_CFQQ(rq); 562 struct cfq_data *cfqd = cfqq->cfqd; 563 const int sync = rq_is_sync(rq); 564 565 BUG_ON(!cfqq->queued[sync]); 566 cfqq->queued[sync]--; 567 568 elv_rb_del(&cfqq->sort_list, rq); 569 570 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 571 cfq_del_cfqq_rr(cfqd, cfqq); 572} 573 574static void cfq_add_rq_rb(struct request *rq) 575{ 576 struct cfq_queue *cfqq = RQ_CFQQ(rq); 577 struct cfq_data *cfqd = cfqq->cfqd; 578 struct request *__alias; 579 580 cfqq->queued[rq_is_sync(rq)]++; 581 582 /* 583 * looks a little odd, but the first insert might return an alias. 584 * if that happens, put the alias on the dispatch list 585 */ 586 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 587 cfq_dispatch_insert(cfqd->queue, __alias); 588 589 if (!cfq_cfqq_on_rr(cfqq)) 590 cfq_add_cfqq_rr(cfqd, cfqq); 591 592 /* 593 * check if this request is a better next-serve candidate 594 */ 595 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 596 BUG_ON(!cfqq->next_rq); 597} 598 599static inline void 600cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 601{ 602 elv_rb_del(&cfqq->sort_list, rq); 603 cfqq->queued[rq_is_sync(rq)]--; 604 cfq_add_rq_rb(rq); 605} 606 607static struct request * 608cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 609{ 610 struct task_struct *tsk = current; 611 pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio), bio_sync(bio)); 612 struct cfq_queue *cfqq; 613 614 cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); 615 if (cfqq) { 616 sector_t sector = bio->bi_sector + bio_sectors(bio); 617 618 return elv_rb_find(&cfqq->sort_list, sector); 619 } 620 621 return NULL; 622} 623 624static void cfq_activate_request(request_queue_t *q, struct request *rq) 625{ 626 struct cfq_data *cfqd = q->elevator->elevator_data; 627 628 cfqd->rq_in_driver++; 629 630 /* 631 * If the depth is larger 1, it really could be queueing. But lets 632 * make the mark a little higher - idling could still be good for 633 * low queueing, and a low queueing number could also just indicate 634 * a SCSI mid layer like behaviour where limit+1 is often seen. 635 */ 636 if (!cfqd->hw_tag && cfqd->rq_in_driver > 4) 637 cfqd->hw_tag = 1; 638 639 cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors; 640} 641 642static void cfq_deactivate_request(request_queue_t *q, struct request *rq) 643{ 644 struct cfq_data *cfqd = q->elevator->elevator_data; 645 646 WARN_ON(!cfqd->rq_in_driver); 647 cfqd->rq_in_driver--; 648} 649 650static void cfq_remove_request(struct request *rq) 651{ 652 struct cfq_queue *cfqq = RQ_CFQQ(rq); 653 654 if (cfqq->next_rq == rq) 655 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 656 657 list_del_init(&rq->queuelist); 658 cfq_del_rq_rb(rq); 659 660 if (rq_is_meta(rq)) { 661 WARN_ON(!cfqq->meta_pending); 662 cfqq->meta_pending--; 663 } 664} 665 666static int cfq_merge(request_queue_t *q, struct request **req, struct bio *bio) 667{ 668 struct cfq_data *cfqd = q->elevator->elevator_data; 669 struct request *__rq; 670 671 __rq = cfq_find_rq_fmerge(cfqd, bio); 672 if (__rq && elv_rq_merge_ok(__rq, bio)) { 673 *req = __rq; 674 return ELEVATOR_FRONT_MERGE; 675 } 676 677 return ELEVATOR_NO_MERGE; 678} 679 680static void cfq_merged_request(request_queue_t *q, struct request *req, 681 int type) 682{ 683 if (type == ELEVATOR_FRONT_MERGE) { 684 struct cfq_queue *cfqq = RQ_CFQQ(req); 685 686 cfq_reposition_rq_rb(cfqq, req); 687 } 688} 689 690static void 691cfq_merged_requests(request_queue_t *q, struct request *rq, 692 struct request *next) 693{ 694 /* 695 * reposition in fifo if next is older than rq 696 */ 697 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 698 time_before(next->start_time, rq->start_time)) 699 list_move(&rq->queuelist, &next->queuelist); 700 701 cfq_remove_request(next); 702} 703 704static int cfq_allow_merge(request_queue_t *q, struct request *rq, 705 struct bio *bio) 706{ 707 struct cfq_data *cfqd = q->elevator->elevator_data; 708 const int rw = bio_data_dir(bio); 709 struct cfq_queue *cfqq; 710 pid_t key; 711 712 /* 713 * Disallow merge of a sync bio into an async request. 714 */ 715 if ((bio_data_dir(bio) == READ || bio_sync(bio)) && !rq_is_sync(rq)) 716 return 0; 717 718 /* 719 * Lookup the cfqq that this bio will be queued with. Allow 720 * merge only if rq is queued there. 721 */ 722 key = cfq_queue_pid(current, rw, bio_sync(bio)); 723 cfqq = cfq_find_cfq_hash(cfqd, key, current->ioprio); 724 725 if (cfqq == RQ_CFQQ(rq)) 726 return 1; 727 728 return 0; 729} 730 731static inline void 732__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 733{ 734 if (cfqq) { 735 /* 736 * stop potential idle class queues waiting service 737 */ 738 del_timer(&cfqd->idle_class_timer); 739 740 cfqq->slice_end = 0; 741 cfq_clear_cfqq_must_alloc_slice(cfqq); 742 cfq_clear_cfqq_fifo_expire(cfqq); 743 cfq_mark_cfqq_slice_new(cfqq); 744 cfq_clear_cfqq_queue_new(cfqq); 745 } 746 747 cfqd->active_queue = cfqq; 748} 749 750/* 751 * current cfqq expired its slice (or was too idle), select new one 752 */ 753static void 754__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 755 int timed_out) 756{ 757 if (cfq_cfqq_wait_request(cfqq)) 758 del_timer(&cfqd->idle_slice_timer); 759 760 cfq_clear_cfqq_must_dispatch(cfqq); 761 cfq_clear_cfqq_wait_request(cfqq); 762 763 /* 764 * store what was left of this slice, if the queue idled/timed out 765 */ 766 if (timed_out && !cfq_cfqq_slice_new(cfqq)) 767 cfqq->slice_resid = cfqq->slice_end - jiffies; 768 769 cfq_resort_rr_list(cfqd, cfqq); 770 771 if (cfqq == cfqd->active_queue) 772 cfqd->active_queue = NULL; 773 774 if (cfqd->active_cic) { 775 put_io_context(cfqd->active_cic->ioc); 776 cfqd->active_cic = NULL; 777 } 778} 779 780static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out) 781{ 782 struct cfq_queue *cfqq = cfqd->active_queue; 783 784 if (cfqq) 785 __cfq_slice_expired(cfqd, cfqq, timed_out); 786} 787 788/* 789 * Get next queue for service. Unless we have a queue preemption, 790 * we'll simply select the first cfqq in the service tree. 791 */ 792static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 793{ 794 struct cfq_queue *cfqq; 795 struct rb_node *n; 796 797 if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) 798 return NULL; 799 800 n = cfq_rb_first(&cfqd->service_tree); 801 cfqq = rb_entry(n, struct cfq_queue, rb_node); 802 803 if (cfq_class_idle(cfqq)) { 804 unsigned long end; 805 806 /* 807 * if we have idle queues and no rt or be queues had 808 * pending requests, either allow immediate service if 809 * the grace period has passed or arm the idle grace 810 * timer 811 */ 812 end = cfqd->last_end_request + CFQ_IDLE_GRACE; 813 if (time_before(jiffies, end)) { 814 mod_timer(&cfqd->idle_class_timer, end); 815 cfqq = NULL; 816 } 817 } 818 819 return cfqq; 820} 821 822/* 823 * Get and set a new active queue for service. 824 */ 825static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) 826{ 827 struct cfq_queue *cfqq; 828 829 cfqq = cfq_get_next_queue(cfqd); 830 __cfq_set_active_queue(cfqd, cfqq); 831 return cfqq; 832} 833 834static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 835 struct request *rq) 836{ 837 if (rq->sector >= cfqd->last_position) 838 return rq->sector - cfqd->last_position; 839 else 840 return cfqd->last_position - rq->sector; 841} 842 843static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) 844{ 845 struct cfq_io_context *cic = cfqd->active_cic; 846 847 if (!sample_valid(cic->seek_samples)) 848 return 0; 849 850 return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; 851} 852 853static int cfq_close_cooperator(struct cfq_data *cfq_data, 854 struct cfq_queue *cfqq) 855{ 856 /* 857 * We should notice if some of the queues are cooperating, eg 858 * working closely on the same area of the disk. In that case, 859 * we can group them together and don't waste time idling. 860 */ 861 return 0; 862} 863 864#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) 865 866static void cfq_arm_slice_timer(struct cfq_data *cfqd) 867{ 868 struct cfq_queue *cfqq = cfqd->active_queue; 869 struct cfq_io_context *cic; 870 unsigned long sl; 871 872 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 873 WARN_ON(cfq_cfqq_slice_new(cfqq)); 874 875 /* 876 * idle is disabled, either manually or by past process history 877 */ 878 if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) 879 return; 880 881 /* 882 * task has exited, don't wait 883 */ 884 cic = cfqd->active_cic; 885 if (!cic || !cic->ioc->task) 886 return; 887 888 /* 889 * See if this prio level has a good candidate 890 */ 891 if (cfq_close_cooperator(cfqd, cfqq) && 892 (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2)) 893 return; 894 895 cfq_mark_cfqq_must_dispatch(cfqq); 896 cfq_mark_cfqq_wait_request(cfqq); 897 898 /* 899 * we don't want to idle for seeks, but we do want to allow 900 * fair distribution of slice time for a process doing back-to-back 901 * seeks. so allow a little bit of time for him to submit a new rq 902 */ 903 sl = cfqd->cfq_slice_idle; 904 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) 905 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); 906 907 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 908} 909 910/* 911 * Move request from internal lists to the request queue dispatch list. 912 */ 913static void cfq_dispatch_insert(request_queue_t *q, struct request *rq) 914{ 915 struct cfq_data *cfqd = q->elevator->elevator_data; 916 struct cfq_queue *cfqq = RQ_CFQQ(rq); 917 918 cfq_remove_request(rq); 919 cfqq->dispatched++; 920 elv_dispatch_sort(q, rq); 921 922 if (cfq_cfqq_sync(cfqq)) 923 cfqd->sync_flight++; 924} 925 926/* 927 * return expired entry, or NULL to just start from scratch in rbtree 928 */ 929static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq) 930{ 931 struct cfq_data *cfqd = cfqq->cfqd; 932 struct request *rq; 933 int fifo; 934 935 if (cfq_cfqq_fifo_expire(cfqq)) 936 return NULL; 937 938 cfq_mark_cfqq_fifo_expire(cfqq); 939 940 if (list_empty(&cfqq->fifo)) 941 return NULL; 942 943 fifo = cfq_cfqq_sync(cfqq); 944 rq = rq_entry_fifo(cfqq->fifo.next); 945 946 if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) 947 return NULL; 948 949 return rq; 950} 951 952static inline int 953cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 954{ 955 const int base_rq = cfqd->cfq_slice_async_rq; 956 957 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 958 959 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 960} 961 962/* 963 * Select a queue for service. If we have a current active queue, 964 * check whether to continue servicing it, or retrieve and set a new one. 965 */ 966static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 967{ 968 struct cfq_queue *cfqq; 969 970 cfqq = cfqd->active_queue; 971 if (!cfqq) 972 goto new_queue; 973 974 /* 975 * The active queue has run out of time, expire it and select new. 976 */ 977 if (cfq_slice_used(cfqq)) 978 goto expire; 979 980 /* 981 * The active queue has requests and isn't expired, allow it to 982 * dispatch. 983 */ 984 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 985 goto keep_queue; 986 987 /* 988 * No requests pending. If the active queue still has requests in 989 * flight or is idling for a new request, allow either of these 990 * conditions to happen (or time out) before selecting a new queue. 991 */ 992 if (cfqq->dispatched || timer_pending(&cfqd->idle_slice_timer)) { 993 cfqq = NULL; 994 goto keep_queue; 995 } 996 997expire: 998 cfq_slice_expired(cfqd, 0); 999new_queue: 1000 cfqq = cfq_set_active_queue(cfqd); 1001keep_queue: 1002 return cfqq; 1003} 1004 1005/* 1006 * Dispatch some requests from cfqq, moving them to the request queue 1007 * dispatch list. 1008 */ 1009static int 1010__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1011 int max_dispatch) 1012{ 1013 int dispatched = 0; 1014 1015 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 1016 1017 do { 1018 struct request *rq; 1019 1020 /* 1021 * follow expired path, else get first next available 1022 */ 1023 if ((rq = cfq_check_fifo(cfqq)) == NULL) 1024 rq = cfqq->next_rq; 1025 1026 /* 1027 * finally, insert request into driver dispatch list 1028 */ 1029 cfq_dispatch_insert(cfqd->queue, rq); 1030 1031 dispatched++; 1032 1033 if (!cfqd->active_cic) { 1034 atomic_inc(&RQ_CIC(rq)->ioc->refcount); 1035 cfqd->active_cic = RQ_CIC(rq); 1036 } 1037 1038 if (RB_EMPTY_ROOT(&cfqq->sort_list)) 1039 break; 1040 1041 } while (dispatched < max_dispatch); 1042 1043 /* 1044 * expire an async queue immediately if it has used up its slice. idle 1045 * queue always expire after 1 dispatch round. 1046 */ 1047 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 1048 dispatched >= cfq_prio_to_maxrq(cfqd, cfqq)) || 1049 cfq_class_idle(cfqq))) { 1050 cfqq->slice_end = jiffies + 1; 1051 cfq_slice_expired(cfqd, 0); 1052 } 1053 1054 return dispatched; 1055} 1056 1057static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 1058{ 1059 int dispatched = 0; 1060 1061 while (cfqq->next_rq) { 1062 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 1063 dispatched++; 1064 } 1065 1066 BUG_ON(!list_empty(&cfqq->fifo)); 1067 return dispatched; 1068} 1069 1070/* 1071 * Drain our current requests. Used for barriers and when switching 1072 * io schedulers on-the-fly. 1073 */ 1074static int cfq_forced_dispatch(struct cfq_data *cfqd) 1075{ 1076 int dispatched = 0; 1077 struct rb_node *n; 1078 1079 while ((n = cfq_rb_first(&cfqd->service_tree)) != NULL) { 1080 struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node); 1081 1082 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 1083 } 1084 1085 cfq_slice_expired(cfqd, 0); 1086 1087 BUG_ON(cfqd->busy_queues); 1088 1089 return dispatched; 1090} 1091 1092static int cfq_dispatch_requests(request_queue_t *q, int force) 1093{ 1094 struct cfq_data *cfqd = q->elevator->elevator_data; 1095 struct cfq_queue *cfqq; 1096 int dispatched; 1097 1098 if (!cfqd->busy_queues) 1099 return 0; 1100 1101 if (unlikely(force)) 1102 return cfq_forced_dispatch(cfqd); 1103 1104 dispatched = 0; 1105 while ((cfqq = cfq_select_queue(cfqd)) != NULL) { 1106 int max_dispatch; 1107 1108 max_dispatch = cfqd->cfq_quantum; 1109 if (cfq_class_idle(cfqq)) 1110 max_dispatch = 1; 1111 1112 if (cfqq->dispatched >= max_dispatch) { 1113 if (cfqd->busy_queues > 1) 1114 break; 1115 if (cfqq->dispatched >= 4 * max_dispatch) 1116 break; 1117 } 1118 1119 if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) 1120 break; 1121 1122 cfq_clear_cfqq_must_dispatch(cfqq); 1123 cfq_clear_cfqq_wait_request(cfqq); 1124 del_timer(&cfqd->idle_slice_timer); 1125 1126 dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); 1127 } 1128 1129 return dispatched; 1130} 1131 1132/* 1133 * task holds one reference to the queue, dropped when task exits. each rq 1134 * in-flight on this queue also holds a reference, dropped when rq is freed. 1135 * 1136 * queue lock must be held here. 1137 */ 1138static void cfq_put_queue(struct cfq_queue *cfqq) 1139{ 1140 struct cfq_data *cfqd = cfqq->cfqd; 1141 1142 BUG_ON(atomic_read(&cfqq->ref) <= 0); 1143 1144 if (!atomic_dec_and_test(&cfqq->ref)) 1145 return; 1146 1147 BUG_ON(rb_first(&cfqq->sort_list)); 1148 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 1149 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1150 1151 if (unlikely(cfqd->active_queue == cfqq)) { 1152 __cfq_slice_expired(cfqd, cfqq, 0); 1153 cfq_schedule_dispatch(cfqd); 1154 } 1155 1156 /* 1157 * it's on the empty list and still hashed 1158 */ 1159 hlist_del(&cfqq->cfq_hash); 1160 kmem_cache_free(cfq_pool, cfqq); 1161} 1162 1163static struct cfq_queue * 1164__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio, 1165 const int hashval) 1166{ 1167 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; 1168 struct hlist_node *entry; 1169 struct cfq_queue *__cfqq; 1170 1171 hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) { 1172 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio); 1173 1174 if (__cfqq->key == key && (__p == prio || !prio)) 1175 return __cfqq; 1176 } 1177 1178 return NULL; 1179} 1180 1181static struct cfq_queue * 1182cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio) 1183{ 1184 return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT)); 1185} 1186 1187static void cfq_free_io_context(struct io_context *ioc) 1188{ 1189 struct cfq_io_context *__cic; 1190 struct rb_node *n; 1191 int freed = 0; 1192 1193 while ((n = rb_first(&ioc->cic_root)) != NULL) { 1194 __cic = rb_entry(n, struct cfq_io_context, rb_node); 1195 rb_erase(&__cic->rb_node, &ioc->cic_root); 1196 kmem_cache_free(cfq_ioc_pool, __cic); 1197 freed++; 1198 } 1199 1200 elv_ioc_count_mod(ioc_count, -freed); 1201 1202 if (ioc_gone && !elv_ioc_count_read(ioc_count)) 1203 complete(ioc_gone); 1204} 1205 1206static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1207{ 1208 if (unlikely(cfqq == cfqd->active_queue)) { 1209 __cfq_slice_expired(cfqd, cfqq, 0); 1210 cfq_schedule_dispatch(cfqd); 1211 } 1212 1213 cfq_put_queue(cfqq); 1214} 1215 1216static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 1217 struct cfq_io_context *cic) 1218{ 1219 list_del_init(&cic->queue_list); 1220 smp_wmb(); 1221 cic->key = NULL; 1222 1223 if (cic->cfqq[ASYNC]) { 1224 cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]); 1225 cic->cfqq[ASYNC] = NULL; 1226 } 1227 1228 if (cic->cfqq[SYNC]) { 1229 cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]); 1230 cic->cfqq[SYNC] = NULL; 1231 } 1232} 1233 1234static void cfq_exit_single_io_context(struct cfq_io_context *cic) 1235{ 1236 struct cfq_data *cfqd = cic->key; 1237 1238 if (cfqd) { 1239 request_queue_t *q = cfqd->queue; 1240 1241 spin_lock_irq(q->queue_lock); 1242 __cfq_exit_single_io_context(cfqd, cic); 1243 spin_unlock_irq(q->queue_lock); 1244 } 1245} 1246 1247/* 1248 * The process that ioc belongs to has exited, we need to clean up 1249 * and put the internal structures we have that belongs to that process. 1250 */ 1251static void cfq_exit_io_context(struct io_context *ioc) 1252{ 1253 struct cfq_io_context *__cic; 1254 struct rb_node *n; 1255 1256 /* 1257 * put the reference this task is holding to the various queues 1258 */ 1259 1260 n = rb_first(&ioc->cic_root); 1261 while (n != NULL) { 1262 __cic = rb_entry(n, struct cfq_io_context, rb_node); 1263 1264 cfq_exit_single_io_context(__cic); 1265 n = rb_next(n); 1266 } 1267} 1268 1269static struct cfq_io_context * 1270cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 1271{ 1272 struct cfq_io_context *cic; 1273 1274 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask, cfqd->queue->node); 1275 if (cic) { 1276 memset(cic, 0, sizeof(*cic)); 1277 cic->last_end_request = jiffies; 1278 INIT_LIST_HEAD(&cic->queue_list); 1279 cic->dtor = cfq_free_io_context; 1280 cic->exit = cfq_exit_io_context; 1281 elv_ioc_count_inc(ioc_count); 1282 } 1283 1284 return cic; 1285} 1286 1287static void cfq_init_prio_data(struct cfq_queue *cfqq) 1288{ 1289 struct task_struct *tsk = current; 1290 int ioprio_class; 1291 1292 if (!cfq_cfqq_prio_changed(cfqq)) 1293 return; 1294 1295 ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio); 1296 switch (ioprio_class) { 1297 default: 1298 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 1299 case IOPRIO_CLASS_NONE: 1300 /* 1301 * no prio set, place us in the middle of the BE classes 1302 */ 1303 cfqq->ioprio = task_nice_ioprio(tsk); 1304 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1305 break; 1306 case IOPRIO_CLASS_RT: 1307 cfqq->ioprio = task_ioprio(tsk); 1308 cfqq->ioprio_class = IOPRIO_CLASS_RT; 1309 break; 1310 case IOPRIO_CLASS_BE: 1311 cfqq->ioprio = task_ioprio(tsk); 1312 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1313 break; 1314 case IOPRIO_CLASS_IDLE: 1315 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 1316 cfqq->ioprio = 7; 1317 cfq_clear_cfqq_idle_window(cfqq); 1318 break; 1319 } 1320 1321 /* 1322 * keep track of original prio settings in case we have to temporarily 1323 * elevate the priority of this queue 1324 */ 1325 cfqq->org_ioprio = cfqq->ioprio; 1326 cfqq->org_ioprio_class = cfqq->ioprio_class; 1327 cfq_clear_cfqq_prio_changed(cfqq); 1328} 1329 1330static inline void changed_ioprio(struct cfq_io_context *cic) 1331{ 1332 struct cfq_data *cfqd = cic->key; 1333 struct cfq_queue *cfqq; 1334 unsigned long flags; 1335 1336 if (unlikely(!cfqd)) 1337 return; 1338 1339 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1340 1341 cfqq = cic->cfqq[ASYNC]; 1342 if (cfqq) { 1343 struct cfq_queue *new_cfqq; 1344 new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task, 1345 GFP_ATOMIC); 1346 if (new_cfqq) { 1347 cic->cfqq[ASYNC] = new_cfqq; 1348 cfq_put_queue(cfqq); 1349 } 1350 } 1351 1352 cfqq = cic->cfqq[SYNC]; 1353 if (cfqq) 1354 cfq_mark_cfqq_prio_changed(cfqq); 1355 1356 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1357} 1358 1359static void cfq_ioc_set_ioprio(struct io_context *ioc) 1360{ 1361 struct cfq_io_context *cic; 1362 struct rb_node *n; 1363 1364 ioc->ioprio_changed = 0; 1365 1366 n = rb_first(&ioc->cic_root); 1367 while (n != NULL) { 1368 cic = rb_entry(n, struct cfq_io_context, rb_node); 1369 1370 changed_ioprio(cic); 1371 n = rb_next(n); 1372 } 1373} 1374 1375static struct cfq_queue * 1376cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, 1377 gfp_t gfp_mask) 1378{ 1379 const int hashval = hash_long(key, CFQ_QHASH_SHIFT); 1380 struct cfq_queue *cfqq, *new_cfqq = NULL; 1381 unsigned short ioprio; 1382 1383retry: 1384 ioprio = tsk->ioprio; 1385 cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval); 1386 1387 if (!cfqq) { 1388 if (new_cfqq) { 1389 cfqq = new_cfqq; 1390 new_cfqq = NULL; 1391 } else if (gfp_mask & __GFP_WAIT) { 1392 /* 1393 * Inform the allocator of the fact that we will 1394 * just repeat this allocation if it fails, to allow 1395 * the allocator to do whatever it needs to attempt to 1396 * free memory. 1397 */ 1398 spin_unlock_irq(cfqd->queue->queue_lock); 1399 new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask|__GFP_NOFAIL, cfqd->queue->node); 1400 spin_lock_irq(cfqd->queue->queue_lock); 1401 goto retry; 1402 } else { 1403 cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask, cfqd->queue->node); 1404 if (!cfqq) 1405 goto out; 1406 } 1407 1408 memset(cfqq, 0, sizeof(*cfqq)); 1409 1410 INIT_HLIST_NODE(&cfqq->cfq_hash); 1411 RB_CLEAR_NODE(&cfqq->rb_node); 1412 INIT_LIST_HEAD(&cfqq->fifo); 1413 1414 cfqq->key = key; 1415 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); 1416 atomic_set(&cfqq->ref, 0); 1417 cfqq->cfqd = cfqd; 1418 1419 if (key != CFQ_KEY_ASYNC) 1420 cfq_mark_cfqq_idle_window(cfqq); 1421 1422 cfq_mark_cfqq_prio_changed(cfqq); 1423 cfq_mark_cfqq_queue_new(cfqq); 1424 cfq_init_prio_data(cfqq); 1425 } 1426 1427 if (new_cfqq) 1428 kmem_cache_free(cfq_pool, new_cfqq); 1429 1430 atomic_inc(&cfqq->ref); 1431out: 1432 WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); 1433 return cfqq; 1434} 1435 1436/* 1437 * We drop cfq io contexts lazily, so we may find a dead one. 1438 */ 1439static void 1440cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic) 1441{ 1442 WARN_ON(!list_empty(&cic->queue_list)); 1443 rb_erase(&cic->rb_node, &ioc->cic_root); 1444 kmem_cache_free(cfq_ioc_pool, cic); 1445 elv_ioc_count_dec(ioc_count); 1446} 1447 1448static struct cfq_io_context * 1449cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc) 1450{ 1451 struct rb_node *n; 1452 struct cfq_io_context *cic; 1453 void *k, *key = cfqd; 1454 1455restart: 1456 n = ioc->cic_root.rb_node; 1457 while (n) { 1458 cic = rb_entry(n, struct cfq_io_context, rb_node); 1459 /* ->key must be copied to avoid race with cfq_exit_queue() */ 1460 k = cic->key; 1461 if (unlikely(!k)) { 1462 cfq_drop_dead_cic(ioc, cic); 1463 goto restart; 1464 } 1465 1466 if (key < k) 1467 n = n->rb_left; 1468 else if (key > k) 1469 n = n->rb_right; 1470 else 1471 return cic; 1472 } 1473 1474 return NULL; 1475} 1476 1477static inline void 1478cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 1479 struct cfq_io_context *cic) 1480{ 1481 struct rb_node **p; 1482 struct rb_node *parent; 1483 struct cfq_io_context *__cic; 1484 unsigned long flags; 1485 void *k; 1486 1487 cic->ioc = ioc; 1488 cic->key = cfqd; 1489 1490restart: 1491 parent = NULL; 1492 p = &ioc->cic_root.rb_node; 1493 while (*p) { 1494 parent = *p; 1495 __cic = rb_entry(parent, struct cfq_io_context, rb_node); 1496 /* ->key must be copied to avoid race with cfq_exit_queue() */ 1497 k = __cic->key; 1498 if (unlikely(!k)) { 1499 cfq_drop_dead_cic(ioc, __cic); 1500 goto restart; 1501 } 1502 1503 if (cic->key < k) 1504 p = &(*p)->rb_left; 1505 else if (cic->key > k) 1506 p = &(*p)->rb_right; 1507 else 1508 BUG(); 1509 } 1510 1511 rb_link_node(&cic->rb_node, parent, p); 1512 rb_insert_color(&cic->rb_node, &ioc->cic_root); 1513 1514 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1515 list_add(&cic->queue_list, &cfqd->cic_list); 1516 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1517} 1518 1519/* 1520 * Setup general io context and cfq io context. There can be several cfq 1521 * io contexts per general io context, if this process is doing io to more 1522 * than one device managed by cfq. 1523 */ 1524static struct cfq_io_context * 1525cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 1526{ 1527 struct io_context *ioc = NULL; 1528 struct cfq_io_context *cic; 1529 1530 might_sleep_if(gfp_mask & __GFP_WAIT); 1531 1532 ioc = get_io_context(gfp_mask, cfqd->queue->node); 1533 if (!ioc) 1534 return NULL; 1535 1536 cic = cfq_cic_rb_lookup(cfqd, ioc); 1537 if (cic) 1538 goto out; 1539 1540 cic = cfq_alloc_io_context(cfqd, gfp_mask); 1541 if (cic == NULL) 1542 goto err; 1543 1544 cfq_cic_link(cfqd, ioc, cic); 1545out: 1546 smp_read_barrier_depends(); 1547 if (unlikely(ioc->ioprio_changed)) 1548 cfq_ioc_set_ioprio(ioc); 1549 1550 return cic; 1551err: 1552 put_io_context(ioc); 1553 return NULL; 1554} 1555 1556static void 1557cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 1558{ 1559 unsigned long elapsed = jiffies - cic->last_end_request; 1560 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 1561 1562 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 1563 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 1564 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 1565} 1566 1567static void 1568cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, 1569 struct request *rq) 1570{ 1571 sector_t sdist; 1572 u64 total; 1573 1574 if (cic->last_request_pos < rq->sector) 1575 sdist = rq->sector - cic->last_request_pos; 1576 else 1577 sdist = cic->last_request_pos - rq->sector; 1578 1579 if (!cic->seek_samples) { 1580 cfqd->new_seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 1581 cfqd->new_seek_mean = cfqd->new_seek_total / 256; 1582 } 1583 1584 /* 1585 * Don't allow the seek distance to get too large from the 1586 * odd fragment, pagein, etc 1587 */ 1588 if (cic->seek_samples <= 60) /* second&third seek */ 1589 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); 1590 else 1591 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); 1592 1593 cic->seek_samples = (7*cic->seek_samples + 256) / 8; 1594 cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 1595 total = cic->seek_total + (cic->seek_samples/2); 1596 do_div(total, cic->seek_samples); 1597 cic->seek_mean = (sector_t)total; 1598} 1599 1600/* 1601 * Disable idle window if the process thinks too long or seeks so much that 1602 * it doesn't matter 1603 */ 1604static void 1605cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1606 struct cfq_io_context *cic) 1607{ 1608 int enable_idle; 1609 1610 if (!cfq_cfqq_sync(cfqq)) 1611 return; 1612 1613 enable_idle = cfq_cfqq_idle_window(cfqq); 1614 1615 if (!cic->ioc->task || !cfqd->cfq_slice_idle || 1616 (cfqd->hw_tag && CIC_SEEKY(cic))) 1617 enable_idle = 0; 1618 else if (sample_valid(cic->ttime_samples)) { 1619 if (cic->ttime_mean > cfqd->cfq_slice_idle) 1620 enable_idle = 0; 1621 else 1622 enable_idle = 1; 1623 } 1624 1625 if (enable_idle) 1626 cfq_mark_cfqq_idle_window(cfqq); 1627 else 1628 cfq_clear_cfqq_idle_window(cfqq); 1629} 1630 1631/* 1632 * Check if new_cfqq should preempt the currently active queue. Return 0 for 1633 * no or if we aren't sure, a 1 will cause a preempt. 1634 */ 1635static int 1636cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 1637 struct request *rq) 1638{ 1639 struct cfq_queue *cfqq; 1640 1641 cfqq = cfqd->active_queue; 1642 if (!cfqq) 1643 return 0; 1644 1645 if (cfq_slice_used(cfqq)) 1646 return 1; 1647 1648 if (cfq_class_idle(new_cfqq)) 1649 return 0; 1650 1651 if (cfq_class_idle(cfqq)) 1652 return 1; 1653 1654 /* 1655 * if the new request is sync, but the currently running queue is 1656 * not, let the sync request have priority. 1657 */ 1658 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 1659 return 1; 1660 1661 /* 1662 * So both queues are sync. Let the new request get disk time if 1663 * it's a metadata request and the current queue is doing regular IO. 1664 */ 1665 if (rq_is_meta(rq) && !cfqq->meta_pending) 1666 return 1; 1667 1668 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 1669 return 0; 1670 1671 /* 1672 * if this request is as-good as one we would expect from the 1673 * current cfqq, let it preempt 1674 */ 1675 if (cfq_rq_close(cfqd, rq)) 1676 return 1; 1677 1678 return 0; 1679} 1680 1681/* 1682 * cfqq preempts the active queue. if we allowed preempt with no slice left, 1683 * let it have half of its nominal slice. 1684 */ 1685static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1686{ 1687 cfq_slice_expired(cfqd, 1); 1688 1689 /* 1690 * Put the new queue at the front of the of the current list, 1691 * so we know that it will be selected next. 1692 */ 1693 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1694 1695 cfq_service_tree_add(cfqd, cfqq, 1); 1696 1697 cfqq->slice_end = 0; 1698 cfq_mark_cfqq_slice_new(cfqq); 1699} 1700 1701/* 1702 * Called when a new fs request (rq) is added (to cfqq). Check if there's 1703 * something we should do about it 1704 */ 1705static void 1706cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1707 struct request *rq) 1708{ 1709 struct cfq_io_context *cic = RQ_CIC(rq); 1710 1711 if (rq_is_meta(rq)) 1712 cfqq->meta_pending++; 1713 1714 cfq_update_io_thinktime(cfqd, cic); 1715 cfq_update_io_seektime(cfqd, cic, rq); 1716 cfq_update_idle_window(cfqd, cfqq, cic); 1717 1718 cic->last_request_pos = rq->sector + rq->nr_sectors; 1719 cfqq->last_request_pos = cic->last_request_pos; 1720 1721 if (cfqq == cfqd->active_queue) { 1722 /* 1723 * if we are waiting for a request for this queue, let it rip 1724 * immediately and flag that we must not expire this queue 1725 * just now 1726 */ 1727 if (cfq_cfqq_wait_request(cfqq)) { 1728 cfq_mark_cfqq_must_dispatch(cfqq); 1729 del_timer(&cfqd->idle_slice_timer); 1730 blk_start_queueing(cfqd->queue); 1731 } 1732 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 1733 /* 1734 * not the active queue - expire current slice if it is 1735 * idle and has expired it's mean thinktime or this new queue 1736 * has some old slice time left and is of higher priority 1737 */ 1738 cfq_preempt_queue(cfqd, cfqq); 1739 cfq_mark_cfqq_must_dispatch(cfqq); 1740 blk_start_queueing(cfqd->queue); 1741 } 1742} 1743 1744static void cfq_insert_request(request_queue_t *q, struct request *rq) 1745{ 1746 struct cfq_data *cfqd = q->elevator->elevator_data; 1747 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1748 1749 cfq_init_prio_data(cfqq); 1750 1751 cfq_add_rq_rb(rq); 1752 1753 list_add_tail(&rq->queuelist, &cfqq->fifo); 1754 1755 cfq_rq_enqueued(cfqd, cfqq, rq); 1756} 1757 1758static void cfq_completed_request(request_queue_t *q, struct request *rq) 1759{ 1760 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1761 struct cfq_data *cfqd = cfqq->cfqd; 1762 const int sync = rq_is_sync(rq); 1763 unsigned long now; 1764 1765 now = jiffies; 1766 1767 WARN_ON(!cfqd->rq_in_driver); 1768 WARN_ON(!cfqq->dispatched); 1769 cfqd->rq_in_driver--; 1770 cfqq->dispatched--; 1771 1772 if (cfq_cfqq_sync(cfqq)) 1773 cfqd->sync_flight--; 1774 1775 if (!cfq_class_idle(cfqq)) 1776 cfqd->last_end_request = now; 1777 1778 if (sync) 1779 RQ_CIC(rq)->last_end_request = now; 1780 1781 /* 1782 * If this is the active queue, check if it needs to be expired, 1783 * or if we want to idle in case it has no pending requests. 1784 */ 1785 if (cfqd->active_queue == cfqq) { 1786 if (cfq_cfqq_slice_new(cfqq)) { 1787 cfq_set_prio_slice(cfqd, cfqq); 1788 cfq_clear_cfqq_slice_new(cfqq); 1789 } 1790 if (cfq_slice_used(cfqq)) 1791 cfq_slice_expired(cfqd, 1); 1792 else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) 1793 cfq_arm_slice_timer(cfqd); 1794 } 1795 1796 if (!cfqd->rq_in_driver) 1797 cfq_schedule_dispatch(cfqd); 1798} 1799 1800/* 1801 * we temporarily boost lower priority queues if they are holding fs exclusive 1802 * resources. they are boosted to normal prio (CLASS_BE/4) 1803 */ 1804static void cfq_prio_boost(struct cfq_queue *cfqq) 1805{ 1806 if (has_fs_excl()) { 1807 /* 1808 * boost idle prio on transactions that would lock out other 1809 * users of the filesystem 1810 */ 1811 if (cfq_class_idle(cfqq)) 1812 cfqq->ioprio_class = IOPRIO_CLASS_BE; 1813 if (cfqq->ioprio > IOPRIO_NORM) 1814 cfqq->ioprio = IOPRIO_NORM; 1815 } else { 1816 /* 1817 * check if we need to unboost the queue 1818 */ 1819 if (cfqq->ioprio_class != cfqq->org_ioprio_class) 1820 cfqq->ioprio_class = cfqq->org_ioprio_class; 1821 if (cfqq->ioprio != cfqq->org_ioprio) 1822 cfqq->ioprio = cfqq->org_ioprio; 1823 } 1824} 1825 1826static inline int __cfq_may_queue(struct cfq_queue *cfqq) 1827{ 1828 if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && 1829 !cfq_cfqq_must_alloc_slice(cfqq)) { 1830 cfq_mark_cfqq_must_alloc_slice(cfqq); 1831 return ELV_MQUEUE_MUST; 1832 } 1833 1834 return ELV_MQUEUE_MAY; 1835} 1836 1837static int cfq_may_queue(request_queue_t *q, int rw) 1838{ 1839 struct cfq_data *cfqd = q->elevator->elevator_data; 1840 struct task_struct *tsk = current; 1841 struct cfq_queue *cfqq; 1842 unsigned int key; 1843 1844 key = cfq_queue_pid(tsk, rw, rw & REQ_RW_SYNC); 1845 1846 /* 1847 * don't force setup of a queue from here, as a call to may_queue 1848 * does not necessarily imply that a request actually will be queued. 1849 * so just lookup a possibly existing queue, or return 'may queue' 1850 * if that fails 1851 */ 1852 cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); 1853 if (cfqq) { 1854 cfq_init_prio_data(cfqq); 1855 cfq_prio_boost(cfqq); 1856 1857 return __cfq_may_queue(cfqq); 1858 } 1859 1860 return ELV_MQUEUE_MAY; 1861} 1862 1863/* 1864 * queue lock held here 1865 */ 1866static void cfq_put_request(struct request *rq) 1867{ 1868 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1869 1870 if (cfqq) { 1871 const int rw = rq_data_dir(rq); 1872 1873 BUG_ON(!cfqq->allocated[rw]); 1874 cfqq->allocated[rw]--; 1875 1876 put_io_context(RQ_CIC(rq)->ioc); 1877 1878 rq->elevator_private = NULL; 1879 rq->elevator_private2 = NULL; 1880 1881 cfq_put_queue(cfqq); 1882 } 1883} 1884 1885/* 1886 * Allocate cfq data structures associated with this request. 1887 */ 1888static int 1889cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask) 1890{ 1891 struct cfq_data *cfqd = q->elevator->elevator_data; 1892 struct task_struct *tsk = current; 1893 struct cfq_io_context *cic; 1894 const int rw = rq_data_dir(rq); 1895 const int is_sync = rq_is_sync(rq); 1896 pid_t key = cfq_queue_pid(tsk, rw, is_sync); 1897 struct cfq_queue *cfqq; 1898 unsigned long flags; 1899 1900 might_sleep_if(gfp_mask & __GFP_WAIT); 1901 1902 cic = cfq_get_io_context(cfqd, gfp_mask); 1903 1904 spin_lock_irqsave(q->queue_lock, flags); 1905 1906 if (!cic) 1907 goto queue_fail; 1908 1909 if (!cic->cfqq[is_sync]) { 1910 cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask); 1911 if (!cfqq) 1912 goto queue_fail; 1913 1914 cic->cfqq[is_sync] = cfqq; 1915 } else 1916 cfqq = cic->cfqq[is_sync]; 1917 1918 cfqq->allocated[rw]++; 1919 cfq_clear_cfqq_must_alloc(cfqq); 1920 atomic_inc(&cfqq->ref); 1921 1922 spin_unlock_irqrestore(q->queue_lock, flags); 1923 1924 rq->elevator_private = cic; 1925 rq->elevator_private2 = cfqq; 1926 return 0; 1927 1928queue_fail: 1929 if (cic) 1930 put_io_context(cic->ioc); 1931 1932 cfq_schedule_dispatch(cfqd); 1933 spin_unlock_irqrestore(q->queue_lock, flags); 1934 return 1; 1935} 1936 1937static void cfq_kick_queue(struct work_struct *work) 1938{ 1939 struct cfq_data *cfqd = 1940 container_of(work, struct cfq_data, unplug_work); 1941 request_queue_t *q = cfqd->queue; 1942 unsigned long flags; 1943 1944 spin_lock_irqsave(q->queue_lock, flags); 1945 blk_start_queueing(q); 1946 spin_unlock_irqrestore(q->queue_lock, flags); 1947} 1948 1949/* 1950 * Timer running if the active_queue is currently idling inside its time slice 1951 */ 1952static void cfq_idle_slice_timer(unsigned long data) 1953{ 1954 struct cfq_data *cfqd = (struct cfq_data *) data; 1955 struct cfq_queue *cfqq; 1956 unsigned long flags; 1957 int timed_out = 1; 1958 1959 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1960 1961 if ((cfqq = cfqd->active_queue) != NULL) { 1962 timed_out = 0; 1963 1964 /* 1965 * expired 1966 */ 1967 if (cfq_slice_used(cfqq)) 1968 goto expire; 1969 1970 /* 1971 * only expire and reinvoke request handler, if there are 1972 * other queues with pending requests 1973 */ 1974 if (!cfqd->busy_queues) 1975 goto out_cont; 1976 1977 /* 1978 * not expired and it has a request pending, let it dispatch 1979 */ 1980 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) { 1981 cfq_mark_cfqq_must_dispatch(cfqq); 1982 goto out_kick; 1983 } 1984 } 1985expire: 1986 cfq_slice_expired(cfqd, timed_out); 1987out_kick: 1988 cfq_schedule_dispatch(cfqd); 1989out_cont: 1990 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1991} 1992 1993/* 1994 * Timer running if an idle class queue is waiting for service 1995 */ 1996static void cfq_idle_class_timer(unsigned long data) 1997{ 1998 struct cfq_data *cfqd = (struct cfq_data *) data; 1999 unsigned long flags, end; 2000 2001 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2002 2003 /* 2004 * race with a non-idle queue, reset timer 2005 */ 2006 end = cfqd->last_end_request + CFQ_IDLE_GRACE; 2007 if (!time_after_eq(jiffies, end)) 2008 mod_timer(&cfqd->idle_class_timer, end); 2009 else 2010 cfq_schedule_dispatch(cfqd); 2011 2012 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2013} 2014 2015static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 2016{ 2017 del_timer_sync(&cfqd->idle_slice_timer); 2018 del_timer_sync(&cfqd->idle_class_timer); 2019 blk_sync_queue(cfqd->queue); 2020} 2021 2022static void cfq_exit_queue(elevator_t *e) 2023{ 2024 struct cfq_data *cfqd = e->elevator_data; 2025 request_queue_t *q = cfqd->queue; 2026 2027 cfq_shutdown_timer_wq(cfqd); 2028 2029 spin_lock_irq(q->queue_lock); 2030 2031 if (cfqd->active_queue) 2032 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 2033 2034 while (!list_empty(&cfqd->cic_list)) { 2035 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 2036 struct cfq_io_context, 2037 queue_list); 2038 2039 __cfq_exit_single_io_context(cfqd, cic); 2040 } 2041 2042 spin_unlock_irq(q->queue_lock); 2043 2044 cfq_shutdown_timer_wq(cfqd); 2045 2046 kfree(cfqd->cfq_hash); 2047 kfree(cfqd); 2048} 2049 2050static void *cfq_init_queue(request_queue_t *q) 2051{ 2052 struct cfq_data *cfqd; 2053 int i; 2054 2055 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); 2056 if (!cfqd) 2057 return NULL; 2058 2059 memset(cfqd, 0, sizeof(*cfqd)); 2060 2061 cfqd->service_tree = CFQ_RB_ROOT; 2062 INIT_LIST_HEAD(&cfqd->cic_list); 2063 2064 cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node); 2065 if (!cfqd->cfq_hash) 2066 goto out_free; 2067 2068 for (i = 0; i < CFQ_QHASH_ENTRIES; i++) 2069 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); 2070 2071 cfqd->queue = q; 2072 2073 init_timer(&cfqd->idle_slice_timer); 2074 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 2075 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 2076 2077 init_timer(&cfqd->idle_class_timer); 2078 cfqd->idle_class_timer.function = cfq_idle_class_timer; 2079 cfqd->idle_class_timer.data = (unsigned long) cfqd; 2080 2081 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 2082 2083 cfqd->cfq_quantum = cfq_quantum; 2084 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 2085 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 2086 cfqd->cfq_back_max = cfq_back_max; 2087 cfqd->cfq_back_penalty = cfq_back_penalty; 2088 cfqd->cfq_slice[0] = cfq_slice_async; 2089 cfqd->cfq_slice[1] = cfq_slice_sync; 2090 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 2091 cfqd->cfq_slice_idle = cfq_slice_idle; 2092 2093 return cfqd; 2094out_free: 2095 kfree(cfqd); 2096 return NULL; 2097} 2098 2099static void cfq_slab_kill(void) 2100{ 2101 if (cfq_pool) 2102 kmem_cache_destroy(cfq_pool); 2103 if (cfq_ioc_pool) 2104 kmem_cache_destroy(cfq_ioc_pool); 2105} 2106 2107static int __init cfq_slab_setup(void) 2108{ 2109 cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0, 2110 NULL, NULL); 2111 if (!cfq_pool) 2112 goto fail; 2113 2114 cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool", 2115 sizeof(struct cfq_io_context), 0, 0, NULL, NULL); 2116 if (!cfq_ioc_pool) 2117 goto fail; 2118 2119 return 0; 2120fail: 2121 cfq_slab_kill(); 2122 return -ENOMEM; 2123} 2124 2125/* 2126 * sysfs parts below --> 2127 */ 2128static ssize_t 2129cfq_var_show(unsigned int var, char *page) 2130{ 2131 return sprintf(page, "%d\n", var); 2132} 2133 2134static ssize_t 2135cfq_var_store(unsigned int *var, const char *page, size_t count) 2136{ 2137 char *p = (char *) page; 2138 2139 *var = simple_strtoul(p, &p, 10); 2140 return count; 2141} 2142 2143#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 2144static ssize_t __FUNC(elevator_t *e, char *page) \ 2145{ \ 2146 struct cfq_data *cfqd = e->elevator_data; \ 2147 unsigned int __data = __VAR; \ 2148 if (__CONV) \ 2149 __data = jiffies_to_msecs(__data); \ 2150 return cfq_var_show(__data, (page)); \ 2151} 2152SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 2153SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 2154SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 2155SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 2156SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 2157SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 2158SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 2159SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 2160SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 2161#undef SHOW_FUNCTION 2162 2163#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 2164static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \ 2165{ \ 2166 struct cfq_data *cfqd = e->elevator_data; \ 2167 unsigned int __data; \ 2168 int ret = cfq_var_store(&__data, (page), count); \ 2169 if (__data < (MIN)) \ 2170 __data = (MIN); \ 2171 else if (__data > (MAX)) \ 2172 __data = (MAX); \ 2173 if (__CONV) \ 2174 *(__PTR) = msecs_to_jiffies(__data); \ 2175 else \ 2176 *(__PTR) = __data; \ 2177 return ret; \ 2178} 2179STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 2180STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1); 2181STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1); 2182STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 2183STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); 2184STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 2185STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 2186STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 2187STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); 2188#undef STORE_FUNCTION 2189 2190#define CFQ_ATTR(name) \ 2191 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 2192 2193static struct elv_fs_entry cfq_attrs[] = { 2194 CFQ_ATTR(quantum), 2195 CFQ_ATTR(fifo_expire_sync), 2196 CFQ_ATTR(fifo_expire_async), 2197 CFQ_ATTR(back_seek_max), 2198 CFQ_ATTR(back_seek_penalty), 2199 CFQ_ATTR(slice_sync), 2200 CFQ_ATTR(slice_async), 2201 CFQ_ATTR(slice_async_rq), 2202 CFQ_ATTR(slice_idle), 2203 __ATTR_NULL 2204}; 2205 2206static struct elevator_type iosched_cfq = { 2207 .ops = { 2208 .elevator_merge_fn = cfq_merge, 2209 .elevator_merged_fn = cfq_merged_request, 2210 .elevator_merge_req_fn = cfq_merged_requests, 2211 .elevator_allow_merge_fn = cfq_allow_merge, 2212 .elevator_dispatch_fn = cfq_dispatch_requests, 2213 .elevator_add_req_fn = cfq_insert_request, 2214 .elevator_activate_req_fn = cfq_activate_request, 2215 .elevator_deactivate_req_fn = cfq_deactivate_request, 2216 .elevator_queue_empty_fn = cfq_queue_empty, 2217 .elevator_completed_req_fn = cfq_completed_request, 2218 .elevator_former_req_fn = elv_rb_former_request, 2219 .elevator_latter_req_fn = elv_rb_latter_request, 2220 .elevator_set_req_fn = cfq_set_request, 2221 .elevator_put_req_fn = cfq_put_request, 2222 .elevator_may_queue_fn = cfq_may_queue, 2223 .elevator_init_fn = cfq_init_queue, 2224 .elevator_exit_fn = cfq_exit_queue, 2225 .trim = cfq_free_io_context, 2226 }, 2227 .elevator_attrs = cfq_attrs, 2228 .elevator_name = "cfq", 2229 .elevator_owner = THIS_MODULE, 2230}; 2231 2232static int __init cfq_init(void) 2233{ 2234 int ret; 2235 2236 /* 2237 * could be 0 on HZ < 1000 setups 2238 */ 2239 if (!cfq_slice_async) 2240 cfq_slice_async = 1; 2241 if (!cfq_slice_idle) 2242 cfq_slice_idle = 1; 2243 2244 if (cfq_slab_setup()) 2245 return -ENOMEM; 2246 2247 ret = elv_register(&iosched_cfq); 2248 if (ret) 2249 cfq_slab_kill(); 2250 2251 return ret; 2252} 2253 2254static void __exit cfq_exit(void) 2255{ 2256 DECLARE_COMPLETION_ONSTACK(all_gone); 2257 elv_unregister(&iosched_cfq); 2258 ioc_gone = &all_gone; 2259 /* ioc_gone's update must be visible before reading ioc_count */ 2260 smp_wmb(); 2261 if (elv_ioc_count_read(ioc_count)) 2262 wait_for_completion(ioc_gone); 2263 synchronize_rcu(); 2264 cfq_slab_kill(); 2265} 2266 2267module_init(cfq_init); 2268module_exit(cfq_exit); 2269 2270MODULE_AUTHOR("Jens Axboe"); 2271MODULE_LICENSE("GPL"); 2272MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 2273