cfq-iosched.c revision 573412b29586e58477adb70e022193a337763319
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/blkdev.h> 11#include <linux/elevator.h> 12#include <linux/jiffies.h> 13#include <linux/rbtree.h> 14#include <linux/ioprio.h> 15#include <linux/blktrace_api.h> 16#include "blk-cgroup.h" 17 18/* 19 * tunables 20 */ 21/* max queue in one round of service */ 22static const int cfq_quantum = 4; 23static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 24/* maximum backwards seek, in KiB */ 25static const int cfq_back_max = 16 * 1024; 26/* penalty of a backwards seek */ 27static const int cfq_back_penalty = 2; 28static const int cfq_slice_sync = HZ / 10; 29static int cfq_slice_async = HZ / 25; 30static const int cfq_slice_async_rq = 2; 31static int cfq_slice_idle = HZ / 125; 32static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ 33static const int cfq_hist_divisor = 4; 34 35/* 36 * offset from end of service tree 37 */ 38#define CFQ_IDLE_DELAY (HZ / 5) 39 40/* 41 * below this threshold, we consider thinktime immediate 42 */ 43#define CFQ_MIN_TT (2) 44 45/* 46 * Allow merged cfqqs to perform this amount of seeky I/O before 47 * deciding to break the queues up again. 48 */ 49#define CFQQ_COOP_TOUT (HZ) 50 51#define CFQ_SLICE_SCALE (5) 52#define CFQ_HW_QUEUE_MIN (5) 53#define CFQ_SERVICE_SHIFT 12 54 55#define RQ_CIC(rq) \ 56 ((struct cfq_io_context *) (rq)->elevator_private) 57#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 58 59static struct kmem_cache *cfq_pool; 60static struct kmem_cache *cfq_ioc_pool; 61 62static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); 63static struct completion *ioc_gone; 64static DEFINE_SPINLOCK(ioc_gone_lock); 65 66#define CFQ_PRIO_LISTS IOPRIO_BE_NR 67#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 68#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 69 70#define sample_valid(samples) ((samples) > 80) 71#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 72 73/* 74 * Most of our rbtree usage is for sorting with min extraction, so 75 * if we cache the leftmost node we don't have to walk down the tree 76 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 77 * move this into the elevator for the rq sorting as well. 78 */ 79struct cfq_rb_root { 80 struct rb_root rb; 81 struct rb_node *left; 82 unsigned count; 83 u64 min_vdisktime; 84 struct rb_node *active; 85 unsigned total_weight; 86}; 87#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } 88 89/* 90 * Per process-grouping structure 91 */ 92struct cfq_queue { 93 /* reference count */ 94 atomic_t ref; 95 /* various state flags, see below */ 96 unsigned int flags; 97 /* parent cfq_data */ 98 struct cfq_data *cfqd; 99 /* service_tree member */ 100 struct rb_node rb_node; 101 /* service_tree key */ 102 unsigned long rb_key; 103 /* prio tree member */ 104 struct rb_node p_node; 105 /* prio tree root we belong to, if any */ 106 struct rb_root *p_root; 107 /* sorted list of pending requests */ 108 struct rb_root sort_list; 109 /* if fifo isn't expired, next request to serve */ 110 struct request *next_rq; 111 /* requests queued in sort_list */ 112 int queued[2]; 113 /* currently allocated requests */ 114 int allocated[2]; 115 /* fifo list of requests in sort_list */ 116 struct list_head fifo; 117 118 /* time when queue got scheduled in to dispatch first request. */ 119 unsigned long dispatch_start; 120 unsigned int allocated_slice; 121 /* time when first request from queue completed and slice started. */ 122 unsigned long slice_start; 123 unsigned long slice_end; 124 long slice_resid; 125 unsigned int slice_dispatch; 126 127 /* pending metadata requests */ 128 int meta_pending; 129 /* number of requests that are on the dispatch list or inside driver */ 130 int dispatched; 131 132 /* io prio of this group */ 133 unsigned short ioprio, org_ioprio; 134 unsigned short ioprio_class, org_ioprio_class; 135 136 unsigned int seek_samples; 137 u64 seek_total; 138 sector_t seek_mean; 139 sector_t last_request_pos; 140 unsigned long seeky_start; 141 142 pid_t pid; 143 144 struct cfq_rb_root *service_tree; 145 struct cfq_queue *new_cfqq; 146 struct cfq_group *cfqg; 147 struct cfq_group *orig_cfqg; 148 /* Sectors dispatched in current dispatch round */ 149 unsigned long nr_sectors; 150}; 151 152/* 153 * First index in the service_trees. 154 * IDLE is handled separately, so it has negative index 155 */ 156enum wl_prio_t { 157 BE_WORKLOAD = 0, 158 RT_WORKLOAD = 1, 159 IDLE_WORKLOAD = 2, 160}; 161 162/* 163 * Second index in the service_trees. 164 */ 165enum wl_type_t { 166 ASYNC_WORKLOAD = 0, 167 SYNC_NOIDLE_WORKLOAD = 1, 168 SYNC_WORKLOAD = 2 169}; 170 171/* This is per cgroup per device grouping structure */ 172struct cfq_group { 173 /* group service_tree member */ 174 struct rb_node rb_node; 175 176 /* group service_tree key */ 177 u64 vdisktime; 178 unsigned int weight; 179 bool on_st; 180 181 /* number of cfqq currently on this group */ 182 int nr_cfqq; 183 184 /* Per group busy queus average. Useful for workload slice calc. */ 185 unsigned int busy_queues_avg[2]; 186 /* 187 * rr lists of queues with requests, onle rr for each priority class. 188 * Counts are embedded in the cfq_rb_root 189 */ 190 struct cfq_rb_root service_trees[2][3]; 191 struct cfq_rb_root service_tree_idle; 192 193 unsigned long saved_workload_slice; 194 enum wl_type_t saved_workload; 195 enum wl_prio_t saved_serving_prio; 196 struct blkio_group blkg; 197#ifdef CONFIG_CFQ_GROUP_IOSCHED 198 struct hlist_node cfqd_node; 199 atomic_t ref; 200#endif 201}; 202 203/* 204 * Per block device queue structure 205 */ 206struct cfq_data { 207 struct request_queue *queue; 208 /* Root service tree for cfq_groups */ 209 struct cfq_rb_root grp_service_tree; 210 struct cfq_group root_group; 211 /* Number of active cfq groups on group service tree */ 212 int nr_groups; 213 214 /* 215 * The priority currently being served 216 */ 217 enum wl_prio_t serving_prio; 218 enum wl_type_t serving_type; 219 unsigned long workload_expires; 220 struct cfq_group *serving_group; 221 bool noidle_tree_requires_idle; 222 223 /* 224 * Each priority tree is sorted by next_request position. These 225 * trees are used when determining if two or more queues are 226 * interleaving requests (see cfq_close_cooperator). 227 */ 228 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 229 230 unsigned int busy_queues; 231 232 int rq_in_driver[2]; 233 int sync_flight; 234 235 /* 236 * queue-depth detection 237 */ 238 int rq_queued; 239 int hw_tag; 240 /* 241 * hw_tag can be 242 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) 243 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) 244 * 0 => no NCQ 245 */ 246 int hw_tag_est_depth; 247 unsigned int hw_tag_samples; 248 249 /* 250 * idle window management 251 */ 252 struct timer_list idle_slice_timer; 253 struct work_struct unplug_work; 254 255 struct cfq_queue *active_queue; 256 struct cfq_io_context *active_cic; 257 258 /* 259 * async queue for each priority case 260 */ 261 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; 262 struct cfq_queue *async_idle_cfqq; 263 264 sector_t last_position; 265 266 /* 267 * tunables, see top of file 268 */ 269 unsigned int cfq_quantum; 270 unsigned int cfq_fifo_expire[2]; 271 unsigned int cfq_back_penalty; 272 unsigned int cfq_back_max; 273 unsigned int cfq_slice[2]; 274 unsigned int cfq_slice_async_rq; 275 unsigned int cfq_slice_idle; 276 unsigned int cfq_latency; 277 unsigned int cfq_group_isolation; 278 279 struct list_head cic_list; 280 281 /* 282 * Fallback dummy cfqq for extreme OOM conditions 283 */ 284 struct cfq_queue oom_cfqq; 285 286 unsigned long last_delayed_sync; 287 288 /* List of cfq groups being managed on this device*/ 289 struct hlist_head cfqg_list; 290 struct rcu_head rcu; 291}; 292 293static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 294 295static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, 296 enum wl_prio_t prio, 297 enum wl_type_t type, 298 struct cfq_data *cfqd) 299{ 300 if (!cfqg) 301 return NULL; 302 303 if (prio == IDLE_WORKLOAD) 304 return &cfqg->service_tree_idle; 305 306 return &cfqg->service_trees[prio][type]; 307} 308 309enum cfqq_state_flags { 310 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 311 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 312 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 313 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 314 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 315 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 316 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 317 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 318 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 319 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ 320 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ 321 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ 322 CFQ_CFQQ_FLAG_wait_busy_done, /* Got new request. Expire the queue */ 323}; 324 325#define CFQ_CFQQ_FNS(name) \ 326static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 327{ \ 328 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 329} \ 330static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 331{ \ 332 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 333} \ 334static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 335{ \ 336 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 337} 338 339CFQ_CFQQ_FNS(on_rr); 340CFQ_CFQQ_FNS(wait_request); 341CFQ_CFQQ_FNS(must_dispatch); 342CFQ_CFQQ_FNS(must_alloc_slice); 343CFQ_CFQQ_FNS(fifo_expire); 344CFQ_CFQQ_FNS(idle_window); 345CFQ_CFQQ_FNS(prio_changed); 346CFQ_CFQQ_FNS(slice_new); 347CFQ_CFQQ_FNS(sync); 348CFQ_CFQQ_FNS(coop); 349CFQ_CFQQ_FNS(deep); 350CFQ_CFQQ_FNS(wait_busy); 351CFQ_CFQQ_FNS(wait_busy_done); 352#undef CFQ_CFQQ_FNS 353 354#ifdef CONFIG_DEBUG_CFQ_IOSCHED 355#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 356 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 357 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 358 blkg_path(&(cfqq)->cfqg->blkg), ##args); 359 360#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 361 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 362 blkg_path(&(cfqg)->blkg), ##args); \ 363 364#else 365#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 366 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 367#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); 368#endif 369#define cfq_log(cfqd, fmt, args...) \ 370 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 371 372/* Traverses through cfq group service trees */ 373#define for_each_cfqg_st(cfqg, i, j, st) \ 374 for (i = 0; i <= IDLE_WORKLOAD; i++) \ 375 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ 376 : &cfqg->service_tree_idle; \ 377 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ 378 (i == IDLE_WORKLOAD && j == 0); \ 379 j++, st = i < IDLE_WORKLOAD ? \ 380 &cfqg->service_trees[i][j]: NULL) \ 381 382 383static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 384{ 385 if (cfq_class_idle(cfqq)) 386 return IDLE_WORKLOAD; 387 if (cfq_class_rt(cfqq)) 388 return RT_WORKLOAD; 389 return BE_WORKLOAD; 390} 391 392 393static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) 394{ 395 if (!cfq_cfqq_sync(cfqq)) 396 return ASYNC_WORKLOAD; 397 if (!cfq_cfqq_idle_window(cfqq)) 398 return SYNC_NOIDLE_WORKLOAD; 399 return SYNC_WORKLOAD; 400} 401 402static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, 403 struct cfq_data *cfqd, 404 struct cfq_group *cfqg) 405{ 406 if (wl == IDLE_WORKLOAD) 407 return cfqg->service_tree_idle.count; 408 409 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count 410 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count 411 + cfqg->service_trees[wl][SYNC_WORKLOAD].count; 412} 413 414static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 415 struct cfq_group *cfqg) 416{ 417 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count 418 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 419} 420 421static void cfq_dispatch_insert(struct request_queue *, struct request *); 422static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 423 struct io_context *, gfp_t); 424static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 425 struct io_context *); 426 427static inline int rq_in_driver(struct cfq_data *cfqd) 428{ 429 return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1]; 430} 431 432static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 433 bool is_sync) 434{ 435 return cic->cfqq[is_sync]; 436} 437 438static inline void cic_set_cfqq(struct cfq_io_context *cic, 439 struct cfq_queue *cfqq, bool is_sync) 440{ 441 cic->cfqq[is_sync] = cfqq; 442} 443 444/* 445 * We regard a request as SYNC, if it's either a read or has the SYNC bit 446 * set (in which case it could also be direct WRITE). 447 */ 448static inline bool cfq_bio_sync(struct bio *bio) 449{ 450 return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO); 451} 452 453/* 454 * scheduler run of queue, if there are requests pending and no one in the 455 * driver that will restart queueing 456 */ 457static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 458{ 459 if (cfqd->busy_queues) { 460 cfq_log(cfqd, "schedule dispatch"); 461 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 462 } 463} 464 465static int cfq_queue_empty(struct request_queue *q) 466{ 467 struct cfq_data *cfqd = q->elevator->elevator_data; 468 469 return !cfqd->rq_queued; 470} 471 472/* 473 * Scale schedule slice based on io priority. Use the sync time slice only 474 * if a queue is marked sync and has sync io queued. A sync queue with async 475 * io only, should not get full sync slice length. 476 */ 477static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync, 478 unsigned short prio) 479{ 480 const int base_slice = cfqd->cfq_slice[sync]; 481 482 WARN_ON(prio >= IOPRIO_BE_NR); 483 484 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); 485} 486 487static inline int 488cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 489{ 490 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 491} 492 493static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) 494{ 495 u64 d = delta << CFQ_SERVICE_SHIFT; 496 497 d = d * BLKIO_WEIGHT_DEFAULT; 498 do_div(d, cfqg->weight); 499 return d; 500} 501 502static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 503{ 504 s64 delta = (s64)(vdisktime - min_vdisktime); 505 if (delta > 0) 506 min_vdisktime = vdisktime; 507 508 return min_vdisktime; 509} 510 511static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) 512{ 513 s64 delta = (s64)(vdisktime - min_vdisktime); 514 if (delta < 0) 515 min_vdisktime = vdisktime; 516 517 return min_vdisktime; 518} 519 520static void update_min_vdisktime(struct cfq_rb_root *st) 521{ 522 u64 vdisktime = st->min_vdisktime; 523 struct cfq_group *cfqg; 524 525 if (st->active) { 526 cfqg = rb_entry_cfqg(st->active); 527 vdisktime = cfqg->vdisktime; 528 } 529 530 if (st->left) { 531 cfqg = rb_entry_cfqg(st->left); 532 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 533 } 534 535 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); 536} 537 538/* 539 * get averaged number of queues of RT/BE priority. 540 * average is updated, with a formula that gives more weight to higher numbers, 541 * to quickly follows sudden increases and decrease slowly 542 */ 543 544static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, 545 struct cfq_group *cfqg, bool rt) 546{ 547 unsigned min_q, max_q; 548 unsigned mult = cfq_hist_divisor - 1; 549 unsigned round = cfq_hist_divisor / 2; 550 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); 551 552 min_q = min(cfqg->busy_queues_avg[rt], busy); 553 max_q = max(cfqg->busy_queues_avg[rt], busy); 554 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / 555 cfq_hist_divisor; 556 return cfqg->busy_queues_avg[rt]; 557} 558 559static inline unsigned 560cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 561{ 562 struct cfq_rb_root *st = &cfqd->grp_service_tree; 563 564 return cfq_target_latency * cfqg->weight / st->total_weight; 565} 566 567static inline void 568cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 569{ 570 unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 571 if (cfqd->cfq_latency) { 572 /* 573 * interested queues (we consider only the ones with the same 574 * priority class in the cfq group) 575 */ 576 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, 577 cfq_class_rt(cfqq)); 578 unsigned sync_slice = cfqd->cfq_slice[1]; 579 unsigned expect_latency = sync_slice * iq; 580 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); 581 582 if (expect_latency > group_slice) { 583 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; 584 /* scale low_slice according to IO priority 585 * and sync vs async */ 586 unsigned low_slice = 587 min(slice, base_low_slice * slice / sync_slice); 588 /* the adapted slice value is scaled to fit all iqs 589 * into the target latency */ 590 slice = max(slice * group_slice / expect_latency, 591 low_slice); 592 } 593 } 594 cfqq->slice_start = jiffies; 595 cfqq->slice_end = jiffies + slice; 596 cfqq->allocated_slice = slice; 597 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 598} 599 600/* 601 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 602 * isn't valid until the first request from the dispatch is activated 603 * and the slice time set. 604 */ 605static inline bool cfq_slice_used(struct cfq_queue *cfqq) 606{ 607 if (cfq_cfqq_slice_new(cfqq)) 608 return 0; 609 if (time_before(jiffies, cfqq->slice_end)) 610 return 0; 611 612 return 1; 613} 614 615/* 616 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 617 * We choose the request that is closest to the head right now. Distance 618 * behind the head is penalized and only allowed to a certain extent. 619 */ 620static struct request * 621cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) 622{ 623 sector_t s1, s2, d1 = 0, d2 = 0; 624 unsigned long back_max; 625#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 626#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 627 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 628 629 if (rq1 == NULL || rq1 == rq2) 630 return rq2; 631 if (rq2 == NULL) 632 return rq1; 633 634 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 635 return rq1; 636 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 637 return rq2; 638 if (rq_is_meta(rq1) && !rq_is_meta(rq2)) 639 return rq1; 640 else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) 641 return rq2; 642 643 s1 = blk_rq_pos(rq1); 644 s2 = blk_rq_pos(rq2); 645 646 /* 647 * by definition, 1KiB is 2 sectors 648 */ 649 back_max = cfqd->cfq_back_max * 2; 650 651 /* 652 * Strict one way elevator _except_ in the case where we allow 653 * short backward seeks which are biased as twice the cost of a 654 * similar forward seek. 655 */ 656 if (s1 >= last) 657 d1 = s1 - last; 658 else if (s1 + back_max >= last) 659 d1 = (last - s1) * cfqd->cfq_back_penalty; 660 else 661 wrap |= CFQ_RQ1_WRAP; 662 663 if (s2 >= last) 664 d2 = s2 - last; 665 else if (s2 + back_max >= last) 666 d2 = (last - s2) * cfqd->cfq_back_penalty; 667 else 668 wrap |= CFQ_RQ2_WRAP; 669 670 /* Found required data */ 671 672 /* 673 * By doing switch() on the bit mask "wrap" we avoid having to 674 * check two variables for all permutations: --> faster! 675 */ 676 switch (wrap) { 677 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 678 if (d1 < d2) 679 return rq1; 680 else if (d2 < d1) 681 return rq2; 682 else { 683 if (s1 >= s2) 684 return rq1; 685 else 686 return rq2; 687 } 688 689 case CFQ_RQ2_WRAP: 690 return rq1; 691 case CFQ_RQ1_WRAP: 692 return rq2; 693 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 694 default: 695 /* 696 * Since both rqs are wrapped, 697 * start with the one that's further behind head 698 * (--> only *one* back seek required), 699 * since back seek takes more time than forward. 700 */ 701 if (s1 <= s2) 702 return rq1; 703 else 704 return rq2; 705 } 706} 707 708/* 709 * The below is leftmost cache rbtree addon 710 */ 711static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 712{ 713 /* Service tree is empty */ 714 if (!root->count) 715 return NULL; 716 717 if (!root->left) 718 root->left = rb_first(&root->rb); 719 720 if (root->left) 721 return rb_entry(root->left, struct cfq_queue, rb_node); 722 723 return NULL; 724} 725 726static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) 727{ 728 if (!root->left) 729 root->left = rb_first(&root->rb); 730 731 if (root->left) 732 return rb_entry_cfqg(root->left); 733 734 return NULL; 735} 736 737static void rb_erase_init(struct rb_node *n, struct rb_root *root) 738{ 739 rb_erase(n, root); 740 RB_CLEAR_NODE(n); 741} 742 743static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 744{ 745 if (root->left == n) 746 root->left = NULL; 747 rb_erase_init(n, &root->rb); 748 --root->count; 749} 750 751/* 752 * would be nice to take fifo expire time into account as well 753 */ 754static struct request * 755cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 756 struct request *last) 757{ 758 struct rb_node *rbnext = rb_next(&last->rb_node); 759 struct rb_node *rbprev = rb_prev(&last->rb_node); 760 struct request *next = NULL, *prev = NULL; 761 762 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 763 764 if (rbprev) 765 prev = rb_entry_rq(rbprev); 766 767 if (rbnext) 768 next = rb_entry_rq(rbnext); 769 else { 770 rbnext = rb_first(&cfqq->sort_list); 771 if (rbnext && rbnext != &last->rb_node) 772 next = rb_entry_rq(rbnext); 773 } 774 775 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); 776} 777 778static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 779 struct cfq_queue *cfqq) 780{ 781 /* 782 * just an approximation, should be ok. 783 */ 784 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - 785 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 786} 787 788static inline s64 789cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) 790{ 791 return cfqg->vdisktime - st->min_vdisktime; 792} 793 794static void 795__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 796{ 797 struct rb_node **node = &st->rb.rb_node; 798 struct rb_node *parent = NULL; 799 struct cfq_group *__cfqg; 800 s64 key = cfqg_key(st, cfqg); 801 int left = 1; 802 803 while (*node != NULL) { 804 parent = *node; 805 __cfqg = rb_entry_cfqg(parent); 806 807 if (key < cfqg_key(st, __cfqg)) 808 node = &parent->rb_left; 809 else { 810 node = &parent->rb_right; 811 left = 0; 812 } 813 } 814 815 if (left) 816 st->left = &cfqg->rb_node; 817 818 rb_link_node(&cfqg->rb_node, parent, node); 819 rb_insert_color(&cfqg->rb_node, &st->rb); 820} 821 822static void 823cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 824{ 825 struct cfq_rb_root *st = &cfqd->grp_service_tree; 826 struct cfq_group *__cfqg; 827 struct rb_node *n; 828 829 cfqg->nr_cfqq++; 830 if (cfqg->on_st) 831 return; 832 833 /* 834 * Currently put the group at the end. Later implement something 835 * so that groups get lesser vtime based on their weights, so that 836 * if group does not loose all if it was not continously backlogged. 837 */ 838 n = rb_last(&st->rb); 839 if (n) { 840 __cfqg = rb_entry_cfqg(n); 841 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 842 } else 843 cfqg->vdisktime = st->min_vdisktime; 844 845 __cfq_group_service_tree_add(st, cfqg); 846 cfqg->on_st = true; 847 cfqd->nr_groups++; 848 st->total_weight += cfqg->weight; 849} 850 851static void 852cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 853{ 854 struct cfq_rb_root *st = &cfqd->grp_service_tree; 855 856 if (st->active == &cfqg->rb_node) 857 st->active = NULL; 858 859 BUG_ON(cfqg->nr_cfqq < 1); 860 cfqg->nr_cfqq--; 861 862 /* If there are other cfq queues under this group, don't delete it */ 863 if (cfqg->nr_cfqq) 864 return; 865 866 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 867 cfqg->on_st = false; 868 cfqd->nr_groups--; 869 st->total_weight -= cfqg->weight; 870 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 871 cfq_rb_erase(&cfqg->rb_node, st); 872 cfqg->saved_workload_slice = 0; 873 blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); 874} 875 876static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 877{ 878 unsigned int slice_used; 879 880 /* 881 * Queue got expired before even a single request completed or 882 * got expired immediately after first request completion. 883 */ 884 if (!cfqq->slice_start || cfqq->slice_start == jiffies) { 885 /* 886 * Also charge the seek time incurred to the group, otherwise 887 * if there are mutiple queues in the group, each can dispatch 888 * a single request on seeky media and cause lots of seek time 889 * and group will never know it. 890 */ 891 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), 892 1); 893 } else { 894 slice_used = jiffies - cfqq->slice_start; 895 if (slice_used > cfqq->allocated_slice) 896 slice_used = cfqq->allocated_slice; 897 } 898 899 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, 900 cfqq->nr_sectors); 901 return slice_used; 902} 903 904static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, 905 struct cfq_queue *cfqq) 906{ 907 struct cfq_rb_root *st = &cfqd->grp_service_tree; 908 unsigned int used_sl, charge_sl; 909 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 910 - cfqg->service_tree_idle.count; 911 912 BUG_ON(nr_sync < 0); 913 used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); 914 915 if (!cfq_cfqq_sync(cfqq) && !nr_sync) 916 charge_sl = cfqq->allocated_slice; 917 918 /* Can't update vdisktime while group is on service tree */ 919 cfq_rb_erase(&cfqg->rb_node, st); 920 cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); 921 __cfq_group_service_tree_add(st, cfqg); 922 923 /* This group is being expired. Save the context */ 924 if (time_after(cfqd->workload_expires, jiffies)) { 925 cfqg->saved_workload_slice = cfqd->workload_expires 926 - jiffies; 927 cfqg->saved_workload = cfqd->serving_type; 928 cfqg->saved_serving_prio = cfqd->serving_prio; 929 } else 930 cfqg->saved_workload_slice = 0; 931 932 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 933 st->min_vdisktime); 934 blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, 935 cfqq->nr_sectors); 936} 937 938#ifdef CONFIG_CFQ_GROUP_IOSCHED 939static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) 940{ 941 if (blkg) 942 return container_of(blkg, struct cfq_group, blkg); 943 return NULL; 944} 945 946void 947cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 948{ 949 cfqg_of_blkg(blkg)->weight = weight; 950} 951 952static struct cfq_group * 953cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) 954{ 955 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 956 struct cfq_group *cfqg = NULL; 957 void *key = cfqd; 958 int i, j; 959 struct cfq_rb_root *st; 960 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 961 unsigned int major, minor; 962 963 /* Do we need to take this reference */ 964 if (!blkiocg_css_tryget(blkcg)) 965 return NULL;; 966 967 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 968 if (cfqg || !create) 969 goto done; 970 971 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 972 if (!cfqg) 973 goto done; 974 975 cfqg->weight = blkcg->weight; 976 for_each_cfqg_st(cfqg, i, j, st) 977 *st = CFQ_RB_ROOT; 978 RB_CLEAR_NODE(&cfqg->rb_node); 979 980 /* 981 * Take the initial reference that will be released on destroy 982 * This can be thought of a joint reference by cgroup and 983 * elevator which will be dropped by either elevator exit 984 * or cgroup deletion path depending on who is exiting first. 985 */ 986 atomic_set(&cfqg->ref, 1); 987 988 /* Add group onto cgroup list */ 989 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 990 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 991 MKDEV(major, minor)); 992 993 /* Add group on cfqd list */ 994 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 995 996done: 997 blkiocg_css_put(blkcg); 998 return cfqg; 999} 1000 1001/* 1002 * Search for the cfq group current task belongs to. If create = 1, then also 1003 * create the cfq group if it does not exist. request_queue lock must be held. 1004 */ 1005static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1006{ 1007 struct cgroup *cgroup; 1008 struct cfq_group *cfqg = NULL; 1009 1010 rcu_read_lock(); 1011 cgroup = task_cgroup(current, blkio_subsys_id); 1012 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); 1013 if (!cfqg && create) 1014 cfqg = &cfqd->root_group; 1015 rcu_read_unlock(); 1016 return cfqg; 1017} 1018 1019static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1020{ 1021 /* Currently, all async queues are mapped to root group */ 1022 if (!cfq_cfqq_sync(cfqq)) 1023 cfqg = &cfqq->cfqd->root_group; 1024 1025 cfqq->cfqg = cfqg; 1026 /* cfqq reference on cfqg */ 1027 atomic_inc(&cfqq->cfqg->ref); 1028} 1029 1030static void cfq_put_cfqg(struct cfq_group *cfqg) 1031{ 1032 struct cfq_rb_root *st; 1033 int i, j; 1034 1035 BUG_ON(atomic_read(&cfqg->ref) <= 0); 1036 if (!atomic_dec_and_test(&cfqg->ref)) 1037 return; 1038 for_each_cfqg_st(cfqg, i, j, st) 1039 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); 1040 kfree(cfqg); 1041} 1042 1043static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) 1044{ 1045 /* Something wrong if we are trying to remove same group twice */ 1046 BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); 1047 1048 hlist_del_init(&cfqg->cfqd_node); 1049 1050 /* 1051 * Put the reference taken at the time of creation so that when all 1052 * queues are gone, group can be destroyed. 1053 */ 1054 cfq_put_cfqg(cfqg); 1055} 1056 1057static void cfq_release_cfq_groups(struct cfq_data *cfqd) 1058{ 1059 struct hlist_node *pos, *n; 1060 struct cfq_group *cfqg; 1061 1062 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { 1063 /* 1064 * If cgroup removal path got to blk_group first and removed 1065 * it from cgroup list, then it will take care of destroying 1066 * cfqg also. 1067 */ 1068 if (!blkiocg_del_blkio_group(&cfqg->blkg)) 1069 cfq_destroy_cfqg(cfqd, cfqg); 1070 } 1071} 1072 1073/* 1074 * Blk cgroup controller notification saying that blkio_group object is being 1075 * delinked as associated cgroup object is going away. That also means that 1076 * no new IO will come in this group. So get rid of this group as soon as 1077 * any pending IO in the group is finished. 1078 * 1079 * This function is called under rcu_read_lock(). key is the rcu protected 1080 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu 1081 * read lock. 1082 * 1083 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means 1084 * it should not be NULL as even if elevator was exiting, cgroup deltion 1085 * path got to it first. 1086 */ 1087void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) 1088{ 1089 unsigned long flags; 1090 struct cfq_data *cfqd = key; 1091 1092 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1093 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); 1094 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1095} 1096 1097#else /* GROUP_IOSCHED */ 1098static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1099{ 1100 return &cfqd->root_group; 1101} 1102static inline void 1103cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 1104 cfqq->cfqg = cfqg; 1105} 1106 1107static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} 1108static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} 1109 1110#endif /* GROUP_IOSCHED */ 1111 1112/* 1113 * The cfqd->service_trees holds all pending cfq_queue's that have 1114 * requests waiting to be processed. It is sorted in the order that 1115 * we will service the queues. 1116 */ 1117static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1118 bool add_front) 1119{ 1120 struct rb_node **p, *parent; 1121 struct cfq_queue *__cfqq; 1122 unsigned long rb_key; 1123 struct cfq_rb_root *service_tree; 1124 int left; 1125 int new_cfqq = 1; 1126 int group_changed = 0; 1127 1128#ifdef CONFIG_CFQ_GROUP_IOSCHED 1129 if (!cfqd->cfq_group_isolation 1130 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD 1131 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { 1132 /* Move this cfq to root group */ 1133 cfq_log_cfqq(cfqd, cfqq, "moving to root group"); 1134 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1135 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1136 cfqq->orig_cfqg = cfqq->cfqg; 1137 cfqq->cfqg = &cfqd->root_group; 1138 atomic_inc(&cfqd->root_group.ref); 1139 group_changed = 1; 1140 } else if (!cfqd->cfq_group_isolation 1141 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { 1142 /* cfqq is sequential now needs to go to its original group */ 1143 BUG_ON(cfqq->cfqg != &cfqd->root_group); 1144 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1145 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1146 cfq_put_cfqg(cfqq->cfqg); 1147 cfqq->cfqg = cfqq->orig_cfqg; 1148 cfqq->orig_cfqg = NULL; 1149 group_changed = 1; 1150 cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); 1151 } 1152#endif 1153 1154 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1155 cfqq_type(cfqq), cfqd); 1156 if (cfq_class_idle(cfqq)) { 1157 rb_key = CFQ_IDLE_DELAY; 1158 parent = rb_last(&service_tree->rb); 1159 if (parent && parent != &cfqq->rb_node) { 1160 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1161 rb_key += __cfqq->rb_key; 1162 } else 1163 rb_key += jiffies; 1164 } else if (!add_front) { 1165 /* 1166 * Get our rb key offset. Subtract any residual slice 1167 * value carried from last service. A negative resid 1168 * count indicates slice overrun, and this should position 1169 * the next service time further away in the tree. 1170 */ 1171 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; 1172 rb_key -= cfqq->slice_resid; 1173 cfqq->slice_resid = 0; 1174 } else { 1175 rb_key = -HZ; 1176 __cfqq = cfq_rb_first(service_tree); 1177 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 1178 } 1179 1180 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1181 new_cfqq = 0; 1182 /* 1183 * same position, nothing more to do 1184 */ 1185 if (rb_key == cfqq->rb_key && 1186 cfqq->service_tree == service_tree) 1187 return; 1188 1189 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1190 cfqq->service_tree = NULL; 1191 } 1192 1193 left = 1; 1194 parent = NULL; 1195 cfqq->service_tree = service_tree; 1196 p = &service_tree->rb.rb_node; 1197 while (*p) { 1198 struct rb_node **n; 1199 1200 parent = *p; 1201 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1202 1203 /* 1204 * sort by key, that represents service time. 1205 */ 1206 if (time_before(rb_key, __cfqq->rb_key)) 1207 n = &(*p)->rb_left; 1208 else { 1209 n = &(*p)->rb_right; 1210 left = 0; 1211 } 1212 1213 p = n; 1214 } 1215 1216 if (left) 1217 service_tree->left = &cfqq->rb_node; 1218 1219 cfqq->rb_key = rb_key; 1220 rb_link_node(&cfqq->rb_node, parent, p); 1221 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 1222 service_tree->count++; 1223 if ((add_front || !new_cfqq) && !group_changed) 1224 return; 1225 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1226} 1227 1228static struct cfq_queue * 1229cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, 1230 sector_t sector, struct rb_node **ret_parent, 1231 struct rb_node ***rb_link) 1232{ 1233 struct rb_node **p, *parent; 1234 struct cfq_queue *cfqq = NULL; 1235 1236 parent = NULL; 1237 p = &root->rb_node; 1238 while (*p) { 1239 struct rb_node **n; 1240 1241 parent = *p; 1242 cfqq = rb_entry(parent, struct cfq_queue, p_node); 1243 1244 /* 1245 * Sort strictly based on sector. Smallest to the left, 1246 * largest to the right. 1247 */ 1248 if (sector > blk_rq_pos(cfqq->next_rq)) 1249 n = &(*p)->rb_right; 1250 else if (sector < blk_rq_pos(cfqq->next_rq)) 1251 n = &(*p)->rb_left; 1252 else 1253 break; 1254 p = n; 1255 cfqq = NULL; 1256 } 1257 1258 *ret_parent = parent; 1259 if (rb_link) 1260 *rb_link = p; 1261 return cfqq; 1262} 1263 1264static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1265{ 1266 struct rb_node **p, *parent; 1267 struct cfq_queue *__cfqq; 1268 1269 if (cfqq->p_root) { 1270 rb_erase(&cfqq->p_node, cfqq->p_root); 1271 cfqq->p_root = NULL; 1272 } 1273 1274 if (cfq_class_idle(cfqq)) 1275 return; 1276 if (!cfqq->next_rq) 1277 return; 1278 1279 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; 1280 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, 1281 blk_rq_pos(cfqq->next_rq), &parent, &p); 1282 if (!__cfqq) { 1283 rb_link_node(&cfqq->p_node, parent, p); 1284 rb_insert_color(&cfqq->p_node, cfqq->p_root); 1285 } else 1286 cfqq->p_root = NULL; 1287} 1288 1289/* 1290 * Update cfqq's position in the service tree. 1291 */ 1292static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1293{ 1294 /* 1295 * Resorting requires the cfqq to be on the RR list already. 1296 */ 1297 if (cfq_cfqq_on_rr(cfqq)) { 1298 cfq_service_tree_add(cfqd, cfqq, 0); 1299 cfq_prio_tree_add(cfqd, cfqq); 1300 } 1301} 1302 1303/* 1304 * add to busy list of queues for service, trying to be fair in ordering 1305 * the pending list according to last request service 1306 */ 1307static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1308{ 1309 cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); 1310 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1311 cfq_mark_cfqq_on_rr(cfqq); 1312 cfqd->busy_queues++; 1313 1314 cfq_resort_rr_list(cfqd, cfqq); 1315} 1316 1317/* 1318 * Called when the cfqq no longer has requests pending, remove it from 1319 * the service tree. 1320 */ 1321static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1322{ 1323 cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); 1324 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1325 cfq_clear_cfqq_on_rr(cfqq); 1326 1327 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1328 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1329 cfqq->service_tree = NULL; 1330 } 1331 if (cfqq->p_root) { 1332 rb_erase(&cfqq->p_node, cfqq->p_root); 1333 cfqq->p_root = NULL; 1334 } 1335 1336 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1337 BUG_ON(!cfqd->busy_queues); 1338 cfqd->busy_queues--; 1339} 1340 1341/* 1342 * rb tree support functions 1343 */ 1344static void cfq_del_rq_rb(struct request *rq) 1345{ 1346 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1347 const int sync = rq_is_sync(rq); 1348 1349 BUG_ON(!cfqq->queued[sync]); 1350 cfqq->queued[sync]--; 1351 1352 elv_rb_del(&cfqq->sort_list, rq); 1353 1354 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { 1355 /* 1356 * Queue will be deleted from service tree when we actually 1357 * expire it later. Right now just remove it from prio tree 1358 * as it is empty. 1359 */ 1360 if (cfqq->p_root) { 1361 rb_erase(&cfqq->p_node, cfqq->p_root); 1362 cfqq->p_root = NULL; 1363 } 1364 } 1365} 1366 1367static void cfq_add_rq_rb(struct request *rq) 1368{ 1369 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1370 struct cfq_data *cfqd = cfqq->cfqd; 1371 struct request *__alias, *prev; 1372 1373 cfqq->queued[rq_is_sync(rq)]++; 1374 1375 /* 1376 * looks a little odd, but the first insert might return an alias. 1377 * if that happens, put the alias on the dispatch list 1378 */ 1379 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 1380 cfq_dispatch_insert(cfqd->queue, __alias); 1381 1382 if (!cfq_cfqq_on_rr(cfqq)) 1383 cfq_add_cfqq_rr(cfqd, cfqq); 1384 1385 /* 1386 * check if this request is a better next-serve candidate 1387 */ 1388 prev = cfqq->next_rq; 1389 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); 1390 1391 /* 1392 * adjust priority tree position, if ->next_rq changes 1393 */ 1394 if (prev != cfqq->next_rq) 1395 cfq_prio_tree_add(cfqd, cfqq); 1396 1397 BUG_ON(!cfqq->next_rq); 1398} 1399 1400static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 1401{ 1402 elv_rb_del(&cfqq->sort_list, rq); 1403 cfqq->queued[rq_is_sync(rq)]--; 1404 cfq_add_rq_rb(rq); 1405} 1406 1407static struct request * 1408cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 1409{ 1410 struct task_struct *tsk = current; 1411 struct cfq_io_context *cic; 1412 struct cfq_queue *cfqq; 1413 1414 cic = cfq_cic_lookup(cfqd, tsk->io_context); 1415 if (!cic) 1416 return NULL; 1417 1418 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1419 if (cfqq) { 1420 sector_t sector = bio->bi_sector + bio_sectors(bio); 1421 1422 return elv_rb_find(&cfqq->sort_list, sector); 1423 } 1424 1425 return NULL; 1426} 1427 1428static void cfq_activate_request(struct request_queue *q, struct request *rq) 1429{ 1430 struct cfq_data *cfqd = q->elevator->elevator_data; 1431 1432 cfqd->rq_in_driver[rq_is_sync(rq)]++; 1433 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 1434 rq_in_driver(cfqd)); 1435 1436 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 1437} 1438 1439static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 1440{ 1441 struct cfq_data *cfqd = q->elevator->elevator_data; 1442 const int sync = rq_is_sync(rq); 1443 1444 WARN_ON(!cfqd->rq_in_driver[sync]); 1445 cfqd->rq_in_driver[sync]--; 1446 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 1447 rq_in_driver(cfqd)); 1448} 1449 1450static void cfq_remove_request(struct request *rq) 1451{ 1452 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1453 1454 if (cfqq->next_rq == rq) 1455 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 1456 1457 list_del_init(&rq->queuelist); 1458 cfq_del_rq_rb(rq); 1459 1460 cfqq->cfqd->rq_queued--; 1461 if (rq_is_meta(rq)) { 1462 WARN_ON(!cfqq->meta_pending); 1463 cfqq->meta_pending--; 1464 } 1465} 1466 1467static int cfq_merge(struct request_queue *q, struct request **req, 1468 struct bio *bio) 1469{ 1470 struct cfq_data *cfqd = q->elevator->elevator_data; 1471 struct request *__rq; 1472 1473 __rq = cfq_find_rq_fmerge(cfqd, bio); 1474 if (__rq && elv_rq_merge_ok(__rq, bio)) { 1475 *req = __rq; 1476 return ELEVATOR_FRONT_MERGE; 1477 } 1478 1479 return ELEVATOR_NO_MERGE; 1480} 1481 1482static void cfq_merged_request(struct request_queue *q, struct request *req, 1483 int type) 1484{ 1485 if (type == ELEVATOR_FRONT_MERGE) { 1486 struct cfq_queue *cfqq = RQ_CFQQ(req); 1487 1488 cfq_reposition_rq_rb(cfqq, req); 1489 } 1490} 1491 1492static void 1493cfq_merged_requests(struct request_queue *q, struct request *rq, 1494 struct request *next) 1495{ 1496 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1497 /* 1498 * reposition in fifo if next is older than rq 1499 */ 1500 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 1501 time_before(rq_fifo_time(next), rq_fifo_time(rq))) { 1502 list_move(&rq->queuelist, &next->queuelist); 1503 rq_set_fifo_time(rq, rq_fifo_time(next)); 1504 } 1505 1506 if (cfqq->next_rq == next) 1507 cfqq->next_rq = rq; 1508 cfq_remove_request(next); 1509} 1510 1511static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1512 struct bio *bio) 1513{ 1514 struct cfq_data *cfqd = q->elevator->elevator_data; 1515 struct cfq_io_context *cic; 1516 struct cfq_queue *cfqq; 1517 1518 /* Deny merge if bio and rq don't belong to same cfq group */ 1519 if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0)) 1520 return false; 1521 /* 1522 * Disallow merge of a sync bio into an async request. 1523 */ 1524 if (cfq_bio_sync(bio) && !rq_is_sync(rq)) 1525 return false; 1526 1527 /* 1528 * Lookup the cfqq that this bio will be queued with. Allow 1529 * merge only if rq is queued there. 1530 */ 1531 cic = cfq_cic_lookup(cfqd, current->io_context); 1532 if (!cic) 1533 return false; 1534 1535 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1536 return cfqq == RQ_CFQQ(rq); 1537} 1538 1539static void __cfq_set_active_queue(struct cfq_data *cfqd, 1540 struct cfq_queue *cfqq) 1541{ 1542 if (cfqq) { 1543 cfq_log_cfqq(cfqd, cfqq, "set_active"); 1544 cfqq->slice_start = 0; 1545 cfqq->dispatch_start = jiffies; 1546 cfqq->allocated_slice = 0; 1547 cfqq->slice_end = 0; 1548 cfqq->slice_dispatch = 0; 1549 cfqq->nr_sectors = 0; 1550 1551 cfq_clear_cfqq_wait_request(cfqq); 1552 cfq_clear_cfqq_must_dispatch(cfqq); 1553 cfq_clear_cfqq_must_alloc_slice(cfqq); 1554 cfq_clear_cfqq_fifo_expire(cfqq); 1555 cfq_mark_cfqq_slice_new(cfqq); 1556 1557 del_timer(&cfqd->idle_slice_timer); 1558 } 1559 1560 cfqd->active_queue = cfqq; 1561} 1562 1563/* 1564 * current cfqq expired its slice (or was too idle), select new one 1565 */ 1566static void 1567__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1568 bool timed_out) 1569{ 1570 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 1571 1572 if (cfq_cfqq_wait_request(cfqq)) 1573 del_timer(&cfqd->idle_slice_timer); 1574 1575 cfq_clear_cfqq_wait_request(cfqq); 1576 cfq_clear_cfqq_wait_busy(cfqq); 1577 cfq_clear_cfqq_wait_busy_done(cfqq); 1578 1579 /* 1580 * store what was left of this slice, if the queue idled/timed out 1581 */ 1582 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 1583 cfqq->slice_resid = cfqq->slice_end - jiffies; 1584 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1585 } 1586 1587 cfq_group_served(cfqd, cfqq->cfqg, cfqq); 1588 1589 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1590 cfq_del_cfqq_rr(cfqd, cfqq); 1591 1592 cfq_resort_rr_list(cfqd, cfqq); 1593 1594 if (cfqq == cfqd->active_queue) 1595 cfqd->active_queue = NULL; 1596 1597 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) 1598 cfqd->grp_service_tree.active = NULL; 1599 1600 if (cfqd->active_cic) { 1601 put_io_context(cfqd->active_cic->ioc); 1602 cfqd->active_cic = NULL; 1603 } 1604} 1605 1606static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) 1607{ 1608 struct cfq_queue *cfqq = cfqd->active_queue; 1609 1610 if (cfqq) 1611 __cfq_slice_expired(cfqd, cfqq, timed_out); 1612} 1613 1614/* 1615 * Get next queue for service. Unless we have a queue preemption, 1616 * we'll simply select the first cfqq in the service tree. 1617 */ 1618static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 1619{ 1620 struct cfq_rb_root *service_tree = 1621 service_tree_for(cfqd->serving_group, cfqd->serving_prio, 1622 cfqd->serving_type, cfqd); 1623 1624 if (!cfqd->rq_queued) 1625 return NULL; 1626 1627 /* There is nothing to dispatch */ 1628 if (!service_tree) 1629 return NULL; 1630 if (RB_EMPTY_ROOT(&service_tree->rb)) 1631 return NULL; 1632 return cfq_rb_first(service_tree); 1633} 1634 1635static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 1636{ 1637 struct cfq_group *cfqg; 1638 struct cfq_queue *cfqq; 1639 int i, j; 1640 struct cfq_rb_root *st; 1641 1642 if (!cfqd->rq_queued) 1643 return NULL; 1644 1645 cfqg = cfq_get_next_cfqg(cfqd); 1646 if (!cfqg) 1647 return NULL; 1648 1649 for_each_cfqg_st(cfqg, i, j, st) 1650 if ((cfqq = cfq_rb_first(st)) != NULL) 1651 return cfqq; 1652 return NULL; 1653} 1654 1655/* 1656 * Get and set a new active queue for service. 1657 */ 1658static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 1659 struct cfq_queue *cfqq) 1660{ 1661 if (!cfqq) 1662 cfqq = cfq_get_next_queue(cfqd); 1663 1664 __cfq_set_active_queue(cfqd, cfqq); 1665 return cfqq; 1666} 1667 1668static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 1669 struct request *rq) 1670{ 1671 if (blk_rq_pos(rq) >= cfqd->last_position) 1672 return blk_rq_pos(rq) - cfqd->last_position; 1673 else 1674 return cfqd->last_position - blk_rq_pos(rq); 1675} 1676 1677#define CFQQ_SEEK_THR 8 * 1024 1678#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) 1679 1680static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1681 struct request *rq) 1682{ 1683 sector_t sdist = cfqq->seek_mean; 1684 1685 if (!sample_valid(cfqq->seek_samples)) 1686 sdist = CFQQ_SEEK_THR; 1687 1688 return cfq_dist_from_last(cfqd, rq) <= sdist; 1689} 1690 1691static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 1692 struct cfq_queue *cur_cfqq) 1693{ 1694 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; 1695 struct rb_node *parent, *node; 1696 struct cfq_queue *__cfqq; 1697 sector_t sector = cfqd->last_position; 1698 1699 if (RB_EMPTY_ROOT(root)) 1700 return NULL; 1701 1702 /* 1703 * First, if we find a request starting at the end of the last 1704 * request, choose it. 1705 */ 1706 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); 1707 if (__cfqq) 1708 return __cfqq; 1709 1710 /* 1711 * If the exact sector wasn't found, the parent of the NULL leaf 1712 * will contain the closest sector. 1713 */ 1714 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1715 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1716 return __cfqq; 1717 1718 if (blk_rq_pos(__cfqq->next_rq) < sector) 1719 node = rb_next(&__cfqq->p_node); 1720 else 1721 node = rb_prev(&__cfqq->p_node); 1722 if (!node) 1723 return NULL; 1724 1725 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1726 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1727 return __cfqq; 1728 1729 return NULL; 1730} 1731 1732/* 1733 * cfqd - obvious 1734 * cur_cfqq - passed in so that we don't decide that the current queue is 1735 * closely cooperating with itself. 1736 * 1737 * So, basically we're assuming that that cur_cfqq has dispatched at least 1738 * one request, and that cfqd->last_position reflects a position on the disk 1739 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 1740 * assumption. 1741 */ 1742static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1743 struct cfq_queue *cur_cfqq) 1744{ 1745 struct cfq_queue *cfqq; 1746 1747 if (!cfq_cfqq_sync(cur_cfqq)) 1748 return NULL; 1749 if (CFQQ_SEEKY(cur_cfqq)) 1750 return NULL; 1751 1752 /* 1753 * We should notice if some of the queues are cooperating, eg 1754 * working closely on the same area of the disk. In that case, 1755 * we can group them together and don't waste time idling. 1756 */ 1757 cfqq = cfqq_close(cfqd, cur_cfqq); 1758 if (!cfqq) 1759 return NULL; 1760 1761 /* If new queue belongs to different cfq_group, don't choose it */ 1762 if (cur_cfqq->cfqg != cfqq->cfqg) 1763 return NULL; 1764 1765 /* 1766 * It only makes sense to merge sync queues. 1767 */ 1768 if (!cfq_cfqq_sync(cfqq)) 1769 return NULL; 1770 if (CFQQ_SEEKY(cfqq)) 1771 return NULL; 1772 1773 /* 1774 * Do not merge queues of different priority classes 1775 */ 1776 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) 1777 return NULL; 1778 1779 return cfqq; 1780} 1781 1782/* 1783 * Determine whether we should enforce idle window for this queue. 1784 */ 1785 1786static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1787{ 1788 enum wl_prio_t prio = cfqq_prio(cfqq); 1789 struct cfq_rb_root *service_tree = cfqq->service_tree; 1790 1791 BUG_ON(!service_tree); 1792 BUG_ON(!service_tree->count); 1793 1794 /* We never do for idle class queues. */ 1795 if (prio == IDLE_WORKLOAD) 1796 return false; 1797 1798 /* We do for queues that were marked with idle window flag. */ 1799 if (cfq_cfqq_idle_window(cfqq) && 1800 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) 1801 return true; 1802 1803 /* 1804 * Otherwise, we do only if they are the last ones 1805 * in their service tree. 1806 */ 1807 return service_tree->count == 1; 1808} 1809 1810static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1811{ 1812 struct cfq_queue *cfqq = cfqd->active_queue; 1813 struct cfq_io_context *cic; 1814 unsigned long sl; 1815 1816 /* 1817 * SSD device without seek penalty, disable idling. But only do so 1818 * for devices that support queuing, otherwise we still have a problem 1819 * with sync vs async workloads. 1820 */ 1821 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) 1822 return; 1823 1824 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 1825 WARN_ON(cfq_cfqq_slice_new(cfqq)); 1826 1827 /* 1828 * idle is disabled, either manually or by past process history 1829 */ 1830 if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) 1831 return; 1832 1833 /* 1834 * still active requests from this queue, don't idle 1835 */ 1836 if (cfqq->dispatched) 1837 return; 1838 1839 /* 1840 * task has exited, don't wait 1841 */ 1842 cic = cfqd->active_cic; 1843 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1844 return; 1845 1846 /* 1847 * If our average think time is larger than the remaining time 1848 * slice, then don't idle. This avoids overrunning the allotted 1849 * time slice. 1850 */ 1851 if (sample_valid(cic->ttime_samples) && 1852 (cfqq->slice_end - jiffies < cic->ttime_mean)) 1853 return; 1854 1855 cfq_mark_cfqq_wait_request(cfqq); 1856 1857 sl = cfqd->cfq_slice_idle; 1858 1859 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1860 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1861} 1862 1863/* 1864 * Move request from internal lists to the request queue dispatch list. 1865 */ 1866static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) 1867{ 1868 struct cfq_data *cfqd = q->elevator->elevator_data; 1869 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1870 1871 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 1872 1873 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); 1874 cfq_remove_request(rq); 1875 cfqq->dispatched++; 1876 elv_dispatch_sort(q, rq); 1877 1878 if (cfq_cfqq_sync(cfqq)) 1879 cfqd->sync_flight++; 1880 cfqq->nr_sectors += blk_rq_sectors(rq); 1881} 1882 1883/* 1884 * return expired entry, or NULL to just start from scratch in rbtree 1885 */ 1886static struct request *cfq_check_fifo(struct cfq_queue *cfqq) 1887{ 1888 struct request *rq = NULL; 1889 1890 if (cfq_cfqq_fifo_expire(cfqq)) 1891 return NULL; 1892 1893 cfq_mark_cfqq_fifo_expire(cfqq); 1894 1895 if (list_empty(&cfqq->fifo)) 1896 return NULL; 1897 1898 rq = rq_entry_fifo(cfqq->fifo.next); 1899 if (time_before(jiffies, rq_fifo_time(rq))) 1900 rq = NULL; 1901 1902 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); 1903 return rq; 1904} 1905 1906static inline int 1907cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1908{ 1909 const int base_rq = cfqd->cfq_slice_async_rq; 1910 1911 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 1912 1913 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 1914} 1915 1916/* 1917 * Must be called with the queue_lock held. 1918 */ 1919static int cfqq_process_refs(struct cfq_queue *cfqq) 1920{ 1921 int process_refs, io_refs; 1922 1923 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 1924 process_refs = atomic_read(&cfqq->ref) - io_refs; 1925 BUG_ON(process_refs < 0); 1926 return process_refs; 1927} 1928 1929static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) 1930{ 1931 int process_refs, new_process_refs; 1932 struct cfq_queue *__cfqq; 1933 1934 /* Avoid a circular list and skip interim queue merges */ 1935 while ((__cfqq = new_cfqq->new_cfqq)) { 1936 if (__cfqq == cfqq) 1937 return; 1938 new_cfqq = __cfqq; 1939 } 1940 1941 process_refs = cfqq_process_refs(cfqq); 1942 /* 1943 * If the process for the cfqq has gone away, there is no 1944 * sense in merging the queues. 1945 */ 1946 if (process_refs == 0) 1947 return; 1948 1949 /* 1950 * Merge in the direction of the lesser amount of work. 1951 */ 1952 new_process_refs = cfqq_process_refs(new_cfqq); 1953 if (new_process_refs >= process_refs) { 1954 cfqq->new_cfqq = new_cfqq; 1955 atomic_add(process_refs, &new_cfqq->ref); 1956 } else { 1957 new_cfqq->new_cfqq = cfqq; 1958 atomic_add(new_process_refs, &cfqq->ref); 1959 } 1960} 1961 1962static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, 1963 struct cfq_group *cfqg, enum wl_prio_t prio, 1964 bool prio_changed) 1965{ 1966 struct cfq_queue *queue; 1967 int i; 1968 bool key_valid = false; 1969 unsigned long lowest_key = 0; 1970 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; 1971 1972 if (prio_changed) { 1973 /* 1974 * When priorities switched, we prefer starting 1975 * from SYNC_NOIDLE (first choice), or just SYNC 1976 * over ASYNC 1977 */ 1978 if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) 1979 return cur_best; 1980 cur_best = SYNC_WORKLOAD; 1981 if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) 1982 return cur_best; 1983 1984 return ASYNC_WORKLOAD; 1985 } 1986 1987 for (i = 0; i < 3; ++i) { 1988 /* otherwise, select the one with lowest rb_key */ 1989 queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd)); 1990 if (queue && 1991 (!key_valid || time_before(queue->rb_key, lowest_key))) { 1992 lowest_key = queue->rb_key; 1993 cur_best = i; 1994 key_valid = true; 1995 } 1996 } 1997 1998 return cur_best; 1999} 2000 2001static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) 2002{ 2003 enum wl_prio_t previous_prio = cfqd->serving_prio; 2004 bool prio_changed; 2005 unsigned slice; 2006 unsigned count; 2007 struct cfq_rb_root *st; 2008 unsigned group_slice; 2009 2010 if (!cfqg) { 2011 cfqd->serving_prio = IDLE_WORKLOAD; 2012 cfqd->workload_expires = jiffies + 1; 2013 return; 2014 } 2015 2016 /* Choose next priority. RT > BE > IDLE */ 2017 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2018 cfqd->serving_prio = RT_WORKLOAD; 2019 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 2020 cfqd->serving_prio = BE_WORKLOAD; 2021 else { 2022 cfqd->serving_prio = IDLE_WORKLOAD; 2023 cfqd->workload_expires = jiffies + 1; 2024 return; 2025 } 2026 2027 /* 2028 * For RT and BE, we have to choose also the type 2029 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2030 * expiration time 2031 */ 2032 prio_changed = (cfqd->serving_prio != previous_prio); 2033 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, 2034 cfqd); 2035 count = st->count; 2036 2037 /* 2038 * If priority didn't change, check workload expiration, 2039 * and that we still have other queues ready 2040 */ 2041 if (!prio_changed && count && 2042 !time_after(jiffies, cfqd->workload_expires)) 2043 return; 2044 2045 /* otherwise select new workload type */ 2046 cfqd->serving_type = 2047 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed); 2048 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, 2049 cfqd); 2050 count = st->count; 2051 2052 /* 2053 * the workload slice is computed as a fraction of target latency 2054 * proportional to the number of queues in that workload, over 2055 * all the queues in the same priority class 2056 */ 2057 group_slice = cfq_group_slice(cfqd, cfqg); 2058 2059 slice = group_slice * count / 2060 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], 2061 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); 2062 2063 if (cfqd->serving_type == ASYNC_WORKLOAD) { 2064 unsigned int tmp; 2065 2066 /* 2067 * Async queues are currently system wide. Just taking 2068 * proportion of queues with-in same group will lead to higher 2069 * async ratio system wide as generally root group is going 2070 * to have higher weight. A more accurate thing would be to 2071 * calculate system wide asnc/sync ratio. 2072 */ 2073 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); 2074 tmp = tmp/cfqd->busy_queues; 2075 slice = min_t(unsigned, slice, tmp); 2076 2077 /* async workload slice is scaled down according to 2078 * the sync/async slice ratio. */ 2079 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; 2080 } else 2081 /* sync workload slice is at least 2 * cfq_slice_idle */ 2082 slice = max(slice, 2 * cfqd->cfq_slice_idle); 2083 2084 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2085 cfqd->workload_expires = jiffies + slice; 2086 cfqd->noidle_tree_requires_idle = false; 2087} 2088 2089static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2090{ 2091 struct cfq_rb_root *st = &cfqd->grp_service_tree; 2092 struct cfq_group *cfqg; 2093 2094 if (RB_EMPTY_ROOT(&st->rb)) 2095 return NULL; 2096 cfqg = cfq_rb_first_group(st); 2097 st->active = &cfqg->rb_node; 2098 update_min_vdisktime(st); 2099 return cfqg; 2100} 2101 2102static void cfq_choose_cfqg(struct cfq_data *cfqd) 2103{ 2104 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); 2105 2106 cfqd->serving_group = cfqg; 2107 2108 /* Restore the workload type data */ 2109 if (cfqg->saved_workload_slice) { 2110 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; 2111 cfqd->serving_type = cfqg->saved_workload; 2112 cfqd->serving_prio = cfqg->saved_serving_prio; 2113 } 2114 choose_service_tree(cfqd, cfqg); 2115} 2116 2117/* 2118 * Select a queue for service. If we have a current active queue, 2119 * check whether to continue servicing it, or retrieve and set a new one. 2120 */ 2121static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 2122{ 2123 struct cfq_queue *cfqq, *new_cfqq = NULL; 2124 2125 cfqq = cfqd->active_queue; 2126 if (!cfqq) 2127 goto new_queue; 2128 2129 if (!cfqd->rq_queued) 2130 return NULL; 2131 /* 2132 * The active queue has run out of time, expire it and select new. 2133 */ 2134 if ((cfq_slice_used(cfqq) || cfq_cfqq_wait_busy_done(cfqq)) 2135 && !cfq_cfqq_must_dispatch(cfqq)) 2136 goto expire; 2137 2138 /* 2139 * The active queue has requests and isn't expired, allow it to 2140 * dispatch. 2141 */ 2142 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 2143 goto keep_queue; 2144 2145 /* 2146 * If another queue has a request waiting within our mean seek 2147 * distance, let it run. The expire code will check for close 2148 * cooperators and put the close queue at the front of the service 2149 * tree. If possible, merge the expiring queue with the new cfqq. 2150 */ 2151 new_cfqq = cfq_close_cooperator(cfqd, cfqq); 2152 if (new_cfqq) { 2153 if (!cfqq->new_cfqq) 2154 cfq_setup_merge(cfqq, new_cfqq); 2155 goto expire; 2156 } 2157 2158 /* 2159 * No requests pending. If the active queue still has requests in 2160 * flight or is idling for a new request, allow either of these 2161 * conditions to happen (or time out) before selecting a new queue. 2162 */ 2163 if (timer_pending(&cfqd->idle_slice_timer) || 2164 (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { 2165 cfqq = NULL; 2166 goto keep_queue; 2167 } 2168 2169expire: 2170 cfq_slice_expired(cfqd, 0); 2171new_queue: 2172 /* 2173 * Current queue expired. Check if we have to switch to a new 2174 * service tree 2175 */ 2176 if (!new_cfqq) 2177 cfq_choose_cfqg(cfqd); 2178 2179 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 2180keep_queue: 2181 return cfqq; 2182} 2183 2184static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 2185{ 2186 int dispatched = 0; 2187 2188 while (cfqq->next_rq) { 2189 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 2190 dispatched++; 2191 } 2192 2193 BUG_ON(!list_empty(&cfqq->fifo)); 2194 2195 /* By default cfqq is not expired if it is empty. Do it explicitly */ 2196 __cfq_slice_expired(cfqq->cfqd, cfqq, 0); 2197 return dispatched; 2198} 2199 2200/* 2201 * Drain our current requests. Used for barriers and when switching 2202 * io schedulers on-the-fly. 2203 */ 2204static int cfq_forced_dispatch(struct cfq_data *cfqd) 2205{ 2206 struct cfq_queue *cfqq; 2207 int dispatched = 0; 2208 2209 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) 2210 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 2211 2212 cfq_slice_expired(cfqd, 0); 2213 BUG_ON(cfqd->busy_queues); 2214 2215 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 2216 return dispatched; 2217} 2218 2219static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2220{ 2221 unsigned int max_dispatch; 2222 2223 /* 2224 * Drain async requests before we start sync IO 2225 */ 2226 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) 2227 return false; 2228 2229 /* 2230 * If this is an async queue and we have sync IO in flight, let it wait 2231 */ 2232 if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) 2233 return false; 2234 2235 max_dispatch = cfqd->cfq_quantum; 2236 if (cfq_class_idle(cfqq)) 2237 max_dispatch = 1; 2238 2239 /* 2240 * Does this cfqq already have too much IO in flight? 2241 */ 2242 if (cfqq->dispatched >= max_dispatch) { 2243 /* 2244 * idle queue must always only have a single IO in flight 2245 */ 2246 if (cfq_class_idle(cfqq)) 2247 return false; 2248 2249 /* 2250 * We have other queues, don't allow more IO from this one 2251 */ 2252 if (cfqd->busy_queues > 1) 2253 return false; 2254 2255 /* 2256 * Sole queue user, no limit 2257 */ 2258 max_dispatch = -1; 2259 } 2260 2261 /* 2262 * Async queues must wait a bit before being allowed dispatch. 2263 * We also ramp up the dispatch depth gradually for async IO, 2264 * based on the last sync IO we serviced 2265 */ 2266 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { 2267 unsigned long last_sync = jiffies - cfqd->last_delayed_sync; 2268 unsigned int depth; 2269 2270 depth = last_sync / cfqd->cfq_slice[1]; 2271 if (!depth && !cfqq->dispatched) 2272 depth = 1; 2273 if (depth < max_dispatch) 2274 max_dispatch = depth; 2275 } 2276 2277 /* 2278 * If we're below the current max, allow a dispatch 2279 */ 2280 return cfqq->dispatched < max_dispatch; 2281} 2282 2283/* 2284 * Dispatch a request from cfqq, moving them to the request queue 2285 * dispatch list. 2286 */ 2287static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2288{ 2289 struct request *rq; 2290 2291 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 2292 2293 if (!cfq_may_dispatch(cfqd, cfqq)) 2294 return false; 2295 2296 /* 2297 * follow expired path, else get first next available 2298 */ 2299 rq = cfq_check_fifo(cfqq); 2300 if (!rq) 2301 rq = cfqq->next_rq; 2302 2303 /* 2304 * insert request into driver dispatch list 2305 */ 2306 cfq_dispatch_insert(cfqd->queue, rq); 2307 2308 if (!cfqd->active_cic) { 2309 struct cfq_io_context *cic = RQ_CIC(rq); 2310 2311 atomic_long_inc(&cic->ioc->refcount); 2312 cfqd->active_cic = cic; 2313 } 2314 2315 return true; 2316} 2317 2318/* 2319 * Find the cfqq that we need to service and move a request from that to the 2320 * dispatch list 2321 */ 2322static int cfq_dispatch_requests(struct request_queue *q, int force) 2323{ 2324 struct cfq_data *cfqd = q->elevator->elevator_data; 2325 struct cfq_queue *cfqq; 2326 2327 if (!cfqd->busy_queues) 2328 return 0; 2329 2330 if (unlikely(force)) 2331 return cfq_forced_dispatch(cfqd); 2332 2333 cfqq = cfq_select_queue(cfqd); 2334 if (!cfqq) 2335 return 0; 2336 2337 /* 2338 * Dispatch a request from this cfqq, if it is allowed 2339 */ 2340 if (!cfq_dispatch_request(cfqd, cfqq)) 2341 return 0; 2342 2343 cfqq->slice_dispatch++; 2344 cfq_clear_cfqq_must_dispatch(cfqq); 2345 2346 /* 2347 * expire an async queue immediately if it has used up its slice. idle 2348 * queue always expire after 1 dispatch round. 2349 */ 2350 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 2351 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 2352 cfq_class_idle(cfqq))) { 2353 cfqq->slice_end = jiffies + 1; 2354 cfq_slice_expired(cfqd, 0); 2355 } 2356 2357 cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); 2358 return 1; 2359} 2360 2361/* 2362 * task holds one reference to the queue, dropped when task exits. each rq 2363 * in-flight on this queue also holds a reference, dropped when rq is freed. 2364 * 2365 * Each cfq queue took a reference on the parent group. Drop it now. 2366 * queue lock must be held here. 2367 */ 2368static void cfq_put_queue(struct cfq_queue *cfqq) 2369{ 2370 struct cfq_data *cfqd = cfqq->cfqd; 2371 struct cfq_group *cfqg, *orig_cfqg; 2372 2373 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2374 2375 if (!atomic_dec_and_test(&cfqq->ref)) 2376 return; 2377 2378 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2379 BUG_ON(rb_first(&cfqq->sort_list)); 2380 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2381 cfqg = cfqq->cfqg; 2382 orig_cfqg = cfqq->orig_cfqg; 2383 2384 if (unlikely(cfqd->active_queue == cfqq)) { 2385 __cfq_slice_expired(cfqd, cfqq, 0); 2386 cfq_schedule_dispatch(cfqd); 2387 } 2388 2389 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2390 kmem_cache_free(cfq_pool, cfqq); 2391 cfq_put_cfqg(cfqg); 2392 if (orig_cfqg) 2393 cfq_put_cfqg(orig_cfqg); 2394} 2395 2396/* 2397 * Must always be called with the rcu_read_lock() held 2398 */ 2399static void 2400__call_for_each_cic(struct io_context *ioc, 2401 void (*func)(struct io_context *, struct cfq_io_context *)) 2402{ 2403 struct cfq_io_context *cic; 2404 struct hlist_node *n; 2405 2406 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 2407 func(ioc, cic); 2408} 2409 2410/* 2411 * Call func for each cic attached to this ioc. 2412 */ 2413static void 2414call_for_each_cic(struct io_context *ioc, 2415 void (*func)(struct io_context *, struct cfq_io_context *)) 2416{ 2417 rcu_read_lock(); 2418 __call_for_each_cic(ioc, func); 2419 rcu_read_unlock(); 2420} 2421 2422static void cfq_cic_free_rcu(struct rcu_head *head) 2423{ 2424 struct cfq_io_context *cic; 2425 2426 cic = container_of(head, struct cfq_io_context, rcu_head); 2427 2428 kmem_cache_free(cfq_ioc_pool, cic); 2429 elv_ioc_count_dec(cfq_ioc_count); 2430 2431 if (ioc_gone) { 2432 /* 2433 * CFQ scheduler is exiting, grab exit lock and check 2434 * the pending io context count. If it hits zero, 2435 * complete ioc_gone and set it back to NULL 2436 */ 2437 spin_lock(&ioc_gone_lock); 2438 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { 2439 complete(ioc_gone); 2440 ioc_gone = NULL; 2441 } 2442 spin_unlock(&ioc_gone_lock); 2443 } 2444} 2445 2446static void cfq_cic_free(struct cfq_io_context *cic) 2447{ 2448 call_rcu(&cic->rcu_head, cfq_cic_free_rcu); 2449} 2450 2451static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 2452{ 2453 unsigned long flags; 2454 2455 BUG_ON(!cic->dead_key); 2456 2457 spin_lock_irqsave(&ioc->lock, flags); 2458 radix_tree_delete(&ioc->radix_root, cic->dead_key); 2459 hlist_del_rcu(&cic->cic_list); 2460 spin_unlock_irqrestore(&ioc->lock, flags); 2461 2462 cfq_cic_free(cic); 2463} 2464 2465/* 2466 * Must be called with rcu_read_lock() held or preemption otherwise disabled. 2467 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), 2468 * and ->trim() which is called with the task lock held 2469 */ 2470static void cfq_free_io_context(struct io_context *ioc) 2471{ 2472 /* 2473 * ioc->refcount is zero here, or we are called from elv_unregister(), 2474 * so no more cic's are allowed to be linked into this ioc. So it 2475 * should be ok to iterate over the known list, we will see all cic's 2476 * since no new ones are added. 2477 */ 2478 __call_for_each_cic(ioc, cic_free_func); 2479} 2480 2481static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2482{ 2483 struct cfq_queue *__cfqq, *next; 2484 2485 if (unlikely(cfqq == cfqd->active_queue)) { 2486 __cfq_slice_expired(cfqd, cfqq, 0); 2487 cfq_schedule_dispatch(cfqd); 2488 } 2489 2490 /* 2491 * If this queue was scheduled to merge with another queue, be 2492 * sure to drop the reference taken on that queue (and others in 2493 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. 2494 */ 2495 __cfqq = cfqq->new_cfqq; 2496 while (__cfqq) { 2497 if (__cfqq == cfqq) { 2498 WARN(1, "cfqq->new_cfqq loop detected\n"); 2499 break; 2500 } 2501 next = __cfqq->new_cfqq; 2502 cfq_put_queue(__cfqq); 2503 __cfqq = next; 2504 } 2505 2506 cfq_put_queue(cfqq); 2507} 2508 2509static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 2510 struct cfq_io_context *cic) 2511{ 2512 struct io_context *ioc = cic->ioc; 2513 2514 list_del_init(&cic->queue_list); 2515 2516 /* 2517 * Make sure key == NULL is seen for dead queues 2518 */ 2519 smp_wmb(); 2520 cic->dead_key = (unsigned long) cic->key; 2521 cic->key = NULL; 2522 2523 if (ioc->ioc_data == cic) 2524 rcu_assign_pointer(ioc->ioc_data, NULL); 2525 2526 if (cic->cfqq[BLK_RW_ASYNC]) { 2527 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 2528 cic->cfqq[BLK_RW_ASYNC] = NULL; 2529 } 2530 2531 if (cic->cfqq[BLK_RW_SYNC]) { 2532 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 2533 cic->cfqq[BLK_RW_SYNC] = NULL; 2534 } 2535} 2536 2537static void cfq_exit_single_io_context(struct io_context *ioc, 2538 struct cfq_io_context *cic) 2539{ 2540 struct cfq_data *cfqd = cic->key; 2541 2542 if (cfqd) { 2543 struct request_queue *q = cfqd->queue; 2544 unsigned long flags; 2545 2546 spin_lock_irqsave(q->queue_lock, flags); 2547 2548 /* 2549 * Ensure we get a fresh copy of the ->key to prevent 2550 * race between exiting task and queue 2551 */ 2552 smp_read_barrier_depends(); 2553 if (cic->key) 2554 __cfq_exit_single_io_context(cfqd, cic); 2555 2556 spin_unlock_irqrestore(q->queue_lock, flags); 2557 } 2558} 2559 2560/* 2561 * The process that ioc belongs to has exited, we need to clean up 2562 * and put the internal structures we have that belongs to that process. 2563 */ 2564static void cfq_exit_io_context(struct io_context *ioc) 2565{ 2566 call_for_each_cic(ioc, cfq_exit_single_io_context); 2567} 2568 2569static struct cfq_io_context * 2570cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 2571{ 2572 struct cfq_io_context *cic; 2573 2574 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, 2575 cfqd->queue->node); 2576 if (cic) { 2577 cic->last_end_request = jiffies; 2578 INIT_LIST_HEAD(&cic->queue_list); 2579 INIT_HLIST_NODE(&cic->cic_list); 2580 cic->dtor = cfq_free_io_context; 2581 cic->exit = cfq_exit_io_context; 2582 elv_ioc_count_inc(cfq_ioc_count); 2583 } 2584 2585 return cic; 2586} 2587 2588static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 2589{ 2590 struct task_struct *tsk = current; 2591 int ioprio_class; 2592 2593 if (!cfq_cfqq_prio_changed(cfqq)) 2594 return; 2595 2596 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); 2597 switch (ioprio_class) { 2598 default: 2599 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 2600 case IOPRIO_CLASS_NONE: 2601 /* 2602 * no prio set, inherit CPU scheduling settings 2603 */ 2604 cfqq->ioprio = task_nice_ioprio(tsk); 2605 cfqq->ioprio_class = task_nice_ioclass(tsk); 2606 break; 2607 case IOPRIO_CLASS_RT: 2608 cfqq->ioprio = task_ioprio(ioc); 2609 cfqq->ioprio_class = IOPRIO_CLASS_RT; 2610 break; 2611 case IOPRIO_CLASS_BE: 2612 cfqq->ioprio = task_ioprio(ioc); 2613 cfqq->ioprio_class = IOPRIO_CLASS_BE; 2614 break; 2615 case IOPRIO_CLASS_IDLE: 2616 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 2617 cfqq->ioprio = 7; 2618 cfq_clear_cfqq_idle_window(cfqq); 2619 break; 2620 } 2621 2622 /* 2623 * keep track of original prio settings in case we have to temporarily 2624 * elevate the priority of this queue 2625 */ 2626 cfqq->org_ioprio = cfqq->ioprio; 2627 cfqq->org_ioprio_class = cfqq->ioprio_class; 2628 cfq_clear_cfqq_prio_changed(cfqq); 2629} 2630 2631static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 2632{ 2633 struct cfq_data *cfqd = cic->key; 2634 struct cfq_queue *cfqq; 2635 unsigned long flags; 2636 2637 if (unlikely(!cfqd)) 2638 return; 2639 2640 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2641 2642 cfqq = cic->cfqq[BLK_RW_ASYNC]; 2643 if (cfqq) { 2644 struct cfq_queue *new_cfqq; 2645 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 2646 GFP_ATOMIC); 2647 if (new_cfqq) { 2648 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 2649 cfq_put_queue(cfqq); 2650 } 2651 } 2652 2653 cfqq = cic->cfqq[BLK_RW_SYNC]; 2654 if (cfqq) 2655 cfq_mark_cfqq_prio_changed(cfqq); 2656 2657 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2658} 2659 2660static void cfq_ioc_set_ioprio(struct io_context *ioc) 2661{ 2662 call_for_each_cic(ioc, changed_ioprio); 2663 ioc->ioprio_changed = 0; 2664} 2665 2666static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2667 pid_t pid, bool is_sync) 2668{ 2669 RB_CLEAR_NODE(&cfqq->rb_node); 2670 RB_CLEAR_NODE(&cfqq->p_node); 2671 INIT_LIST_HEAD(&cfqq->fifo); 2672 2673 atomic_set(&cfqq->ref, 0); 2674 cfqq->cfqd = cfqd; 2675 2676 cfq_mark_cfqq_prio_changed(cfqq); 2677 2678 if (is_sync) { 2679 if (!cfq_class_idle(cfqq)) 2680 cfq_mark_cfqq_idle_window(cfqq); 2681 cfq_mark_cfqq_sync(cfqq); 2682 } 2683 cfqq->pid = pid; 2684} 2685 2686#ifdef CONFIG_CFQ_GROUP_IOSCHED 2687static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 2688{ 2689 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 2690 struct cfq_data *cfqd = cic->key; 2691 unsigned long flags; 2692 struct request_queue *q; 2693 2694 if (unlikely(!cfqd)) 2695 return; 2696 2697 q = cfqd->queue; 2698 2699 spin_lock_irqsave(q->queue_lock, flags); 2700 2701 if (sync_cfqq) { 2702 /* 2703 * Drop reference to sync queue. A new sync queue will be 2704 * assigned in new group upon arrival of a fresh request. 2705 */ 2706 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); 2707 cic_set_cfqq(cic, NULL, 1); 2708 cfq_put_queue(sync_cfqq); 2709 } 2710 2711 spin_unlock_irqrestore(q->queue_lock, flags); 2712} 2713 2714static void cfq_ioc_set_cgroup(struct io_context *ioc) 2715{ 2716 call_for_each_cic(ioc, changed_cgroup); 2717 ioc->cgroup_changed = 0; 2718} 2719#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 2720 2721static struct cfq_queue * 2722cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 2723 struct io_context *ioc, gfp_t gfp_mask) 2724{ 2725 struct cfq_queue *cfqq, *new_cfqq = NULL; 2726 struct cfq_io_context *cic; 2727 struct cfq_group *cfqg; 2728 2729retry: 2730 cfqg = cfq_get_cfqg(cfqd, 1); 2731 cic = cfq_cic_lookup(cfqd, ioc); 2732 /* cic always exists here */ 2733 cfqq = cic_to_cfqq(cic, is_sync); 2734 2735 /* 2736 * Always try a new alloc if we fell back to the OOM cfqq 2737 * originally, since it should just be a temporary situation. 2738 */ 2739 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 2740 cfqq = NULL; 2741 if (new_cfqq) { 2742 cfqq = new_cfqq; 2743 new_cfqq = NULL; 2744 } else if (gfp_mask & __GFP_WAIT) { 2745 spin_unlock_irq(cfqd->queue->queue_lock); 2746 new_cfqq = kmem_cache_alloc_node(cfq_pool, 2747 gfp_mask | __GFP_ZERO, 2748 cfqd->queue->node); 2749 spin_lock_irq(cfqd->queue->queue_lock); 2750 if (new_cfqq) 2751 goto retry; 2752 } else { 2753 cfqq = kmem_cache_alloc_node(cfq_pool, 2754 gfp_mask | __GFP_ZERO, 2755 cfqd->queue->node); 2756 } 2757 2758 if (cfqq) { 2759 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 2760 cfq_init_prio_data(cfqq, ioc); 2761 cfq_link_cfqq_cfqg(cfqq, cfqg); 2762 cfq_log_cfqq(cfqd, cfqq, "alloced"); 2763 } else 2764 cfqq = &cfqd->oom_cfqq; 2765 } 2766 2767 if (new_cfqq) 2768 kmem_cache_free(cfq_pool, new_cfqq); 2769 2770 return cfqq; 2771} 2772 2773static struct cfq_queue ** 2774cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) 2775{ 2776 switch (ioprio_class) { 2777 case IOPRIO_CLASS_RT: 2778 return &cfqd->async_cfqq[0][ioprio]; 2779 case IOPRIO_CLASS_BE: 2780 return &cfqd->async_cfqq[1][ioprio]; 2781 case IOPRIO_CLASS_IDLE: 2782 return &cfqd->async_idle_cfqq; 2783 default: 2784 BUG(); 2785 } 2786} 2787 2788static struct cfq_queue * 2789cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, 2790 gfp_t gfp_mask) 2791{ 2792 const int ioprio = task_ioprio(ioc); 2793 const int ioprio_class = task_ioprio_class(ioc); 2794 struct cfq_queue **async_cfqq = NULL; 2795 struct cfq_queue *cfqq = NULL; 2796 2797 if (!is_sync) { 2798 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 2799 cfqq = *async_cfqq; 2800 } 2801 2802 if (!cfqq) 2803 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 2804 2805 /* 2806 * pin the queue now that it's allocated, scheduler exit will prune it 2807 */ 2808 if (!is_sync && !(*async_cfqq)) { 2809 atomic_inc(&cfqq->ref); 2810 *async_cfqq = cfqq; 2811 } 2812 2813 atomic_inc(&cfqq->ref); 2814 return cfqq; 2815} 2816 2817/* 2818 * We drop cfq io contexts lazily, so we may find a dead one. 2819 */ 2820static void 2821cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, 2822 struct cfq_io_context *cic) 2823{ 2824 unsigned long flags; 2825 2826 WARN_ON(!list_empty(&cic->queue_list)); 2827 2828 spin_lock_irqsave(&ioc->lock, flags); 2829 2830 BUG_ON(ioc->ioc_data == cic); 2831 2832 radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd); 2833 hlist_del_rcu(&cic->cic_list); 2834 spin_unlock_irqrestore(&ioc->lock, flags); 2835 2836 cfq_cic_free(cic); 2837} 2838 2839static struct cfq_io_context * 2840cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 2841{ 2842 struct cfq_io_context *cic; 2843 unsigned long flags; 2844 void *k; 2845 2846 if (unlikely(!ioc)) 2847 return NULL; 2848 2849 rcu_read_lock(); 2850 2851 /* 2852 * we maintain a last-hit cache, to avoid browsing over the tree 2853 */ 2854 cic = rcu_dereference(ioc->ioc_data); 2855 if (cic && cic->key == cfqd) { 2856 rcu_read_unlock(); 2857 return cic; 2858 } 2859 2860 do { 2861 cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); 2862 rcu_read_unlock(); 2863 if (!cic) 2864 break; 2865 /* ->key must be copied to avoid race with cfq_exit_queue() */ 2866 k = cic->key; 2867 if (unlikely(!k)) { 2868 cfq_drop_dead_cic(cfqd, ioc, cic); 2869 rcu_read_lock(); 2870 continue; 2871 } 2872 2873 spin_lock_irqsave(&ioc->lock, flags); 2874 rcu_assign_pointer(ioc->ioc_data, cic); 2875 spin_unlock_irqrestore(&ioc->lock, flags); 2876 break; 2877 } while (1); 2878 2879 return cic; 2880} 2881 2882/* 2883 * Add cic into ioc, using cfqd as the search key. This enables us to lookup 2884 * the process specific cfq io context when entered from the block layer. 2885 * Also adds the cic to a per-cfqd list, used when this queue is removed. 2886 */ 2887static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 2888 struct cfq_io_context *cic, gfp_t gfp_mask) 2889{ 2890 unsigned long flags; 2891 int ret; 2892 2893 ret = radix_tree_preload(gfp_mask); 2894 if (!ret) { 2895 cic->ioc = ioc; 2896 cic->key = cfqd; 2897 2898 spin_lock_irqsave(&ioc->lock, flags); 2899 ret = radix_tree_insert(&ioc->radix_root, 2900 (unsigned long) cfqd, cic); 2901 if (!ret) 2902 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 2903 spin_unlock_irqrestore(&ioc->lock, flags); 2904 2905 radix_tree_preload_end(); 2906 2907 if (!ret) { 2908 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2909 list_add(&cic->queue_list, &cfqd->cic_list); 2910 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2911 } 2912 } 2913 2914 if (ret) 2915 printk(KERN_ERR "cfq: cic link failed!\n"); 2916 2917 return ret; 2918} 2919 2920/* 2921 * Setup general io context and cfq io context. There can be several cfq 2922 * io contexts per general io context, if this process is doing io to more 2923 * than one device managed by cfq. 2924 */ 2925static struct cfq_io_context * 2926cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 2927{ 2928 struct io_context *ioc = NULL; 2929 struct cfq_io_context *cic; 2930 2931 might_sleep_if(gfp_mask & __GFP_WAIT); 2932 2933 ioc = get_io_context(gfp_mask, cfqd->queue->node); 2934 if (!ioc) 2935 return NULL; 2936 2937 cic = cfq_cic_lookup(cfqd, ioc); 2938 if (cic) 2939 goto out; 2940 2941 cic = cfq_alloc_io_context(cfqd, gfp_mask); 2942 if (cic == NULL) 2943 goto err; 2944 2945 if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) 2946 goto err_free; 2947 2948out: 2949 smp_read_barrier_depends(); 2950 if (unlikely(ioc->ioprio_changed)) 2951 cfq_ioc_set_ioprio(ioc); 2952 2953#ifdef CONFIG_CFQ_GROUP_IOSCHED 2954 if (unlikely(ioc->cgroup_changed)) 2955 cfq_ioc_set_cgroup(ioc); 2956#endif 2957 return cic; 2958err_free: 2959 cfq_cic_free(cic); 2960err: 2961 put_io_context(ioc); 2962 return NULL; 2963} 2964 2965static void 2966cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 2967{ 2968 unsigned long elapsed = jiffies - cic->last_end_request; 2969 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 2970 2971 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 2972 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 2973 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 2974} 2975 2976static void 2977cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2978 struct request *rq) 2979{ 2980 sector_t sdist; 2981 u64 total; 2982 2983 if (!cfqq->last_request_pos) 2984 sdist = 0; 2985 else if (cfqq->last_request_pos < blk_rq_pos(rq)) 2986 sdist = blk_rq_pos(rq) - cfqq->last_request_pos; 2987 else 2988 sdist = cfqq->last_request_pos - blk_rq_pos(rq); 2989 2990 /* 2991 * Don't allow the seek distance to get too large from the 2992 * odd fragment, pagein, etc 2993 */ 2994 if (cfqq->seek_samples <= 60) /* second&third seek */ 2995 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024); 2996 else 2997 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64); 2998 2999 cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8; 3000 cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8; 3001 total = cfqq->seek_total + (cfqq->seek_samples/2); 3002 do_div(total, cfqq->seek_samples); 3003 cfqq->seek_mean = (sector_t)total; 3004 3005 /* 3006 * If this cfqq is shared between multiple processes, check to 3007 * make sure that those processes are still issuing I/Os within 3008 * the mean seek distance. If not, it may be time to break the 3009 * queues apart again. 3010 */ 3011 if (cfq_cfqq_coop(cfqq)) { 3012 if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start) 3013 cfqq->seeky_start = jiffies; 3014 else if (!CFQQ_SEEKY(cfqq)) 3015 cfqq->seeky_start = 0; 3016 } 3017} 3018 3019/* 3020 * Disable idle window if the process thinks too long or seeks so much that 3021 * it doesn't matter 3022 */ 3023static void 3024cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3025 struct cfq_io_context *cic) 3026{ 3027 int old_idle, enable_idle; 3028 3029 /* 3030 * Don't idle for async or idle io prio class 3031 */ 3032 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) 3033 return; 3034 3035 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3036 3037 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3038 cfq_mark_cfqq_deep(cfqq); 3039 3040 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3041 (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples) 3042 && CFQQ_SEEKY(cfqq))) 3043 enable_idle = 0; 3044 else if (sample_valid(cic->ttime_samples)) { 3045 if (cic->ttime_mean > cfqd->cfq_slice_idle) 3046 enable_idle = 0; 3047 else 3048 enable_idle = 1; 3049 } 3050 3051 if (old_idle != enable_idle) { 3052 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); 3053 if (enable_idle) 3054 cfq_mark_cfqq_idle_window(cfqq); 3055 else 3056 cfq_clear_cfqq_idle_window(cfqq); 3057 } 3058} 3059 3060/* 3061 * Check if new_cfqq should preempt the currently active queue. Return 0 for 3062 * no or if we aren't sure, a 1 will cause a preempt. 3063 */ 3064static bool 3065cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 3066 struct request *rq) 3067{ 3068 struct cfq_queue *cfqq; 3069 3070 cfqq = cfqd->active_queue; 3071 if (!cfqq) 3072 return false; 3073 3074 if (cfq_class_idle(new_cfqq)) 3075 return false; 3076 3077 if (cfq_class_idle(cfqq)) 3078 return true; 3079 3080 /* 3081 * if the new request is sync, but the currently running queue is 3082 * not, let the sync request have priority. 3083 */ 3084 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3085 return true; 3086 3087 if (new_cfqq->cfqg != cfqq->cfqg) 3088 return false; 3089 3090 if (cfq_slice_used(cfqq)) 3091 return true; 3092 3093 /* Allow preemption only if we are idling on sync-noidle tree */ 3094 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && 3095 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 3096 new_cfqq->service_tree->count == 2 && 3097 RB_EMPTY_ROOT(&cfqq->sort_list)) 3098 return true; 3099 3100 /* 3101 * So both queues are sync. Let the new request get disk time if 3102 * it's a metadata request and the current queue is doing regular IO. 3103 */ 3104 if (rq_is_meta(rq) && !cfqq->meta_pending) 3105 return true; 3106 3107 /* 3108 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. 3109 */ 3110 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 3111 return true; 3112 3113 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 3114 return false; 3115 3116 /* 3117 * if this request is as-good as one we would expect from the 3118 * current cfqq, let it preempt 3119 */ 3120 if (cfq_rq_close(cfqd, cfqq, rq)) 3121 return true; 3122 3123 return false; 3124} 3125 3126/* 3127 * cfqq preempts the active queue. if we allowed preempt with no slice left, 3128 * let it have half of its nominal slice. 3129 */ 3130static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3131{ 3132 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3133 cfq_slice_expired(cfqd, 1); 3134 3135 /* 3136 * Put the new queue at the front of the of the current list, 3137 * so we know that it will be selected next. 3138 */ 3139 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 3140 3141 cfq_service_tree_add(cfqd, cfqq, 1); 3142 3143 cfqq->slice_end = 0; 3144 cfq_mark_cfqq_slice_new(cfqq); 3145} 3146 3147/* 3148 * Called when a new fs request (rq) is added (to cfqq). Check if there's 3149 * something we should do about it 3150 */ 3151static void 3152cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3153 struct request *rq) 3154{ 3155 struct cfq_io_context *cic = RQ_CIC(rq); 3156 3157 cfqd->rq_queued++; 3158 if (rq_is_meta(rq)) 3159 cfqq->meta_pending++; 3160 3161 cfq_update_io_thinktime(cfqd, cic); 3162 cfq_update_io_seektime(cfqd, cfqq, rq); 3163 cfq_update_idle_window(cfqd, cfqq, cic); 3164 3165 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3166 3167 if (cfqq == cfqd->active_queue) { 3168 if (cfq_cfqq_wait_busy(cfqq)) { 3169 cfq_clear_cfqq_wait_busy(cfqq); 3170 cfq_mark_cfqq_wait_busy_done(cfqq); 3171 } 3172 /* 3173 * Remember that we saw a request from this process, but 3174 * don't start queuing just yet. Otherwise we risk seeing lots 3175 * of tiny requests, because we disrupt the normal plugging 3176 * and merging. If the request is already larger than a single 3177 * page, let it rip immediately. For that case we assume that 3178 * merging is already done. Ditto for a busy system that 3179 * has other work pending, don't risk delaying until the 3180 * idle timer unplug to continue working. 3181 */ 3182 if (cfq_cfqq_wait_request(cfqq)) { 3183 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3184 cfqd->busy_queues > 1) { 3185 del_timer(&cfqd->idle_slice_timer); 3186 __blk_run_queue(cfqd->queue); 3187 } else 3188 cfq_mark_cfqq_must_dispatch(cfqq); 3189 } 3190 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3191 /* 3192 * not the active queue - expire current slice if it is 3193 * idle and has expired it's mean thinktime or this new queue 3194 * has some old slice time left and is of higher priority or 3195 * this new queue is RT and the current one is BE 3196 */ 3197 cfq_preempt_queue(cfqd, cfqq); 3198 __blk_run_queue(cfqd->queue); 3199 } 3200} 3201 3202static void cfq_insert_request(struct request_queue *q, struct request *rq) 3203{ 3204 struct cfq_data *cfqd = q->elevator->elevator_data; 3205 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3206 3207 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3208 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3209 3210 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3211 list_add_tail(&rq->queuelist, &cfqq->fifo); 3212 cfq_add_rq_rb(rq); 3213 3214 cfq_rq_enqueued(cfqd, cfqq, rq); 3215} 3216 3217/* 3218 * Update hw_tag based on peak queue depth over 50 samples under 3219 * sufficient load. 3220 */ 3221static void cfq_update_hw_tag(struct cfq_data *cfqd) 3222{ 3223 struct cfq_queue *cfqq = cfqd->active_queue; 3224 3225 if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth) 3226 cfqd->hw_tag_est_depth = rq_in_driver(cfqd); 3227 3228 if (cfqd->hw_tag == 1) 3229 return; 3230 3231 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 3232 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) 3233 return; 3234 3235 /* 3236 * If active queue hasn't enough requests and can idle, cfq might not 3237 * dispatch sufficient requests to hardware. Don't zero hw_tag in this 3238 * case 3239 */ 3240 if (cfqq && cfq_cfqq_idle_window(cfqq) && 3241 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < 3242 CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN) 3243 return; 3244 3245 if (cfqd->hw_tag_samples++ < 50) 3246 return; 3247 3248 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) 3249 cfqd->hw_tag = 1; 3250 else 3251 cfqd->hw_tag = 0; 3252} 3253 3254static void cfq_completed_request(struct request_queue *q, struct request *rq) 3255{ 3256 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3257 struct cfq_data *cfqd = cfqq->cfqd; 3258 const int sync = rq_is_sync(rq); 3259 unsigned long now; 3260 3261 now = jiffies; 3262 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq)); 3263 3264 cfq_update_hw_tag(cfqd); 3265 3266 WARN_ON(!cfqd->rq_in_driver[sync]); 3267 WARN_ON(!cfqq->dispatched); 3268 cfqd->rq_in_driver[sync]--; 3269 cfqq->dispatched--; 3270 3271 if (cfq_cfqq_sync(cfqq)) 3272 cfqd->sync_flight--; 3273 3274 if (sync) { 3275 RQ_CIC(rq)->last_end_request = now; 3276 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) 3277 cfqd->last_delayed_sync = now; 3278 } 3279 3280 /* 3281 * If this is the active queue, check if it needs to be expired, 3282 * or if we want to idle in case it has no pending requests. 3283 */ 3284 if (cfqd->active_queue == cfqq) { 3285 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 3286 3287 if (cfq_cfqq_slice_new(cfqq)) { 3288 cfq_set_prio_slice(cfqd, cfqq); 3289 cfq_clear_cfqq_slice_new(cfqq); 3290 } 3291 3292 /* 3293 * If this queue consumed its slice and this is last queue 3294 * in the group, wait for next request before we expire 3295 * the queue 3296 */ 3297 if (cfq_slice_used(cfqq) && cfqq->cfqg->nr_cfqq == 1) { 3298 cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; 3299 cfq_mark_cfqq_wait_busy(cfqq); 3300 } 3301 3302 /* 3303 * Idling is not enabled on: 3304 * - expired queues 3305 * - idle-priority queues 3306 * - async queues 3307 * - queues with still some requests queued 3308 * - when there is a close cooperator 3309 */ 3310 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3311 cfq_slice_expired(cfqd, 1); 3312 else if (sync && cfqq_empty && 3313 !cfq_close_cooperator(cfqd, cfqq)) { 3314 cfqd->noidle_tree_requires_idle |= !rq_noidle(rq); 3315 /* 3316 * Idling is enabled for SYNC_WORKLOAD. 3317 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree 3318 * only if we processed at least one !rq_noidle request 3319 */ 3320 if (cfqd->serving_type == SYNC_WORKLOAD 3321 || cfqd->noidle_tree_requires_idle 3322 || cfqq->cfqg->nr_cfqq == 1) 3323 cfq_arm_slice_timer(cfqd); 3324 } 3325 } 3326 3327 if (!rq_in_driver(cfqd)) 3328 cfq_schedule_dispatch(cfqd); 3329} 3330 3331/* 3332 * we temporarily boost lower priority queues if they are holding fs exclusive 3333 * resources. they are boosted to normal prio (CLASS_BE/4) 3334 */ 3335static void cfq_prio_boost(struct cfq_queue *cfqq) 3336{ 3337 if (has_fs_excl()) { 3338 /* 3339 * boost idle prio on transactions that would lock out other 3340 * users of the filesystem 3341 */ 3342 if (cfq_class_idle(cfqq)) 3343 cfqq->ioprio_class = IOPRIO_CLASS_BE; 3344 if (cfqq->ioprio > IOPRIO_NORM) 3345 cfqq->ioprio = IOPRIO_NORM; 3346 } else { 3347 /* 3348 * unboost the queue (if needed) 3349 */ 3350 cfqq->ioprio_class = cfqq->org_ioprio_class; 3351 cfqq->ioprio = cfqq->org_ioprio; 3352 } 3353} 3354 3355static inline int __cfq_may_queue(struct cfq_queue *cfqq) 3356{ 3357 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { 3358 cfq_mark_cfqq_must_alloc_slice(cfqq); 3359 return ELV_MQUEUE_MUST; 3360 } 3361 3362 return ELV_MQUEUE_MAY; 3363} 3364 3365static int cfq_may_queue(struct request_queue *q, int rw) 3366{ 3367 struct cfq_data *cfqd = q->elevator->elevator_data; 3368 struct task_struct *tsk = current; 3369 struct cfq_io_context *cic; 3370 struct cfq_queue *cfqq; 3371 3372 /* 3373 * don't force setup of a queue from here, as a call to may_queue 3374 * does not necessarily imply that a request actually will be queued. 3375 * so just lookup a possibly existing queue, or return 'may queue' 3376 * if that fails 3377 */ 3378 cic = cfq_cic_lookup(cfqd, tsk->io_context); 3379 if (!cic) 3380 return ELV_MQUEUE_MAY; 3381 3382 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3383 if (cfqq) { 3384 cfq_init_prio_data(cfqq, cic->ioc); 3385 cfq_prio_boost(cfqq); 3386 3387 return __cfq_may_queue(cfqq); 3388 } 3389 3390 return ELV_MQUEUE_MAY; 3391} 3392 3393/* 3394 * queue lock held here 3395 */ 3396static void cfq_put_request(struct request *rq) 3397{ 3398 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3399 3400 if (cfqq) { 3401 const int rw = rq_data_dir(rq); 3402 3403 BUG_ON(!cfqq->allocated[rw]); 3404 cfqq->allocated[rw]--; 3405 3406 put_io_context(RQ_CIC(rq)->ioc); 3407 3408 rq->elevator_private = NULL; 3409 rq->elevator_private2 = NULL; 3410 3411 cfq_put_queue(cfqq); 3412 } 3413} 3414 3415static struct cfq_queue * 3416cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, 3417 struct cfq_queue *cfqq) 3418{ 3419 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 3420 cic_set_cfqq(cic, cfqq->new_cfqq, 1); 3421 cfq_mark_cfqq_coop(cfqq->new_cfqq); 3422 cfq_put_queue(cfqq); 3423 return cic_to_cfqq(cic, 1); 3424} 3425 3426static int should_split_cfqq(struct cfq_queue *cfqq) 3427{ 3428 if (cfqq->seeky_start && 3429 time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT)) 3430 return 1; 3431 return 0; 3432} 3433 3434/* 3435 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this 3436 * was the last process referring to said cfqq. 3437 */ 3438static struct cfq_queue * 3439split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) 3440{ 3441 if (cfqq_process_refs(cfqq) == 1) { 3442 cfqq->seeky_start = 0; 3443 cfqq->pid = current->pid; 3444 cfq_clear_cfqq_coop(cfqq); 3445 return cfqq; 3446 } 3447 3448 cic_set_cfqq(cic, NULL, 1); 3449 cfq_put_queue(cfqq); 3450 return NULL; 3451} 3452/* 3453 * Allocate cfq data structures associated with this request. 3454 */ 3455static int 3456cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 3457{ 3458 struct cfq_data *cfqd = q->elevator->elevator_data; 3459 struct cfq_io_context *cic; 3460 const int rw = rq_data_dir(rq); 3461 const bool is_sync = rq_is_sync(rq); 3462 struct cfq_queue *cfqq; 3463 unsigned long flags; 3464 3465 might_sleep_if(gfp_mask & __GFP_WAIT); 3466 3467 cic = cfq_get_io_context(cfqd, gfp_mask); 3468 3469 spin_lock_irqsave(q->queue_lock, flags); 3470 3471 if (!cic) 3472 goto queue_fail; 3473 3474new_queue: 3475 cfqq = cic_to_cfqq(cic, is_sync); 3476 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3477 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3478 cic_set_cfqq(cic, cfqq, is_sync); 3479 } else { 3480 /* 3481 * If the queue was seeky for too long, break it apart. 3482 */ 3483 if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) { 3484 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); 3485 cfqq = split_cfqq(cic, cfqq); 3486 if (!cfqq) 3487 goto new_queue; 3488 } 3489 3490 /* 3491 * Check to see if this queue is scheduled to merge with 3492 * another, closely cooperating queue. The merging of 3493 * queues happens here as it must be done in process context. 3494 * The reference on new_cfqq was taken in merge_cfqqs. 3495 */ 3496 if (cfqq->new_cfqq) 3497 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); 3498 } 3499 3500 cfqq->allocated[rw]++; 3501 atomic_inc(&cfqq->ref); 3502 3503 spin_unlock_irqrestore(q->queue_lock, flags); 3504 3505 rq->elevator_private = cic; 3506 rq->elevator_private2 = cfqq; 3507 return 0; 3508 3509queue_fail: 3510 if (cic) 3511 put_io_context(cic->ioc); 3512 3513 cfq_schedule_dispatch(cfqd); 3514 spin_unlock_irqrestore(q->queue_lock, flags); 3515 cfq_log(cfqd, "set_request fail"); 3516 return 1; 3517} 3518 3519static void cfq_kick_queue(struct work_struct *work) 3520{ 3521 struct cfq_data *cfqd = 3522 container_of(work, struct cfq_data, unplug_work); 3523 struct request_queue *q = cfqd->queue; 3524 3525 spin_lock_irq(q->queue_lock); 3526 __blk_run_queue(cfqd->queue); 3527 spin_unlock_irq(q->queue_lock); 3528} 3529 3530/* 3531 * Timer running if the active_queue is currently idling inside its time slice 3532 */ 3533static void cfq_idle_slice_timer(unsigned long data) 3534{ 3535 struct cfq_data *cfqd = (struct cfq_data *) data; 3536 struct cfq_queue *cfqq; 3537 unsigned long flags; 3538 int timed_out = 1; 3539 3540 cfq_log(cfqd, "idle timer fired"); 3541 3542 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3543 3544 cfqq = cfqd->active_queue; 3545 if (cfqq) { 3546 timed_out = 0; 3547 3548 /* 3549 * We saw a request before the queue expired, let it through 3550 */ 3551 if (cfq_cfqq_must_dispatch(cfqq)) 3552 goto out_kick; 3553 3554 /* 3555 * expired 3556 */ 3557 if (cfq_slice_used(cfqq)) 3558 goto expire; 3559 3560 /* 3561 * only expire and reinvoke request handler, if there are 3562 * other queues with pending requests 3563 */ 3564 if (!cfqd->busy_queues) 3565 goto out_cont; 3566 3567 /* 3568 * not expired and it has a request pending, let it dispatch 3569 */ 3570 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3571 goto out_kick; 3572 3573 /* 3574 * Queue depth flag is reset only when the idle didn't succeed 3575 */ 3576 cfq_clear_cfqq_deep(cfqq); 3577 } 3578expire: 3579 cfq_slice_expired(cfqd, timed_out); 3580out_kick: 3581 cfq_schedule_dispatch(cfqd); 3582out_cont: 3583 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3584} 3585 3586static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 3587{ 3588 del_timer_sync(&cfqd->idle_slice_timer); 3589 cancel_work_sync(&cfqd->unplug_work); 3590} 3591 3592static void cfq_put_async_queues(struct cfq_data *cfqd) 3593{ 3594 int i; 3595 3596 for (i = 0; i < IOPRIO_BE_NR; i++) { 3597 if (cfqd->async_cfqq[0][i]) 3598 cfq_put_queue(cfqd->async_cfqq[0][i]); 3599 if (cfqd->async_cfqq[1][i]) 3600 cfq_put_queue(cfqd->async_cfqq[1][i]); 3601 } 3602 3603 if (cfqd->async_idle_cfqq) 3604 cfq_put_queue(cfqd->async_idle_cfqq); 3605} 3606 3607static void cfq_cfqd_free(struct rcu_head *head) 3608{ 3609 kfree(container_of(head, struct cfq_data, rcu)); 3610} 3611 3612static void cfq_exit_queue(struct elevator_queue *e) 3613{ 3614 struct cfq_data *cfqd = e->elevator_data; 3615 struct request_queue *q = cfqd->queue; 3616 3617 cfq_shutdown_timer_wq(cfqd); 3618 3619 spin_lock_irq(q->queue_lock); 3620 3621 if (cfqd->active_queue) 3622 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3623 3624 while (!list_empty(&cfqd->cic_list)) { 3625 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 3626 struct cfq_io_context, 3627 queue_list); 3628 3629 __cfq_exit_single_io_context(cfqd, cic); 3630 } 3631 3632 cfq_put_async_queues(cfqd); 3633 cfq_release_cfq_groups(cfqd); 3634 blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3635 3636 spin_unlock_irq(q->queue_lock); 3637 3638 cfq_shutdown_timer_wq(cfqd); 3639 3640 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3641 call_rcu(&cfqd->rcu, cfq_cfqd_free); 3642} 3643 3644static void *cfq_init_queue(struct request_queue *q) 3645{ 3646 struct cfq_data *cfqd; 3647 int i, j; 3648 struct cfq_group *cfqg; 3649 struct cfq_rb_root *st; 3650 3651 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3652 if (!cfqd) 3653 return NULL; 3654 3655 /* Init root service tree */ 3656 cfqd->grp_service_tree = CFQ_RB_ROOT; 3657 3658 /* Init root group */ 3659 cfqg = &cfqd->root_group; 3660 for_each_cfqg_st(cfqg, i, j, st) 3661 *st = CFQ_RB_ROOT; 3662 RB_CLEAR_NODE(&cfqg->rb_node); 3663 3664 /* Give preference to root group over other groups */ 3665 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; 3666 3667#ifdef CONFIG_CFQ_GROUP_IOSCHED 3668 /* 3669 * Take a reference to root group which we never drop. This is just 3670 * to make sure that cfq_put_cfqg() does not try to kfree root group 3671 */ 3672 atomic_set(&cfqg->ref, 1); 3673 blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, 3674 0); 3675#endif 3676 /* 3677 * Not strictly needed (since RB_ROOT just clears the node and we 3678 * zeroed cfqd on alloc), but better be safe in case someone decides 3679 * to add magic to the rb code 3680 */ 3681 for (i = 0; i < CFQ_PRIO_LISTS; i++) 3682 cfqd->prio_trees[i] = RB_ROOT; 3683 3684 /* 3685 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 3686 * Grab a permanent reference to it, so that the normal code flow 3687 * will not attempt to free it. 3688 */ 3689 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3690 atomic_inc(&cfqd->oom_cfqq.ref); 3691 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 3692 3693 INIT_LIST_HEAD(&cfqd->cic_list); 3694 3695 cfqd->queue = q; 3696 3697 init_timer(&cfqd->idle_slice_timer); 3698 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 3699 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 3700 3701 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 3702 3703 cfqd->cfq_quantum = cfq_quantum; 3704 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 3705 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 3706 cfqd->cfq_back_max = cfq_back_max; 3707 cfqd->cfq_back_penalty = cfq_back_penalty; 3708 cfqd->cfq_slice[0] = cfq_slice_async; 3709 cfqd->cfq_slice[1] = cfq_slice_sync; 3710 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3711 cfqd->cfq_slice_idle = cfq_slice_idle; 3712 cfqd->cfq_latency = 1; 3713 cfqd->cfq_group_isolation = 0; 3714 cfqd->hw_tag = -1; 3715 cfqd->last_delayed_sync = jiffies - HZ; 3716 INIT_RCU_HEAD(&cfqd->rcu); 3717 return cfqd; 3718} 3719 3720static void cfq_slab_kill(void) 3721{ 3722 /* 3723 * Caller already ensured that pending RCU callbacks are completed, 3724 * so we should have no busy allocations at this point. 3725 */ 3726 if (cfq_pool) 3727 kmem_cache_destroy(cfq_pool); 3728 if (cfq_ioc_pool) 3729 kmem_cache_destroy(cfq_ioc_pool); 3730} 3731 3732static int __init cfq_slab_setup(void) 3733{ 3734 cfq_pool = KMEM_CACHE(cfq_queue, 0); 3735 if (!cfq_pool) 3736 goto fail; 3737 3738 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); 3739 if (!cfq_ioc_pool) 3740 goto fail; 3741 3742 return 0; 3743fail: 3744 cfq_slab_kill(); 3745 return -ENOMEM; 3746} 3747 3748/* 3749 * sysfs parts below --> 3750 */ 3751static ssize_t 3752cfq_var_show(unsigned int var, char *page) 3753{ 3754 return sprintf(page, "%d\n", var); 3755} 3756 3757static ssize_t 3758cfq_var_store(unsigned int *var, const char *page, size_t count) 3759{ 3760 char *p = (char *) page; 3761 3762 *var = simple_strtoul(p, &p, 10); 3763 return count; 3764} 3765 3766#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 3767static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 3768{ \ 3769 struct cfq_data *cfqd = e->elevator_data; \ 3770 unsigned int __data = __VAR; \ 3771 if (__CONV) \ 3772 __data = jiffies_to_msecs(__data); \ 3773 return cfq_var_show(__data, (page)); \ 3774} 3775SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 3776SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 3777SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 3778SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 3779SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 3780SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 3781SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 3782SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 3783SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 3784SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 3785SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); 3786#undef SHOW_FUNCTION 3787 3788#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 3789static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 3790{ \ 3791 struct cfq_data *cfqd = e->elevator_data; \ 3792 unsigned int __data; \ 3793 int ret = cfq_var_store(&__data, (page), count); \ 3794 if (__data < (MIN)) \ 3795 __data = (MIN); \ 3796 else if (__data > (MAX)) \ 3797 __data = (MAX); \ 3798 if (__CONV) \ 3799 *(__PTR) = msecs_to_jiffies(__data); \ 3800 else \ 3801 *(__PTR) = __data; \ 3802 return ret; \ 3803} 3804STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 3805STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, 3806 UINT_MAX, 1); 3807STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, 3808 UINT_MAX, 1); 3809STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 3810STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 3811 UINT_MAX, 0); 3812STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 3813STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 3814STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 3815STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 3816 UINT_MAX, 0); 3817STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 3818STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); 3819#undef STORE_FUNCTION 3820 3821#define CFQ_ATTR(name) \ 3822 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 3823 3824static struct elv_fs_entry cfq_attrs[] = { 3825 CFQ_ATTR(quantum), 3826 CFQ_ATTR(fifo_expire_sync), 3827 CFQ_ATTR(fifo_expire_async), 3828 CFQ_ATTR(back_seek_max), 3829 CFQ_ATTR(back_seek_penalty), 3830 CFQ_ATTR(slice_sync), 3831 CFQ_ATTR(slice_async), 3832 CFQ_ATTR(slice_async_rq), 3833 CFQ_ATTR(slice_idle), 3834 CFQ_ATTR(low_latency), 3835 CFQ_ATTR(group_isolation), 3836 __ATTR_NULL 3837}; 3838 3839static struct elevator_type iosched_cfq = { 3840 .ops = { 3841 .elevator_merge_fn = cfq_merge, 3842 .elevator_merged_fn = cfq_merged_request, 3843 .elevator_merge_req_fn = cfq_merged_requests, 3844 .elevator_allow_merge_fn = cfq_allow_merge, 3845 .elevator_dispatch_fn = cfq_dispatch_requests, 3846 .elevator_add_req_fn = cfq_insert_request, 3847 .elevator_activate_req_fn = cfq_activate_request, 3848 .elevator_deactivate_req_fn = cfq_deactivate_request, 3849 .elevator_queue_empty_fn = cfq_queue_empty, 3850 .elevator_completed_req_fn = cfq_completed_request, 3851 .elevator_former_req_fn = elv_rb_former_request, 3852 .elevator_latter_req_fn = elv_rb_latter_request, 3853 .elevator_set_req_fn = cfq_set_request, 3854 .elevator_put_req_fn = cfq_put_request, 3855 .elevator_may_queue_fn = cfq_may_queue, 3856 .elevator_init_fn = cfq_init_queue, 3857 .elevator_exit_fn = cfq_exit_queue, 3858 .trim = cfq_free_io_context, 3859 }, 3860 .elevator_attrs = cfq_attrs, 3861 .elevator_name = "cfq", 3862 .elevator_owner = THIS_MODULE, 3863}; 3864 3865#ifdef CONFIG_CFQ_GROUP_IOSCHED 3866static struct blkio_policy_type blkio_policy_cfq = { 3867 .ops = { 3868 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 3869 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 3870 }, 3871}; 3872#else 3873static struct blkio_policy_type blkio_policy_cfq; 3874#endif 3875 3876static int __init cfq_init(void) 3877{ 3878 /* 3879 * could be 0 on HZ < 1000 setups 3880 */ 3881 if (!cfq_slice_async) 3882 cfq_slice_async = 1; 3883 if (!cfq_slice_idle) 3884 cfq_slice_idle = 1; 3885 3886 if (cfq_slab_setup()) 3887 return -ENOMEM; 3888 3889 elv_register(&iosched_cfq); 3890 blkio_policy_register(&blkio_policy_cfq); 3891 3892 return 0; 3893} 3894 3895static void __exit cfq_exit(void) 3896{ 3897 DECLARE_COMPLETION_ONSTACK(all_gone); 3898 blkio_policy_unregister(&blkio_policy_cfq); 3899 elv_unregister(&iosched_cfq); 3900 ioc_gone = &all_gone; 3901 /* ioc_gone's update must be visible before reading ioc_count */ 3902 smp_wmb(); 3903 3904 /* 3905 * this also protects us from entering cfq_slab_kill() with 3906 * pending RCU callbacks 3907 */ 3908 if (elv_ioc_count_read(cfq_ioc_count)) 3909 wait_for_completion(&all_gone); 3910 cfq_slab_kill(); 3911} 3912 3913module_init(cfq_init); 3914module_exit(cfq_exit); 3915 3916MODULE_AUTHOR("Jens Axboe"); 3917MODULE_LICENSE("GPL"); 3918MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 3919