cfq-iosched.c revision 9d6a986c0b276085f7944cd8ad65f4f82aff7536
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/blkdev.h> 11#include <linux/elevator.h> 12#include <linux/jiffies.h> 13#include <linux/rbtree.h> 14#include <linux/ioprio.h> 15#include <linux/blktrace_api.h> 16#include "blk-cgroup.h" 17#include "cfq-iosched.h" 18 19/* 20 * tunables 21 */ 22/* max queue in one round of service */ 23static const int cfq_quantum = 4; 24static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 25/* maximum backwards seek, in KiB */ 26static const int cfq_back_max = 16 * 1024; 27/* penalty of a backwards seek */ 28static const int cfq_back_penalty = 2; 29static const int cfq_slice_sync = HZ / 10; 30static int cfq_slice_async = HZ / 25; 31static const int cfq_slice_async_rq = 2; 32static int cfq_slice_idle = HZ / 125; 33static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ 34static const int cfq_hist_divisor = 4; 35 36/* 37 * offset from end of service tree 38 */ 39#define CFQ_IDLE_DELAY (HZ / 5) 40 41/* 42 * below this threshold, we consider thinktime immediate 43 */ 44#define CFQ_MIN_TT (2) 45 46/* 47 * Allow merged cfqqs to perform this amount of seeky I/O before 48 * deciding to break the queues up again. 49 */ 50#define CFQQ_COOP_TOUT (HZ) 51 52#define CFQ_SLICE_SCALE (5) 53#define CFQ_HW_QUEUE_MIN (5) 54#define CFQ_SERVICE_SHIFT 12 55 56#define RQ_CIC(rq) \ 57 ((struct cfq_io_context *) (rq)->elevator_private) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 59 60static struct kmem_cache *cfq_pool; 61static struct kmem_cache *cfq_ioc_pool; 62 63static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); 64static struct completion *ioc_gone; 65static DEFINE_SPINLOCK(ioc_gone_lock); 66 67#define CFQ_PRIO_LISTS IOPRIO_BE_NR 68#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 69#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 70 71#define sample_valid(samples) ((samples) > 80) 72#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 73 74/* 75 * Most of our rbtree usage is for sorting with min extraction, so 76 * if we cache the leftmost node we don't have to walk down the tree 77 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 78 * move this into the elevator for the rq sorting as well. 79 */ 80struct cfq_rb_root { 81 struct rb_root rb; 82 struct rb_node *left; 83 unsigned count; 84 u64 min_vdisktime; 85 struct rb_node *active; 86 unsigned total_weight; 87}; 88#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } 89 90/* 91 * Per process-grouping structure 92 */ 93struct cfq_queue { 94 /* reference count */ 95 atomic_t ref; 96 /* various state flags, see below */ 97 unsigned int flags; 98 /* parent cfq_data */ 99 struct cfq_data *cfqd; 100 /* service_tree member */ 101 struct rb_node rb_node; 102 /* service_tree key */ 103 unsigned long rb_key; 104 /* prio tree member */ 105 struct rb_node p_node; 106 /* prio tree root we belong to, if any */ 107 struct rb_root *p_root; 108 /* sorted list of pending requests */ 109 struct rb_root sort_list; 110 /* if fifo isn't expired, next request to serve */ 111 struct request *next_rq; 112 /* requests queued in sort_list */ 113 int queued[2]; 114 /* currently allocated requests */ 115 int allocated[2]; 116 /* fifo list of requests in sort_list */ 117 struct list_head fifo; 118 119 /* time when queue got scheduled in to dispatch first request. */ 120 unsigned long dispatch_start; 121 unsigned int allocated_slice; 122 /* time when first request from queue completed and slice started. */ 123 unsigned long slice_start; 124 unsigned long slice_end; 125 long slice_resid; 126 unsigned int slice_dispatch; 127 128 /* pending metadata requests */ 129 int meta_pending; 130 /* number of requests that are on the dispatch list or inside driver */ 131 int dispatched; 132 133 /* io prio of this group */ 134 unsigned short ioprio, org_ioprio; 135 unsigned short ioprio_class, org_ioprio_class; 136 137 unsigned int seek_samples; 138 u64 seek_total; 139 sector_t seek_mean; 140 sector_t last_request_pos; 141 unsigned long seeky_start; 142 143 pid_t pid; 144 145 struct cfq_rb_root *service_tree; 146 struct cfq_queue *new_cfqq; 147 struct cfq_group *cfqg; 148 struct cfq_group *orig_cfqg; 149 /* Sectors dispatched in current dispatch round */ 150 unsigned long nr_sectors; 151}; 152 153/* 154 * First index in the service_trees. 155 * IDLE is handled separately, so it has negative index 156 */ 157enum wl_prio_t { 158 BE_WORKLOAD = 0, 159 RT_WORKLOAD = 1, 160 IDLE_WORKLOAD = 2, 161}; 162 163/* 164 * Second index in the service_trees. 165 */ 166enum wl_type_t { 167 ASYNC_WORKLOAD = 0, 168 SYNC_NOIDLE_WORKLOAD = 1, 169 SYNC_WORKLOAD = 2 170}; 171 172/* This is per cgroup per device grouping structure */ 173struct cfq_group { 174 /* group service_tree member */ 175 struct rb_node rb_node; 176 177 /* group service_tree key */ 178 u64 vdisktime; 179 unsigned int weight; 180 bool on_st; 181 182 /* number of cfqq currently on this group */ 183 int nr_cfqq; 184 185 /* Per group busy queus average. Useful for workload slice calc. */ 186 unsigned int busy_queues_avg[2]; 187 /* 188 * rr lists of queues with requests, onle rr for each priority class. 189 * Counts are embedded in the cfq_rb_root 190 */ 191 struct cfq_rb_root service_trees[2][3]; 192 struct cfq_rb_root service_tree_idle; 193 194 unsigned long saved_workload_slice; 195 enum wl_type_t saved_workload; 196 enum wl_prio_t saved_serving_prio; 197 struct blkio_group blkg; 198#ifdef CONFIG_CFQ_GROUP_IOSCHED 199 struct hlist_node cfqd_node; 200 atomic_t ref; 201#endif 202}; 203 204/* 205 * Per block device queue structure 206 */ 207struct cfq_data { 208 struct request_queue *queue; 209 /* Root service tree for cfq_groups */ 210 struct cfq_rb_root grp_service_tree; 211 struct cfq_group root_group; 212 /* Number of active cfq groups on group service tree */ 213 int nr_groups; 214 215 /* 216 * The priority currently being served 217 */ 218 enum wl_prio_t serving_prio; 219 enum wl_type_t serving_type; 220 unsigned long workload_expires; 221 struct cfq_group *serving_group; 222 bool noidle_tree_requires_idle; 223 224 /* 225 * Each priority tree is sorted by next_request position. These 226 * trees are used when determining if two or more queues are 227 * interleaving requests (see cfq_close_cooperator). 228 */ 229 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 230 231 unsigned int busy_queues; 232 233 int rq_in_driver[2]; 234 int sync_flight; 235 236 /* 237 * queue-depth detection 238 */ 239 int rq_queued; 240 int hw_tag; 241 /* 242 * hw_tag can be 243 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) 244 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) 245 * 0 => no NCQ 246 */ 247 int hw_tag_est_depth; 248 unsigned int hw_tag_samples; 249 250 /* 251 * idle window management 252 */ 253 struct timer_list idle_slice_timer; 254 struct work_struct unplug_work; 255 256 struct cfq_queue *active_queue; 257 struct cfq_io_context *active_cic; 258 259 /* 260 * async queue for each priority case 261 */ 262 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; 263 struct cfq_queue *async_idle_cfqq; 264 265 sector_t last_position; 266 267 /* 268 * tunables, see top of file 269 */ 270 unsigned int cfq_quantum; 271 unsigned int cfq_fifo_expire[2]; 272 unsigned int cfq_back_penalty; 273 unsigned int cfq_back_max; 274 unsigned int cfq_slice[2]; 275 unsigned int cfq_slice_async_rq; 276 unsigned int cfq_slice_idle; 277 unsigned int cfq_latency; 278 unsigned int cfq_group_isolation; 279 280 struct list_head cic_list; 281 282 /* 283 * Fallback dummy cfqq for extreme OOM conditions 284 */ 285 struct cfq_queue oom_cfqq; 286 287 unsigned long last_end_sync_rq; 288 289 /* List of cfq groups being managed on this device*/ 290 struct hlist_head cfqg_list; 291}; 292 293static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 294 295static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, 296 enum wl_prio_t prio, 297 enum wl_type_t type, 298 struct cfq_data *cfqd) 299{ 300 if (!cfqg) 301 return NULL; 302 303 if (prio == IDLE_WORKLOAD) 304 return &cfqg->service_tree_idle; 305 306 return &cfqg->service_trees[prio][type]; 307} 308 309enum cfqq_state_flags { 310 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 311 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 312 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 313 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 314 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 315 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 316 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 317 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 318 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 319 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ 320 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ 321 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ 322 CFQ_CFQQ_FLAG_wait_busy_done, /* Got new request. Expire the queue */ 323}; 324 325#define CFQ_CFQQ_FNS(name) \ 326static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 327{ \ 328 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 329} \ 330static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 331{ \ 332 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 333} \ 334static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 335{ \ 336 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 337} 338 339CFQ_CFQQ_FNS(on_rr); 340CFQ_CFQQ_FNS(wait_request); 341CFQ_CFQQ_FNS(must_dispatch); 342CFQ_CFQQ_FNS(must_alloc_slice); 343CFQ_CFQQ_FNS(fifo_expire); 344CFQ_CFQQ_FNS(idle_window); 345CFQ_CFQQ_FNS(prio_changed); 346CFQ_CFQQ_FNS(slice_new); 347CFQ_CFQQ_FNS(sync); 348CFQ_CFQQ_FNS(coop); 349CFQ_CFQQ_FNS(deep); 350CFQ_CFQQ_FNS(wait_busy); 351CFQ_CFQQ_FNS(wait_busy_done); 352#undef CFQ_CFQQ_FNS 353 354#ifdef CONFIG_DEBUG_CFQ_IOSCHED 355#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 356 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 357 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 358 blkg_path(&(cfqq)->cfqg->blkg), ##args); 359 360#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 361 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 362 blkg_path(&(cfqg)->blkg), ##args); \ 363 364#else 365#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 366 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 367#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); 368#endif 369#define cfq_log(cfqd, fmt, args...) \ 370 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 371 372/* Traverses through cfq group service trees */ 373#define for_each_cfqg_st(cfqg, i, j, st) \ 374 for (i = 0; i <= IDLE_WORKLOAD; i++) \ 375 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ 376 : &cfqg->service_tree_idle; \ 377 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ 378 (i == IDLE_WORKLOAD && j == 0); \ 379 j++, st = i < IDLE_WORKLOAD ? \ 380 &cfqg->service_trees[i][j]: NULL) \ 381 382 383static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 384{ 385 if (cfq_class_idle(cfqq)) 386 return IDLE_WORKLOAD; 387 if (cfq_class_rt(cfqq)) 388 return RT_WORKLOAD; 389 return BE_WORKLOAD; 390} 391 392 393static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) 394{ 395 if (!cfq_cfqq_sync(cfqq)) 396 return ASYNC_WORKLOAD; 397 if (!cfq_cfqq_idle_window(cfqq)) 398 return SYNC_NOIDLE_WORKLOAD; 399 return SYNC_WORKLOAD; 400} 401 402static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, 403 struct cfq_data *cfqd, 404 struct cfq_group *cfqg) 405{ 406 if (wl == IDLE_WORKLOAD) 407 return cfqg->service_tree_idle.count; 408 409 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count 410 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count 411 + cfqg->service_trees[wl][SYNC_WORKLOAD].count; 412} 413 414static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 415 struct cfq_group *cfqg) 416{ 417 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count 418 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 419} 420 421static void cfq_dispatch_insert(struct request_queue *, struct request *); 422static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 423 struct io_context *, gfp_t); 424static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 425 struct io_context *); 426 427static inline int rq_in_driver(struct cfq_data *cfqd) 428{ 429 return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1]; 430} 431 432static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 433 bool is_sync) 434{ 435 return cic->cfqq[is_sync]; 436} 437 438static inline void cic_set_cfqq(struct cfq_io_context *cic, 439 struct cfq_queue *cfqq, bool is_sync) 440{ 441 cic->cfqq[is_sync] = cfqq; 442} 443 444/* 445 * We regard a request as SYNC, if it's either a read or has the SYNC bit 446 * set (in which case it could also be direct WRITE). 447 */ 448static inline bool cfq_bio_sync(struct bio *bio) 449{ 450 return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO); 451} 452 453/* 454 * scheduler run of queue, if there are requests pending and no one in the 455 * driver that will restart queueing 456 */ 457static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 458{ 459 if (cfqd->busy_queues) { 460 cfq_log(cfqd, "schedule dispatch"); 461 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 462 } 463} 464 465static int cfq_queue_empty(struct request_queue *q) 466{ 467 struct cfq_data *cfqd = q->elevator->elevator_data; 468 469 return !cfqd->rq_queued; 470} 471 472/* 473 * Scale schedule slice based on io priority. Use the sync time slice only 474 * if a queue is marked sync and has sync io queued. A sync queue with async 475 * io only, should not get full sync slice length. 476 */ 477static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync, 478 unsigned short prio) 479{ 480 const int base_slice = cfqd->cfq_slice[sync]; 481 482 WARN_ON(prio >= IOPRIO_BE_NR); 483 484 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); 485} 486 487static inline int 488cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 489{ 490 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 491} 492 493static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) 494{ 495 u64 d = delta << CFQ_SERVICE_SHIFT; 496 497 d = d * BLKIO_WEIGHT_DEFAULT; 498 do_div(d, cfqg->weight); 499 return d; 500} 501 502static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 503{ 504 s64 delta = (s64)(vdisktime - min_vdisktime); 505 if (delta > 0) 506 min_vdisktime = vdisktime; 507 508 return min_vdisktime; 509} 510 511static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) 512{ 513 s64 delta = (s64)(vdisktime - min_vdisktime); 514 if (delta < 0) 515 min_vdisktime = vdisktime; 516 517 return min_vdisktime; 518} 519 520static void update_min_vdisktime(struct cfq_rb_root *st) 521{ 522 u64 vdisktime = st->min_vdisktime; 523 struct cfq_group *cfqg; 524 525 if (st->active) { 526 cfqg = rb_entry_cfqg(st->active); 527 vdisktime = cfqg->vdisktime; 528 } 529 530 if (st->left) { 531 cfqg = rb_entry_cfqg(st->left); 532 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 533 } 534 535 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); 536} 537 538/* 539 * get averaged number of queues of RT/BE priority. 540 * average is updated, with a formula that gives more weight to higher numbers, 541 * to quickly follows sudden increases and decrease slowly 542 */ 543 544static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, 545 struct cfq_group *cfqg, bool rt) 546{ 547 unsigned min_q, max_q; 548 unsigned mult = cfq_hist_divisor - 1; 549 unsigned round = cfq_hist_divisor / 2; 550 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); 551 552 min_q = min(cfqg->busy_queues_avg[rt], busy); 553 max_q = max(cfqg->busy_queues_avg[rt], busy); 554 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / 555 cfq_hist_divisor; 556 return cfqg->busy_queues_avg[rt]; 557} 558 559static inline unsigned 560cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 561{ 562 struct cfq_rb_root *st = &cfqd->grp_service_tree; 563 564 return cfq_target_latency * cfqg->weight / st->total_weight; 565} 566 567static inline void 568cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 569{ 570 unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 571 if (cfqd->cfq_latency) { 572 /* 573 * interested queues (we consider only the ones with the same 574 * priority class in the cfq group) 575 */ 576 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, 577 cfq_class_rt(cfqq)); 578 unsigned sync_slice = cfqd->cfq_slice[1]; 579 unsigned expect_latency = sync_slice * iq; 580 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); 581 582 if (expect_latency > group_slice) { 583 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; 584 /* scale low_slice according to IO priority 585 * and sync vs async */ 586 unsigned low_slice = 587 min(slice, base_low_slice * slice / sync_slice); 588 /* the adapted slice value is scaled to fit all iqs 589 * into the target latency */ 590 slice = max(slice * group_slice / expect_latency, 591 low_slice); 592 } 593 } 594 cfqq->slice_start = jiffies; 595 cfqq->slice_end = jiffies + slice; 596 cfqq->allocated_slice = slice; 597 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 598} 599 600/* 601 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 602 * isn't valid until the first request from the dispatch is activated 603 * and the slice time set. 604 */ 605static inline bool cfq_slice_used(struct cfq_queue *cfqq) 606{ 607 if (cfq_cfqq_slice_new(cfqq)) 608 return 0; 609 if (time_before(jiffies, cfqq->slice_end)) 610 return 0; 611 612 return 1; 613} 614 615/* 616 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 617 * We choose the request that is closest to the head right now. Distance 618 * behind the head is penalized and only allowed to a certain extent. 619 */ 620static struct request * 621cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) 622{ 623 sector_t s1, s2, d1 = 0, d2 = 0; 624 unsigned long back_max; 625#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 626#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 627 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 628 629 if (rq1 == NULL || rq1 == rq2) 630 return rq2; 631 if (rq2 == NULL) 632 return rq1; 633 634 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 635 return rq1; 636 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 637 return rq2; 638 if (rq_is_meta(rq1) && !rq_is_meta(rq2)) 639 return rq1; 640 else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) 641 return rq2; 642 643 s1 = blk_rq_pos(rq1); 644 s2 = blk_rq_pos(rq2); 645 646 /* 647 * by definition, 1KiB is 2 sectors 648 */ 649 back_max = cfqd->cfq_back_max * 2; 650 651 /* 652 * Strict one way elevator _except_ in the case where we allow 653 * short backward seeks which are biased as twice the cost of a 654 * similar forward seek. 655 */ 656 if (s1 >= last) 657 d1 = s1 - last; 658 else if (s1 + back_max >= last) 659 d1 = (last - s1) * cfqd->cfq_back_penalty; 660 else 661 wrap |= CFQ_RQ1_WRAP; 662 663 if (s2 >= last) 664 d2 = s2 - last; 665 else if (s2 + back_max >= last) 666 d2 = (last - s2) * cfqd->cfq_back_penalty; 667 else 668 wrap |= CFQ_RQ2_WRAP; 669 670 /* Found required data */ 671 672 /* 673 * By doing switch() on the bit mask "wrap" we avoid having to 674 * check two variables for all permutations: --> faster! 675 */ 676 switch (wrap) { 677 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 678 if (d1 < d2) 679 return rq1; 680 else if (d2 < d1) 681 return rq2; 682 else { 683 if (s1 >= s2) 684 return rq1; 685 else 686 return rq2; 687 } 688 689 case CFQ_RQ2_WRAP: 690 return rq1; 691 case CFQ_RQ1_WRAP: 692 return rq2; 693 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 694 default: 695 /* 696 * Since both rqs are wrapped, 697 * start with the one that's further behind head 698 * (--> only *one* back seek required), 699 * since back seek takes more time than forward. 700 */ 701 if (s1 <= s2) 702 return rq1; 703 else 704 return rq2; 705 } 706} 707 708/* 709 * The below is leftmost cache rbtree addon 710 */ 711static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 712{ 713 /* Service tree is empty */ 714 if (!root->count) 715 return NULL; 716 717 if (!root->left) 718 root->left = rb_first(&root->rb); 719 720 if (root->left) 721 return rb_entry(root->left, struct cfq_queue, rb_node); 722 723 return NULL; 724} 725 726static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) 727{ 728 if (!root->left) 729 root->left = rb_first(&root->rb); 730 731 if (root->left) 732 return rb_entry_cfqg(root->left); 733 734 return NULL; 735} 736 737static void rb_erase_init(struct rb_node *n, struct rb_root *root) 738{ 739 rb_erase(n, root); 740 RB_CLEAR_NODE(n); 741} 742 743static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 744{ 745 if (root->left == n) 746 root->left = NULL; 747 rb_erase_init(n, &root->rb); 748 --root->count; 749} 750 751/* 752 * would be nice to take fifo expire time into account as well 753 */ 754static struct request * 755cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 756 struct request *last) 757{ 758 struct rb_node *rbnext = rb_next(&last->rb_node); 759 struct rb_node *rbprev = rb_prev(&last->rb_node); 760 struct request *next = NULL, *prev = NULL; 761 762 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 763 764 if (rbprev) 765 prev = rb_entry_rq(rbprev); 766 767 if (rbnext) 768 next = rb_entry_rq(rbnext); 769 else { 770 rbnext = rb_first(&cfqq->sort_list); 771 if (rbnext && rbnext != &last->rb_node) 772 next = rb_entry_rq(rbnext); 773 } 774 775 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); 776} 777 778static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 779 struct cfq_queue *cfqq) 780{ 781 /* 782 * just an approximation, should be ok. 783 */ 784 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - 785 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 786} 787 788static inline s64 789cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) 790{ 791 return cfqg->vdisktime - st->min_vdisktime; 792} 793 794static void 795__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 796{ 797 struct rb_node **node = &st->rb.rb_node; 798 struct rb_node *parent = NULL; 799 struct cfq_group *__cfqg; 800 s64 key = cfqg_key(st, cfqg); 801 int left = 1; 802 803 while (*node != NULL) { 804 parent = *node; 805 __cfqg = rb_entry_cfqg(parent); 806 807 if (key < cfqg_key(st, __cfqg)) 808 node = &parent->rb_left; 809 else { 810 node = &parent->rb_right; 811 left = 0; 812 } 813 } 814 815 if (left) 816 st->left = &cfqg->rb_node; 817 818 rb_link_node(&cfqg->rb_node, parent, node); 819 rb_insert_color(&cfqg->rb_node, &st->rb); 820} 821 822static void 823cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 824{ 825 struct cfq_rb_root *st = &cfqd->grp_service_tree; 826 struct cfq_group *__cfqg; 827 struct rb_node *n; 828 829 cfqg->nr_cfqq++; 830 if (cfqg->on_st) 831 return; 832 833 /* 834 * Currently put the group at the end. Later implement something 835 * so that groups get lesser vtime based on their weights, so that 836 * if group does not loose all if it was not continously backlogged. 837 */ 838 n = rb_last(&st->rb); 839 if (n) { 840 __cfqg = rb_entry_cfqg(n); 841 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 842 } else 843 cfqg->vdisktime = st->min_vdisktime; 844 845 __cfq_group_service_tree_add(st, cfqg); 846 cfqg->on_st = true; 847 cfqd->nr_groups++; 848 st->total_weight += cfqg->weight; 849} 850 851static void 852cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 853{ 854 struct cfq_rb_root *st = &cfqd->grp_service_tree; 855 856 if (st->active == &cfqg->rb_node) 857 st->active = NULL; 858 859 BUG_ON(cfqg->nr_cfqq < 1); 860 cfqg->nr_cfqq--; 861 862 /* If there are other cfq queues under this group, don't delete it */ 863 if (cfqg->nr_cfqq) 864 return; 865 866 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 867 cfqg->on_st = false; 868 cfqd->nr_groups--; 869 st->total_weight -= cfqg->weight; 870 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 871 cfq_rb_erase(&cfqg->rb_node, st); 872 cfqg->saved_workload_slice = 0; 873 blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); 874} 875 876static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 877{ 878 unsigned int slice_used; 879 880 /* 881 * Queue got expired before even a single request completed or 882 * got expired immediately after first request completion. 883 */ 884 if (!cfqq->slice_start || cfqq->slice_start == jiffies) { 885 /* 886 * Also charge the seek time incurred to the group, otherwise 887 * if there are mutiple queues in the group, each can dispatch 888 * a single request on seeky media and cause lots of seek time 889 * and group will never know it. 890 */ 891 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), 892 1); 893 } else { 894 slice_used = jiffies - cfqq->slice_start; 895 if (slice_used > cfqq->allocated_slice) 896 slice_used = cfqq->allocated_slice; 897 } 898 899 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, 900 cfqq->nr_sectors); 901 return slice_used; 902} 903 904static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, 905 struct cfq_queue *cfqq) 906{ 907 struct cfq_rb_root *st = &cfqd->grp_service_tree; 908 unsigned int used_sl, charge_sl; 909 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 910 - cfqg->service_tree_idle.count; 911 912 BUG_ON(nr_sync < 0); 913 used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); 914 915 if (!cfq_cfqq_sync(cfqq) && !nr_sync) 916 charge_sl = cfqq->allocated_slice; 917 918 /* Can't update vdisktime while group is on service tree */ 919 cfq_rb_erase(&cfqg->rb_node, st); 920 cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); 921 __cfq_group_service_tree_add(st, cfqg); 922 923 /* This group is being expired. Save the context */ 924 if (time_after(cfqd->workload_expires, jiffies)) { 925 cfqg->saved_workload_slice = cfqd->workload_expires 926 - jiffies; 927 cfqg->saved_workload = cfqd->serving_type; 928 cfqg->saved_serving_prio = cfqd->serving_prio; 929 } else 930 cfqg->saved_workload_slice = 0; 931 932 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 933 st->min_vdisktime); 934 blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, 935 cfqq->nr_sectors); 936} 937 938#ifdef CONFIG_CFQ_GROUP_IOSCHED 939static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) 940{ 941 if (blkg) 942 return container_of(blkg, struct cfq_group, blkg); 943 return NULL; 944} 945 946void 947cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 948{ 949 cfqg_of_blkg(blkg)->weight = weight; 950} 951 952static struct cfq_group * 953cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) 954{ 955 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 956 struct cfq_group *cfqg = NULL; 957 void *key = cfqd; 958 int i, j; 959 struct cfq_rb_root *st; 960 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 961 unsigned int major, minor; 962 963 /* Do we need to take this reference */ 964 if (!blkiocg_css_tryget(blkcg)) 965 return NULL;; 966 967 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 968 if (cfqg || !create) 969 goto done; 970 971 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 972 if (!cfqg) 973 goto done; 974 975 cfqg->weight = blkcg->weight; 976 for_each_cfqg_st(cfqg, i, j, st) 977 *st = CFQ_RB_ROOT; 978 RB_CLEAR_NODE(&cfqg->rb_node); 979 980 /* 981 * Take the initial reference that will be released on destroy 982 * This can be thought of a joint reference by cgroup and 983 * elevator which will be dropped by either elevator exit 984 * or cgroup deletion path depending on who is exiting first. 985 */ 986 atomic_set(&cfqg->ref, 1); 987 988 /* Add group onto cgroup list */ 989 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 990 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 991 MKDEV(major, minor)); 992 993 /* Add group on cfqd list */ 994 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 995 996done: 997 blkiocg_css_put(blkcg); 998 return cfqg; 999} 1000 1001/* 1002 * Search for the cfq group current task belongs to. If create = 1, then also 1003 * create the cfq group if it does not exist. request_queue lock must be held. 1004 */ 1005static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1006{ 1007 struct cgroup *cgroup; 1008 struct cfq_group *cfqg = NULL; 1009 1010 rcu_read_lock(); 1011 cgroup = task_cgroup(current, blkio_subsys_id); 1012 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); 1013 if (!cfqg && create) 1014 cfqg = &cfqd->root_group; 1015 rcu_read_unlock(); 1016 return cfqg; 1017} 1018 1019static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1020{ 1021 /* Currently, all async queues are mapped to root group */ 1022 if (!cfq_cfqq_sync(cfqq)) 1023 cfqg = &cfqq->cfqd->root_group; 1024 1025 cfqq->cfqg = cfqg; 1026 /* cfqq reference on cfqg */ 1027 atomic_inc(&cfqq->cfqg->ref); 1028} 1029 1030static void cfq_put_cfqg(struct cfq_group *cfqg) 1031{ 1032 struct cfq_rb_root *st; 1033 int i, j; 1034 1035 BUG_ON(atomic_read(&cfqg->ref) <= 0); 1036 if (!atomic_dec_and_test(&cfqg->ref)) 1037 return; 1038 for_each_cfqg_st(cfqg, i, j, st) 1039 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); 1040 kfree(cfqg); 1041} 1042 1043static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) 1044{ 1045 /* Something wrong if we are trying to remove same group twice */ 1046 BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); 1047 1048 hlist_del_init(&cfqg->cfqd_node); 1049 1050 /* 1051 * Put the reference taken at the time of creation so that when all 1052 * queues are gone, group can be destroyed. 1053 */ 1054 cfq_put_cfqg(cfqg); 1055} 1056 1057static void cfq_release_cfq_groups(struct cfq_data *cfqd) 1058{ 1059 struct hlist_node *pos, *n; 1060 struct cfq_group *cfqg; 1061 1062 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { 1063 /* 1064 * If cgroup removal path got to blk_group first and removed 1065 * it from cgroup list, then it will take care of destroying 1066 * cfqg also. 1067 */ 1068 if (!blkiocg_del_blkio_group(&cfqg->blkg)) 1069 cfq_destroy_cfqg(cfqd, cfqg); 1070 } 1071} 1072 1073/* 1074 * Blk cgroup controller notification saying that blkio_group object is being 1075 * delinked as associated cgroup object is going away. That also means that 1076 * no new IO will come in this group. So get rid of this group as soon as 1077 * any pending IO in the group is finished. 1078 * 1079 * This function is called under rcu_read_lock(). key is the rcu protected 1080 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu 1081 * read lock. 1082 * 1083 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means 1084 * it should not be NULL as even if elevator was exiting, cgroup deltion 1085 * path got to it first. 1086 */ 1087void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) 1088{ 1089 unsigned long flags; 1090 struct cfq_data *cfqd = key; 1091 1092 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1093 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); 1094 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1095} 1096 1097#else /* GROUP_IOSCHED */ 1098static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1099{ 1100 return &cfqd->root_group; 1101} 1102static inline void 1103cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 1104 cfqq->cfqg = cfqg; 1105} 1106 1107static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} 1108static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} 1109 1110#endif /* GROUP_IOSCHED */ 1111 1112/* 1113 * The cfqd->service_trees holds all pending cfq_queue's that have 1114 * requests waiting to be processed. It is sorted in the order that 1115 * we will service the queues. 1116 */ 1117static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1118 bool add_front) 1119{ 1120 struct rb_node **p, *parent; 1121 struct cfq_queue *__cfqq; 1122 unsigned long rb_key; 1123 struct cfq_rb_root *service_tree; 1124 int left; 1125 int new_cfqq = 1; 1126 int group_changed = 0; 1127 1128#ifdef CONFIG_CFQ_GROUP_IOSCHED 1129 if (!cfqd->cfq_group_isolation 1130 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD 1131 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { 1132 /* Move this cfq to root group */ 1133 cfq_log_cfqq(cfqd, cfqq, "moving to root group"); 1134 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1135 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1136 cfqq->orig_cfqg = cfqq->cfqg; 1137 cfqq->cfqg = &cfqd->root_group; 1138 atomic_inc(&cfqd->root_group.ref); 1139 group_changed = 1; 1140 } else if (!cfqd->cfq_group_isolation 1141 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { 1142 /* cfqq is sequential now needs to go to its original group */ 1143 BUG_ON(cfqq->cfqg != &cfqd->root_group); 1144 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1145 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1146 cfq_put_cfqg(cfqq->cfqg); 1147 cfqq->cfqg = cfqq->orig_cfqg; 1148 cfqq->orig_cfqg = NULL; 1149 group_changed = 1; 1150 cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); 1151 } 1152#endif 1153 1154 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1155 cfqq_type(cfqq), cfqd); 1156 if (cfq_class_idle(cfqq)) { 1157 rb_key = CFQ_IDLE_DELAY; 1158 parent = rb_last(&service_tree->rb); 1159 if (parent && parent != &cfqq->rb_node) { 1160 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1161 rb_key += __cfqq->rb_key; 1162 } else 1163 rb_key += jiffies; 1164 } else if (!add_front) { 1165 /* 1166 * Get our rb key offset. Subtract any residual slice 1167 * value carried from last service. A negative resid 1168 * count indicates slice overrun, and this should position 1169 * the next service time further away in the tree. 1170 */ 1171 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; 1172 rb_key -= cfqq->slice_resid; 1173 cfqq->slice_resid = 0; 1174 } else { 1175 rb_key = -HZ; 1176 __cfqq = cfq_rb_first(service_tree); 1177 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 1178 } 1179 1180 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1181 new_cfqq = 0; 1182 /* 1183 * same position, nothing more to do 1184 */ 1185 if (rb_key == cfqq->rb_key && 1186 cfqq->service_tree == service_tree) 1187 return; 1188 1189 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1190 cfqq->service_tree = NULL; 1191 } 1192 1193 left = 1; 1194 parent = NULL; 1195 cfqq->service_tree = service_tree; 1196 p = &service_tree->rb.rb_node; 1197 while (*p) { 1198 struct rb_node **n; 1199 1200 parent = *p; 1201 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1202 1203 /* 1204 * sort by key, that represents service time. 1205 */ 1206 if (time_before(rb_key, __cfqq->rb_key)) 1207 n = &(*p)->rb_left; 1208 else { 1209 n = &(*p)->rb_right; 1210 left = 0; 1211 } 1212 1213 p = n; 1214 } 1215 1216 if (left) 1217 service_tree->left = &cfqq->rb_node; 1218 1219 cfqq->rb_key = rb_key; 1220 rb_link_node(&cfqq->rb_node, parent, p); 1221 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 1222 service_tree->count++; 1223 if ((add_front || !new_cfqq) && !group_changed) 1224 return; 1225 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1226} 1227 1228static struct cfq_queue * 1229cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, 1230 sector_t sector, struct rb_node **ret_parent, 1231 struct rb_node ***rb_link) 1232{ 1233 struct rb_node **p, *parent; 1234 struct cfq_queue *cfqq = NULL; 1235 1236 parent = NULL; 1237 p = &root->rb_node; 1238 while (*p) { 1239 struct rb_node **n; 1240 1241 parent = *p; 1242 cfqq = rb_entry(parent, struct cfq_queue, p_node); 1243 1244 /* 1245 * Sort strictly based on sector. Smallest to the left, 1246 * largest to the right. 1247 */ 1248 if (sector > blk_rq_pos(cfqq->next_rq)) 1249 n = &(*p)->rb_right; 1250 else if (sector < blk_rq_pos(cfqq->next_rq)) 1251 n = &(*p)->rb_left; 1252 else 1253 break; 1254 p = n; 1255 cfqq = NULL; 1256 } 1257 1258 *ret_parent = parent; 1259 if (rb_link) 1260 *rb_link = p; 1261 return cfqq; 1262} 1263 1264static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1265{ 1266 struct rb_node **p, *parent; 1267 struct cfq_queue *__cfqq; 1268 1269 if (cfqq->p_root) { 1270 rb_erase(&cfqq->p_node, cfqq->p_root); 1271 cfqq->p_root = NULL; 1272 } 1273 1274 if (cfq_class_idle(cfqq)) 1275 return; 1276 if (!cfqq->next_rq) 1277 return; 1278 1279 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; 1280 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, 1281 blk_rq_pos(cfqq->next_rq), &parent, &p); 1282 if (!__cfqq) { 1283 rb_link_node(&cfqq->p_node, parent, p); 1284 rb_insert_color(&cfqq->p_node, cfqq->p_root); 1285 } else 1286 cfqq->p_root = NULL; 1287} 1288 1289/* 1290 * Update cfqq's position in the service tree. 1291 */ 1292static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1293{ 1294 /* 1295 * Resorting requires the cfqq to be on the RR list already. 1296 */ 1297 if (cfq_cfqq_on_rr(cfqq)) { 1298 cfq_service_tree_add(cfqd, cfqq, 0); 1299 cfq_prio_tree_add(cfqd, cfqq); 1300 } 1301} 1302 1303/* 1304 * add to busy list of queues for service, trying to be fair in ordering 1305 * the pending list according to last request service 1306 */ 1307static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1308{ 1309 cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); 1310 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1311 cfq_mark_cfqq_on_rr(cfqq); 1312 cfqd->busy_queues++; 1313 1314 cfq_resort_rr_list(cfqd, cfqq); 1315} 1316 1317/* 1318 * Called when the cfqq no longer has requests pending, remove it from 1319 * the service tree. 1320 */ 1321static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1322{ 1323 cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); 1324 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1325 cfq_clear_cfqq_on_rr(cfqq); 1326 1327 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1328 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1329 cfqq->service_tree = NULL; 1330 } 1331 if (cfqq->p_root) { 1332 rb_erase(&cfqq->p_node, cfqq->p_root); 1333 cfqq->p_root = NULL; 1334 } 1335 1336 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1337 BUG_ON(!cfqd->busy_queues); 1338 cfqd->busy_queues--; 1339} 1340 1341/* 1342 * rb tree support functions 1343 */ 1344static void cfq_del_rq_rb(struct request *rq) 1345{ 1346 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1347 const int sync = rq_is_sync(rq); 1348 1349 BUG_ON(!cfqq->queued[sync]); 1350 cfqq->queued[sync]--; 1351 1352 elv_rb_del(&cfqq->sort_list, rq); 1353 1354 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { 1355 /* 1356 * Queue will be deleted from service tree when we actually 1357 * expire it later. Right now just remove it from prio tree 1358 * as it is empty. 1359 */ 1360 if (cfqq->p_root) { 1361 rb_erase(&cfqq->p_node, cfqq->p_root); 1362 cfqq->p_root = NULL; 1363 } 1364 } 1365} 1366 1367static void cfq_add_rq_rb(struct request *rq) 1368{ 1369 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1370 struct cfq_data *cfqd = cfqq->cfqd; 1371 struct request *__alias, *prev; 1372 1373 cfqq->queued[rq_is_sync(rq)]++; 1374 1375 /* 1376 * looks a little odd, but the first insert might return an alias. 1377 * if that happens, put the alias on the dispatch list 1378 */ 1379 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 1380 cfq_dispatch_insert(cfqd->queue, __alias); 1381 1382 if (!cfq_cfqq_on_rr(cfqq)) 1383 cfq_add_cfqq_rr(cfqd, cfqq); 1384 1385 /* 1386 * check if this request is a better next-serve candidate 1387 */ 1388 prev = cfqq->next_rq; 1389 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); 1390 1391 /* 1392 * adjust priority tree position, if ->next_rq changes 1393 */ 1394 if (prev != cfqq->next_rq) 1395 cfq_prio_tree_add(cfqd, cfqq); 1396 1397 BUG_ON(!cfqq->next_rq); 1398} 1399 1400static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 1401{ 1402 elv_rb_del(&cfqq->sort_list, rq); 1403 cfqq->queued[rq_is_sync(rq)]--; 1404 cfq_add_rq_rb(rq); 1405} 1406 1407static struct request * 1408cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 1409{ 1410 struct task_struct *tsk = current; 1411 struct cfq_io_context *cic; 1412 struct cfq_queue *cfqq; 1413 1414 cic = cfq_cic_lookup(cfqd, tsk->io_context); 1415 if (!cic) 1416 return NULL; 1417 1418 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1419 if (cfqq) { 1420 sector_t sector = bio->bi_sector + bio_sectors(bio); 1421 1422 return elv_rb_find(&cfqq->sort_list, sector); 1423 } 1424 1425 return NULL; 1426} 1427 1428static void cfq_activate_request(struct request_queue *q, struct request *rq) 1429{ 1430 struct cfq_data *cfqd = q->elevator->elevator_data; 1431 1432 cfqd->rq_in_driver[rq_is_sync(rq)]++; 1433 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 1434 rq_in_driver(cfqd)); 1435 1436 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 1437} 1438 1439static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 1440{ 1441 struct cfq_data *cfqd = q->elevator->elevator_data; 1442 const int sync = rq_is_sync(rq); 1443 1444 WARN_ON(!cfqd->rq_in_driver[sync]); 1445 cfqd->rq_in_driver[sync]--; 1446 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 1447 rq_in_driver(cfqd)); 1448} 1449 1450static void cfq_remove_request(struct request *rq) 1451{ 1452 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1453 1454 if (cfqq->next_rq == rq) 1455 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 1456 1457 list_del_init(&rq->queuelist); 1458 cfq_del_rq_rb(rq); 1459 1460 cfqq->cfqd->rq_queued--; 1461 if (rq_is_meta(rq)) { 1462 WARN_ON(!cfqq->meta_pending); 1463 cfqq->meta_pending--; 1464 } 1465} 1466 1467static int cfq_merge(struct request_queue *q, struct request **req, 1468 struct bio *bio) 1469{ 1470 struct cfq_data *cfqd = q->elevator->elevator_data; 1471 struct request *__rq; 1472 1473 __rq = cfq_find_rq_fmerge(cfqd, bio); 1474 if (__rq && elv_rq_merge_ok(__rq, bio)) { 1475 *req = __rq; 1476 return ELEVATOR_FRONT_MERGE; 1477 } 1478 1479 return ELEVATOR_NO_MERGE; 1480} 1481 1482static void cfq_merged_request(struct request_queue *q, struct request *req, 1483 int type) 1484{ 1485 if (type == ELEVATOR_FRONT_MERGE) { 1486 struct cfq_queue *cfqq = RQ_CFQQ(req); 1487 1488 cfq_reposition_rq_rb(cfqq, req); 1489 } 1490} 1491 1492static void 1493cfq_merged_requests(struct request_queue *q, struct request *rq, 1494 struct request *next) 1495{ 1496 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1497 /* 1498 * reposition in fifo if next is older than rq 1499 */ 1500 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 1501 time_before(rq_fifo_time(next), rq_fifo_time(rq))) { 1502 list_move(&rq->queuelist, &next->queuelist); 1503 rq_set_fifo_time(rq, rq_fifo_time(next)); 1504 } 1505 1506 if (cfqq->next_rq == next) 1507 cfqq->next_rq = rq; 1508 cfq_remove_request(next); 1509} 1510 1511static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1512 struct bio *bio) 1513{ 1514 struct cfq_data *cfqd = q->elevator->elevator_data; 1515 struct cfq_io_context *cic; 1516 struct cfq_queue *cfqq; 1517 1518 /* Deny merge if bio and rq don't belong to same cfq group */ 1519 if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0)) 1520 return false; 1521 /* 1522 * Disallow merge of a sync bio into an async request. 1523 */ 1524 if (cfq_bio_sync(bio) && !rq_is_sync(rq)) 1525 return false; 1526 1527 /* 1528 * Lookup the cfqq that this bio will be queued with. Allow 1529 * merge only if rq is queued there. 1530 */ 1531 cic = cfq_cic_lookup(cfqd, current->io_context); 1532 if (!cic) 1533 return false; 1534 1535 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1536 return cfqq == RQ_CFQQ(rq); 1537} 1538 1539static void __cfq_set_active_queue(struct cfq_data *cfqd, 1540 struct cfq_queue *cfqq) 1541{ 1542 if (cfqq) { 1543 cfq_log_cfqq(cfqd, cfqq, "set_active"); 1544 cfqq->slice_start = 0; 1545 cfqq->dispatch_start = jiffies; 1546 cfqq->allocated_slice = 0; 1547 cfqq->slice_end = 0; 1548 cfqq->slice_dispatch = 0; 1549 cfqq->nr_sectors = 0; 1550 1551 cfq_clear_cfqq_wait_request(cfqq); 1552 cfq_clear_cfqq_must_dispatch(cfqq); 1553 cfq_clear_cfqq_must_alloc_slice(cfqq); 1554 cfq_clear_cfqq_fifo_expire(cfqq); 1555 cfq_mark_cfqq_slice_new(cfqq); 1556 1557 del_timer(&cfqd->idle_slice_timer); 1558 } 1559 1560 cfqd->active_queue = cfqq; 1561} 1562 1563/* 1564 * current cfqq expired its slice (or was too idle), select new one 1565 */ 1566static void 1567__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1568 bool timed_out) 1569{ 1570 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 1571 1572 if (cfq_cfqq_wait_request(cfqq)) 1573 del_timer(&cfqd->idle_slice_timer); 1574 1575 cfq_clear_cfqq_wait_request(cfqq); 1576 cfq_clear_cfqq_wait_busy(cfqq); 1577 cfq_clear_cfqq_wait_busy_done(cfqq); 1578 1579 /* 1580 * store what was left of this slice, if the queue idled/timed out 1581 */ 1582 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 1583 cfqq->slice_resid = cfqq->slice_end - jiffies; 1584 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1585 } 1586 1587 cfq_group_served(cfqd, cfqq->cfqg, cfqq); 1588 1589 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1590 cfq_del_cfqq_rr(cfqd, cfqq); 1591 1592 cfq_resort_rr_list(cfqd, cfqq); 1593 1594 if (cfqq == cfqd->active_queue) 1595 cfqd->active_queue = NULL; 1596 1597 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) 1598 cfqd->grp_service_tree.active = NULL; 1599 1600 if (cfqd->active_cic) { 1601 put_io_context(cfqd->active_cic->ioc); 1602 cfqd->active_cic = NULL; 1603 } 1604} 1605 1606static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) 1607{ 1608 struct cfq_queue *cfqq = cfqd->active_queue; 1609 1610 if (cfqq) 1611 __cfq_slice_expired(cfqd, cfqq, timed_out); 1612} 1613 1614/* 1615 * Get next queue for service. Unless we have a queue preemption, 1616 * we'll simply select the first cfqq in the service tree. 1617 */ 1618static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 1619{ 1620 struct cfq_rb_root *service_tree = 1621 service_tree_for(cfqd->serving_group, cfqd->serving_prio, 1622 cfqd->serving_type, cfqd); 1623 1624 if (!cfqd->rq_queued) 1625 return NULL; 1626 1627 /* There is nothing to dispatch */ 1628 if (!service_tree) 1629 return NULL; 1630 if (RB_EMPTY_ROOT(&service_tree->rb)) 1631 return NULL; 1632 return cfq_rb_first(service_tree); 1633} 1634 1635static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 1636{ 1637 struct cfq_group *cfqg; 1638 struct cfq_queue *cfqq; 1639 int i, j; 1640 struct cfq_rb_root *st; 1641 1642 if (!cfqd->rq_queued) 1643 return NULL; 1644 1645 cfqg = cfq_get_next_cfqg(cfqd); 1646 if (!cfqg) 1647 return NULL; 1648 1649 for_each_cfqg_st(cfqg, i, j, st) 1650 if ((cfqq = cfq_rb_first(st)) != NULL) 1651 return cfqq; 1652 return NULL; 1653} 1654 1655/* 1656 * Get and set a new active queue for service. 1657 */ 1658static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 1659 struct cfq_queue *cfqq) 1660{ 1661 if (!cfqq) 1662 cfqq = cfq_get_next_queue(cfqd); 1663 1664 __cfq_set_active_queue(cfqd, cfqq); 1665 return cfqq; 1666} 1667 1668static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 1669 struct request *rq) 1670{ 1671 if (blk_rq_pos(rq) >= cfqd->last_position) 1672 return blk_rq_pos(rq) - cfqd->last_position; 1673 else 1674 return cfqd->last_position - blk_rq_pos(rq); 1675} 1676 1677#define CFQQ_SEEK_THR 8 * 1024 1678#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) 1679 1680static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1681 struct request *rq) 1682{ 1683 sector_t sdist = cfqq->seek_mean; 1684 1685 if (!sample_valid(cfqq->seek_samples)) 1686 sdist = CFQQ_SEEK_THR; 1687 1688 return cfq_dist_from_last(cfqd, rq) <= sdist; 1689} 1690 1691static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 1692 struct cfq_queue *cur_cfqq) 1693{ 1694 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; 1695 struct rb_node *parent, *node; 1696 struct cfq_queue *__cfqq; 1697 sector_t sector = cfqd->last_position; 1698 1699 if (RB_EMPTY_ROOT(root)) 1700 return NULL; 1701 1702 /* 1703 * First, if we find a request starting at the end of the last 1704 * request, choose it. 1705 */ 1706 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); 1707 if (__cfqq) 1708 return __cfqq; 1709 1710 /* 1711 * If the exact sector wasn't found, the parent of the NULL leaf 1712 * will contain the closest sector. 1713 */ 1714 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1715 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1716 return __cfqq; 1717 1718 if (blk_rq_pos(__cfqq->next_rq) < sector) 1719 node = rb_next(&__cfqq->p_node); 1720 else 1721 node = rb_prev(&__cfqq->p_node); 1722 if (!node) 1723 return NULL; 1724 1725 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1726 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1727 return __cfqq; 1728 1729 return NULL; 1730} 1731 1732/* 1733 * cfqd - obvious 1734 * cur_cfqq - passed in so that we don't decide that the current queue is 1735 * closely cooperating with itself. 1736 * 1737 * So, basically we're assuming that that cur_cfqq has dispatched at least 1738 * one request, and that cfqd->last_position reflects a position on the disk 1739 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 1740 * assumption. 1741 */ 1742static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1743 struct cfq_queue *cur_cfqq) 1744{ 1745 struct cfq_queue *cfqq; 1746 1747 if (!cfq_cfqq_sync(cur_cfqq)) 1748 return NULL; 1749 if (CFQQ_SEEKY(cur_cfqq)) 1750 return NULL; 1751 1752 /* 1753 * We should notice if some of the queues are cooperating, eg 1754 * working closely on the same area of the disk. In that case, 1755 * we can group them together and don't waste time idling. 1756 */ 1757 cfqq = cfqq_close(cfqd, cur_cfqq); 1758 if (!cfqq) 1759 return NULL; 1760 1761 /* If new queue belongs to different cfq_group, don't choose it */ 1762 if (cur_cfqq->cfqg != cfqq->cfqg) 1763 return NULL; 1764 1765 /* 1766 * It only makes sense to merge sync queues. 1767 */ 1768 if (!cfq_cfqq_sync(cfqq)) 1769 return NULL; 1770 if (CFQQ_SEEKY(cfqq)) 1771 return NULL; 1772 1773 /* 1774 * Do not merge queues of different priority classes 1775 */ 1776 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) 1777 return NULL; 1778 1779 return cfqq; 1780} 1781 1782/* 1783 * Determine whether we should enforce idle window for this queue. 1784 */ 1785 1786static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1787{ 1788 enum wl_prio_t prio = cfqq_prio(cfqq); 1789 struct cfq_rb_root *service_tree = cfqq->service_tree; 1790 1791 BUG_ON(!service_tree); 1792 BUG_ON(!service_tree->count); 1793 1794 /* We never do for idle class queues. */ 1795 if (prio == IDLE_WORKLOAD) 1796 return false; 1797 1798 /* We do for queues that were marked with idle window flag. */ 1799 if (cfq_cfqq_idle_window(cfqq) && 1800 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) 1801 return true; 1802 1803 /* 1804 * Otherwise, we do only if they are the last ones 1805 * in their service tree. 1806 */ 1807 return service_tree->count == 1; 1808} 1809 1810static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1811{ 1812 struct cfq_queue *cfqq = cfqd->active_queue; 1813 struct cfq_io_context *cic; 1814 unsigned long sl; 1815 1816 /* 1817 * SSD device without seek penalty, disable idling. But only do so 1818 * for devices that support queuing, otherwise we still have a problem 1819 * with sync vs async workloads. 1820 */ 1821 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) 1822 return; 1823 1824 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 1825 WARN_ON(cfq_cfqq_slice_new(cfqq)); 1826 1827 /* 1828 * idle is disabled, either manually or by past process history 1829 */ 1830 if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) 1831 return; 1832 1833 /* 1834 * still active requests from this queue, don't idle 1835 */ 1836 if (cfqq->dispatched) 1837 return; 1838 1839 /* 1840 * task has exited, don't wait 1841 */ 1842 cic = cfqd->active_cic; 1843 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1844 return; 1845 1846 /* 1847 * If our average think time is larger than the remaining time 1848 * slice, then don't idle. This avoids overrunning the allotted 1849 * time slice. 1850 */ 1851 if (sample_valid(cic->ttime_samples) && 1852 (cfqq->slice_end - jiffies < cic->ttime_mean)) 1853 return; 1854 1855 cfq_mark_cfqq_wait_request(cfqq); 1856 1857 sl = cfqd->cfq_slice_idle; 1858 1859 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1860 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1861} 1862 1863/* 1864 * Move request from internal lists to the request queue dispatch list. 1865 */ 1866static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) 1867{ 1868 struct cfq_data *cfqd = q->elevator->elevator_data; 1869 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1870 1871 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 1872 1873 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); 1874 cfq_remove_request(rq); 1875 cfqq->dispatched++; 1876 elv_dispatch_sort(q, rq); 1877 1878 if (cfq_cfqq_sync(cfqq)) 1879 cfqd->sync_flight++; 1880 cfqq->nr_sectors += blk_rq_sectors(rq); 1881} 1882 1883/* 1884 * return expired entry, or NULL to just start from scratch in rbtree 1885 */ 1886static struct request *cfq_check_fifo(struct cfq_queue *cfqq) 1887{ 1888 struct request *rq = NULL; 1889 1890 if (cfq_cfqq_fifo_expire(cfqq)) 1891 return NULL; 1892 1893 cfq_mark_cfqq_fifo_expire(cfqq); 1894 1895 if (list_empty(&cfqq->fifo)) 1896 return NULL; 1897 1898 rq = rq_entry_fifo(cfqq->fifo.next); 1899 if (time_before(jiffies, rq_fifo_time(rq))) 1900 rq = NULL; 1901 1902 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); 1903 return rq; 1904} 1905 1906static inline int 1907cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1908{ 1909 const int base_rq = cfqd->cfq_slice_async_rq; 1910 1911 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 1912 1913 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 1914} 1915 1916/* 1917 * Must be called with the queue_lock held. 1918 */ 1919static int cfqq_process_refs(struct cfq_queue *cfqq) 1920{ 1921 int process_refs, io_refs; 1922 1923 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 1924 process_refs = atomic_read(&cfqq->ref) - io_refs; 1925 BUG_ON(process_refs < 0); 1926 return process_refs; 1927} 1928 1929static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) 1930{ 1931 int process_refs, new_process_refs; 1932 struct cfq_queue *__cfqq; 1933 1934 /* Avoid a circular list and skip interim queue merges */ 1935 while ((__cfqq = new_cfqq->new_cfqq)) { 1936 if (__cfqq == cfqq) 1937 return; 1938 new_cfqq = __cfqq; 1939 } 1940 1941 process_refs = cfqq_process_refs(cfqq); 1942 /* 1943 * If the process for the cfqq has gone away, there is no 1944 * sense in merging the queues. 1945 */ 1946 if (process_refs == 0) 1947 return; 1948 1949 /* 1950 * Merge in the direction of the lesser amount of work. 1951 */ 1952 new_process_refs = cfqq_process_refs(new_cfqq); 1953 if (new_process_refs >= process_refs) { 1954 cfqq->new_cfqq = new_cfqq; 1955 atomic_add(process_refs, &new_cfqq->ref); 1956 } else { 1957 new_cfqq->new_cfqq = cfqq; 1958 atomic_add(new_process_refs, &cfqq->ref); 1959 } 1960} 1961 1962static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, 1963 struct cfq_group *cfqg, enum wl_prio_t prio, 1964 bool prio_changed) 1965{ 1966 struct cfq_queue *queue; 1967 int i; 1968 bool key_valid = false; 1969 unsigned long lowest_key = 0; 1970 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; 1971 1972 if (prio_changed) { 1973 /* 1974 * When priorities switched, we prefer starting 1975 * from SYNC_NOIDLE (first choice), or just SYNC 1976 * over ASYNC 1977 */ 1978 if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) 1979 return cur_best; 1980 cur_best = SYNC_WORKLOAD; 1981 if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) 1982 return cur_best; 1983 1984 return ASYNC_WORKLOAD; 1985 } 1986 1987 for (i = 0; i < 3; ++i) { 1988 /* otherwise, select the one with lowest rb_key */ 1989 queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd)); 1990 if (queue && 1991 (!key_valid || time_before(queue->rb_key, lowest_key))) { 1992 lowest_key = queue->rb_key; 1993 cur_best = i; 1994 key_valid = true; 1995 } 1996 } 1997 1998 return cur_best; 1999} 2000 2001static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) 2002{ 2003 enum wl_prio_t previous_prio = cfqd->serving_prio; 2004 bool prio_changed; 2005 unsigned slice; 2006 unsigned count; 2007 struct cfq_rb_root *st; 2008 unsigned group_slice; 2009 2010 if (!cfqg) { 2011 cfqd->serving_prio = IDLE_WORKLOAD; 2012 cfqd->workload_expires = jiffies + 1; 2013 return; 2014 } 2015 2016 /* Choose next priority. RT > BE > IDLE */ 2017 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2018 cfqd->serving_prio = RT_WORKLOAD; 2019 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 2020 cfqd->serving_prio = BE_WORKLOAD; 2021 else { 2022 cfqd->serving_prio = IDLE_WORKLOAD; 2023 cfqd->workload_expires = jiffies + 1; 2024 return; 2025 } 2026 2027 /* 2028 * For RT and BE, we have to choose also the type 2029 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2030 * expiration time 2031 */ 2032 prio_changed = (cfqd->serving_prio != previous_prio); 2033 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, 2034 cfqd); 2035 count = st->count; 2036 2037 /* 2038 * If priority didn't change, check workload expiration, 2039 * and that we still have other queues ready 2040 */ 2041 if (!prio_changed && count && 2042 !time_after(jiffies, cfqd->workload_expires)) 2043 return; 2044 2045 /* otherwise select new workload type */ 2046 cfqd->serving_type = 2047 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed); 2048 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, 2049 cfqd); 2050 count = st->count; 2051 2052 /* 2053 * the workload slice is computed as a fraction of target latency 2054 * proportional to the number of queues in that workload, over 2055 * all the queues in the same priority class 2056 */ 2057 group_slice = cfq_group_slice(cfqd, cfqg); 2058 2059 slice = group_slice * count / 2060 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], 2061 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); 2062 2063 if (cfqd->serving_type == ASYNC_WORKLOAD) { 2064 unsigned int tmp; 2065 2066 /* 2067 * Async queues are currently system wide. Just taking 2068 * proportion of queues with-in same group will lead to higher 2069 * async ratio system wide as generally root group is going 2070 * to have higher weight. A more accurate thing would be to 2071 * calculate system wide asnc/sync ratio. 2072 */ 2073 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); 2074 tmp = tmp/cfqd->busy_queues; 2075 slice = min_t(unsigned, slice, tmp); 2076 2077 /* async workload slice is scaled down according to 2078 * the sync/async slice ratio. */ 2079 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; 2080 } else 2081 /* sync workload slice is at least 2 * cfq_slice_idle */ 2082 slice = max(slice, 2 * cfqd->cfq_slice_idle); 2083 2084 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2085 cfqd->workload_expires = jiffies + slice; 2086 cfqd->noidle_tree_requires_idle = false; 2087} 2088 2089static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2090{ 2091 struct cfq_rb_root *st = &cfqd->grp_service_tree; 2092 struct cfq_group *cfqg; 2093 2094 if (RB_EMPTY_ROOT(&st->rb)) 2095 return NULL; 2096 cfqg = cfq_rb_first_group(st); 2097 st->active = &cfqg->rb_node; 2098 update_min_vdisktime(st); 2099 return cfqg; 2100} 2101 2102static void cfq_choose_cfqg(struct cfq_data *cfqd) 2103{ 2104 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); 2105 2106 cfqd->serving_group = cfqg; 2107 2108 /* Restore the workload type data */ 2109 if (cfqg->saved_workload_slice) { 2110 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; 2111 cfqd->serving_type = cfqg->saved_workload; 2112 cfqd->serving_prio = cfqg->saved_serving_prio; 2113 } 2114 choose_service_tree(cfqd, cfqg); 2115} 2116 2117/* 2118 * Select a queue for service. If we have a current active queue, 2119 * check whether to continue servicing it, or retrieve and set a new one. 2120 */ 2121static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 2122{ 2123 struct cfq_queue *cfqq, *new_cfqq = NULL; 2124 2125 cfqq = cfqd->active_queue; 2126 if (!cfqq) 2127 goto new_queue; 2128 2129 if (!cfqd->rq_queued) 2130 return NULL; 2131 /* 2132 * The active queue has run out of time, expire it and select new. 2133 */ 2134 if ((cfq_slice_used(cfqq) || cfq_cfqq_wait_busy_done(cfqq)) 2135 && !cfq_cfqq_must_dispatch(cfqq)) 2136 goto expire; 2137 2138 /* 2139 * The active queue has requests and isn't expired, allow it to 2140 * dispatch. 2141 */ 2142 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 2143 goto keep_queue; 2144 2145 /* 2146 * If another queue has a request waiting within our mean seek 2147 * distance, let it run. The expire code will check for close 2148 * cooperators and put the close queue at the front of the service 2149 * tree. If possible, merge the expiring queue with the new cfqq. 2150 */ 2151 new_cfqq = cfq_close_cooperator(cfqd, cfqq); 2152 if (new_cfqq) { 2153 if (!cfqq->new_cfqq) 2154 cfq_setup_merge(cfqq, new_cfqq); 2155 goto expire; 2156 } 2157 2158 /* 2159 * No requests pending. If the active queue still has requests in 2160 * flight or is idling for a new request, allow either of these 2161 * conditions to happen (or time out) before selecting a new queue. 2162 */ 2163 if (timer_pending(&cfqd->idle_slice_timer) || 2164 (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { 2165 cfqq = NULL; 2166 goto keep_queue; 2167 } 2168 2169expire: 2170 cfq_slice_expired(cfqd, 0); 2171new_queue: 2172 /* 2173 * Current queue expired. Check if we have to switch to a new 2174 * service tree 2175 */ 2176 if (!new_cfqq) 2177 cfq_choose_cfqg(cfqd); 2178 2179 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 2180keep_queue: 2181 return cfqq; 2182} 2183 2184static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 2185{ 2186 int dispatched = 0; 2187 2188 while (cfqq->next_rq) { 2189 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 2190 dispatched++; 2191 } 2192 2193 BUG_ON(!list_empty(&cfqq->fifo)); 2194 2195 /* By default cfqq is not expired if it is empty. Do it explicitly */ 2196 __cfq_slice_expired(cfqq->cfqd, cfqq, 0); 2197 return dispatched; 2198} 2199 2200/* 2201 * Drain our current requests. Used for barriers and when switching 2202 * io schedulers on-the-fly. 2203 */ 2204static int cfq_forced_dispatch(struct cfq_data *cfqd) 2205{ 2206 struct cfq_queue *cfqq; 2207 int dispatched = 0; 2208 2209 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) 2210 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 2211 2212 cfq_slice_expired(cfqd, 0); 2213 BUG_ON(cfqd->busy_queues); 2214 2215 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 2216 return dispatched; 2217} 2218 2219static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2220{ 2221 unsigned int max_dispatch; 2222 2223 /* 2224 * Drain async requests before we start sync IO 2225 */ 2226 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) 2227 return false; 2228 2229 /* 2230 * If this is an async queue and we have sync IO in flight, let it wait 2231 */ 2232 if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) 2233 return false; 2234 2235 max_dispatch = cfqd->cfq_quantum; 2236 if (cfq_class_idle(cfqq)) 2237 max_dispatch = 1; 2238 2239 /* 2240 * Does this cfqq already have too much IO in flight? 2241 */ 2242 if (cfqq->dispatched >= max_dispatch) { 2243 /* 2244 * idle queue must always only have a single IO in flight 2245 */ 2246 if (cfq_class_idle(cfqq)) 2247 return false; 2248 2249 /* 2250 * We have other queues, don't allow more IO from this one 2251 */ 2252 if (cfqd->busy_queues > 1) 2253 return false; 2254 2255 /* 2256 * Sole queue user, no limit 2257 */ 2258 max_dispatch = -1; 2259 } 2260 2261 /* 2262 * Async queues must wait a bit before being allowed dispatch. 2263 * We also ramp up the dispatch depth gradually for async IO, 2264 * based on the last sync IO we serviced 2265 */ 2266 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { 2267 unsigned long last_sync = jiffies - cfqd->last_end_sync_rq; 2268 unsigned int depth; 2269 2270 depth = last_sync / cfqd->cfq_slice[1]; 2271 if (!depth && !cfqq->dispatched) 2272 depth = 1; 2273 if (depth < max_dispatch) 2274 max_dispatch = depth; 2275 } 2276 2277 /* 2278 * If we're below the current max, allow a dispatch 2279 */ 2280 return cfqq->dispatched < max_dispatch; 2281} 2282 2283/* 2284 * Dispatch a request from cfqq, moving them to the request queue 2285 * dispatch list. 2286 */ 2287static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2288{ 2289 struct request *rq; 2290 2291 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 2292 2293 if (!cfq_may_dispatch(cfqd, cfqq)) 2294 return false; 2295 2296 /* 2297 * follow expired path, else get first next available 2298 */ 2299 rq = cfq_check_fifo(cfqq); 2300 if (!rq) 2301 rq = cfqq->next_rq; 2302 2303 /* 2304 * insert request into driver dispatch list 2305 */ 2306 cfq_dispatch_insert(cfqd->queue, rq); 2307 2308 if (!cfqd->active_cic) { 2309 struct cfq_io_context *cic = RQ_CIC(rq); 2310 2311 atomic_long_inc(&cic->ioc->refcount); 2312 cfqd->active_cic = cic; 2313 } 2314 2315 return true; 2316} 2317 2318/* 2319 * Find the cfqq that we need to service and move a request from that to the 2320 * dispatch list 2321 */ 2322static int cfq_dispatch_requests(struct request_queue *q, int force) 2323{ 2324 struct cfq_data *cfqd = q->elevator->elevator_data; 2325 struct cfq_queue *cfqq; 2326 2327 if (!cfqd->busy_queues) 2328 return 0; 2329 2330 if (unlikely(force)) 2331 return cfq_forced_dispatch(cfqd); 2332 2333 cfqq = cfq_select_queue(cfqd); 2334 if (!cfqq) 2335 return 0; 2336 2337 /* 2338 * Dispatch a request from this cfqq, if it is allowed 2339 */ 2340 if (!cfq_dispatch_request(cfqd, cfqq)) 2341 return 0; 2342 2343 cfqq->slice_dispatch++; 2344 cfq_clear_cfqq_must_dispatch(cfqq); 2345 2346 /* 2347 * expire an async queue immediately if it has used up its slice. idle 2348 * queue always expire after 1 dispatch round. 2349 */ 2350 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 2351 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 2352 cfq_class_idle(cfqq))) { 2353 cfqq->slice_end = jiffies + 1; 2354 cfq_slice_expired(cfqd, 0); 2355 } 2356 2357 cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); 2358 return 1; 2359} 2360 2361/* 2362 * task holds one reference to the queue, dropped when task exits. each rq 2363 * in-flight on this queue also holds a reference, dropped when rq is freed. 2364 * 2365 * Each cfq queue took a reference on the parent group. Drop it now. 2366 * queue lock must be held here. 2367 */ 2368static void cfq_put_queue(struct cfq_queue *cfqq) 2369{ 2370 struct cfq_data *cfqd = cfqq->cfqd; 2371 struct cfq_group *cfqg; 2372 2373 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2374 2375 if (!atomic_dec_and_test(&cfqq->ref)) 2376 return; 2377 2378 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2379 BUG_ON(rb_first(&cfqq->sort_list)); 2380 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2381 cfqg = cfqq->cfqg; 2382 2383 if (unlikely(cfqd->active_queue == cfqq)) { 2384 __cfq_slice_expired(cfqd, cfqq, 0); 2385 cfq_schedule_dispatch(cfqd); 2386 } 2387 2388 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2389 kmem_cache_free(cfq_pool, cfqq); 2390 cfq_put_cfqg(cfqg); 2391 if (cfqq->orig_cfqg) 2392 cfq_put_cfqg(cfqq->orig_cfqg); 2393} 2394 2395/* 2396 * Must always be called with the rcu_read_lock() held 2397 */ 2398static void 2399__call_for_each_cic(struct io_context *ioc, 2400 void (*func)(struct io_context *, struct cfq_io_context *)) 2401{ 2402 struct cfq_io_context *cic; 2403 struct hlist_node *n; 2404 2405 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 2406 func(ioc, cic); 2407} 2408 2409/* 2410 * Call func for each cic attached to this ioc. 2411 */ 2412static void 2413call_for_each_cic(struct io_context *ioc, 2414 void (*func)(struct io_context *, struct cfq_io_context *)) 2415{ 2416 rcu_read_lock(); 2417 __call_for_each_cic(ioc, func); 2418 rcu_read_unlock(); 2419} 2420 2421static void cfq_cic_free_rcu(struct rcu_head *head) 2422{ 2423 struct cfq_io_context *cic; 2424 2425 cic = container_of(head, struct cfq_io_context, rcu_head); 2426 2427 kmem_cache_free(cfq_ioc_pool, cic); 2428 elv_ioc_count_dec(cfq_ioc_count); 2429 2430 if (ioc_gone) { 2431 /* 2432 * CFQ scheduler is exiting, grab exit lock and check 2433 * the pending io context count. If it hits zero, 2434 * complete ioc_gone and set it back to NULL 2435 */ 2436 spin_lock(&ioc_gone_lock); 2437 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { 2438 complete(ioc_gone); 2439 ioc_gone = NULL; 2440 } 2441 spin_unlock(&ioc_gone_lock); 2442 } 2443} 2444 2445static void cfq_cic_free(struct cfq_io_context *cic) 2446{ 2447 call_rcu(&cic->rcu_head, cfq_cic_free_rcu); 2448} 2449 2450static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 2451{ 2452 unsigned long flags; 2453 2454 BUG_ON(!cic->dead_key); 2455 2456 spin_lock_irqsave(&ioc->lock, flags); 2457 radix_tree_delete(&ioc->radix_root, cic->dead_key); 2458 hlist_del_rcu(&cic->cic_list); 2459 spin_unlock_irqrestore(&ioc->lock, flags); 2460 2461 cfq_cic_free(cic); 2462} 2463 2464/* 2465 * Must be called with rcu_read_lock() held or preemption otherwise disabled. 2466 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), 2467 * and ->trim() which is called with the task lock held 2468 */ 2469static void cfq_free_io_context(struct io_context *ioc) 2470{ 2471 /* 2472 * ioc->refcount is zero here, or we are called from elv_unregister(), 2473 * so no more cic's are allowed to be linked into this ioc. So it 2474 * should be ok to iterate over the known list, we will see all cic's 2475 * since no new ones are added. 2476 */ 2477 __call_for_each_cic(ioc, cic_free_func); 2478} 2479 2480static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2481{ 2482 struct cfq_queue *__cfqq, *next; 2483 2484 if (unlikely(cfqq == cfqd->active_queue)) { 2485 __cfq_slice_expired(cfqd, cfqq, 0); 2486 cfq_schedule_dispatch(cfqd); 2487 } 2488 2489 /* 2490 * If this queue was scheduled to merge with another queue, be 2491 * sure to drop the reference taken on that queue (and others in 2492 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. 2493 */ 2494 __cfqq = cfqq->new_cfqq; 2495 while (__cfqq) { 2496 if (__cfqq == cfqq) { 2497 WARN(1, "cfqq->new_cfqq loop detected\n"); 2498 break; 2499 } 2500 next = __cfqq->new_cfqq; 2501 cfq_put_queue(__cfqq); 2502 __cfqq = next; 2503 } 2504 2505 cfq_put_queue(cfqq); 2506} 2507 2508static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 2509 struct cfq_io_context *cic) 2510{ 2511 struct io_context *ioc = cic->ioc; 2512 2513 list_del_init(&cic->queue_list); 2514 2515 /* 2516 * Make sure key == NULL is seen for dead queues 2517 */ 2518 smp_wmb(); 2519 cic->dead_key = (unsigned long) cic->key; 2520 cic->key = NULL; 2521 2522 if (ioc->ioc_data == cic) 2523 rcu_assign_pointer(ioc->ioc_data, NULL); 2524 2525 if (cic->cfqq[BLK_RW_ASYNC]) { 2526 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 2527 cic->cfqq[BLK_RW_ASYNC] = NULL; 2528 } 2529 2530 if (cic->cfqq[BLK_RW_SYNC]) { 2531 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 2532 cic->cfqq[BLK_RW_SYNC] = NULL; 2533 } 2534} 2535 2536static void cfq_exit_single_io_context(struct io_context *ioc, 2537 struct cfq_io_context *cic) 2538{ 2539 struct cfq_data *cfqd = cic->key; 2540 2541 if (cfqd) { 2542 struct request_queue *q = cfqd->queue; 2543 unsigned long flags; 2544 2545 spin_lock_irqsave(q->queue_lock, flags); 2546 2547 /* 2548 * Ensure we get a fresh copy of the ->key to prevent 2549 * race between exiting task and queue 2550 */ 2551 smp_read_barrier_depends(); 2552 if (cic->key) 2553 __cfq_exit_single_io_context(cfqd, cic); 2554 2555 spin_unlock_irqrestore(q->queue_lock, flags); 2556 } 2557} 2558 2559/* 2560 * The process that ioc belongs to has exited, we need to clean up 2561 * and put the internal structures we have that belongs to that process. 2562 */ 2563static void cfq_exit_io_context(struct io_context *ioc) 2564{ 2565 call_for_each_cic(ioc, cfq_exit_single_io_context); 2566} 2567 2568static struct cfq_io_context * 2569cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 2570{ 2571 struct cfq_io_context *cic; 2572 2573 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, 2574 cfqd->queue->node); 2575 if (cic) { 2576 cic->last_end_request = jiffies; 2577 INIT_LIST_HEAD(&cic->queue_list); 2578 INIT_HLIST_NODE(&cic->cic_list); 2579 cic->dtor = cfq_free_io_context; 2580 cic->exit = cfq_exit_io_context; 2581 elv_ioc_count_inc(cfq_ioc_count); 2582 } 2583 2584 return cic; 2585} 2586 2587static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 2588{ 2589 struct task_struct *tsk = current; 2590 int ioprio_class; 2591 2592 if (!cfq_cfqq_prio_changed(cfqq)) 2593 return; 2594 2595 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); 2596 switch (ioprio_class) { 2597 default: 2598 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 2599 case IOPRIO_CLASS_NONE: 2600 /* 2601 * no prio set, inherit CPU scheduling settings 2602 */ 2603 cfqq->ioprio = task_nice_ioprio(tsk); 2604 cfqq->ioprio_class = task_nice_ioclass(tsk); 2605 break; 2606 case IOPRIO_CLASS_RT: 2607 cfqq->ioprio = task_ioprio(ioc); 2608 cfqq->ioprio_class = IOPRIO_CLASS_RT; 2609 break; 2610 case IOPRIO_CLASS_BE: 2611 cfqq->ioprio = task_ioprio(ioc); 2612 cfqq->ioprio_class = IOPRIO_CLASS_BE; 2613 break; 2614 case IOPRIO_CLASS_IDLE: 2615 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 2616 cfqq->ioprio = 7; 2617 cfq_clear_cfqq_idle_window(cfqq); 2618 break; 2619 } 2620 2621 /* 2622 * keep track of original prio settings in case we have to temporarily 2623 * elevate the priority of this queue 2624 */ 2625 cfqq->org_ioprio = cfqq->ioprio; 2626 cfqq->org_ioprio_class = cfqq->ioprio_class; 2627 cfq_clear_cfqq_prio_changed(cfqq); 2628} 2629 2630static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 2631{ 2632 struct cfq_data *cfqd = cic->key; 2633 struct cfq_queue *cfqq; 2634 unsigned long flags; 2635 2636 if (unlikely(!cfqd)) 2637 return; 2638 2639 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2640 2641 cfqq = cic->cfqq[BLK_RW_ASYNC]; 2642 if (cfqq) { 2643 struct cfq_queue *new_cfqq; 2644 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 2645 GFP_ATOMIC); 2646 if (new_cfqq) { 2647 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 2648 cfq_put_queue(cfqq); 2649 } 2650 } 2651 2652 cfqq = cic->cfqq[BLK_RW_SYNC]; 2653 if (cfqq) 2654 cfq_mark_cfqq_prio_changed(cfqq); 2655 2656 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2657} 2658 2659static void cfq_ioc_set_ioprio(struct io_context *ioc) 2660{ 2661 call_for_each_cic(ioc, changed_ioprio); 2662 ioc->ioprio_changed = 0; 2663} 2664 2665static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2666 pid_t pid, bool is_sync) 2667{ 2668 RB_CLEAR_NODE(&cfqq->rb_node); 2669 RB_CLEAR_NODE(&cfqq->p_node); 2670 INIT_LIST_HEAD(&cfqq->fifo); 2671 2672 atomic_set(&cfqq->ref, 0); 2673 cfqq->cfqd = cfqd; 2674 2675 cfq_mark_cfqq_prio_changed(cfqq); 2676 2677 if (is_sync) { 2678 if (!cfq_class_idle(cfqq)) 2679 cfq_mark_cfqq_idle_window(cfqq); 2680 cfq_mark_cfqq_sync(cfqq); 2681 } 2682 cfqq->pid = pid; 2683} 2684 2685#ifdef CONFIG_CFQ_GROUP_IOSCHED 2686static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 2687{ 2688 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 2689 struct cfq_data *cfqd = cic->key; 2690 unsigned long flags; 2691 struct request_queue *q; 2692 2693 if (unlikely(!cfqd)) 2694 return; 2695 2696 q = cfqd->queue; 2697 2698 spin_lock_irqsave(q->queue_lock, flags); 2699 2700 if (sync_cfqq) { 2701 /* 2702 * Drop reference to sync queue. A new sync queue will be 2703 * assigned in new group upon arrival of a fresh request. 2704 */ 2705 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); 2706 cic_set_cfqq(cic, NULL, 1); 2707 cfq_put_queue(sync_cfqq); 2708 } 2709 2710 spin_unlock_irqrestore(q->queue_lock, flags); 2711} 2712 2713static void cfq_ioc_set_cgroup(struct io_context *ioc) 2714{ 2715 call_for_each_cic(ioc, changed_cgroup); 2716 ioc->cgroup_changed = 0; 2717} 2718#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 2719 2720static struct cfq_queue * 2721cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 2722 struct io_context *ioc, gfp_t gfp_mask) 2723{ 2724 struct cfq_queue *cfqq, *new_cfqq = NULL; 2725 struct cfq_io_context *cic; 2726 struct cfq_group *cfqg; 2727 2728retry: 2729 cfqg = cfq_get_cfqg(cfqd, 1); 2730 cic = cfq_cic_lookup(cfqd, ioc); 2731 /* cic always exists here */ 2732 cfqq = cic_to_cfqq(cic, is_sync); 2733 2734 /* 2735 * Always try a new alloc if we fell back to the OOM cfqq 2736 * originally, since it should just be a temporary situation. 2737 */ 2738 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 2739 cfqq = NULL; 2740 if (new_cfqq) { 2741 cfqq = new_cfqq; 2742 new_cfqq = NULL; 2743 } else if (gfp_mask & __GFP_WAIT) { 2744 spin_unlock_irq(cfqd->queue->queue_lock); 2745 new_cfqq = kmem_cache_alloc_node(cfq_pool, 2746 gfp_mask | __GFP_ZERO, 2747 cfqd->queue->node); 2748 spin_lock_irq(cfqd->queue->queue_lock); 2749 if (new_cfqq) 2750 goto retry; 2751 } else { 2752 cfqq = kmem_cache_alloc_node(cfq_pool, 2753 gfp_mask | __GFP_ZERO, 2754 cfqd->queue->node); 2755 } 2756 2757 if (cfqq) { 2758 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 2759 cfq_init_prio_data(cfqq, ioc); 2760 cfq_link_cfqq_cfqg(cfqq, cfqg); 2761 cfq_log_cfqq(cfqd, cfqq, "alloced"); 2762 } else 2763 cfqq = &cfqd->oom_cfqq; 2764 } 2765 2766 if (new_cfqq) 2767 kmem_cache_free(cfq_pool, new_cfqq); 2768 2769 return cfqq; 2770} 2771 2772static struct cfq_queue ** 2773cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) 2774{ 2775 switch (ioprio_class) { 2776 case IOPRIO_CLASS_RT: 2777 return &cfqd->async_cfqq[0][ioprio]; 2778 case IOPRIO_CLASS_BE: 2779 return &cfqd->async_cfqq[1][ioprio]; 2780 case IOPRIO_CLASS_IDLE: 2781 return &cfqd->async_idle_cfqq; 2782 default: 2783 BUG(); 2784 } 2785} 2786 2787static struct cfq_queue * 2788cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, 2789 gfp_t gfp_mask) 2790{ 2791 const int ioprio = task_ioprio(ioc); 2792 const int ioprio_class = task_ioprio_class(ioc); 2793 struct cfq_queue **async_cfqq = NULL; 2794 struct cfq_queue *cfqq = NULL; 2795 2796 if (!is_sync) { 2797 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 2798 cfqq = *async_cfqq; 2799 } 2800 2801 if (!cfqq) 2802 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 2803 2804 /* 2805 * pin the queue now that it's allocated, scheduler exit will prune it 2806 */ 2807 if (!is_sync && !(*async_cfqq)) { 2808 atomic_inc(&cfqq->ref); 2809 *async_cfqq = cfqq; 2810 } 2811 2812 atomic_inc(&cfqq->ref); 2813 return cfqq; 2814} 2815 2816/* 2817 * We drop cfq io contexts lazily, so we may find a dead one. 2818 */ 2819static void 2820cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, 2821 struct cfq_io_context *cic) 2822{ 2823 unsigned long flags; 2824 2825 WARN_ON(!list_empty(&cic->queue_list)); 2826 2827 spin_lock_irqsave(&ioc->lock, flags); 2828 2829 BUG_ON(ioc->ioc_data == cic); 2830 2831 radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd); 2832 hlist_del_rcu(&cic->cic_list); 2833 spin_unlock_irqrestore(&ioc->lock, flags); 2834 2835 cfq_cic_free(cic); 2836} 2837 2838static struct cfq_io_context * 2839cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 2840{ 2841 struct cfq_io_context *cic; 2842 unsigned long flags; 2843 void *k; 2844 2845 if (unlikely(!ioc)) 2846 return NULL; 2847 2848 rcu_read_lock(); 2849 2850 /* 2851 * we maintain a last-hit cache, to avoid browsing over the tree 2852 */ 2853 cic = rcu_dereference(ioc->ioc_data); 2854 if (cic && cic->key == cfqd) { 2855 rcu_read_unlock(); 2856 return cic; 2857 } 2858 2859 do { 2860 cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); 2861 rcu_read_unlock(); 2862 if (!cic) 2863 break; 2864 /* ->key must be copied to avoid race with cfq_exit_queue() */ 2865 k = cic->key; 2866 if (unlikely(!k)) { 2867 cfq_drop_dead_cic(cfqd, ioc, cic); 2868 rcu_read_lock(); 2869 continue; 2870 } 2871 2872 spin_lock_irqsave(&ioc->lock, flags); 2873 rcu_assign_pointer(ioc->ioc_data, cic); 2874 spin_unlock_irqrestore(&ioc->lock, flags); 2875 break; 2876 } while (1); 2877 2878 return cic; 2879} 2880 2881/* 2882 * Add cic into ioc, using cfqd as the search key. This enables us to lookup 2883 * the process specific cfq io context when entered from the block layer. 2884 * Also adds the cic to a per-cfqd list, used when this queue is removed. 2885 */ 2886static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 2887 struct cfq_io_context *cic, gfp_t gfp_mask) 2888{ 2889 unsigned long flags; 2890 int ret; 2891 2892 ret = radix_tree_preload(gfp_mask); 2893 if (!ret) { 2894 cic->ioc = ioc; 2895 cic->key = cfqd; 2896 2897 spin_lock_irqsave(&ioc->lock, flags); 2898 ret = radix_tree_insert(&ioc->radix_root, 2899 (unsigned long) cfqd, cic); 2900 if (!ret) 2901 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 2902 spin_unlock_irqrestore(&ioc->lock, flags); 2903 2904 radix_tree_preload_end(); 2905 2906 if (!ret) { 2907 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2908 list_add(&cic->queue_list, &cfqd->cic_list); 2909 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2910 } 2911 } 2912 2913 if (ret) 2914 printk(KERN_ERR "cfq: cic link failed!\n"); 2915 2916 return ret; 2917} 2918 2919/* 2920 * Setup general io context and cfq io context. There can be several cfq 2921 * io contexts per general io context, if this process is doing io to more 2922 * than one device managed by cfq. 2923 */ 2924static struct cfq_io_context * 2925cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 2926{ 2927 struct io_context *ioc = NULL; 2928 struct cfq_io_context *cic; 2929 2930 might_sleep_if(gfp_mask & __GFP_WAIT); 2931 2932 ioc = get_io_context(gfp_mask, cfqd->queue->node); 2933 if (!ioc) 2934 return NULL; 2935 2936 cic = cfq_cic_lookup(cfqd, ioc); 2937 if (cic) 2938 goto out; 2939 2940 cic = cfq_alloc_io_context(cfqd, gfp_mask); 2941 if (cic == NULL) 2942 goto err; 2943 2944 if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) 2945 goto err_free; 2946 2947out: 2948 smp_read_barrier_depends(); 2949 if (unlikely(ioc->ioprio_changed)) 2950 cfq_ioc_set_ioprio(ioc); 2951 2952#ifdef CONFIG_CFQ_GROUP_IOSCHED 2953 if (unlikely(ioc->cgroup_changed)) 2954 cfq_ioc_set_cgroup(ioc); 2955#endif 2956 return cic; 2957err_free: 2958 cfq_cic_free(cic); 2959err: 2960 put_io_context(ioc); 2961 return NULL; 2962} 2963 2964static void 2965cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 2966{ 2967 unsigned long elapsed = jiffies - cic->last_end_request; 2968 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 2969 2970 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 2971 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 2972 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 2973} 2974 2975static void 2976cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2977 struct request *rq) 2978{ 2979 sector_t sdist; 2980 u64 total; 2981 2982 if (!cfqq->last_request_pos) 2983 sdist = 0; 2984 else if (cfqq->last_request_pos < blk_rq_pos(rq)) 2985 sdist = blk_rq_pos(rq) - cfqq->last_request_pos; 2986 else 2987 sdist = cfqq->last_request_pos - blk_rq_pos(rq); 2988 2989 /* 2990 * Don't allow the seek distance to get too large from the 2991 * odd fragment, pagein, etc 2992 */ 2993 if (cfqq->seek_samples <= 60) /* second&third seek */ 2994 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024); 2995 else 2996 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64); 2997 2998 cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8; 2999 cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8; 3000 total = cfqq->seek_total + (cfqq->seek_samples/2); 3001 do_div(total, cfqq->seek_samples); 3002 cfqq->seek_mean = (sector_t)total; 3003 3004 /* 3005 * If this cfqq is shared between multiple processes, check to 3006 * make sure that those processes are still issuing I/Os within 3007 * the mean seek distance. If not, it may be time to break the 3008 * queues apart again. 3009 */ 3010 if (cfq_cfqq_coop(cfqq)) { 3011 if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start) 3012 cfqq->seeky_start = jiffies; 3013 else if (!CFQQ_SEEKY(cfqq)) 3014 cfqq->seeky_start = 0; 3015 } 3016} 3017 3018/* 3019 * Disable idle window if the process thinks too long or seeks so much that 3020 * it doesn't matter 3021 */ 3022static void 3023cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3024 struct cfq_io_context *cic) 3025{ 3026 int old_idle, enable_idle; 3027 3028 /* 3029 * Don't idle for async or idle io prio class 3030 */ 3031 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) 3032 return; 3033 3034 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3035 3036 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3037 cfq_mark_cfqq_deep(cfqq); 3038 3039 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3040 (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples) 3041 && CFQQ_SEEKY(cfqq))) 3042 enable_idle = 0; 3043 else if (sample_valid(cic->ttime_samples)) { 3044 if (cic->ttime_mean > cfqd->cfq_slice_idle) 3045 enable_idle = 0; 3046 else 3047 enable_idle = 1; 3048 } 3049 3050 if (old_idle != enable_idle) { 3051 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); 3052 if (enable_idle) 3053 cfq_mark_cfqq_idle_window(cfqq); 3054 else 3055 cfq_clear_cfqq_idle_window(cfqq); 3056 } 3057} 3058 3059/* 3060 * Check if new_cfqq should preempt the currently active queue. Return 0 for 3061 * no or if we aren't sure, a 1 will cause a preempt. 3062 */ 3063static bool 3064cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 3065 struct request *rq) 3066{ 3067 struct cfq_queue *cfqq; 3068 3069 cfqq = cfqd->active_queue; 3070 if (!cfqq) 3071 return false; 3072 3073 if (cfq_class_idle(new_cfqq)) 3074 return false; 3075 3076 if (cfq_class_idle(cfqq)) 3077 return true; 3078 3079 /* 3080 * if the new request is sync, but the currently running queue is 3081 * not, let the sync request have priority. 3082 */ 3083 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3084 return true; 3085 3086 if (new_cfqq->cfqg != cfqq->cfqg) 3087 return false; 3088 3089 if (cfq_slice_used(cfqq)) 3090 return true; 3091 3092 /* Allow preemption only if we are idling on sync-noidle tree */ 3093 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && 3094 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 3095 new_cfqq->service_tree->count == 2 && 3096 RB_EMPTY_ROOT(&cfqq->sort_list)) 3097 return true; 3098 3099 /* 3100 * So both queues are sync. Let the new request get disk time if 3101 * it's a metadata request and the current queue is doing regular IO. 3102 */ 3103 if (rq_is_meta(rq) && !cfqq->meta_pending) 3104 return true; 3105 3106 /* 3107 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. 3108 */ 3109 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 3110 return true; 3111 3112 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 3113 return false; 3114 3115 /* 3116 * if this request is as-good as one we would expect from the 3117 * current cfqq, let it preempt 3118 */ 3119 if (cfq_rq_close(cfqd, cfqq, rq)) 3120 return true; 3121 3122 return false; 3123} 3124 3125/* 3126 * cfqq preempts the active queue. if we allowed preempt with no slice left, 3127 * let it have half of its nominal slice. 3128 */ 3129static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3130{ 3131 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3132 cfq_slice_expired(cfqd, 1); 3133 3134 /* 3135 * Put the new queue at the front of the of the current list, 3136 * so we know that it will be selected next. 3137 */ 3138 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 3139 3140 cfq_service_tree_add(cfqd, cfqq, 1); 3141 3142 cfqq->slice_end = 0; 3143 cfq_mark_cfqq_slice_new(cfqq); 3144} 3145 3146/* 3147 * Called when a new fs request (rq) is added (to cfqq). Check if there's 3148 * something we should do about it 3149 */ 3150static void 3151cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3152 struct request *rq) 3153{ 3154 struct cfq_io_context *cic = RQ_CIC(rq); 3155 3156 cfqd->rq_queued++; 3157 if (rq_is_meta(rq)) 3158 cfqq->meta_pending++; 3159 3160 cfq_update_io_thinktime(cfqd, cic); 3161 cfq_update_io_seektime(cfqd, cfqq, rq); 3162 cfq_update_idle_window(cfqd, cfqq, cic); 3163 3164 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3165 3166 if (cfqq == cfqd->active_queue) { 3167 if (cfq_cfqq_wait_busy(cfqq)) { 3168 cfq_clear_cfqq_wait_busy(cfqq); 3169 cfq_mark_cfqq_wait_busy_done(cfqq); 3170 } 3171 /* 3172 * Remember that we saw a request from this process, but 3173 * don't start queuing just yet. Otherwise we risk seeing lots 3174 * of tiny requests, because we disrupt the normal plugging 3175 * and merging. If the request is already larger than a single 3176 * page, let it rip immediately. For that case we assume that 3177 * merging is already done. Ditto for a busy system that 3178 * has other work pending, don't risk delaying until the 3179 * idle timer unplug to continue working. 3180 */ 3181 if (cfq_cfqq_wait_request(cfqq)) { 3182 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3183 cfqd->busy_queues > 1) { 3184 del_timer(&cfqd->idle_slice_timer); 3185 __blk_run_queue(cfqd->queue); 3186 } else 3187 cfq_mark_cfqq_must_dispatch(cfqq); 3188 } 3189 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3190 /* 3191 * not the active queue - expire current slice if it is 3192 * idle and has expired it's mean thinktime or this new queue 3193 * has some old slice time left and is of higher priority or 3194 * this new queue is RT and the current one is BE 3195 */ 3196 cfq_preempt_queue(cfqd, cfqq); 3197 __blk_run_queue(cfqd->queue); 3198 } 3199} 3200 3201static void cfq_insert_request(struct request_queue *q, struct request *rq) 3202{ 3203 struct cfq_data *cfqd = q->elevator->elevator_data; 3204 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3205 3206 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3207 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3208 3209 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3210 list_add_tail(&rq->queuelist, &cfqq->fifo); 3211 cfq_add_rq_rb(rq); 3212 3213 cfq_rq_enqueued(cfqd, cfqq, rq); 3214} 3215 3216/* 3217 * Update hw_tag based on peak queue depth over 50 samples under 3218 * sufficient load. 3219 */ 3220static void cfq_update_hw_tag(struct cfq_data *cfqd) 3221{ 3222 struct cfq_queue *cfqq = cfqd->active_queue; 3223 3224 if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth) 3225 cfqd->hw_tag_est_depth = rq_in_driver(cfqd); 3226 3227 if (cfqd->hw_tag == 1) 3228 return; 3229 3230 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 3231 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) 3232 return; 3233 3234 /* 3235 * If active queue hasn't enough requests and can idle, cfq might not 3236 * dispatch sufficient requests to hardware. Don't zero hw_tag in this 3237 * case 3238 */ 3239 if (cfqq && cfq_cfqq_idle_window(cfqq) && 3240 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < 3241 CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN) 3242 return; 3243 3244 if (cfqd->hw_tag_samples++ < 50) 3245 return; 3246 3247 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) 3248 cfqd->hw_tag = 1; 3249 else 3250 cfqd->hw_tag = 0; 3251} 3252 3253static void cfq_completed_request(struct request_queue *q, struct request *rq) 3254{ 3255 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3256 struct cfq_data *cfqd = cfqq->cfqd; 3257 const int sync = rq_is_sync(rq); 3258 unsigned long now; 3259 3260 now = jiffies; 3261 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq)); 3262 3263 cfq_update_hw_tag(cfqd); 3264 3265 WARN_ON(!cfqd->rq_in_driver[sync]); 3266 WARN_ON(!cfqq->dispatched); 3267 cfqd->rq_in_driver[sync]--; 3268 cfqq->dispatched--; 3269 3270 if (cfq_cfqq_sync(cfqq)) 3271 cfqd->sync_flight--; 3272 3273 if (sync) { 3274 RQ_CIC(rq)->last_end_request = now; 3275 cfqd->last_end_sync_rq = now; 3276 } 3277 3278 /* 3279 * If this is the active queue, check if it needs to be expired, 3280 * or if we want to idle in case it has no pending requests. 3281 */ 3282 if (cfqd->active_queue == cfqq) { 3283 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 3284 3285 if (cfq_cfqq_slice_new(cfqq)) { 3286 cfq_set_prio_slice(cfqd, cfqq); 3287 cfq_clear_cfqq_slice_new(cfqq); 3288 } 3289 3290 /* 3291 * If this queue consumed its slice and this is last queue 3292 * in the group, wait for next request before we expire 3293 * the queue 3294 */ 3295 if (cfq_slice_used(cfqq) && cfqq->cfqg->nr_cfqq == 1) { 3296 cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; 3297 cfq_mark_cfqq_wait_busy(cfqq); 3298 } 3299 3300 /* 3301 * Idling is not enabled on: 3302 * - expired queues 3303 * - idle-priority queues 3304 * - async queues 3305 * - queues with still some requests queued 3306 * - when there is a close cooperator 3307 */ 3308 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3309 cfq_slice_expired(cfqd, 1); 3310 else if (sync && cfqq_empty && 3311 !cfq_close_cooperator(cfqd, cfqq)) { 3312 cfqd->noidle_tree_requires_idle |= !rq_noidle(rq); 3313 /* 3314 * Idling is enabled for SYNC_WORKLOAD. 3315 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree 3316 * only if we processed at least one !rq_noidle request 3317 */ 3318 if (cfqd->serving_type == SYNC_WORKLOAD 3319 || cfqd->noidle_tree_requires_idle 3320 || cfqq->cfqg->nr_cfqq == 1) 3321 cfq_arm_slice_timer(cfqd); 3322 } 3323 } 3324 3325 if (!rq_in_driver(cfqd)) 3326 cfq_schedule_dispatch(cfqd); 3327} 3328 3329/* 3330 * we temporarily boost lower priority queues if they are holding fs exclusive 3331 * resources. they are boosted to normal prio (CLASS_BE/4) 3332 */ 3333static void cfq_prio_boost(struct cfq_queue *cfqq) 3334{ 3335 if (has_fs_excl()) { 3336 /* 3337 * boost idle prio on transactions that would lock out other 3338 * users of the filesystem 3339 */ 3340 if (cfq_class_idle(cfqq)) 3341 cfqq->ioprio_class = IOPRIO_CLASS_BE; 3342 if (cfqq->ioprio > IOPRIO_NORM) 3343 cfqq->ioprio = IOPRIO_NORM; 3344 } else { 3345 /* 3346 * unboost the queue (if needed) 3347 */ 3348 cfqq->ioprio_class = cfqq->org_ioprio_class; 3349 cfqq->ioprio = cfqq->org_ioprio; 3350 } 3351} 3352 3353static inline int __cfq_may_queue(struct cfq_queue *cfqq) 3354{ 3355 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { 3356 cfq_mark_cfqq_must_alloc_slice(cfqq); 3357 return ELV_MQUEUE_MUST; 3358 } 3359 3360 return ELV_MQUEUE_MAY; 3361} 3362 3363static int cfq_may_queue(struct request_queue *q, int rw) 3364{ 3365 struct cfq_data *cfqd = q->elevator->elevator_data; 3366 struct task_struct *tsk = current; 3367 struct cfq_io_context *cic; 3368 struct cfq_queue *cfqq; 3369 3370 /* 3371 * don't force setup of a queue from here, as a call to may_queue 3372 * does not necessarily imply that a request actually will be queued. 3373 * so just lookup a possibly existing queue, or return 'may queue' 3374 * if that fails 3375 */ 3376 cic = cfq_cic_lookup(cfqd, tsk->io_context); 3377 if (!cic) 3378 return ELV_MQUEUE_MAY; 3379 3380 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3381 if (cfqq) { 3382 cfq_init_prio_data(cfqq, cic->ioc); 3383 cfq_prio_boost(cfqq); 3384 3385 return __cfq_may_queue(cfqq); 3386 } 3387 3388 return ELV_MQUEUE_MAY; 3389} 3390 3391/* 3392 * queue lock held here 3393 */ 3394static void cfq_put_request(struct request *rq) 3395{ 3396 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3397 3398 if (cfqq) { 3399 const int rw = rq_data_dir(rq); 3400 3401 BUG_ON(!cfqq->allocated[rw]); 3402 cfqq->allocated[rw]--; 3403 3404 put_io_context(RQ_CIC(rq)->ioc); 3405 3406 rq->elevator_private = NULL; 3407 rq->elevator_private2 = NULL; 3408 3409 cfq_put_queue(cfqq); 3410 } 3411} 3412 3413static struct cfq_queue * 3414cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, 3415 struct cfq_queue *cfqq) 3416{ 3417 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 3418 cic_set_cfqq(cic, cfqq->new_cfqq, 1); 3419 cfq_mark_cfqq_coop(cfqq->new_cfqq); 3420 cfq_put_queue(cfqq); 3421 return cic_to_cfqq(cic, 1); 3422} 3423 3424static int should_split_cfqq(struct cfq_queue *cfqq) 3425{ 3426 if (cfqq->seeky_start && 3427 time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT)) 3428 return 1; 3429 return 0; 3430} 3431 3432/* 3433 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this 3434 * was the last process referring to said cfqq. 3435 */ 3436static struct cfq_queue * 3437split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) 3438{ 3439 if (cfqq_process_refs(cfqq) == 1) { 3440 cfqq->seeky_start = 0; 3441 cfqq->pid = current->pid; 3442 cfq_clear_cfqq_coop(cfqq); 3443 return cfqq; 3444 } 3445 3446 cic_set_cfqq(cic, NULL, 1); 3447 cfq_put_queue(cfqq); 3448 return NULL; 3449} 3450/* 3451 * Allocate cfq data structures associated with this request. 3452 */ 3453static int 3454cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 3455{ 3456 struct cfq_data *cfqd = q->elevator->elevator_data; 3457 struct cfq_io_context *cic; 3458 const int rw = rq_data_dir(rq); 3459 const bool is_sync = rq_is_sync(rq); 3460 struct cfq_queue *cfqq; 3461 unsigned long flags; 3462 3463 might_sleep_if(gfp_mask & __GFP_WAIT); 3464 3465 cic = cfq_get_io_context(cfqd, gfp_mask); 3466 3467 spin_lock_irqsave(q->queue_lock, flags); 3468 3469 if (!cic) 3470 goto queue_fail; 3471 3472new_queue: 3473 cfqq = cic_to_cfqq(cic, is_sync); 3474 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3475 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3476 cic_set_cfqq(cic, cfqq, is_sync); 3477 } else { 3478 /* 3479 * If the queue was seeky for too long, break it apart. 3480 */ 3481 if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) { 3482 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); 3483 cfqq = split_cfqq(cic, cfqq); 3484 if (!cfqq) 3485 goto new_queue; 3486 } 3487 3488 /* 3489 * Check to see if this queue is scheduled to merge with 3490 * another, closely cooperating queue. The merging of 3491 * queues happens here as it must be done in process context. 3492 * The reference on new_cfqq was taken in merge_cfqqs. 3493 */ 3494 if (cfqq->new_cfqq) 3495 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); 3496 } 3497 3498 cfqq->allocated[rw]++; 3499 atomic_inc(&cfqq->ref); 3500 3501 spin_unlock_irqrestore(q->queue_lock, flags); 3502 3503 rq->elevator_private = cic; 3504 rq->elevator_private2 = cfqq; 3505 return 0; 3506 3507queue_fail: 3508 if (cic) 3509 put_io_context(cic->ioc); 3510 3511 cfq_schedule_dispatch(cfqd); 3512 spin_unlock_irqrestore(q->queue_lock, flags); 3513 cfq_log(cfqd, "set_request fail"); 3514 return 1; 3515} 3516 3517static void cfq_kick_queue(struct work_struct *work) 3518{ 3519 struct cfq_data *cfqd = 3520 container_of(work, struct cfq_data, unplug_work); 3521 struct request_queue *q = cfqd->queue; 3522 3523 spin_lock_irq(q->queue_lock); 3524 __blk_run_queue(cfqd->queue); 3525 spin_unlock_irq(q->queue_lock); 3526} 3527 3528/* 3529 * Timer running if the active_queue is currently idling inside its time slice 3530 */ 3531static void cfq_idle_slice_timer(unsigned long data) 3532{ 3533 struct cfq_data *cfqd = (struct cfq_data *) data; 3534 struct cfq_queue *cfqq; 3535 unsigned long flags; 3536 int timed_out = 1; 3537 3538 cfq_log(cfqd, "idle timer fired"); 3539 3540 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3541 3542 cfqq = cfqd->active_queue; 3543 if (cfqq) { 3544 timed_out = 0; 3545 3546 /* 3547 * We saw a request before the queue expired, let it through 3548 */ 3549 if (cfq_cfqq_must_dispatch(cfqq)) 3550 goto out_kick; 3551 3552 /* 3553 * expired 3554 */ 3555 if (cfq_slice_used(cfqq)) 3556 goto expire; 3557 3558 /* 3559 * only expire and reinvoke request handler, if there are 3560 * other queues with pending requests 3561 */ 3562 if (!cfqd->busy_queues) 3563 goto out_cont; 3564 3565 /* 3566 * not expired and it has a request pending, let it dispatch 3567 */ 3568 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3569 goto out_kick; 3570 3571 /* 3572 * Queue depth flag is reset only when the idle didn't succeed 3573 */ 3574 cfq_clear_cfqq_deep(cfqq); 3575 } 3576expire: 3577 cfq_slice_expired(cfqd, timed_out); 3578out_kick: 3579 cfq_schedule_dispatch(cfqd); 3580out_cont: 3581 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3582} 3583 3584static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 3585{ 3586 del_timer_sync(&cfqd->idle_slice_timer); 3587 cancel_work_sync(&cfqd->unplug_work); 3588} 3589 3590static void cfq_put_async_queues(struct cfq_data *cfqd) 3591{ 3592 int i; 3593 3594 for (i = 0; i < IOPRIO_BE_NR; i++) { 3595 if (cfqd->async_cfqq[0][i]) 3596 cfq_put_queue(cfqd->async_cfqq[0][i]); 3597 if (cfqd->async_cfqq[1][i]) 3598 cfq_put_queue(cfqd->async_cfqq[1][i]); 3599 } 3600 3601 if (cfqd->async_idle_cfqq) 3602 cfq_put_queue(cfqd->async_idle_cfqq); 3603} 3604 3605static void cfq_exit_queue(struct elevator_queue *e) 3606{ 3607 struct cfq_data *cfqd = e->elevator_data; 3608 struct request_queue *q = cfqd->queue; 3609 3610 cfq_shutdown_timer_wq(cfqd); 3611 3612 spin_lock_irq(q->queue_lock); 3613 3614 if (cfqd->active_queue) 3615 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3616 3617 while (!list_empty(&cfqd->cic_list)) { 3618 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 3619 struct cfq_io_context, 3620 queue_list); 3621 3622 __cfq_exit_single_io_context(cfqd, cic); 3623 } 3624 3625 cfq_put_async_queues(cfqd); 3626 cfq_release_cfq_groups(cfqd); 3627 blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3628 3629 spin_unlock_irq(q->queue_lock); 3630 3631 cfq_shutdown_timer_wq(cfqd); 3632 3633 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3634 synchronize_rcu(); 3635 kfree(cfqd); 3636} 3637 3638static void *cfq_init_queue(struct request_queue *q) 3639{ 3640 struct cfq_data *cfqd; 3641 int i, j; 3642 struct cfq_group *cfqg; 3643 struct cfq_rb_root *st; 3644 3645 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3646 if (!cfqd) 3647 return NULL; 3648 3649 /* Init root service tree */ 3650 cfqd->grp_service_tree = CFQ_RB_ROOT; 3651 3652 /* Init root group */ 3653 cfqg = &cfqd->root_group; 3654 for_each_cfqg_st(cfqg, i, j, st) 3655 *st = CFQ_RB_ROOT; 3656 RB_CLEAR_NODE(&cfqg->rb_node); 3657 3658 /* Give preference to root group over other groups */ 3659 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; 3660 3661#ifdef CONFIG_CFQ_GROUP_IOSCHED 3662 /* 3663 * Take a reference to root group which we never drop. This is just 3664 * to make sure that cfq_put_cfqg() does not try to kfree root group 3665 */ 3666 atomic_set(&cfqg->ref, 1); 3667 blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, 3668 0); 3669#endif 3670 /* 3671 * Not strictly needed (since RB_ROOT just clears the node and we 3672 * zeroed cfqd on alloc), but better be safe in case someone decides 3673 * to add magic to the rb code 3674 */ 3675 for (i = 0; i < CFQ_PRIO_LISTS; i++) 3676 cfqd->prio_trees[i] = RB_ROOT; 3677 3678 /* 3679 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 3680 * Grab a permanent reference to it, so that the normal code flow 3681 * will not attempt to free it. 3682 */ 3683 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3684 atomic_inc(&cfqd->oom_cfqq.ref); 3685 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 3686 3687 INIT_LIST_HEAD(&cfqd->cic_list); 3688 3689 cfqd->queue = q; 3690 3691 init_timer(&cfqd->idle_slice_timer); 3692 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 3693 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 3694 3695 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 3696 3697 cfqd->cfq_quantum = cfq_quantum; 3698 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 3699 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 3700 cfqd->cfq_back_max = cfq_back_max; 3701 cfqd->cfq_back_penalty = cfq_back_penalty; 3702 cfqd->cfq_slice[0] = cfq_slice_async; 3703 cfqd->cfq_slice[1] = cfq_slice_sync; 3704 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3705 cfqd->cfq_slice_idle = cfq_slice_idle; 3706 cfqd->cfq_latency = 1; 3707 cfqd->cfq_group_isolation = 0; 3708 cfqd->hw_tag = -1; 3709 cfqd->last_end_sync_rq = jiffies; 3710 return cfqd; 3711} 3712 3713static void cfq_slab_kill(void) 3714{ 3715 /* 3716 * Caller already ensured that pending RCU callbacks are completed, 3717 * so we should have no busy allocations at this point. 3718 */ 3719 if (cfq_pool) 3720 kmem_cache_destroy(cfq_pool); 3721 if (cfq_ioc_pool) 3722 kmem_cache_destroy(cfq_ioc_pool); 3723} 3724 3725static int __init cfq_slab_setup(void) 3726{ 3727 cfq_pool = KMEM_CACHE(cfq_queue, 0); 3728 if (!cfq_pool) 3729 goto fail; 3730 3731 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); 3732 if (!cfq_ioc_pool) 3733 goto fail; 3734 3735 return 0; 3736fail: 3737 cfq_slab_kill(); 3738 return -ENOMEM; 3739} 3740 3741/* 3742 * sysfs parts below --> 3743 */ 3744static ssize_t 3745cfq_var_show(unsigned int var, char *page) 3746{ 3747 return sprintf(page, "%d\n", var); 3748} 3749 3750static ssize_t 3751cfq_var_store(unsigned int *var, const char *page, size_t count) 3752{ 3753 char *p = (char *) page; 3754 3755 *var = simple_strtoul(p, &p, 10); 3756 return count; 3757} 3758 3759#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 3760static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 3761{ \ 3762 struct cfq_data *cfqd = e->elevator_data; \ 3763 unsigned int __data = __VAR; \ 3764 if (__CONV) \ 3765 __data = jiffies_to_msecs(__data); \ 3766 return cfq_var_show(__data, (page)); \ 3767} 3768SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 3769SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 3770SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 3771SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 3772SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 3773SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 3774SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 3775SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 3776SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 3777SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 3778SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); 3779#undef SHOW_FUNCTION 3780 3781#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 3782static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 3783{ \ 3784 struct cfq_data *cfqd = e->elevator_data; \ 3785 unsigned int __data; \ 3786 int ret = cfq_var_store(&__data, (page), count); \ 3787 if (__data < (MIN)) \ 3788 __data = (MIN); \ 3789 else if (__data > (MAX)) \ 3790 __data = (MAX); \ 3791 if (__CONV) \ 3792 *(__PTR) = msecs_to_jiffies(__data); \ 3793 else \ 3794 *(__PTR) = __data; \ 3795 return ret; \ 3796} 3797STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 3798STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, 3799 UINT_MAX, 1); 3800STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, 3801 UINT_MAX, 1); 3802STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 3803STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 3804 UINT_MAX, 0); 3805STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 3806STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 3807STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 3808STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 3809 UINT_MAX, 0); 3810STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 3811STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); 3812#undef STORE_FUNCTION 3813 3814#define CFQ_ATTR(name) \ 3815 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 3816 3817static struct elv_fs_entry cfq_attrs[] = { 3818 CFQ_ATTR(quantum), 3819 CFQ_ATTR(fifo_expire_sync), 3820 CFQ_ATTR(fifo_expire_async), 3821 CFQ_ATTR(back_seek_max), 3822 CFQ_ATTR(back_seek_penalty), 3823 CFQ_ATTR(slice_sync), 3824 CFQ_ATTR(slice_async), 3825 CFQ_ATTR(slice_async_rq), 3826 CFQ_ATTR(slice_idle), 3827 CFQ_ATTR(low_latency), 3828 CFQ_ATTR(group_isolation), 3829 __ATTR_NULL 3830}; 3831 3832static struct elevator_type iosched_cfq = { 3833 .ops = { 3834 .elevator_merge_fn = cfq_merge, 3835 .elevator_merged_fn = cfq_merged_request, 3836 .elevator_merge_req_fn = cfq_merged_requests, 3837 .elevator_allow_merge_fn = cfq_allow_merge, 3838 .elevator_dispatch_fn = cfq_dispatch_requests, 3839 .elevator_add_req_fn = cfq_insert_request, 3840 .elevator_activate_req_fn = cfq_activate_request, 3841 .elevator_deactivate_req_fn = cfq_deactivate_request, 3842 .elevator_queue_empty_fn = cfq_queue_empty, 3843 .elevator_completed_req_fn = cfq_completed_request, 3844 .elevator_former_req_fn = elv_rb_former_request, 3845 .elevator_latter_req_fn = elv_rb_latter_request, 3846 .elevator_set_req_fn = cfq_set_request, 3847 .elevator_put_req_fn = cfq_put_request, 3848 .elevator_may_queue_fn = cfq_may_queue, 3849 .elevator_init_fn = cfq_init_queue, 3850 .elevator_exit_fn = cfq_exit_queue, 3851 .trim = cfq_free_io_context, 3852 }, 3853 .elevator_attrs = cfq_attrs, 3854 .elevator_name = "cfq", 3855 .elevator_owner = THIS_MODULE, 3856}; 3857 3858static int __init cfq_init(void) 3859{ 3860 /* 3861 * could be 0 on HZ < 1000 setups 3862 */ 3863 if (!cfq_slice_async) 3864 cfq_slice_async = 1; 3865 if (!cfq_slice_idle) 3866 cfq_slice_idle = 1; 3867 3868 if (cfq_slab_setup()) 3869 return -ENOMEM; 3870 3871 elv_register(&iosched_cfq); 3872 3873 return 0; 3874} 3875 3876static void __exit cfq_exit(void) 3877{ 3878 DECLARE_COMPLETION_ONSTACK(all_gone); 3879 elv_unregister(&iosched_cfq); 3880 ioc_gone = &all_gone; 3881 /* ioc_gone's update must be visible before reading ioc_count */ 3882 smp_wmb(); 3883 3884 /* 3885 * this also protects us from entering cfq_slab_kill() with 3886 * pending RCU callbacks 3887 */ 3888 if (elv_ioc_count_read(cfq_ioc_count)) 3889 wait_for_completion(&all_gone); 3890 cfq_slab_kill(); 3891} 3892 3893module_init(cfq_init); 3894module_exit(cfq_exit); 3895 3896MODULE_AUTHOR("Jens Axboe"); 3897MODULE_LICENSE("GPL"); 3898MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 3899