cfq-iosched.c revision c4e7893ebc3a5c507b53f59b9de448db20849944
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/slab.h> 11#include <linux/blkdev.h> 12#include <linux/elevator.h> 13#include <linux/jiffies.h> 14#include <linux/rbtree.h> 15#include <linux/ioprio.h> 16#include <linux/blktrace_api.h> 17#include "cfq.h" 18 19/* 20 * tunables 21 */ 22/* max queue in one round of service */ 23static const int cfq_quantum = 8; 24static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 25/* maximum backwards seek, in KiB */ 26static const int cfq_back_max = 16 * 1024; 27/* penalty of a backwards seek */ 28static const int cfq_back_penalty = 2; 29static const int cfq_slice_sync = HZ / 10; 30static int cfq_slice_async = HZ / 25; 31static const int cfq_slice_async_rq = 2; 32static int cfq_slice_idle = HZ / 125; 33static int cfq_group_idle = HZ / 125; 34static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ 35static const int cfq_hist_divisor = 4; 36 37/* 38 * offset from end of service tree 39 */ 40#define CFQ_IDLE_DELAY (HZ / 5) 41 42/* 43 * below this threshold, we consider thinktime immediate 44 */ 45#define CFQ_MIN_TT (2) 46 47#define CFQ_SLICE_SCALE (5) 48#define CFQ_HW_QUEUE_MIN (5) 49#define CFQ_SERVICE_SHIFT 12 50 51#define CFQQ_SEEK_THR (sector_t)(8 * 100) 52#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) 53#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) 54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 55 56#define RQ_CIC(rq) \ 57 ((struct cfq_io_context *) (rq)->elevator_private) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) 60 61static struct kmem_cache *cfq_pool; 62static struct kmem_cache *cfq_ioc_pool; 63 64static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); 65static struct completion *ioc_gone; 66static DEFINE_SPINLOCK(ioc_gone_lock); 67 68static DEFINE_SPINLOCK(cic_index_lock); 69static DEFINE_IDA(cic_index_ida); 70 71#define CFQ_PRIO_LISTS IOPRIO_BE_NR 72#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 73#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 74 75#define sample_valid(samples) ((samples) > 80) 76#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 77 78/* 79 * Most of our rbtree usage is for sorting with min extraction, so 80 * if we cache the leftmost node we don't have to walk down the tree 81 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 82 * move this into the elevator for the rq sorting as well. 83 */ 84struct cfq_rb_root { 85 struct rb_root rb; 86 struct rb_node *left; 87 unsigned count; 88 unsigned total_weight; 89 u64 min_vdisktime; 90 struct rb_node *active; 91}; 92#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ 93 .count = 0, .min_vdisktime = 0, } 94 95/* 96 * Per process-grouping structure 97 */ 98struct cfq_queue { 99 /* reference count */ 100 atomic_t ref; 101 /* various state flags, see below */ 102 unsigned int flags; 103 /* parent cfq_data */ 104 struct cfq_data *cfqd; 105 /* service_tree member */ 106 struct rb_node rb_node; 107 /* service_tree key */ 108 unsigned long rb_key; 109 /* prio tree member */ 110 struct rb_node p_node; 111 /* prio tree root we belong to, if any */ 112 struct rb_root *p_root; 113 /* sorted list of pending requests */ 114 struct rb_root sort_list; 115 /* if fifo isn't expired, next request to serve */ 116 struct request *next_rq; 117 /* requests queued in sort_list */ 118 int queued[2]; 119 /* currently allocated requests */ 120 int allocated[2]; 121 /* fifo list of requests in sort_list */ 122 struct list_head fifo; 123 124 /* time when queue got scheduled in to dispatch first request. */ 125 unsigned long dispatch_start; 126 unsigned int allocated_slice; 127 unsigned int slice_dispatch; 128 /* time when first request from queue completed and slice started. */ 129 unsigned long slice_start; 130 unsigned long slice_end; 131 long slice_resid; 132 133 /* pending metadata requests */ 134 int meta_pending; 135 /* number of requests that are on the dispatch list or inside driver */ 136 int dispatched; 137 138 /* io prio of this group */ 139 unsigned short ioprio, org_ioprio; 140 unsigned short ioprio_class, org_ioprio_class; 141 142 pid_t pid; 143 144 u32 seek_history; 145 sector_t last_request_pos; 146 147 struct cfq_rb_root *service_tree; 148 struct cfq_queue *new_cfqq; 149 struct cfq_group *cfqg; 150 struct cfq_group *orig_cfqg; 151 /* Number of sectors dispatched from queue in single dispatch round */ 152 unsigned long nr_sectors; 153}; 154 155/* 156 * First index in the service_trees. 157 * IDLE is handled separately, so it has negative index 158 */ 159enum wl_prio_t { 160 BE_WORKLOAD = 0, 161 RT_WORKLOAD = 1, 162 IDLE_WORKLOAD = 2, 163}; 164 165/* 166 * Second index in the service_trees. 167 */ 168enum wl_type_t { 169 ASYNC_WORKLOAD = 0, 170 SYNC_NOIDLE_WORKLOAD = 1, 171 SYNC_WORKLOAD = 2 172}; 173 174/* This is per cgroup per device grouping structure */ 175struct cfq_group { 176 /* group service_tree member */ 177 struct rb_node rb_node; 178 179 /* group service_tree key */ 180 u64 vdisktime; 181 unsigned int weight; 182 bool on_st; 183 184 /* number of cfqq currently on this group */ 185 int nr_cfqq; 186 187 /* Per group busy queus average. Useful for workload slice calc. */ 188 unsigned int busy_queues_avg[2]; 189 /* 190 * rr lists of queues with requests, onle rr for each priority class. 191 * Counts are embedded in the cfq_rb_root 192 */ 193 struct cfq_rb_root service_trees[2][3]; 194 struct cfq_rb_root service_tree_idle; 195 196 unsigned long saved_workload_slice; 197 enum wl_type_t saved_workload; 198 enum wl_prio_t saved_serving_prio; 199 struct blkio_group blkg; 200#ifdef CONFIG_CFQ_GROUP_IOSCHED 201 struct hlist_node cfqd_node; 202 atomic_t ref; 203#endif 204 /* number of requests that are on the dispatch list or inside driver */ 205 int dispatched; 206}; 207 208/* 209 * Per block device queue structure 210 */ 211struct cfq_data { 212 struct request_queue *queue; 213 /* Root service tree for cfq_groups */ 214 struct cfq_rb_root grp_service_tree; 215 struct cfq_group root_group; 216 217 /* 218 * The priority currently being served 219 */ 220 enum wl_prio_t serving_prio; 221 enum wl_type_t serving_type; 222 unsigned long workload_expires; 223 struct cfq_group *serving_group; 224 bool noidle_tree_requires_idle; 225 226 /* 227 * Each priority tree is sorted by next_request position. These 228 * trees are used when determining if two or more queues are 229 * interleaving requests (see cfq_close_cooperator). 230 */ 231 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 232 233 unsigned int busy_queues; 234 235 int rq_in_driver; 236 int rq_in_flight[2]; 237 238 /* 239 * queue-depth detection 240 */ 241 int rq_queued; 242 int hw_tag; 243 /* 244 * hw_tag can be 245 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) 246 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) 247 * 0 => no NCQ 248 */ 249 int hw_tag_est_depth; 250 unsigned int hw_tag_samples; 251 252 /* 253 * idle window management 254 */ 255 struct timer_list idle_slice_timer; 256 struct work_struct unplug_work; 257 258 struct cfq_queue *active_queue; 259 struct cfq_io_context *active_cic; 260 261 /* 262 * async queue for each priority case 263 */ 264 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; 265 struct cfq_queue *async_idle_cfqq; 266 267 sector_t last_position; 268 269 /* 270 * tunables, see top of file 271 */ 272 unsigned int cfq_quantum; 273 unsigned int cfq_fifo_expire[2]; 274 unsigned int cfq_back_penalty; 275 unsigned int cfq_back_max; 276 unsigned int cfq_slice[2]; 277 unsigned int cfq_slice_async_rq; 278 unsigned int cfq_slice_idle; 279 unsigned int cfq_group_idle; 280 unsigned int cfq_latency; 281 unsigned int cfq_group_isolation; 282 283 unsigned int cic_index; 284 struct list_head cic_list; 285 286 /* 287 * Fallback dummy cfqq for extreme OOM conditions 288 */ 289 struct cfq_queue oom_cfqq; 290 291 unsigned long last_delayed_sync; 292 293 /* List of cfq groups being managed on this device*/ 294 struct hlist_head cfqg_list; 295 struct rcu_head rcu; 296}; 297 298static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 299 300static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, 301 enum wl_prio_t prio, 302 enum wl_type_t type) 303{ 304 if (!cfqg) 305 return NULL; 306 307 if (prio == IDLE_WORKLOAD) 308 return &cfqg->service_tree_idle; 309 310 return &cfqg->service_trees[prio][type]; 311} 312 313enum cfqq_state_flags { 314 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 315 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 316 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 317 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 318 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 319 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 320 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 321 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 322 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 323 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ 324 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ 325 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ 326 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ 327}; 328 329#define CFQ_CFQQ_FNS(name) \ 330static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 331{ \ 332 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 333} \ 334static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 335{ \ 336 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 337} \ 338static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 339{ \ 340 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 341} 342 343CFQ_CFQQ_FNS(on_rr); 344CFQ_CFQQ_FNS(wait_request); 345CFQ_CFQQ_FNS(must_dispatch); 346CFQ_CFQQ_FNS(must_alloc_slice); 347CFQ_CFQQ_FNS(fifo_expire); 348CFQ_CFQQ_FNS(idle_window); 349CFQ_CFQQ_FNS(prio_changed); 350CFQ_CFQQ_FNS(slice_new); 351CFQ_CFQQ_FNS(sync); 352CFQ_CFQQ_FNS(coop); 353CFQ_CFQQ_FNS(split_coop); 354CFQ_CFQQ_FNS(deep); 355CFQ_CFQQ_FNS(wait_busy); 356#undef CFQ_CFQQ_FNS 357 358#ifdef CONFIG_CFQ_GROUP_IOSCHED 359#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 360 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 361 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 362 blkg_path(&(cfqq)->cfqg->blkg), ##args); 363 364#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 365 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 366 blkg_path(&(cfqg)->blkg), ##args); \ 367 368#else 369#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 370 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 371#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); 372#endif 373#define cfq_log(cfqd, fmt, args...) \ 374 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 375 376/* Traverses through cfq group service trees */ 377#define for_each_cfqg_st(cfqg, i, j, st) \ 378 for (i = 0; i <= IDLE_WORKLOAD; i++) \ 379 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ 380 : &cfqg->service_tree_idle; \ 381 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ 382 (i == IDLE_WORKLOAD && j == 0); \ 383 j++, st = i < IDLE_WORKLOAD ? \ 384 &cfqg->service_trees[i][j]: NULL) \ 385 386 387static inline bool iops_mode(struct cfq_data *cfqd) 388{ 389 /* 390 * If we are not idling on queues and it is a NCQ drive, parallel 391 * execution of requests is on and measuring time is not possible 392 * in most of the cases until and unless we drive shallower queue 393 * depths and that becomes a performance bottleneck. In such cases 394 * switch to start providing fairness in terms of number of IOs. 395 */ 396 if (!cfqd->cfq_slice_idle && cfqd->hw_tag) 397 return true; 398 else 399 return false; 400} 401 402static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 403{ 404 if (cfq_class_idle(cfqq)) 405 return IDLE_WORKLOAD; 406 if (cfq_class_rt(cfqq)) 407 return RT_WORKLOAD; 408 return BE_WORKLOAD; 409} 410 411 412static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) 413{ 414 if (!cfq_cfqq_sync(cfqq)) 415 return ASYNC_WORKLOAD; 416 if (!cfq_cfqq_idle_window(cfqq)) 417 return SYNC_NOIDLE_WORKLOAD; 418 return SYNC_WORKLOAD; 419} 420 421static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, 422 struct cfq_data *cfqd, 423 struct cfq_group *cfqg) 424{ 425 if (wl == IDLE_WORKLOAD) 426 return cfqg->service_tree_idle.count; 427 428 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count 429 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count 430 + cfqg->service_trees[wl][SYNC_WORKLOAD].count; 431} 432 433static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 434 struct cfq_group *cfqg) 435{ 436 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count 437 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 438} 439 440static void cfq_dispatch_insert(struct request_queue *, struct request *); 441static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 442 struct io_context *, gfp_t); 443static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 444 struct io_context *); 445 446static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 447 bool is_sync) 448{ 449 return cic->cfqq[is_sync]; 450} 451 452static inline void cic_set_cfqq(struct cfq_io_context *cic, 453 struct cfq_queue *cfqq, bool is_sync) 454{ 455 cic->cfqq[is_sync] = cfqq; 456} 457 458#define CIC_DEAD_KEY 1ul 459#define CIC_DEAD_INDEX_SHIFT 1 460 461static inline void *cfqd_dead_key(struct cfq_data *cfqd) 462{ 463 return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); 464} 465 466static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) 467{ 468 struct cfq_data *cfqd = cic->key; 469 470 if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) 471 return NULL; 472 473 return cfqd; 474} 475 476/* 477 * We regard a request as SYNC, if it's either a read or has the SYNC bit 478 * set (in which case it could also be direct WRITE). 479 */ 480static inline bool cfq_bio_sync(struct bio *bio) 481{ 482 return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC); 483} 484 485/* 486 * scheduler run of queue, if there are requests pending and no one in the 487 * driver that will restart queueing 488 */ 489static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 490{ 491 if (cfqd->busy_queues) { 492 cfq_log(cfqd, "schedule dispatch"); 493 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 494 } 495} 496 497static int cfq_queue_empty(struct request_queue *q) 498{ 499 struct cfq_data *cfqd = q->elevator->elevator_data; 500 501 return !cfqd->rq_queued; 502} 503 504/* 505 * Scale schedule slice based on io priority. Use the sync time slice only 506 * if a queue is marked sync and has sync io queued. A sync queue with async 507 * io only, should not get full sync slice length. 508 */ 509static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync, 510 unsigned short prio) 511{ 512 const int base_slice = cfqd->cfq_slice[sync]; 513 514 WARN_ON(prio >= IOPRIO_BE_NR); 515 516 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); 517} 518 519static inline int 520cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 521{ 522 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 523} 524 525static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) 526{ 527 u64 d = delta << CFQ_SERVICE_SHIFT; 528 529 d = d * BLKIO_WEIGHT_DEFAULT; 530 do_div(d, cfqg->weight); 531 return d; 532} 533 534static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 535{ 536 s64 delta = (s64)(vdisktime - min_vdisktime); 537 if (delta > 0) 538 min_vdisktime = vdisktime; 539 540 return min_vdisktime; 541} 542 543static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) 544{ 545 s64 delta = (s64)(vdisktime - min_vdisktime); 546 if (delta < 0) 547 min_vdisktime = vdisktime; 548 549 return min_vdisktime; 550} 551 552static void update_min_vdisktime(struct cfq_rb_root *st) 553{ 554 u64 vdisktime = st->min_vdisktime; 555 struct cfq_group *cfqg; 556 557 if (st->active) { 558 cfqg = rb_entry_cfqg(st->active); 559 vdisktime = cfqg->vdisktime; 560 } 561 562 if (st->left) { 563 cfqg = rb_entry_cfqg(st->left); 564 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 565 } 566 567 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); 568} 569 570/* 571 * get averaged number of queues of RT/BE priority. 572 * average is updated, with a formula that gives more weight to higher numbers, 573 * to quickly follows sudden increases and decrease slowly 574 */ 575 576static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, 577 struct cfq_group *cfqg, bool rt) 578{ 579 unsigned min_q, max_q; 580 unsigned mult = cfq_hist_divisor - 1; 581 unsigned round = cfq_hist_divisor / 2; 582 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); 583 584 min_q = min(cfqg->busy_queues_avg[rt], busy); 585 max_q = max(cfqg->busy_queues_avg[rt], busy); 586 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / 587 cfq_hist_divisor; 588 return cfqg->busy_queues_avg[rt]; 589} 590 591static inline unsigned 592cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 593{ 594 struct cfq_rb_root *st = &cfqd->grp_service_tree; 595 596 return cfq_target_latency * cfqg->weight / st->total_weight; 597} 598 599static inline void 600cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 601{ 602 unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 603 if (cfqd->cfq_latency) { 604 /* 605 * interested queues (we consider only the ones with the same 606 * priority class in the cfq group) 607 */ 608 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, 609 cfq_class_rt(cfqq)); 610 unsigned sync_slice = cfqd->cfq_slice[1]; 611 unsigned expect_latency = sync_slice * iq; 612 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); 613 614 if (expect_latency > group_slice) { 615 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; 616 /* scale low_slice according to IO priority 617 * and sync vs async */ 618 unsigned low_slice = 619 min(slice, base_low_slice * slice / sync_slice); 620 /* the adapted slice value is scaled to fit all iqs 621 * into the target latency */ 622 slice = max(slice * group_slice / expect_latency, 623 low_slice); 624 } 625 } 626 cfqq->slice_start = jiffies; 627 cfqq->slice_end = jiffies + slice; 628 cfqq->allocated_slice = slice; 629 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 630} 631 632/* 633 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 634 * isn't valid until the first request from the dispatch is activated 635 * and the slice time set. 636 */ 637static inline bool cfq_slice_used(struct cfq_queue *cfqq) 638{ 639 if (cfq_cfqq_slice_new(cfqq)) 640 return 0; 641 if (time_before(jiffies, cfqq->slice_end)) 642 return 0; 643 644 return 1; 645} 646 647/* 648 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 649 * We choose the request that is closest to the head right now. Distance 650 * behind the head is penalized and only allowed to a certain extent. 651 */ 652static struct request * 653cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) 654{ 655 sector_t s1, s2, d1 = 0, d2 = 0; 656 unsigned long back_max; 657#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 658#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 659 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 660 661 if (rq1 == NULL || rq1 == rq2) 662 return rq2; 663 if (rq2 == NULL) 664 return rq1; 665 666 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 667 return rq1; 668 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 669 return rq2; 670 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) 671 return rq1; 672 else if ((rq2->cmd_flags & REQ_META) && 673 !(rq1->cmd_flags & REQ_META)) 674 return rq2; 675 676 s1 = blk_rq_pos(rq1); 677 s2 = blk_rq_pos(rq2); 678 679 /* 680 * by definition, 1KiB is 2 sectors 681 */ 682 back_max = cfqd->cfq_back_max * 2; 683 684 /* 685 * Strict one way elevator _except_ in the case where we allow 686 * short backward seeks which are biased as twice the cost of a 687 * similar forward seek. 688 */ 689 if (s1 >= last) 690 d1 = s1 - last; 691 else if (s1 + back_max >= last) 692 d1 = (last - s1) * cfqd->cfq_back_penalty; 693 else 694 wrap |= CFQ_RQ1_WRAP; 695 696 if (s2 >= last) 697 d2 = s2 - last; 698 else if (s2 + back_max >= last) 699 d2 = (last - s2) * cfqd->cfq_back_penalty; 700 else 701 wrap |= CFQ_RQ2_WRAP; 702 703 /* Found required data */ 704 705 /* 706 * By doing switch() on the bit mask "wrap" we avoid having to 707 * check two variables for all permutations: --> faster! 708 */ 709 switch (wrap) { 710 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 711 if (d1 < d2) 712 return rq1; 713 else if (d2 < d1) 714 return rq2; 715 else { 716 if (s1 >= s2) 717 return rq1; 718 else 719 return rq2; 720 } 721 722 case CFQ_RQ2_WRAP: 723 return rq1; 724 case CFQ_RQ1_WRAP: 725 return rq2; 726 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 727 default: 728 /* 729 * Since both rqs are wrapped, 730 * start with the one that's further behind head 731 * (--> only *one* back seek required), 732 * since back seek takes more time than forward. 733 */ 734 if (s1 <= s2) 735 return rq1; 736 else 737 return rq2; 738 } 739} 740 741/* 742 * The below is leftmost cache rbtree addon 743 */ 744static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 745{ 746 /* Service tree is empty */ 747 if (!root->count) 748 return NULL; 749 750 if (!root->left) 751 root->left = rb_first(&root->rb); 752 753 if (root->left) 754 return rb_entry(root->left, struct cfq_queue, rb_node); 755 756 return NULL; 757} 758 759static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) 760{ 761 if (!root->left) 762 root->left = rb_first(&root->rb); 763 764 if (root->left) 765 return rb_entry_cfqg(root->left); 766 767 return NULL; 768} 769 770static void rb_erase_init(struct rb_node *n, struct rb_root *root) 771{ 772 rb_erase(n, root); 773 RB_CLEAR_NODE(n); 774} 775 776static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 777{ 778 if (root->left == n) 779 root->left = NULL; 780 rb_erase_init(n, &root->rb); 781 --root->count; 782} 783 784/* 785 * would be nice to take fifo expire time into account as well 786 */ 787static struct request * 788cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 789 struct request *last) 790{ 791 struct rb_node *rbnext = rb_next(&last->rb_node); 792 struct rb_node *rbprev = rb_prev(&last->rb_node); 793 struct request *next = NULL, *prev = NULL; 794 795 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 796 797 if (rbprev) 798 prev = rb_entry_rq(rbprev); 799 800 if (rbnext) 801 next = rb_entry_rq(rbnext); 802 else { 803 rbnext = rb_first(&cfqq->sort_list); 804 if (rbnext && rbnext != &last->rb_node) 805 next = rb_entry_rq(rbnext); 806 } 807 808 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); 809} 810 811static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 812 struct cfq_queue *cfqq) 813{ 814 /* 815 * just an approximation, should be ok. 816 */ 817 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - 818 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 819} 820 821static inline s64 822cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) 823{ 824 return cfqg->vdisktime - st->min_vdisktime; 825} 826 827static void 828__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 829{ 830 struct rb_node **node = &st->rb.rb_node; 831 struct rb_node *parent = NULL; 832 struct cfq_group *__cfqg; 833 s64 key = cfqg_key(st, cfqg); 834 int left = 1; 835 836 while (*node != NULL) { 837 parent = *node; 838 __cfqg = rb_entry_cfqg(parent); 839 840 if (key < cfqg_key(st, __cfqg)) 841 node = &parent->rb_left; 842 else { 843 node = &parent->rb_right; 844 left = 0; 845 } 846 } 847 848 if (left) 849 st->left = &cfqg->rb_node; 850 851 rb_link_node(&cfqg->rb_node, parent, node); 852 rb_insert_color(&cfqg->rb_node, &st->rb); 853} 854 855static void 856cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 857{ 858 struct cfq_rb_root *st = &cfqd->grp_service_tree; 859 struct cfq_group *__cfqg; 860 struct rb_node *n; 861 862 cfqg->nr_cfqq++; 863 if (cfqg->on_st) 864 return; 865 866 /* 867 * Currently put the group at the end. Later implement something 868 * so that groups get lesser vtime based on their weights, so that 869 * if group does not loose all if it was not continously backlogged. 870 */ 871 n = rb_last(&st->rb); 872 if (n) { 873 __cfqg = rb_entry_cfqg(n); 874 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 875 } else 876 cfqg->vdisktime = st->min_vdisktime; 877 878 __cfq_group_service_tree_add(st, cfqg); 879 cfqg->on_st = true; 880 st->total_weight += cfqg->weight; 881} 882 883static void 884cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 885{ 886 struct cfq_rb_root *st = &cfqd->grp_service_tree; 887 888 if (st->active == &cfqg->rb_node) 889 st->active = NULL; 890 891 BUG_ON(cfqg->nr_cfqq < 1); 892 cfqg->nr_cfqq--; 893 894 /* If there are other cfq queues under this group, don't delete it */ 895 if (cfqg->nr_cfqq) 896 return; 897 898 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 899 cfqg->on_st = false; 900 st->total_weight -= cfqg->weight; 901 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 902 cfq_rb_erase(&cfqg->rb_node, st); 903 cfqg->saved_workload_slice = 0; 904 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 905} 906 907static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 908{ 909 unsigned int slice_used; 910 911 /* 912 * Queue got expired before even a single request completed or 913 * got expired immediately after first request completion. 914 */ 915 if (!cfqq->slice_start || cfqq->slice_start == jiffies) { 916 /* 917 * Also charge the seek time incurred to the group, otherwise 918 * if there are mutiple queues in the group, each can dispatch 919 * a single request on seeky media and cause lots of seek time 920 * and group will never know it. 921 */ 922 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), 923 1); 924 } else { 925 slice_used = jiffies - cfqq->slice_start; 926 if (slice_used > cfqq->allocated_slice) 927 slice_used = cfqq->allocated_slice; 928 } 929 930 return slice_used; 931} 932 933static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, 934 struct cfq_queue *cfqq) 935{ 936 struct cfq_rb_root *st = &cfqd->grp_service_tree; 937 unsigned int used_sl, charge; 938 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 939 - cfqg->service_tree_idle.count; 940 941 BUG_ON(nr_sync < 0); 942 used_sl = charge = cfq_cfqq_slice_usage(cfqq); 943 944 if (iops_mode(cfqd)) 945 charge = cfqq->slice_dispatch; 946 else if (!cfq_cfqq_sync(cfqq) && !nr_sync) 947 charge = cfqq->allocated_slice; 948 949 /* Can't update vdisktime while group is on service tree */ 950 cfq_rb_erase(&cfqg->rb_node, st); 951 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 952 __cfq_group_service_tree_add(st, cfqg); 953 954 /* This group is being expired. Save the context */ 955 if (time_after(cfqd->workload_expires, jiffies)) { 956 cfqg->saved_workload_slice = cfqd->workload_expires 957 - jiffies; 958 cfqg->saved_workload = cfqd->serving_type; 959 cfqg->saved_serving_prio = cfqd->serving_prio; 960 } else 961 cfqg->saved_workload_slice = 0; 962 963 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 964 st->min_vdisktime); 965 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" 966 " sect=%u", used_sl, cfqq->slice_dispatch, charge, 967 iops_mode(cfqd), cfqq->nr_sectors); 968 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 969 cfq_blkiocg_set_start_empty_time(&cfqg->blkg); 970} 971 972#ifdef CONFIG_CFQ_GROUP_IOSCHED 973static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) 974{ 975 if (blkg) 976 return container_of(blkg, struct cfq_group, blkg); 977 return NULL; 978} 979 980void 981cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 982{ 983 cfqg_of_blkg(blkg)->weight = weight; 984} 985 986static struct cfq_group * 987cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) 988{ 989 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 990 struct cfq_group *cfqg = NULL; 991 void *key = cfqd; 992 int i, j; 993 struct cfq_rb_root *st; 994 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 995 unsigned int major, minor; 996 997 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 998 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 999 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1000 cfqg->blkg.dev = MKDEV(major, minor); 1001 goto done; 1002 } 1003 if (cfqg || !create) 1004 goto done; 1005 1006 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 1007 if (!cfqg) 1008 goto done; 1009 1010 for_each_cfqg_st(cfqg, i, j, st) 1011 *st = CFQ_RB_ROOT; 1012 RB_CLEAR_NODE(&cfqg->rb_node); 1013 1014 /* 1015 * Take the initial reference that will be released on destroy 1016 * This can be thought of a joint reference by cgroup and 1017 * elevator which will be dropped by either elevator exit 1018 * or cgroup deletion path depending on who is exiting first. 1019 */ 1020 atomic_set(&cfqg->ref, 1); 1021 1022 /* Add group onto cgroup list */ 1023 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1024 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1025 MKDEV(major, minor)); 1026 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); 1027 1028 /* Add group on cfqd list */ 1029 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 1030 1031done: 1032 return cfqg; 1033} 1034 1035/* 1036 * Search for the cfq group current task belongs to. If create = 1, then also 1037 * create the cfq group if it does not exist. request_queue lock must be held. 1038 */ 1039static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1040{ 1041 struct cgroup *cgroup; 1042 struct cfq_group *cfqg = NULL; 1043 1044 rcu_read_lock(); 1045 cgroup = task_cgroup(current, blkio_subsys_id); 1046 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); 1047 if (!cfqg && create) 1048 cfqg = &cfqd->root_group; 1049 rcu_read_unlock(); 1050 return cfqg; 1051} 1052 1053static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1054{ 1055 atomic_inc(&cfqg->ref); 1056 return cfqg; 1057} 1058 1059static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1060{ 1061 /* Currently, all async queues are mapped to root group */ 1062 if (!cfq_cfqq_sync(cfqq)) 1063 cfqg = &cfqq->cfqd->root_group; 1064 1065 cfqq->cfqg = cfqg; 1066 /* cfqq reference on cfqg */ 1067 atomic_inc(&cfqq->cfqg->ref); 1068} 1069 1070static void cfq_put_cfqg(struct cfq_group *cfqg) 1071{ 1072 struct cfq_rb_root *st; 1073 int i, j; 1074 1075 BUG_ON(atomic_read(&cfqg->ref) <= 0); 1076 if (!atomic_dec_and_test(&cfqg->ref)) 1077 return; 1078 for_each_cfqg_st(cfqg, i, j, st) 1079 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); 1080 kfree(cfqg); 1081} 1082 1083static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) 1084{ 1085 /* Something wrong if we are trying to remove same group twice */ 1086 BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); 1087 1088 hlist_del_init(&cfqg->cfqd_node); 1089 1090 /* 1091 * Put the reference taken at the time of creation so that when all 1092 * queues are gone, group can be destroyed. 1093 */ 1094 cfq_put_cfqg(cfqg); 1095} 1096 1097static void cfq_release_cfq_groups(struct cfq_data *cfqd) 1098{ 1099 struct hlist_node *pos, *n; 1100 struct cfq_group *cfqg; 1101 1102 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { 1103 /* 1104 * If cgroup removal path got to blk_group first and removed 1105 * it from cgroup list, then it will take care of destroying 1106 * cfqg also. 1107 */ 1108 if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) 1109 cfq_destroy_cfqg(cfqd, cfqg); 1110 } 1111} 1112 1113/* 1114 * Blk cgroup controller notification saying that blkio_group object is being 1115 * delinked as associated cgroup object is going away. That also means that 1116 * no new IO will come in this group. So get rid of this group as soon as 1117 * any pending IO in the group is finished. 1118 * 1119 * This function is called under rcu_read_lock(). key is the rcu protected 1120 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu 1121 * read lock. 1122 * 1123 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means 1124 * it should not be NULL as even if elevator was exiting, cgroup deltion 1125 * path got to it first. 1126 */ 1127void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) 1128{ 1129 unsigned long flags; 1130 struct cfq_data *cfqd = key; 1131 1132 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1133 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); 1134 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1135} 1136 1137#else /* GROUP_IOSCHED */ 1138static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1139{ 1140 return &cfqd->root_group; 1141} 1142 1143static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1144{ 1145 return cfqg; 1146} 1147 1148static inline void 1149cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 1150 cfqq->cfqg = cfqg; 1151} 1152 1153static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} 1154static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} 1155 1156#endif /* GROUP_IOSCHED */ 1157 1158/* 1159 * The cfqd->service_trees holds all pending cfq_queue's that have 1160 * requests waiting to be processed. It is sorted in the order that 1161 * we will service the queues. 1162 */ 1163static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1164 bool add_front) 1165{ 1166 struct rb_node **p, *parent; 1167 struct cfq_queue *__cfqq; 1168 unsigned long rb_key; 1169 struct cfq_rb_root *service_tree; 1170 int left; 1171 int new_cfqq = 1; 1172 int group_changed = 0; 1173 1174#ifdef CONFIG_CFQ_GROUP_IOSCHED 1175 if (!cfqd->cfq_group_isolation 1176 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD 1177 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { 1178 /* Move this cfq to root group */ 1179 cfq_log_cfqq(cfqd, cfqq, "moving to root group"); 1180 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1181 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1182 cfqq->orig_cfqg = cfqq->cfqg; 1183 cfqq->cfqg = &cfqd->root_group; 1184 atomic_inc(&cfqd->root_group.ref); 1185 group_changed = 1; 1186 } else if (!cfqd->cfq_group_isolation 1187 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { 1188 /* cfqq is sequential now needs to go to its original group */ 1189 BUG_ON(cfqq->cfqg != &cfqd->root_group); 1190 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1191 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1192 cfq_put_cfqg(cfqq->cfqg); 1193 cfqq->cfqg = cfqq->orig_cfqg; 1194 cfqq->orig_cfqg = NULL; 1195 group_changed = 1; 1196 cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); 1197 } 1198#endif 1199 1200 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1201 cfqq_type(cfqq)); 1202 if (cfq_class_idle(cfqq)) { 1203 rb_key = CFQ_IDLE_DELAY; 1204 parent = rb_last(&service_tree->rb); 1205 if (parent && parent != &cfqq->rb_node) { 1206 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1207 rb_key += __cfqq->rb_key; 1208 } else 1209 rb_key += jiffies; 1210 } else if (!add_front) { 1211 /* 1212 * Get our rb key offset. Subtract any residual slice 1213 * value carried from last service. A negative resid 1214 * count indicates slice overrun, and this should position 1215 * the next service time further away in the tree. 1216 */ 1217 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; 1218 rb_key -= cfqq->slice_resid; 1219 cfqq->slice_resid = 0; 1220 } else { 1221 rb_key = -HZ; 1222 __cfqq = cfq_rb_first(service_tree); 1223 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 1224 } 1225 1226 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1227 new_cfqq = 0; 1228 /* 1229 * same position, nothing more to do 1230 */ 1231 if (rb_key == cfqq->rb_key && 1232 cfqq->service_tree == service_tree) 1233 return; 1234 1235 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1236 cfqq->service_tree = NULL; 1237 } 1238 1239 left = 1; 1240 parent = NULL; 1241 cfqq->service_tree = service_tree; 1242 p = &service_tree->rb.rb_node; 1243 while (*p) { 1244 struct rb_node **n; 1245 1246 parent = *p; 1247 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1248 1249 /* 1250 * sort by key, that represents service time. 1251 */ 1252 if (time_before(rb_key, __cfqq->rb_key)) 1253 n = &(*p)->rb_left; 1254 else { 1255 n = &(*p)->rb_right; 1256 left = 0; 1257 } 1258 1259 p = n; 1260 } 1261 1262 if (left) 1263 service_tree->left = &cfqq->rb_node; 1264 1265 cfqq->rb_key = rb_key; 1266 rb_link_node(&cfqq->rb_node, parent, p); 1267 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 1268 service_tree->count++; 1269 if ((add_front || !new_cfqq) && !group_changed) 1270 return; 1271 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1272} 1273 1274static struct cfq_queue * 1275cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, 1276 sector_t sector, struct rb_node **ret_parent, 1277 struct rb_node ***rb_link) 1278{ 1279 struct rb_node **p, *parent; 1280 struct cfq_queue *cfqq = NULL; 1281 1282 parent = NULL; 1283 p = &root->rb_node; 1284 while (*p) { 1285 struct rb_node **n; 1286 1287 parent = *p; 1288 cfqq = rb_entry(parent, struct cfq_queue, p_node); 1289 1290 /* 1291 * Sort strictly based on sector. Smallest to the left, 1292 * largest to the right. 1293 */ 1294 if (sector > blk_rq_pos(cfqq->next_rq)) 1295 n = &(*p)->rb_right; 1296 else if (sector < blk_rq_pos(cfqq->next_rq)) 1297 n = &(*p)->rb_left; 1298 else 1299 break; 1300 p = n; 1301 cfqq = NULL; 1302 } 1303 1304 *ret_parent = parent; 1305 if (rb_link) 1306 *rb_link = p; 1307 return cfqq; 1308} 1309 1310static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1311{ 1312 struct rb_node **p, *parent; 1313 struct cfq_queue *__cfqq; 1314 1315 if (cfqq->p_root) { 1316 rb_erase(&cfqq->p_node, cfqq->p_root); 1317 cfqq->p_root = NULL; 1318 } 1319 1320 if (cfq_class_idle(cfqq)) 1321 return; 1322 if (!cfqq->next_rq) 1323 return; 1324 1325 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; 1326 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, 1327 blk_rq_pos(cfqq->next_rq), &parent, &p); 1328 if (!__cfqq) { 1329 rb_link_node(&cfqq->p_node, parent, p); 1330 rb_insert_color(&cfqq->p_node, cfqq->p_root); 1331 } else 1332 cfqq->p_root = NULL; 1333} 1334 1335/* 1336 * Update cfqq's position in the service tree. 1337 */ 1338static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1339{ 1340 /* 1341 * Resorting requires the cfqq to be on the RR list already. 1342 */ 1343 if (cfq_cfqq_on_rr(cfqq)) { 1344 cfq_service_tree_add(cfqd, cfqq, 0); 1345 cfq_prio_tree_add(cfqd, cfqq); 1346 } 1347} 1348 1349/* 1350 * add to busy list of queues for service, trying to be fair in ordering 1351 * the pending list according to last request service 1352 */ 1353static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1354{ 1355 cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); 1356 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1357 cfq_mark_cfqq_on_rr(cfqq); 1358 cfqd->busy_queues++; 1359 1360 cfq_resort_rr_list(cfqd, cfqq); 1361} 1362 1363/* 1364 * Called when the cfqq no longer has requests pending, remove it from 1365 * the service tree. 1366 */ 1367static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1368{ 1369 cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); 1370 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1371 cfq_clear_cfqq_on_rr(cfqq); 1372 1373 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1374 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1375 cfqq->service_tree = NULL; 1376 } 1377 if (cfqq->p_root) { 1378 rb_erase(&cfqq->p_node, cfqq->p_root); 1379 cfqq->p_root = NULL; 1380 } 1381 1382 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1383 BUG_ON(!cfqd->busy_queues); 1384 cfqd->busy_queues--; 1385} 1386 1387/* 1388 * rb tree support functions 1389 */ 1390static void cfq_del_rq_rb(struct request *rq) 1391{ 1392 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1393 const int sync = rq_is_sync(rq); 1394 1395 BUG_ON(!cfqq->queued[sync]); 1396 cfqq->queued[sync]--; 1397 1398 elv_rb_del(&cfqq->sort_list, rq); 1399 1400 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { 1401 /* 1402 * Queue will be deleted from service tree when we actually 1403 * expire it later. Right now just remove it from prio tree 1404 * as it is empty. 1405 */ 1406 if (cfqq->p_root) { 1407 rb_erase(&cfqq->p_node, cfqq->p_root); 1408 cfqq->p_root = NULL; 1409 } 1410 } 1411} 1412 1413static void cfq_add_rq_rb(struct request *rq) 1414{ 1415 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1416 struct cfq_data *cfqd = cfqq->cfqd; 1417 struct request *__alias, *prev; 1418 1419 cfqq->queued[rq_is_sync(rq)]++; 1420 1421 /* 1422 * looks a little odd, but the first insert might return an alias. 1423 * if that happens, put the alias on the dispatch list 1424 */ 1425 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 1426 cfq_dispatch_insert(cfqd->queue, __alias); 1427 1428 if (!cfq_cfqq_on_rr(cfqq)) 1429 cfq_add_cfqq_rr(cfqd, cfqq); 1430 1431 /* 1432 * check if this request is a better next-serve candidate 1433 */ 1434 prev = cfqq->next_rq; 1435 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); 1436 1437 /* 1438 * adjust priority tree position, if ->next_rq changes 1439 */ 1440 if (prev != cfqq->next_rq) 1441 cfq_prio_tree_add(cfqd, cfqq); 1442 1443 BUG_ON(!cfqq->next_rq); 1444} 1445 1446static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 1447{ 1448 elv_rb_del(&cfqq->sort_list, rq); 1449 cfqq->queued[rq_is_sync(rq)]--; 1450 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1451 rq_data_dir(rq), rq_is_sync(rq)); 1452 cfq_add_rq_rb(rq); 1453 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 1454 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), 1455 rq_is_sync(rq)); 1456} 1457 1458static struct request * 1459cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 1460{ 1461 struct task_struct *tsk = current; 1462 struct cfq_io_context *cic; 1463 struct cfq_queue *cfqq; 1464 1465 cic = cfq_cic_lookup(cfqd, tsk->io_context); 1466 if (!cic) 1467 return NULL; 1468 1469 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1470 if (cfqq) { 1471 sector_t sector = bio->bi_sector + bio_sectors(bio); 1472 1473 return elv_rb_find(&cfqq->sort_list, sector); 1474 } 1475 1476 return NULL; 1477} 1478 1479static void cfq_activate_request(struct request_queue *q, struct request *rq) 1480{ 1481 struct cfq_data *cfqd = q->elevator->elevator_data; 1482 1483 cfqd->rq_in_driver++; 1484 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 1485 cfqd->rq_in_driver); 1486 1487 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 1488} 1489 1490static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 1491{ 1492 struct cfq_data *cfqd = q->elevator->elevator_data; 1493 1494 WARN_ON(!cfqd->rq_in_driver); 1495 cfqd->rq_in_driver--; 1496 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 1497 cfqd->rq_in_driver); 1498} 1499 1500static void cfq_remove_request(struct request *rq) 1501{ 1502 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1503 1504 if (cfqq->next_rq == rq) 1505 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 1506 1507 list_del_init(&rq->queuelist); 1508 cfq_del_rq_rb(rq); 1509 1510 cfqq->cfqd->rq_queued--; 1511 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1512 rq_data_dir(rq), rq_is_sync(rq)); 1513 if (rq->cmd_flags & REQ_META) { 1514 WARN_ON(!cfqq->meta_pending); 1515 cfqq->meta_pending--; 1516 } 1517} 1518 1519static int cfq_merge(struct request_queue *q, struct request **req, 1520 struct bio *bio) 1521{ 1522 struct cfq_data *cfqd = q->elevator->elevator_data; 1523 struct request *__rq; 1524 1525 __rq = cfq_find_rq_fmerge(cfqd, bio); 1526 if (__rq && elv_rq_merge_ok(__rq, bio)) { 1527 *req = __rq; 1528 return ELEVATOR_FRONT_MERGE; 1529 } 1530 1531 return ELEVATOR_NO_MERGE; 1532} 1533 1534static void cfq_merged_request(struct request_queue *q, struct request *req, 1535 int type) 1536{ 1537 if (type == ELEVATOR_FRONT_MERGE) { 1538 struct cfq_queue *cfqq = RQ_CFQQ(req); 1539 1540 cfq_reposition_rq_rb(cfqq, req); 1541 } 1542} 1543 1544static void cfq_bio_merged(struct request_queue *q, struct request *req, 1545 struct bio *bio) 1546{ 1547 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, 1548 bio_data_dir(bio), cfq_bio_sync(bio)); 1549} 1550 1551static void 1552cfq_merged_requests(struct request_queue *q, struct request *rq, 1553 struct request *next) 1554{ 1555 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1556 /* 1557 * reposition in fifo if next is older than rq 1558 */ 1559 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 1560 time_before(rq_fifo_time(next), rq_fifo_time(rq))) { 1561 list_move(&rq->queuelist, &next->queuelist); 1562 rq_set_fifo_time(rq, rq_fifo_time(next)); 1563 } 1564 1565 if (cfqq->next_rq == next) 1566 cfqq->next_rq = rq; 1567 cfq_remove_request(next); 1568 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, 1569 rq_data_dir(next), rq_is_sync(next)); 1570} 1571 1572static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1573 struct bio *bio) 1574{ 1575 struct cfq_data *cfqd = q->elevator->elevator_data; 1576 struct cfq_io_context *cic; 1577 struct cfq_queue *cfqq; 1578 1579 /* 1580 * Disallow merge of a sync bio into an async request. 1581 */ 1582 if (cfq_bio_sync(bio) && !rq_is_sync(rq)) 1583 return false; 1584 1585 /* 1586 * Lookup the cfqq that this bio will be queued with. Allow 1587 * merge only if rq is queued there. 1588 */ 1589 cic = cfq_cic_lookup(cfqd, current->io_context); 1590 if (!cic) 1591 return false; 1592 1593 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1594 return cfqq == RQ_CFQQ(rq); 1595} 1596 1597static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1598{ 1599 del_timer(&cfqd->idle_slice_timer); 1600 cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); 1601} 1602 1603static void __cfq_set_active_queue(struct cfq_data *cfqd, 1604 struct cfq_queue *cfqq) 1605{ 1606 if (cfqq) { 1607 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 1608 cfqd->serving_prio, cfqd->serving_type); 1609 cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); 1610 cfqq->slice_start = 0; 1611 cfqq->dispatch_start = jiffies; 1612 cfqq->allocated_slice = 0; 1613 cfqq->slice_end = 0; 1614 cfqq->slice_dispatch = 0; 1615 cfqq->nr_sectors = 0; 1616 1617 cfq_clear_cfqq_wait_request(cfqq); 1618 cfq_clear_cfqq_must_dispatch(cfqq); 1619 cfq_clear_cfqq_must_alloc_slice(cfqq); 1620 cfq_clear_cfqq_fifo_expire(cfqq); 1621 cfq_mark_cfqq_slice_new(cfqq); 1622 1623 cfq_del_timer(cfqd, cfqq); 1624 } 1625 1626 cfqd->active_queue = cfqq; 1627} 1628 1629/* 1630 * current cfqq expired its slice (or was too idle), select new one 1631 */ 1632static void 1633__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1634 bool timed_out) 1635{ 1636 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 1637 1638 if (cfq_cfqq_wait_request(cfqq)) 1639 cfq_del_timer(cfqd, cfqq); 1640 1641 cfq_clear_cfqq_wait_request(cfqq); 1642 cfq_clear_cfqq_wait_busy(cfqq); 1643 1644 /* 1645 * If this cfqq is shared between multiple processes, check to 1646 * make sure that those processes are still issuing I/Os within 1647 * the mean seek distance. If not, it may be time to break the 1648 * queues apart again. 1649 */ 1650 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) 1651 cfq_mark_cfqq_split_coop(cfqq); 1652 1653 /* 1654 * store what was left of this slice, if the queue idled/timed out 1655 */ 1656 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 1657 cfqq->slice_resid = cfqq->slice_end - jiffies; 1658 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1659 } 1660 1661 cfq_group_served(cfqd, cfqq->cfqg, cfqq); 1662 1663 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1664 cfq_del_cfqq_rr(cfqd, cfqq); 1665 1666 cfq_resort_rr_list(cfqd, cfqq); 1667 1668 if (cfqq == cfqd->active_queue) 1669 cfqd->active_queue = NULL; 1670 1671 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) 1672 cfqd->grp_service_tree.active = NULL; 1673 1674 if (cfqd->active_cic) { 1675 put_io_context(cfqd->active_cic->ioc); 1676 cfqd->active_cic = NULL; 1677 } 1678} 1679 1680static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) 1681{ 1682 struct cfq_queue *cfqq = cfqd->active_queue; 1683 1684 if (cfqq) 1685 __cfq_slice_expired(cfqd, cfqq, timed_out); 1686} 1687 1688/* 1689 * Get next queue for service. Unless we have a queue preemption, 1690 * we'll simply select the first cfqq in the service tree. 1691 */ 1692static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 1693{ 1694 struct cfq_rb_root *service_tree = 1695 service_tree_for(cfqd->serving_group, cfqd->serving_prio, 1696 cfqd->serving_type); 1697 1698 if (!cfqd->rq_queued) 1699 return NULL; 1700 1701 /* There is nothing to dispatch */ 1702 if (!service_tree) 1703 return NULL; 1704 if (RB_EMPTY_ROOT(&service_tree->rb)) 1705 return NULL; 1706 return cfq_rb_first(service_tree); 1707} 1708 1709static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 1710{ 1711 struct cfq_group *cfqg; 1712 struct cfq_queue *cfqq; 1713 int i, j; 1714 struct cfq_rb_root *st; 1715 1716 if (!cfqd->rq_queued) 1717 return NULL; 1718 1719 cfqg = cfq_get_next_cfqg(cfqd); 1720 if (!cfqg) 1721 return NULL; 1722 1723 for_each_cfqg_st(cfqg, i, j, st) 1724 if ((cfqq = cfq_rb_first(st)) != NULL) 1725 return cfqq; 1726 return NULL; 1727} 1728 1729/* 1730 * Get and set a new active queue for service. 1731 */ 1732static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 1733 struct cfq_queue *cfqq) 1734{ 1735 if (!cfqq) 1736 cfqq = cfq_get_next_queue(cfqd); 1737 1738 __cfq_set_active_queue(cfqd, cfqq); 1739 return cfqq; 1740} 1741 1742static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 1743 struct request *rq) 1744{ 1745 if (blk_rq_pos(rq) >= cfqd->last_position) 1746 return blk_rq_pos(rq) - cfqd->last_position; 1747 else 1748 return cfqd->last_position - blk_rq_pos(rq); 1749} 1750 1751static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1752 struct request *rq) 1753{ 1754 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; 1755} 1756 1757static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 1758 struct cfq_queue *cur_cfqq) 1759{ 1760 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; 1761 struct rb_node *parent, *node; 1762 struct cfq_queue *__cfqq; 1763 sector_t sector = cfqd->last_position; 1764 1765 if (RB_EMPTY_ROOT(root)) 1766 return NULL; 1767 1768 /* 1769 * First, if we find a request starting at the end of the last 1770 * request, choose it. 1771 */ 1772 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); 1773 if (__cfqq) 1774 return __cfqq; 1775 1776 /* 1777 * If the exact sector wasn't found, the parent of the NULL leaf 1778 * will contain the closest sector. 1779 */ 1780 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1781 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1782 return __cfqq; 1783 1784 if (blk_rq_pos(__cfqq->next_rq) < sector) 1785 node = rb_next(&__cfqq->p_node); 1786 else 1787 node = rb_prev(&__cfqq->p_node); 1788 if (!node) 1789 return NULL; 1790 1791 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1792 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1793 return __cfqq; 1794 1795 return NULL; 1796} 1797 1798/* 1799 * cfqd - obvious 1800 * cur_cfqq - passed in so that we don't decide that the current queue is 1801 * closely cooperating with itself. 1802 * 1803 * So, basically we're assuming that that cur_cfqq has dispatched at least 1804 * one request, and that cfqd->last_position reflects a position on the disk 1805 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 1806 * assumption. 1807 */ 1808static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1809 struct cfq_queue *cur_cfqq) 1810{ 1811 struct cfq_queue *cfqq; 1812 1813 if (cfq_class_idle(cur_cfqq)) 1814 return NULL; 1815 if (!cfq_cfqq_sync(cur_cfqq)) 1816 return NULL; 1817 if (CFQQ_SEEKY(cur_cfqq)) 1818 return NULL; 1819 1820 /* 1821 * Don't search priority tree if it's the only queue in the group. 1822 */ 1823 if (cur_cfqq->cfqg->nr_cfqq == 1) 1824 return NULL; 1825 1826 /* 1827 * We should notice if some of the queues are cooperating, eg 1828 * working closely on the same area of the disk. In that case, 1829 * we can group them together and don't waste time idling. 1830 */ 1831 cfqq = cfqq_close(cfqd, cur_cfqq); 1832 if (!cfqq) 1833 return NULL; 1834 1835 /* If new queue belongs to different cfq_group, don't choose it */ 1836 if (cur_cfqq->cfqg != cfqq->cfqg) 1837 return NULL; 1838 1839 /* 1840 * It only makes sense to merge sync queues. 1841 */ 1842 if (!cfq_cfqq_sync(cfqq)) 1843 return NULL; 1844 if (CFQQ_SEEKY(cfqq)) 1845 return NULL; 1846 1847 /* 1848 * Do not merge queues of different priority classes 1849 */ 1850 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) 1851 return NULL; 1852 1853 return cfqq; 1854} 1855 1856/* 1857 * Determine whether we should enforce idle window for this queue. 1858 */ 1859 1860static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1861{ 1862 enum wl_prio_t prio = cfqq_prio(cfqq); 1863 struct cfq_rb_root *service_tree = cfqq->service_tree; 1864 1865 BUG_ON(!service_tree); 1866 BUG_ON(!service_tree->count); 1867 1868 if (!cfqd->cfq_slice_idle) 1869 return false; 1870 1871 /* We never do for idle class queues. */ 1872 if (prio == IDLE_WORKLOAD) 1873 return false; 1874 1875 /* We do for queues that were marked with idle window flag. */ 1876 if (cfq_cfqq_idle_window(cfqq) && 1877 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) 1878 return true; 1879 1880 /* 1881 * Otherwise, we do only if they are the last ones 1882 * in their service tree. 1883 */ 1884 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) 1885 return 1; 1886 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", 1887 service_tree->count); 1888 return 0; 1889} 1890 1891static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1892{ 1893 struct cfq_queue *cfqq = cfqd->active_queue; 1894 struct cfq_io_context *cic; 1895 unsigned long sl, group_idle = 0; 1896 1897 /* 1898 * SSD device without seek penalty, disable idling. But only do so 1899 * for devices that support queuing, otherwise we still have a problem 1900 * with sync vs async workloads. 1901 */ 1902 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) 1903 return; 1904 1905 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 1906 WARN_ON(cfq_cfqq_slice_new(cfqq)); 1907 1908 /* 1909 * idle is disabled, either manually or by past process history 1910 */ 1911 if (!cfq_should_idle(cfqd, cfqq)) { 1912 /* no queue idling. Check for group idling */ 1913 if (cfqd->cfq_group_idle) 1914 group_idle = cfqd->cfq_group_idle; 1915 else 1916 return; 1917 } 1918 1919 /* 1920 * still active requests from this queue, don't idle 1921 */ 1922 if (cfqq->dispatched) 1923 return; 1924 1925 /* 1926 * task has exited, don't wait 1927 */ 1928 cic = cfqd->active_cic; 1929 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1930 return; 1931 1932 /* 1933 * If our average think time is larger than the remaining time 1934 * slice, then don't idle. This avoids overrunning the allotted 1935 * time slice. 1936 */ 1937 if (sample_valid(cic->ttime_samples) && 1938 (cfqq->slice_end - jiffies < cic->ttime_mean)) { 1939 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", 1940 cic->ttime_mean); 1941 return; 1942 } 1943 1944 /* There are other queues in the group, don't do group idle */ 1945 if (group_idle && cfqq->cfqg->nr_cfqq > 1) 1946 return; 1947 1948 cfq_mark_cfqq_wait_request(cfqq); 1949 1950 if (group_idle) 1951 sl = cfqd->cfq_group_idle; 1952 else 1953 sl = cfqd->cfq_slice_idle; 1954 1955 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1956 cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); 1957 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, 1958 group_idle ? 1 : 0); 1959} 1960 1961/* 1962 * Move request from internal lists to the request queue dispatch list. 1963 */ 1964static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) 1965{ 1966 struct cfq_data *cfqd = q->elevator->elevator_data; 1967 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1968 1969 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 1970 1971 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); 1972 cfq_remove_request(rq); 1973 cfqq->dispatched++; 1974 (RQ_CFQG(rq))->dispatched++; 1975 elv_dispatch_sort(q, rq); 1976 1977 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 1978 cfqq->nr_sectors += blk_rq_sectors(rq); 1979 cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), 1980 rq_data_dir(rq), rq_is_sync(rq)); 1981} 1982 1983/* 1984 * return expired entry, or NULL to just start from scratch in rbtree 1985 */ 1986static struct request *cfq_check_fifo(struct cfq_queue *cfqq) 1987{ 1988 struct request *rq = NULL; 1989 1990 if (cfq_cfqq_fifo_expire(cfqq)) 1991 return NULL; 1992 1993 cfq_mark_cfqq_fifo_expire(cfqq); 1994 1995 if (list_empty(&cfqq->fifo)) 1996 return NULL; 1997 1998 rq = rq_entry_fifo(cfqq->fifo.next); 1999 if (time_before(jiffies, rq_fifo_time(rq))) 2000 rq = NULL; 2001 2002 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); 2003 return rq; 2004} 2005 2006static inline int 2007cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2008{ 2009 const int base_rq = cfqd->cfq_slice_async_rq; 2010 2011 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 2012 2013 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 2014} 2015 2016/* 2017 * Must be called with the queue_lock held. 2018 */ 2019static int cfqq_process_refs(struct cfq_queue *cfqq) 2020{ 2021 int process_refs, io_refs; 2022 2023 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 2024 process_refs = atomic_read(&cfqq->ref) - io_refs; 2025 BUG_ON(process_refs < 0); 2026 return process_refs; 2027} 2028 2029static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) 2030{ 2031 int process_refs, new_process_refs; 2032 struct cfq_queue *__cfqq; 2033 2034 /* 2035 * If there are no process references on the new_cfqq, then it is 2036 * unsafe to follow the ->new_cfqq chain as other cfqq's in the 2037 * chain may have dropped their last reference (not just their 2038 * last process reference). 2039 */ 2040 if (!cfqq_process_refs(new_cfqq)) 2041 return; 2042 2043 /* Avoid a circular list and skip interim queue merges */ 2044 while ((__cfqq = new_cfqq->new_cfqq)) { 2045 if (__cfqq == cfqq) 2046 return; 2047 new_cfqq = __cfqq; 2048 } 2049 2050 process_refs = cfqq_process_refs(cfqq); 2051 new_process_refs = cfqq_process_refs(new_cfqq); 2052 /* 2053 * If the process for the cfqq has gone away, there is no 2054 * sense in merging the queues. 2055 */ 2056 if (process_refs == 0 || new_process_refs == 0) 2057 return; 2058 2059 /* 2060 * Merge in the direction of the lesser amount of work. 2061 */ 2062 if (new_process_refs >= process_refs) { 2063 cfqq->new_cfqq = new_cfqq; 2064 atomic_add(process_refs, &new_cfqq->ref); 2065 } else { 2066 new_cfqq->new_cfqq = cfqq; 2067 atomic_add(new_process_refs, &cfqq->ref); 2068 } 2069} 2070 2071static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, 2072 struct cfq_group *cfqg, enum wl_prio_t prio) 2073{ 2074 struct cfq_queue *queue; 2075 int i; 2076 bool key_valid = false; 2077 unsigned long lowest_key = 0; 2078 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; 2079 2080 for (i = 0; i <= SYNC_WORKLOAD; ++i) { 2081 /* select the one with lowest rb_key */ 2082 queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); 2083 if (queue && 2084 (!key_valid || time_before(queue->rb_key, lowest_key))) { 2085 lowest_key = queue->rb_key; 2086 cur_best = i; 2087 key_valid = true; 2088 } 2089 } 2090 2091 return cur_best; 2092} 2093 2094static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) 2095{ 2096 unsigned slice; 2097 unsigned count; 2098 struct cfq_rb_root *st; 2099 unsigned group_slice; 2100 2101 if (!cfqg) { 2102 cfqd->serving_prio = IDLE_WORKLOAD; 2103 cfqd->workload_expires = jiffies + 1; 2104 return; 2105 } 2106 2107 /* Choose next priority. RT > BE > IDLE */ 2108 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2109 cfqd->serving_prio = RT_WORKLOAD; 2110 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 2111 cfqd->serving_prio = BE_WORKLOAD; 2112 else { 2113 cfqd->serving_prio = IDLE_WORKLOAD; 2114 cfqd->workload_expires = jiffies + 1; 2115 return; 2116 } 2117 2118 /* 2119 * For RT and BE, we have to choose also the type 2120 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2121 * expiration time 2122 */ 2123 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2124 count = st->count; 2125 2126 /* 2127 * check workload expiration, and that we still have other queues ready 2128 */ 2129 if (count && !time_after(jiffies, cfqd->workload_expires)) 2130 return; 2131 2132 /* otherwise select new workload type */ 2133 cfqd->serving_type = 2134 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); 2135 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2136 count = st->count; 2137 2138 /* 2139 * the workload slice is computed as a fraction of target latency 2140 * proportional to the number of queues in that workload, over 2141 * all the queues in the same priority class 2142 */ 2143 group_slice = cfq_group_slice(cfqd, cfqg); 2144 2145 slice = group_slice * count / 2146 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], 2147 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); 2148 2149 if (cfqd->serving_type == ASYNC_WORKLOAD) { 2150 unsigned int tmp; 2151 2152 /* 2153 * Async queues are currently system wide. Just taking 2154 * proportion of queues with-in same group will lead to higher 2155 * async ratio system wide as generally root group is going 2156 * to have higher weight. A more accurate thing would be to 2157 * calculate system wide asnc/sync ratio. 2158 */ 2159 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); 2160 tmp = tmp/cfqd->busy_queues; 2161 slice = min_t(unsigned, slice, tmp); 2162 2163 /* async workload slice is scaled down according to 2164 * the sync/async slice ratio. */ 2165 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; 2166 } else 2167 /* sync workload slice is at least 2 * cfq_slice_idle */ 2168 slice = max(slice, 2 * cfqd->cfq_slice_idle); 2169 2170 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2171 cfq_log(cfqd, "workload slice:%d", slice); 2172 cfqd->workload_expires = jiffies + slice; 2173 cfqd->noidle_tree_requires_idle = false; 2174} 2175 2176static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2177{ 2178 struct cfq_rb_root *st = &cfqd->grp_service_tree; 2179 struct cfq_group *cfqg; 2180 2181 if (RB_EMPTY_ROOT(&st->rb)) 2182 return NULL; 2183 cfqg = cfq_rb_first_group(st); 2184 st->active = &cfqg->rb_node; 2185 update_min_vdisktime(st); 2186 return cfqg; 2187} 2188 2189static void cfq_choose_cfqg(struct cfq_data *cfqd) 2190{ 2191 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); 2192 2193 cfqd->serving_group = cfqg; 2194 2195 /* Restore the workload type data */ 2196 if (cfqg->saved_workload_slice) { 2197 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; 2198 cfqd->serving_type = cfqg->saved_workload; 2199 cfqd->serving_prio = cfqg->saved_serving_prio; 2200 } else 2201 cfqd->workload_expires = jiffies - 1; 2202 2203 choose_service_tree(cfqd, cfqg); 2204} 2205 2206/* 2207 * Select a queue for service. If we have a current active queue, 2208 * check whether to continue servicing it, or retrieve and set a new one. 2209 */ 2210static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 2211{ 2212 struct cfq_queue *cfqq, *new_cfqq = NULL; 2213 2214 cfqq = cfqd->active_queue; 2215 if (!cfqq) 2216 goto new_queue; 2217 2218 if (!cfqd->rq_queued) 2219 return NULL; 2220 2221 /* 2222 * We were waiting for group to get backlogged. Expire the queue 2223 */ 2224 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) 2225 goto expire; 2226 2227 /* 2228 * The active queue has run out of time, expire it and select new. 2229 */ 2230 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { 2231 /* 2232 * If slice had not expired at the completion of last request 2233 * we might not have turned on wait_busy flag. Don't expire 2234 * the queue yet. Allow the group to get backlogged. 2235 * 2236 * The very fact that we have used the slice, that means we 2237 * have been idling all along on this queue and it should be 2238 * ok to wait for this request to complete. 2239 */ 2240 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) 2241 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 2242 cfqq = NULL; 2243 goto keep_queue; 2244 } else 2245 goto check_group_idle; 2246 } 2247 2248 /* 2249 * The active queue has requests and isn't expired, allow it to 2250 * dispatch. 2251 */ 2252 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 2253 goto keep_queue; 2254 2255 /* 2256 * If another queue has a request waiting within our mean seek 2257 * distance, let it run. The expire code will check for close 2258 * cooperators and put the close queue at the front of the service 2259 * tree. If possible, merge the expiring queue with the new cfqq. 2260 */ 2261 new_cfqq = cfq_close_cooperator(cfqd, cfqq); 2262 if (new_cfqq) { 2263 if (!cfqq->new_cfqq) 2264 cfq_setup_merge(cfqq, new_cfqq); 2265 goto expire; 2266 } 2267 2268 /* 2269 * No requests pending. If the active queue still has requests in 2270 * flight or is idling for a new request, allow either of these 2271 * conditions to happen (or time out) before selecting a new queue. 2272 */ 2273 if (timer_pending(&cfqd->idle_slice_timer)) { 2274 cfqq = NULL; 2275 goto keep_queue; 2276 } 2277 2278 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 2279 cfqq = NULL; 2280 goto keep_queue; 2281 } 2282 2283 /* 2284 * If group idle is enabled and there are requests dispatched from 2285 * this group, wait for requests to complete. 2286 */ 2287check_group_idle: 2288 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 2289 && cfqq->cfqg->dispatched) { 2290 cfqq = NULL; 2291 goto keep_queue; 2292 } 2293 2294expire: 2295 cfq_slice_expired(cfqd, 0); 2296new_queue: 2297 /* 2298 * Current queue expired. Check if we have to switch to a new 2299 * service tree 2300 */ 2301 if (!new_cfqq) 2302 cfq_choose_cfqg(cfqd); 2303 2304 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 2305keep_queue: 2306 return cfqq; 2307} 2308 2309static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 2310{ 2311 int dispatched = 0; 2312 2313 while (cfqq->next_rq) { 2314 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 2315 dispatched++; 2316 } 2317 2318 BUG_ON(!list_empty(&cfqq->fifo)); 2319 2320 /* By default cfqq is not expired if it is empty. Do it explicitly */ 2321 __cfq_slice_expired(cfqq->cfqd, cfqq, 0); 2322 return dispatched; 2323} 2324 2325/* 2326 * Drain our current requests. Used for barriers and when switching 2327 * io schedulers on-the-fly. 2328 */ 2329static int cfq_forced_dispatch(struct cfq_data *cfqd) 2330{ 2331 struct cfq_queue *cfqq; 2332 int dispatched = 0; 2333 2334 /* Expire the timeslice of the current active queue first */ 2335 cfq_slice_expired(cfqd, 0); 2336 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { 2337 __cfq_set_active_queue(cfqd, cfqq); 2338 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 2339 } 2340 2341 BUG_ON(cfqd->busy_queues); 2342 2343 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 2344 return dispatched; 2345} 2346 2347static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, 2348 struct cfq_queue *cfqq) 2349{ 2350 /* the queue hasn't finished any request, can't estimate */ 2351 if (cfq_cfqq_slice_new(cfqq)) 2352 return 1; 2353 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, 2354 cfqq->slice_end)) 2355 return 1; 2356 2357 return 0; 2358} 2359 2360static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2361{ 2362 unsigned int max_dispatch; 2363 2364 /* 2365 * Drain async requests before we start sync IO 2366 */ 2367 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) 2368 return false; 2369 2370 /* 2371 * If this is an async queue and we have sync IO in flight, let it wait 2372 */ 2373 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) 2374 return false; 2375 2376 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); 2377 if (cfq_class_idle(cfqq)) 2378 max_dispatch = 1; 2379 2380 /* 2381 * Does this cfqq already have too much IO in flight? 2382 */ 2383 if (cfqq->dispatched >= max_dispatch) { 2384 /* 2385 * idle queue must always only have a single IO in flight 2386 */ 2387 if (cfq_class_idle(cfqq)) 2388 return false; 2389 2390 /* 2391 * We have other queues, don't allow more IO from this one 2392 */ 2393 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) 2394 return false; 2395 2396 /* 2397 * Sole queue user, no limit 2398 */ 2399 if (cfqd->busy_queues == 1) 2400 max_dispatch = -1; 2401 else 2402 /* 2403 * Normally we start throttling cfqq when cfq_quantum/2 2404 * requests have been dispatched. But we can drive 2405 * deeper queue depths at the beginning of slice 2406 * subjected to upper limit of cfq_quantum. 2407 * */ 2408 max_dispatch = cfqd->cfq_quantum; 2409 } 2410 2411 /* 2412 * Async queues must wait a bit before being allowed dispatch. 2413 * We also ramp up the dispatch depth gradually for async IO, 2414 * based on the last sync IO we serviced 2415 */ 2416 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { 2417 unsigned long last_sync = jiffies - cfqd->last_delayed_sync; 2418 unsigned int depth; 2419 2420 depth = last_sync / cfqd->cfq_slice[1]; 2421 if (!depth && !cfqq->dispatched) 2422 depth = 1; 2423 if (depth < max_dispatch) 2424 max_dispatch = depth; 2425 } 2426 2427 /* 2428 * If we're below the current max, allow a dispatch 2429 */ 2430 return cfqq->dispatched < max_dispatch; 2431} 2432 2433/* 2434 * Dispatch a request from cfqq, moving them to the request queue 2435 * dispatch list. 2436 */ 2437static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2438{ 2439 struct request *rq; 2440 2441 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 2442 2443 if (!cfq_may_dispatch(cfqd, cfqq)) 2444 return false; 2445 2446 /* 2447 * follow expired path, else get first next available 2448 */ 2449 rq = cfq_check_fifo(cfqq); 2450 if (!rq) 2451 rq = cfqq->next_rq; 2452 2453 /* 2454 * insert request into driver dispatch list 2455 */ 2456 cfq_dispatch_insert(cfqd->queue, rq); 2457 2458 if (!cfqd->active_cic) { 2459 struct cfq_io_context *cic = RQ_CIC(rq); 2460 2461 atomic_long_inc(&cic->ioc->refcount); 2462 cfqd->active_cic = cic; 2463 } 2464 2465 return true; 2466} 2467 2468/* 2469 * Find the cfqq that we need to service and move a request from that to the 2470 * dispatch list 2471 */ 2472static int cfq_dispatch_requests(struct request_queue *q, int force) 2473{ 2474 struct cfq_data *cfqd = q->elevator->elevator_data; 2475 struct cfq_queue *cfqq; 2476 2477 if (!cfqd->busy_queues) 2478 return 0; 2479 2480 if (unlikely(force)) 2481 return cfq_forced_dispatch(cfqd); 2482 2483 cfqq = cfq_select_queue(cfqd); 2484 if (!cfqq) 2485 return 0; 2486 2487 /* 2488 * Dispatch a request from this cfqq, if it is allowed 2489 */ 2490 if (!cfq_dispatch_request(cfqd, cfqq)) 2491 return 0; 2492 2493 cfqq->slice_dispatch++; 2494 cfq_clear_cfqq_must_dispatch(cfqq); 2495 2496 /* 2497 * expire an async queue immediately if it has used up its slice. idle 2498 * queue always expire after 1 dispatch round. 2499 */ 2500 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 2501 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 2502 cfq_class_idle(cfqq))) { 2503 cfqq->slice_end = jiffies + 1; 2504 cfq_slice_expired(cfqd, 0); 2505 } 2506 2507 cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); 2508 return 1; 2509} 2510 2511/* 2512 * task holds one reference to the queue, dropped when task exits. each rq 2513 * in-flight on this queue also holds a reference, dropped when rq is freed. 2514 * 2515 * Each cfq queue took a reference on the parent group. Drop it now. 2516 * queue lock must be held here. 2517 */ 2518static void cfq_put_queue(struct cfq_queue *cfqq) 2519{ 2520 struct cfq_data *cfqd = cfqq->cfqd; 2521 struct cfq_group *cfqg, *orig_cfqg; 2522 2523 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2524 2525 if (!atomic_dec_and_test(&cfqq->ref)) 2526 return; 2527 2528 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2529 BUG_ON(rb_first(&cfqq->sort_list)); 2530 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2531 cfqg = cfqq->cfqg; 2532 orig_cfqg = cfqq->orig_cfqg; 2533 2534 if (unlikely(cfqd->active_queue == cfqq)) { 2535 __cfq_slice_expired(cfqd, cfqq, 0); 2536 cfq_schedule_dispatch(cfqd); 2537 } 2538 2539 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2540 kmem_cache_free(cfq_pool, cfqq); 2541 cfq_put_cfqg(cfqg); 2542 if (orig_cfqg) 2543 cfq_put_cfqg(orig_cfqg); 2544} 2545 2546/* 2547 * Must always be called with the rcu_read_lock() held 2548 */ 2549static void 2550__call_for_each_cic(struct io_context *ioc, 2551 void (*func)(struct io_context *, struct cfq_io_context *)) 2552{ 2553 struct cfq_io_context *cic; 2554 struct hlist_node *n; 2555 2556 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 2557 func(ioc, cic); 2558} 2559 2560/* 2561 * Call func for each cic attached to this ioc. 2562 */ 2563static void 2564call_for_each_cic(struct io_context *ioc, 2565 void (*func)(struct io_context *, struct cfq_io_context *)) 2566{ 2567 rcu_read_lock(); 2568 __call_for_each_cic(ioc, func); 2569 rcu_read_unlock(); 2570} 2571 2572static void cfq_cic_free_rcu(struct rcu_head *head) 2573{ 2574 struct cfq_io_context *cic; 2575 2576 cic = container_of(head, struct cfq_io_context, rcu_head); 2577 2578 kmem_cache_free(cfq_ioc_pool, cic); 2579 elv_ioc_count_dec(cfq_ioc_count); 2580 2581 if (ioc_gone) { 2582 /* 2583 * CFQ scheduler is exiting, grab exit lock and check 2584 * the pending io context count. If it hits zero, 2585 * complete ioc_gone and set it back to NULL 2586 */ 2587 spin_lock(&ioc_gone_lock); 2588 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { 2589 complete(ioc_gone); 2590 ioc_gone = NULL; 2591 } 2592 spin_unlock(&ioc_gone_lock); 2593 } 2594} 2595 2596static void cfq_cic_free(struct cfq_io_context *cic) 2597{ 2598 call_rcu(&cic->rcu_head, cfq_cic_free_rcu); 2599} 2600 2601static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 2602{ 2603 unsigned long flags; 2604 unsigned long dead_key = (unsigned long) cic->key; 2605 2606 BUG_ON(!(dead_key & CIC_DEAD_KEY)); 2607 2608 spin_lock_irqsave(&ioc->lock, flags); 2609 radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); 2610 hlist_del_rcu(&cic->cic_list); 2611 spin_unlock_irqrestore(&ioc->lock, flags); 2612 2613 cfq_cic_free(cic); 2614} 2615 2616/* 2617 * Must be called with rcu_read_lock() held or preemption otherwise disabled. 2618 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), 2619 * and ->trim() which is called with the task lock held 2620 */ 2621static void cfq_free_io_context(struct io_context *ioc) 2622{ 2623 /* 2624 * ioc->refcount is zero here, or we are called from elv_unregister(), 2625 * so no more cic's are allowed to be linked into this ioc. So it 2626 * should be ok to iterate over the known list, we will see all cic's 2627 * since no new ones are added. 2628 */ 2629 __call_for_each_cic(ioc, cic_free_func); 2630} 2631 2632static void cfq_put_cooperator(struct cfq_queue *cfqq) 2633{ 2634 struct cfq_queue *__cfqq, *next; 2635 2636 /* 2637 * If this queue was scheduled to merge with another queue, be 2638 * sure to drop the reference taken on that queue (and others in 2639 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. 2640 */ 2641 __cfqq = cfqq->new_cfqq; 2642 while (__cfqq) { 2643 if (__cfqq == cfqq) { 2644 WARN(1, "cfqq->new_cfqq loop detected\n"); 2645 break; 2646 } 2647 next = __cfqq->new_cfqq; 2648 cfq_put_queue(__cfqq); 2649 __cfqq = next; 2650 } 2651} 2652 2653static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2654{ 2655 if (unlikely(cfqq == cfqd->active_queue)) { 2656 __cfq_slice_expired(cfqd, cfqq, 0); 2657 cfq_schedule_dispatch(cfqd); 2658 } 2659 2660 cfq_put_cooperator(cfqq); 2661 2662 cfq_put_queue(cfqq); 2663} 2664 2665static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 2666 struct cfq_io_context *cic) 2667{ 2668 struct io_context *ioc = cic->ioc; 2669 2670 list_del_init(&cic->queue_list); 2671 2672 /* 2673 * Make sure dead mark is seen for dead queues 2674 */ 2675 smp_wmb(); 2676 cic->key = cfqd_dead_key(cfqd); 2677 2678 if (ioc->ioc_data == cic) 2679 rcu_assign_pointer(ioc->ioc_data, NULL); 2680 2681 if (cic->cfqq[BLK_RW_ASYNC]) { 2682 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 2683 cic->cfqq[BLK_RW_ASYNC] = NULL; 2684 } 2685 2686 if (cic->cfqq[BLK_RW_SYNC]) { 2687 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 2688 cic->cfqq[BLK_RW_SYNC] = NULL; 2689 } 2690} 2691 2692static void cfq_exit_single_io_context(struct io_context *ioc, 2693 struct cfq_io_context *cic) 2694{ 2695 struct cfq_data *cfqd = cic_to_cfqd(cic); 2696 2697 if (cfqd) { 2698 struct request_queue *q = cfqd->queue; 2699 unsigned long flags; 2700 2701 spin_lock_irqsave(q->queue_lock, flags); 2702 2703 /* 2704 * Ensure we get a fresh copy of the ->key to prevent 2705 * race between exiting task and queue 2706 */ 2707 smp_read_barrier_depends(); 2708 if (cic->key == cfqd) 2709 __cfq_exit_single_io_context(cfqd, cic); 2710 2711 spin_unlock_irqrestore(q->queue_lock, flags); 2712 } 2713} 2714 2715/* 2716 * The process that ioc belongs to has exited, we need to clean up 2717 * and put the internal structures we have that belongs to that process. 2718 */ 2719static void cfq_exit_io_context(struct io_context *ioc) 2720{ 2721 call_for_each_cic(ioc, cfq_exit_single_io_context); 2722} 2723 2724static struct cfq_io_context * 2725cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 2726{ 2727 struct cfq_io_context *cic; 2728 2729 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, 2730 cfqd->queue->node); 2731 if (cic) { 2732 cic->last_end_request = jiffies; 2733 INIT_LIST_HEAD(&cic->queue_list); 2734 INIT_HLIST_NODE(&cic->cic_list); 2735 cic->dtor = cfq_free_io_context; 2736 cic->exit = cfq_exit_io_context; 2737 elv_ioc_count_inc(cfq_ioc_count); 2738 } 2739 2740 return cic; 2741} 2742 2743static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 2744{ 2745 struct task_struct *tsk = current; 2746 int ioprio_class; 2747 2748 if (!cfq_cfqq_prio_changed(cfqq)) 2749 return; 2750 2751 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); 2752 switch (ioprio_class) { 2753 default: 2754 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 2755 case IOPRIO_CLASS_NONE: 2756 /* 2757 * no prio set, inherit CPU scheduling settings 2758 */ 2759 cfqq->ioprio = task_nice_ioprio(tsk); 2760 cfqq->ioprio_class = task_nice_ioclass(tsk); 2761 break; 2762 case IOPRIO_CLASS_RT: 2763 cfqq->ioprio = task_ioprio(ioc); 2764 cfqq->ioprio_class = IOPRIO_CLASS_RT; 2765 break; 2766 case IOPRIO_CLASS_BE: 2767 cfqq->ioprio = task_ioprio(ioc); 2768 cfqq->ioprio_class = IOPRIO_CLASS_BE; 2769 break; 2770 case IOPRIO_CLASS_IDLE: 2771 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 2772 cfqq->ioprio = 7; 2773 cfq_clear_cfqq_idle_window(cfqq); 2774 break; 2775 } 2776 2777 /* 2778 * keep track of original prio settings in case we have to temporarily 2779 * elevate the priority of this queue 2780 */ 2781 cfqq->org_ioprio = cfqq->ioprio; 2782 cfqq->org_ioprio_class = cfqq->ioprio_class; 2783 cfq_clear_cfqq_prio_changed(cfqq); 2784} 2785 2786static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 2787{ 2788 struct cfq_data *cfqd = cic_to_cfqd(cic); 2789 struct cfq_queue *cfqq; 2790 unsigned long flags; 2791 2792 if (unlikely(!cfqd)) 2793 return; 2794 2795 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2796 2797 cfqq = cic->cfqq[BLK_RW_ASYNC]; 2798 if (cfqq) { 2799 struct cfq_queue *new_cfqq; 2800 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 2801 GFP_ATOMIC); 2802 if (new_cfqq) { 2803 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 2804 cfq_put_queue(cfqq); 2805 } 2806 } 2807 2808 cfqq = cic->cfqq[BLK_RW_SYNC]; 2809 if (cfqq) 2810 cfq_mark_cfqq_prio_changed(cfqq); 2811 2812 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2813} 2814 2815static void cfq_ioc_set_ioprio(struct io_context *ioc) 2816{ 2817 call_for_each_cic(ioc, changed_ioprio); 2818 ioc->ioprio_changed = 0; 2819} 2820 2821static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2822 pid_t pid, bool is_sync) 2823{ 2824 RB_CLEAR_NODE(&cfqq->rb_node); 2825 RB_CLEAR_NODE(&cfqq->p_node); 2826 INIT_LIST_HEAD(&cfqq->fifo); 2827 2828 atomic_set(&cfqq->ref, 0); 2829 cfqq->cfqd = cfqd; 2830 2831 cfq_mark_cfqq_prio_changed(cfqq); 2832 2833 if (is_sync) { 2834 if (!cfq_class_idle(cfqq)) 2835 cfq_mark_cfqq_idle_window(cfqq); 2836 cfq_mark_cfqq_sync(cfqq); 2837 } 2838 cfqq->pid = pid; 2839} 2840 2841#ifdef CONFIG_CFQ_GROUP_IOSCHED 2842static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 2843{ 2844 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 2845 struct cfq_data *cfqd = cic_to_cfqd(cic); 2846 unsigned long flags; 2847 struct request_queue *q; 2848 2849 if (unlikely(!cfqd)) 2850 return; 2851 2852 q = cfqd->queue; 2853 2854 spin_lock_irqsave(q->queue_lock, flags); 2855 2856 if (sync_cfqq) { 2857 /* 2858 * Drop reference to sync queue. A new sync queue will be 2859 * assigned in new group upon arrival of a fresh request. 2860 */ 2861 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); 2862 cic_set_cfqq(cic, NULL, 1); 2863 cfq_put_queue(sync_cfqq); 2864 } 2865 2866 spin_unlock_irqrestore(q->queue_lock, flags); 2867} 2868 2869static void cfq_ioc_set_cgroup(struct io_context *ioc) 2870{ 2871 call_for_each_cic(ioc, changed_cgroup); 2872 ioc->cgroup_changed = 0; 2873} 2874#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 2875 2876static struct cfq_queue * 2877cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 2878 struct io_context *ioc, gfp_t gfp_mask) 2879{ 2880 struct cfq_queue *cfqq, *new_cfqq = NULL; 2881 struct cfq_io_context *cic; 2882 struct cfq_group *cfqg; 2883 2884retry: 2885 cfqg = cfq_get_cfqg(cfqd, 1); 2886 cic = cfq_cic_lookup(cfqd, ioc); 2887 /* cic always exists here */ 2888 cfqq = cic_to_cfqq(cic, is_sync); 2889 2890 /* 2891 * Always try a new alloc if we fell back to the OOM cfqq 2892 * originally, since it should just be a temporary situation. 2893 */ 2894 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 2895 cfqq = NULL; 2896 if (new_cfqq) { 2897 cfqq = new_cfqq; 2898 new_cfqq = NULL; 2899 } else if (gfp_mask & __GFP_WAIT) { 2900 spin_unlock_irq(cfqd->queue->queue_lock); 2901 new_cfqq = kmem_cache_alloc_node(cfq_pool, 2902 gfp_mask | __GFP_ZERO, 2903 cfqd->queue->node); 2904 spin_lock_irq(cfqd->queue->queue_lock); 2905 if (new_cfqq) 2906 goto retry; 2907 } else { 2908 cfqq = kmem_cache_alloc_node(cfq_pool, 2909 gfp_mask | __GFP_ZERO, 2910 cfqd->queue->node); 2911 } 2912 2913 if (cfqq) { 2914 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 2915 cfq_init_prio_data(cfqq, ioc); 2916 cfq_link_cfqq_cfqg(cfqq, cfqg); 2917 cfq_log_cfqq(cfqd, cfqq, "alloced"); 2918 } else 2919 cfqq = &cfqd->oom_cfqq; 2920 } 2921 2922 if (new_cfqq) 2923 kmem_cache_free(cfq_pool, new_cfqq); 2924 2925 return cfqq; 2926} 2927 2928static struct cfq_queue ** 2929cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) 2930{ 2931 switch (ioprio_class) { 2932 case IOPRIO_CLASS_RT: 2933 return &cfqd->async_cfqq[0][ioprio]; 2934 case IOPRIO_CLASS_BE: 2935 return &cfqd->async_cfqq[1][ioprio]; 2936 case IOPRIO_CLASS_IDLE: 2937 return &cfqd->async_idle_cfqq; 2938 default: 2939 BUG(); 2940 } 2941} 2942 2943static struct cfq_queue * 2944cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, 2945 gfp_t gfp_mask) 2946{ 2947 const int ioprio = task_ioprio(ioc); 2948 const int ioprio_class = task_ioprio_class(ioc); 2949 struct cfq_queue **async_cfqq = NULL; 2950 struct cfq_queue *cfqq = NULL; 2951 2952 if (!is_sync) { 2953 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 2954 cfqq = *async_cfqq; 2955 } 2956 2957 if (!cfqq) 2958 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 2959 2960 /* 2961 * pin the queue now that it's allocated, scheduler exit will prune it 2962 */ 2963 if (!is_sync && !(*async_cfqq)) { 2964 atomic_inc(&cfqq->ref); 2965 *async_cfqq = cfqq; 2966 } 2967 2968 atomic_inc(&cfqq->ref); 2969 return cfqq; 2970} 2971 2972/* 2973 * We drop cfq io contexts lazily, so we may find a dead one. 2974 */ 2975static void 2976cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, 2977 struct cfq_io_context *cic) 2978{ 2979 unsigned long flags; 2980 2981 WARN_ON(!list_empty(&cic->queue_list)); 2982 BUG_ON(cic->key != cfqd_dead_key(cfqd)); 2983 2984 spin_lock_irqsave(&ioc->lock, flags); 2985 2986 BUG_ON(ioc->ioc_data == cic); 2987 2988 radix_tree_delete(&ioc->radix_root, cfqd->cic_index); 2989 hlist_del_rcu(&cic->cic_list); 2990 spin_unlock_irqrestore(&ioc->lock, flags); 2991 2992 cfq_cic_free(cic); 2993} 2994 2995static struct cfq_io_context * 2996cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 2997{ 2998 struct cfq_io_context *cic; 2999 unsigned long flags; 3000 3001 if (unlikely(!ioc)) 3002 return NULL; 3003 3004 rcu_read_lock(); 3005 3006 /* 3007 * we maintain a last-hit cache, to avoid browsing over the tree 3008 */ 3009 cic = rcu_dereference(ioc->ioc_data); 3010 if (cic && cic->key == cfqd) { 3011 rcu_read_unlock(); 3012 return cic; 3013 } 3014 3015 do { 3016 cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); 3017 rcu_read_unlock(); 3018 if (!cic) 3019 break; 3020 if (unlikely(cic->key != cfqd)) { 3021 cfq_drop_dead_cic(cfqd, ioc, cic); 3022 rcu_read_lock(); 3023 continue; 3024 } 3025 3026 spin_lock_irqsave(&ioc->lock, flags); 3027 rcu_assign_pointer(ioc->ioc_data, cic); 3028 spin_unlock_irqrestore(&ioc->lock, flags); 3029 break; 3030 } while (1); 3031 3032 return cic; 3033} 3034 3035/* 3036 * Add cic into ioc, using cfqd as the search key. This enables us to lookup 3037 * the process specific cfq io context when entered from the block layer. 3038 * Also adds the cic to a per-cfqd list, used when this queue is removed. 3039 */ 3040static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 3041 struct cfq_io_context *cic, gfp_t gfp_mask) 3042{ 3043 unsigned long flags; 3044 int ret; 3045 3046 ret = radix_tree_preload(gfp_mask); 3047 if (!ret) { 3048 cic->ioc = ioc; 3049 cic->key = cfqd; 3050 3051 spin_lock_irqsave(&ioc->lock, flags); 3052 ret = radix_tree_insert(&ioc->radix_root, 3053 cfqd->cic_index, cic); 3054 if (!ret) 3055 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 3056 spin_unlock_irqrestore(&ioc->lock, flags); 3057 3058 radix_tree_preload_end(); 3059 3060 if (!ret) { 3061 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3062 list_add(&cic->queue_list, &cfqd->cic_list); 3063 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3064 } 3065 } 3066 3067 if (ret) 3068 printk(KERN_ERR "cfq: cic link failed!\n"); 3069 3070 return ret; 3071} 3072 3073/* 3074 * Setup general io context and cfq io context. There can be several cfq 3075 * io contexts per general io context, if this process is doing io to more 3076 * than one device managed by cfq. 3077 */ 3078static struct cfq_io_context * 3079cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 3080{ 3081 struct io_context *ioc = NULL; 3082 struct cfq_io_context *cic; 3083 3084 might_sleep_if(gfp_mask & __GFP_WAIT); 3085 3086 ioc = get_io_context(gfp_mask, cfqd->queue->node); 3087 if (!ioc) 3088 return NULL; 3089 3090 cic = cfq_cic_lookup(cfqd, ioc); 3091 if (cic) 3092 goto out; 3093 3094 cic = cfq_alloc_io_context(cfqd, gfp_mask); 3095 if (cic == NULL) 3096 goto err; 3097 3098 if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) 3099 goto err_free; 3100 3101out: 3102 smp_read_barrier_depends(); 3103 if (unlikely(ioc->ioprio_changed)) 3104 cfq_ioc_set_ioprio(ioc); 3105 3106#ifdef CONFIG_CFQ_GROUP_IOSCHED 3107 if (unlikely(ioc->cgroup_changed)) 3108 cfq_ioc_set_cgroup(ioc); 3109#endif 3110 return cic; 3111err_free: 3112 cfq_cic_free(cic); 3113err: 3114 put_io_context(ioc); 3115 return NULL; 3116} 3117 3118static void 3119cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 3120{ 3121 unsigned long elapsed = jiffies - cic->last_end_request; 3122 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 3123 3124 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 3125 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 3126 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 3127} 3128 3129static void 3130cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3131 struct request *rq) 3132{ 3133 sector_t sdist = 0; 3134 sector_t n_sec = blk_rq_sectors(rq); 3135 if (cfqq->last_request_pos) { 3136 if (cfqq->last_request_pos < blk_rq_pos(rq)) 3137 sdist = blk_rq_pos(rq) - cfqq->last_request_pos; 3138 else 3139 sdist = cfqq->last_request_pos - blk_rq_pos(rq); 3140 } 3141 3142 cfqq->seek_history <<= 1; 3143 if (blk_queue_nonrot(cfqd->queue)) 3144 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); 3145 else 3146 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); 3147} 3148 3149/* 3150 * Disable idle window if the process thinks too long or seeks so much that 3151 * it doesn't matter 3152 */ 3153static void 3154cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3155 struct cfq_io_context *cic) 3156{ 3157 int old_idle, enable_idle; 3158 3159 /* 3160 * Don't idle for async or idle io prio class 3161 */ 3162 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) 3163 return; 3164 3165 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3166 3167 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3168 cfq_mark_cfqq_deep(cfqq); 3169 3170 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3171 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3172 enable_idle = 0; 3173 else if (sample_valid(cic->ttime_samples)) { 3174 if (cic->ttime_mean > cfqd->cfq_slice_idle) 3175 enable_idle = 0; 3176 else 3177 enable_idle = 1; 3178 } 3179 3180 if (old_idle != enable_idle) { 3181 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); 3182 if (enable_idle) 3183 cfq_mark_cfqq_idle_window(cfqq); 3184 else 3185 cfq_clear_cfqq_idle_window(cfqq); 3186 } 3187} 3188 3189/* 3190 * Check if new_cfqq should preempt the currently active queue. Return 0 for 3191 * no or if we aren't sure, a 1 will cause a preempt. 3192 */ 3193static bool 3194cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 3195 struct request *rq) 3196{ 3197 struct cfq_queue *cfqq; 3198 3199 cfqq = cfqd->active_queue; 3200 if (!cfqq) 3201 return false; 3202 3203 if (cfq_class_idle(new_cfqq)) 3204 return false; 3205 3206 if (cfq_class_idle(cfqq)) 3207 return true; 3208 3209 /* 3210 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. 3211 */ 3212 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) 3213 return false; 3214 3215 /* 3216 * if the new request is sync, but the currently running queue is 3217 * not, let the sync request have priority. 3218 */ 3219 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3220 return true; 3221 3222 if (new_cfqq->cfqg != cfqq->cfqg) 3223 return false; 3224 3225 if (cfq_slice_used(cfqq)) 3226 return true; 3227 3228 /* Allow preemption only if we are idling on sync-noidle tree */ 3229 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && 3230 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 3231 new_cfqq->service_tree->count == 2 && 3232 RB_EMPTY_ROOT(&cfqq->sort_list)) 3233 return true; 3234 3235 /* 3236 * So both queues are sync. Let the new request get disk time if 3237 * it's a metadata request and the current queue is doing regular IO. 3238 */ 3239 if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) 3240 return true; 3241 3242 /* 3243 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. 3244 */ 3245 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 3246 return true; 3247 3248 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 3249 return false; 3250 3251 /* 3252 * if this request is as-good as one we would expect from the 3253 * current cfqq, let it preempt 3254 */ 3255 if (cfq_rq_close(cfqd, cfqq, rq)) 3256 return true; 3257 3258 return false; 3259} 3260 3261/* 3262 * cfqq preempts the active queue. if we allowed preempt with no slice left, 3263 * let it have half of its nominal slice. 3264 */ 3265static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3266{ 3267 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3268 cfq_slice_expired(cfqd, 1); 3269 3270 /* 3271 * Put the new queue at the front of the of the current list, 3272 * so we know that it will be selected next. 3273 */ 3274 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 3275 3276 cfq_service_tree_add(cfqd, cfqq, 1); 3277 3278 cfqq->slice_end = 0; 3279 cfq_mark_cfqq_slice_new(cfqq); 3280} 3281 3282/* 3283 * Called when a new fs request (rq) is added (to cfqq). Check if there's 3284 * something we should do about it 3285 */ 3286static void 3287cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3288 struct request *rq) 3289{ 3290 struct cfq_io_context *cic = RQ_CIC(rq); 3291 3292 cfqd->rq_queued++; 3293 if (rq->cmd_flags & REQ_META) 3294 cfqq->meta_pending++; 3295 3296 cfq_update_io_thinktime(cfqd, cic); 3297 cfq_update_io_seektime(cfqd, cfqq, rq); 3298 cfq_update_idle_window(cfqd, cfqq, cic); 3299 3300 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3301 3302 if (cfqq == cfqd->active_queue) { 3303 /* 3304 * Remember that we saw a request from this process, but 3305 * don't start queuing just yet. Otherwise we risk seeing lots 3306 * of tiny requests, because we disrupt the normal plugging 3307 * and merging. If the request is already larger than a single 3308 * page, let it rip immediately. For that case we assume that 3309 * merging is already done. Ditto for a busy system that 3310 * has other work pending, don't risk delaying until the 3311 * idle timer unplug to continue working. 3312 */ 3313 if (cfq_cfqq_wait_request(cfqq)) { 3314 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3315 cfqd->busy_queues > 1) { 3316 cfq_del_timer(cfqd, cfqq); 3317 cfq_clear_cfqq_wait_request(cfqq); 3318 __blk_run_queue(cfqd->queue); 3319 } else { 3320 cfq_blkiocg_update_idle_time_stats( 3321 &cfqq->cfqg->blkg); 3322 cfq_mark_cfqq_must_dispatch(cfqq); 3323 } 3324 } 3325 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3326 /* 3327 * not the active queue - expire current slice if it is 3328 * idle and has expired it's mean thinktime or this new queue 3329 * has some old slice time left and is of higher priority or 3330 * this new queue is RT and the current one is BE 3331 */ 3332 cfq_preempt_queue(cfqd, cfqq); 3333 __blk_run_queue(cfqd->queue); 3334 } 3335} 3336 3337static void cfq_insert_request(struct request_queue *q, struct request *rq) 3338{ 3339 struct cfq_data *cfqd = q->elevator->elevator_data; 3340 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3341 3342 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3343 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3344 3345 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3346 list_add_tail(&rq->queuelist, &cfqq->fifo); 3347 cfq_add_rq_rb(rq); 3348 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 3349 &cfqd->serving_group->blkg, rq_data_dir(rq), 3350 rq_is_sync(rq)); 3351 cfq_rq_enqueued(cfqd, cfqq, rq); 3352} 3353 3354/* 3355 * Update hw_tag based on peak queue depth over 50 samples under 3356 * sufficient load. 3357 */ 3358static void cfq_update_hw_tag(struct cfq_data *cfqd) 3359{ 3360 struct cfq_queue *cfqq = cfqd->active_queue; 3361 3362 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) 3363 cfqd->hw_tag_est_depth = cfqd->rq_in_driver; 3364 3365 if (cfqd->hw_tag == 1) 3366 return; 3367 3368 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 3369 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) 3370 return; 3371 3372 /* 3373 * If active queue hasn't enough requests and can idle, cfq might not 3374 * dispatch sufficient requests to hardware. Don't zero hw_tag in this 3375 * case 3376 */ 3377 if (cfqq && cfq_cfqq_idle_window(cfqq) && 3378 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < 3379 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) 3380 return; 3381 3382 if (cfqd->hw_tag_samples++ < 50) 3383 return; 3384 3385 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) 3386 cfqd->hw_tag = 1; 3387 else 3388 cfqd->hw_tag = 0; 3389} 3390 3391static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3392{ 3393 struct cfq_io_context *cic = cfqd->active_cic; 3394 3395 /* If there are other queues in the group, don't wait */ 3396 if (cfqq->cfqg->nr_cfqq > 1) 3397 return false; 3398 3399 if (cfq_slice_used(cfqq)) 3400 return true; 3401 3402 /* if slice left is less than think time, wait busy */ 3403 if (cic && sample_valid(cic->ttime_samples) 3404 && (cfqq->slice_end - jiffies < cic->ttime_mean)) 3405 return true; 3406 3407 /* 3408 * If think times is less than a jiffy than ttime_mean=0 and above 3409 * will not be true. It might happen that slice has not expired yet 3410 * but will expire soon (4-5 ns) during select_queue(). To cover the 3411 * case where think time is less than a jiffy, mark the queue wait 3412 * busy if only 1 jiffy is left in the slice. 3413 */ 3414 if (cfqq->slice_end - jiffies == 1) 3415 return true; 3416 3417 return false; 3418} 3419 3420static void cfq_completed_request(struct request_queue *q, struct request *rq) 3421{ 3422 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3423 struct cfq_data *cfqd = cfqq->cfqd; 3424 const int sync = rq_is_sync(rq); 3425 unsigned long now; 3426 3427 now = jiffies; 3428 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", 3429 !!(rq->cmd_flags & REQ_NOIDLE)); 3430 3431 cfq_update_hw_tag(cfqd); 3432 3433 WARN_ON(!cfqd->rq_in_driver); 3434 WARN_ON(!cfqq->dispatched); 3435 cfqd->rq_in_driver--; 3436 cfqq->dispatched--; 3437 (RQ_CFQG(rq))->dispatched--; 3438 cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, 3439 rq_start_time_ns(rq), rq_io_start_time_ns(rq), 3440 rq_data_dir(rq), rq_is_sync(rq)); 3441 3442 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3443 3444 if (sync) { 3445 RQ_CIC(rq)->last_end_request = now; 3446 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) 3447 cfqd->last_delayed_sync = now; 3448 } 3449 3450 /* 3451 * If this is the active queue, check if it needs to be expired, 3452 * or if we want to idle in case it has no pending requests. 3453 */ 3454 if (cfqd->active_queue == cfqq) { 3455 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 3456 3457 if (cfq_cfqq_slice_new(cfqq)) { 3458 cfq_set_prio_slice(cfqd, cfqq); 3459 cfq_clear_cfqq_slice_new(cfqq); 3460 } 3461 3462 /* 3463 * Should we wait for next request to come in before we expire 3464 * the queue. 3465 */ 3466 if (cfq_should_wait_busy(cfqd, cfqq)) { 3467 unsigned long extend_sl = cfqd->cfq_slice_idle; 3468 if (!cfqd->cfq_slice_idle) 3469 extend_sl = cfqd->cfq_group_idle; 3470 cfqq->slice_end = jiffies + extend_sl; 3471 cfq_mark_cfqq_wait_busy(cfqq); 3472 cfq_log_cfqq(cfqd, cfqq, "will busy wait"); 3473 } 3474 3475 /* 3476 * Idling is not enabled on: 3477 * - expired queues 3478 * - idle-priority queues 3479 * - async queues 3480 * - queues with still some requests queued 3481 * - when there is a close cooperator 3482 */ 3483 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3484 cfq_slice_expired(cfqd, 1); 3485 else if (sync && cfqq_empty && 3486 !cfq_close_cooperator(cfqd, cfqq)) { 3487 cfqd->noidle_tree_requires_idle |= 3488 !(rq->cmd_flags & REQ_NOIDLE); 3489 /* 3490 * Idling is enabled for SYNC_WORKLOAD. 3491 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree 3492 * only if we processed at least one !REQ_NOIDLE request 3493 */ 3494 if (cfqd->serving_type == SYNC_WORKLOAD 3495 || cfqd->noidle_tree_requires_idle 3496 || cfqq->cfqg->nr_cfqq == 1) 3497 cfq_arm_slice_timer(cfqd); 3498 } 3499 } 3500 3501 if (!cfqd->rq_in_driver) 3502 cfq_schedule_dispatch(cfqd); 3503} 3504 3505/* 3506 * we temporarily boost lower priority queues if they are holding fs exclusive 3507 * resources. they are boosted to normal prio (CLASS_BE/4) 3508 */ 3509static void cfq_prio_boost(struct cfq_queue *cfqq) 3510{ 3511 if (has_fs_excl()) { 3512 /* 3513 * boost idle prio on transactions that would lock out other 3514 * users of the filesystem 3515 */ 3516 if (cfq_class_idle(cfqq)) 3517 cfqq->ioprio_class = IOPRIO_CLASS_BE; 3518 if (cfqq->ioprio > IOPRIO_NORM) 3519 cfqq->ioprio = IOPRIO_NORM; 3520 } else { 3521 /* 3522 * unboost the queue (if needed) 3523 */ 3524 cfqq->ioprio_class = cfqq->org_ioprio_class; 3525 cfqq->ioprio = cfqq->org_ioprio; 3526 } 3527} 3528 3529static inline int __cfq_may_queue(struct cfq_queue *cfqq) 3530{ 3531 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { 3532 cfq_mark_cfqq_must_alloc_slice(cfqq); 3533 return ELV_MQUEUE_MUST; 3534 } 3535 3536 return ELV_MQUEUE_MAY; 3537} 3538 3539static int cfq_may_queue(struct request_queue *q, int rw) 3540{ 3541 struct cfq_data *cfqd = q->elevator->elevator_data; 3542 struct task_struct *tsk = current; 3543 struct cfq_io_context *cic; 3544 struct cfq_queue *cfqq; 3545 3546 /* 3547 * don't force setup of a queue from here, as a call to may_queue 3548 * does not necessarily imply that a request actually will be queued. 3549 * so just lookup a possibly existing queue, or return 'may queue' 3550 * if that fails 3551 */ 3552 cic = cfq_cic_lookup(cfqd, tsk->io_context); 3553 if (!cic) 3554 return ELV_MQUEUE_MAY; 3555 3556 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3557 if (cfqq) { 3558 cfq_init_prio_data(cfqq, cic->ioc); 3559 cfq_prio_boost(cfqq); 3560 3561 return __cfq_may_queue(cfqq); 3562 } 3563 3564 return ELV_MQUEUE_MAY; 3565} 3566 3567/* 3568 * queue lock held here 3569 */ 3570static void cfq_put_request(struct request *rq) 3571{ 3572 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3573 3574 if (cfqq) { 3575 const int rw = rq_data_dir(rq); 3576 3577 BUG_ON(!cfqq->allocated[rw]); 3578 cfqq->allocated[rw]--; 3579 3580 put_io_context(RQ_CIC(rq)->ioc); 3581 3582 rq->elevator_private = NULL; 3583 rq->elevator_private2 = NULL; 3584 3585 /* Put down rq reference on cfqg */ 3586 cfq_put_cfqg(RQ_CFQG(rq)); 3587 rq->elevator_private3 = NULL; 3588 3589 cfq_put_queue(cfqq); 3590 } 3591} 3592 3593static struct cfq_queue * 3594cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, 3595 struct cfq_queue *cfqq) 3596{ 3597 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 3598 cic_set_cfqq(cic, cfqq->new_cfqq, 1); 3599 cfq_mark_cfqq_coop(cfqq->new_cfqq); 3600 cfq_put_queue(cfqq); 3601 return cic_to_cfqq(cic, 1); 3602} 3603 3604/* 3605 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this 3606 * was the last process referring to said cfqq. 3607 */ 3608static struct cfq_queue * 3609split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) 3610{ 3611 if (cfqq_process_refs(cfqq) == 1) { 3612 cfqq->pid = current->pid; 3613 cfq_clear_cfqq_coop(cfqq); 3614 cfq_clear_cfqq_split_coop(cfqq); 3615 return cfqq; 3616 } 3617 3618 cic_set_cfqq(cic, NULL, 1); 3619 3620 cfq_put_cooperator(cfqq); 3621 3622 cfq_put_queue(cfqq); 3623 return NULL; 3624} 3625/* 3626 * Allocate cfq data structures associated with this request. 3627 */ 3628static int 3629cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 3630{ 3631 struct cfq_data *cfqd = q->elevator->elevator_data; 3632 struct cfq_io_context *cic; 3633 const int rw = rq_data_dir(rq); 3634 const bool is_sync = rq_is_sync(rq); 3635 struct cfq_queue *cfqq; 3636 unsigned long flags; 3637 3638 might_sleep_if(gfp_mask & __GFP_WAIT); 3639 3640 cic = cfq_get_io_context(cfqd, gfp_mask); 3641 3642 spin_lock_irqsave(q->queue_lock, flags); 3643 3644 if (!cic) 3645 goto queue_fail; 3646 3647new_queue: 3648 cfqq = cic_to_cfqq(cic, is_sync); 3649 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3650 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3651 cic_set_cfqq(cic, cfqq, is_sync); 3652 } else { 3653 /* 3654 * If the queue was seeky for too long, break it apart. 3655 */ 3656 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { 3657 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); 3658 cfqq = split_cfqq(cic, cfqq); 3659 if (!cfqq) 3660 goto new_queue; 3661 } 3662 3663 /* 3664 * Check to see if this queue is scheduled to merge with 3665 * another, closely cooperating queue. The merging of 3666 * queues happens here as it must be done in process context. 3667 * The reference on new_cfqq was taken in merge_cfqqs. 3668 */ 3669 if (cfqq->new_cfqq) 3670 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); 3671 } 3672 3673 cfqq->allocated[rw]++; 3674 atomic_inc(&cfqq->ref); 3675 3676 spin_unlock_irqrestore(q->queue_lock, flags); 3677 3678 rq->elevator_private = cic; 3679 rq->elevator_private2 = cfqq; 3680 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); 3681 return 0; 3682 3683queue_fail: 3684 if (cic) 3685 put_io_context(cic->ioc); 3686 3687 cfq_schedule_dispatch(cfqd); 3688 spin_unlock_irqrestore(q->queue_lock, flags); 3689 cfq_log(cfqd, "set_request fail"); 3690 return 1; 3691} 3692 3693static void cfq_kick_queue(struct work_struct *work) 3694{ 3695 struct cfq_data *cfqd = 3696 container_of(work, struct cfq_data, unplug_work); 3697 struct request_queue *q = cfqd->queue; 3698 3699 spin_lock_irq(q->queue_lock); 3700 __blk_run_queue(cfqd->queue); 3701 spin_unlock_irq(q->queue_lock); 3702} 3703 3704/* 3705 * Timer running if the active_queue is currently idling inside its time slice 3706 */ 3707static void cfq_idle_slice_timer(unsigned long data) 3708{ 3709 struct cfq_data *cfqd = (struct cfq_data *) data; 3710 struct cfq_queue *cfqq; 3711 unsigned long flags; 3712 int timed_out = 1; 3713 3714 cfq_log(cfqd, "idle timer fired"); 3715 3716 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3717 3718 cfqq = cfqd->active_queue; 3719 if (cfqq) { 3720 timed_out = 0; 3721 3722 /* 3723 * We saw a request before the queue expired, let it through 3724 */ 3725 if (cfq_cfqq_must_dispatch(cfqq)) 3726 goto out_kick; 3727 3728 /* 3729 * expired 3730 */ 3731 if (cfq_slice_used(cfqq)) 3732 goto expire; 3733 3734 /* 3735 * only expire and reinvoke request handler, if there are 3736 * other queues with pending requests 3737 */ 3738 if (!cfqd->busy_queues) 3739 goto out_cont; 3740 3741 /* 3742 * not expired and it has a request pending, let it dispatch 3743 */ 3744 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3745 goto out_kick; 3746 3747 /* 3748 * Queue depth flag is reset only when the idle didn't succeed 3749 */ 3750 cfq_clear_cfqq_deep(cfqq); 3751 } 3752expire: 3753 cfq_slice_expired(cfqd, timed_out); 3754out_kick: 3755 cfq_schedule_dispatch(cfqd); 3756out_cont: 3757 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3758} 3759 3760static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 3761{ 3762 del_timer_sync(&cfqd->idle_slice_timer); 3763 cancel_work_sync(&cfqd->unplug_work); 3764} 3765 3766static void cfq_put_async_queues(struct cfq_data *cfqd) 3767{ 3768 int i; 3769 3770 for (i = 0; i < IOPRIO_BE_NR; i++) { 3771 if (cfqd->async_cfqq[0][i]) 3772 cfq_put_queue(cfqd->async_cfqq[0][i]); 3773 if (cfqd->async_cfqq[1][i]) 3774 cfq_put_queue(cfqd->async_cfqq[1][i]); 3775 } 3776 3777 if (cfqd->async_idle_cfqq) 3778 cfq_put_queue(cfqd->async_idle_cfqq); 3779} 3780 3781static void cfq_cfqd_free(struct rcu_head *head) 3782{ 3783 kfree(container_of(head, struct cfq_data, rcu)); 3784} 3785 3786static void cfq_exit_queue(struct elevator_queue *e) 3787{ 3788 struct cfq_data *cfqd = e->elevator_data; 3789 struct request_queue *q = cfqd->queue; 3790 3791 cfq_shutdown_timer_wq(cfqd); 3792 3793 spin_lock_irq(q->queue_lock); 3794 3795 if (cfqd->active_queue) 3796 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3797 3798 while (!list_empty(&cfqd->cic_list)) { 3799 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 3800 struct cfq_io_context, 3801 queue_list); 3802 3803 __cfq_exit_single_io_context(cfqd, cic); 3804 } 3805 3806 cfq_put_async_queues(cfqd); 3807 cfq_release_cfq_groups(cfqd); 3808 cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3809 3810 spin_unlock_irq(q->queue_lock); 3811 3812 cfq_shutdown_timer_wq(cfqd); 3813 3814 spin_lock(&cic_index_lock); 3815 ida_remove(&cic_index_ida, cfqd->cic_index); 3816 spin_unlock(&cic_index_lock); 3817 3818 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3819 call_rcu(&cfqd->rcu, cfq_cfqd_free); 3820} 3821 3822static int cfq_alloc_cic_index(void) 3823{ 3824 int index, error; 3825 3826 do { 3827 if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) 3828 return -ENOMEM; 3829 3830 spin_lock(&cic_index_lock); 3831 error = ida_get_new(&cic_index_ida, &index); 3832 spin_unlock(&cic_index_lock); 3833 if (error && error != -EAGAIN) 3834 return error; 3835 } while (error); 3836 3837 return index; 3838} 3839 3840static void *cfq_init_queue(struct request_queue *q) 3841{ 3842 struct cfq_data *cfqd; 3843 int i, j; 3844 struct cfq_group *cfqg; 3845 struct cfq_rb_root *st; 3846 3847 i = cfq_alloc_cic_index(); 3848 if (i < 0) 3849 return NULL; 3850 3851 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3852 if (!cfqd) 3853 return NULL; 3854 3855 cfqd->cic_index = i; 3856 3857 /* Init root service tree */ 3858 cfqd->grp_service_tree = CFQ_RB_ROOT; 3859 3860 /* Init root group */ 3861 cfqg = &cfqd->root_group; 3862 for_each_cfqg_st(cfqg, i, j, st) 3863 *st = CFQ_RB_ROOT; 3864 RB_CLEAR_NODE(&cfqg->rb_node); 3865 3866 /* Give preference to root group over other groups */ 3867 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; 3868 3869#ifdef CONFIG_CFQ_GROUP_IOSCHED 3870 /* 3871 * Take a reference to root group which we never drop. This is just 3872 * to make sure that cfq_put_cfqg() does not try to kfree root group 3873 */ 3874 atomic_set(&cfqg->ref, 1); 3875 rcu_read_lock(); 3876 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, 3877 (void *)cfqd, 0); 3878 rcu_read_unlock(); 3879#endif 3880 /* 3881 * Not strictly needed (since RB_ROOT just clears the node and we 3882 * zeroed cfqd on alloc), but better be safe in case someone decides 3883 * to add magic to the rb code 3884 */ 3885 for (i = 0; i < CFQ_PRIO_LISTS; i++) 3886 cfqd->prio_trees[i] = RB_ROOT; 3887 3888 /* 3889 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 3890 * Grab a permanent reference to it, so that the normal code flow 3891 * will not attempt to free it. 3892 */ 3893 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3894 atomic_inc(&cfqd->oom_cfqq.ref); 3895 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 3896 3897 INIT_LIST_HEAD(&cfqd->cic_list); 3898 3899 cfqd->queue = q; 3900 3901 init_timer(&cfqd->idle_slice_timer); 3902 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 3903 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 3904 3905 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 3906 3907 cfqd->cfq_quantum = cfq_quantum; 3908 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 3909 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 3910 cfqd->cfq_back_max = cfq_back_max; 3911 cfqd->cfq_back_penalty = cfq_back_penalty; 3912 cfqd->cfq_slice[0] = cfq_slice_async; 3913 cfqd->cfq_slice[1] = cfq_slice_sync; 3914 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3915 cfqd->cfq_slice_idle = cfq_slice_idle; 3916 cfqd->cfq_group_idle = cfq_group_idle; 3917 cfqd->cfq_latency = 1; 3918 cfqd->cfq_group_isolation = 0; 3919 cfqd->hw_tag = -1; 3920 /* 3921 * we optimistically start assuming sync ops weren't delayed in last 3922 * second, in order to have larger depth for async operations. 3923 */ 3924 cfqd->last_delayed_sync = jiffies - HZ; 3925 return cfqd; 3926} 3927 3928static void cfq_slab_kill(void) 3929{ 3930 /* 3931 * Caller already ensured that pending RCU callbacks are completed, 3932 * so we should have no busy allocations at this point. 3933 */ 3934 if (cfq_pool) 3935 kmem_cache_destroy(cfq_pool); 3936 if (cfq_ioc_pool) 3937 kmem_cache_destroy(cfq_ioc_pool); 3938} 3939 3940static int __init cfq_slab_setup(void) 3941{ 3942 cfq_pool = KMEM_CACHE(cfq_queue, 0); 3943 if (!cfq_pool) 3944 goto fail; 3945 3946 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); 3947 if (!cfq_ioc_pool) 3948 goto fail; 3949 3950 return 0; 3951fail: 3952 cfq_slab_kill(); 3953 return -ENOMEM; 3954} 3955 3956/* 3957 * sysfs parts below --> 3958 */ 3959static ssize_t 3960cfq_var_show(unsigned int var, char *page) 3961{ 3962 return sprintf(page, "%d\n", var); 3963} 3964 3965static ssize_t 3966cfq_var_store(unsigned int *var, const char *page, size_t count) 3967{ 3968 char *p = (char *) page; 3969 3970 *var = simple_strtoul(p, &p, 10); 3971 return count; 3972} 3973 3974#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 3975static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 3976{ \ 3977 struct cfq_data *cfqd = e->elevator_data; \ 3978 unsigned int __data = __VAR; \ 3979 if (__CONV) \ 3980 __data = jiffies_to_msecs(__data); \ 3981 return cfq_var_show(__data, (page)); \ 3982} 3983SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 3984SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 3985SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 3986SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 3987SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 3988SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 3989SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); 3990SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 3991SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 3992SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 3993SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 3994SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); 3995#undef SHOW_FUNCTION 3996 3997#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 3998static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 3999{ \ 4000 struct cfq_data *cfqd = e->elevator_data; \ 4001 unsigned int __data; \ 4002 int ret = cfq_var_store(&__data, (page), count); \ 4003 if (__data < (MIN)) \ 4004 __data = (MIN); \ 4005 else if (__data > (MAX)) \ 4006 __data = (MAX); \ 4007 if (__CONV) \ 4008 *(__PTR) = msecs_to_jiffies(__data); \ 4009 else \ 4010 *(__PTR) = __data; \ 4011 return ret; \ 4012} 4013STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 4014STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, 4015 UINT_MAX, 1); 4016STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, 4017 UINT_MAX, 1); 4018STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 4019STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 4020 UINT_MAX, 0); 4021STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 4022STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); 4023STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 4024STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 4025STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4026 UINT_MAX, 0); 4027STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4028STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); 4029#undef STORE_FUNCTION 4030 4031#define CFQ_ATTR(name) \ 4032 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 4033 4034static struct elv_fs_entry cfq_attrs[] = { 4035 CFQ_ATTR(quantum), 4036 CFQ_ATTR(fifo_expire_sync), 4037 CFQ_ATTR(fifo_expire_async), 4038 CFQ_ATTR(back_seek_max), 4039 CFQ_ATTR(back_seek_penalty), 4040 CFQ_ATTR(slice_sync), 4041 CFQ_ATTR(slice_async), 4042 CFQ_ATTR(slice_async_rq), 4043 CFQ_ATTR(slice_idle), 4044 CFQ_ATTR(group_idle), 4045 CFQ_ATTR(low_latency), 4046 CFQ_ATTR(group_isolation), 4047 __ATTR_NULL 4048}; 4049 4050static struct elevator_type iosched_cfq = { 4051 .ops = { 4052 .elevator_merge_fn = cfq_merge, 4053 .elevator_merged_fn = cfq_merged_request, 4054 .elevator_merge_req_fn = cfq_merged_requests, 4055 .elevator_allow_merge_fn = cfq_allow_merge, 4056 .elevator_bio_merged_fn = cfq_bio_merged, 4057 .elevator_dispatch_fn = cfq_dispatch_requests, 4058 .elevator_add_req_fn = cfq_insert_request, 4059 .elevator_activate_req_fn = cfq_activate_request, 4060 .elevator_deactivate_req_fn = cfq_deactivate_request, 4061 .elevator_queue_empty_fn = cfq_queue_empty, 4062 .elevator_completed_req_fn = cfq_completed_request, 4063 .elevator_former_req_fn = elv_rb_former_request, 4064 .elevator_latter_req_fn = elv_rb_latter_request, 4065 .elevator_set_req_fn = cfq_set_request, 4066 .elevator_put_req_fn = cfq_put_request, 4067 .elevator_may_queue_fn = cfq_may_queue, 4068 .elevator_init_fn = cfq_init_queue, 4069 .elevator_exit_fn = cfq_exit_queue, 4070 .trim = cfq_free_io_context, 4071 }, 4072 .elevator_attrs = cfq_attrs, 4073 .elevator_name = "cfq", 4074 .elevator_owner = THIS_MODULE, 4075}; 4076 4077#ifdef CONFIG_CFQ_GROUP_IOSCHED 4078static struct blkio_policy_type blkio_policy_cfq = { 4079 .ops = { 4080 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4081 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4082 }, 4083}; 4084#else 4085static struct blkio_policy_type blkio_policy_cfq; 4086#endif 4087 4088static int __init cfq_init(void) 4089{ 4090 /* 4091 * could be 0 on HZ < 1000 setups 4092 */ 4093 if (!cfq_slice_async) 4094 cfq_slice_async = 1; 4095 if (!cfq_slice_idle) 4096 cfq_slice_idle = 1; 4097 4098#ifdef CONFIG_CFQ_GROUP_IOSCHED 4099 if (!cfq_group_idle) 4100 cfq_group_idle = 1; 4101#else 4102 cfq_group_idle = 0; 4103#endif 4104 if (cfq_slab_setup()) 4105 return -ENOMEM; 4106 4107 elv_register(&iosched_cfq); 4108 blkio_policy_register(&blkio_policy_cfq); 4109 4110 return 0; 4111} 4112 4113static void __exit cfq_exit(void) 4114{ 4115 DECLARE_COMPLETION_ONSTACK(all_gone); 4116 blkio_policy_unregister(&blkio_policy_cfq); 4117 elv_unregister(&iosched_cfq); 4118 ioc_gone = &all_gone; 4119 /* ioc_gone's update must be visible before reading ioc_count */ 4120 smp_wmb(); 4121 4122 /* 4123 * this also protects us from entering cfq_slab_kill() with 4124 * pending RCU callbacks 4125 */ 4126 if (elv_ioc_count_read(cfq_ioc_count)) 4127 wait_for_completion(&all_gone); 4128 ida_destroy(&cic_index_ida); 4129 cfq_slab_kill(); 4130} 4131 4132module_init(cfq_init); 4133module_exit(cfq_exit); 4134 4135MODULE_AUTHOR("Jens Axboe"); 4136MODULE_LICENSE("GPL"); 4137MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 4138