cfq-iosched.c revision 8e1ac6655104bc6e1e79d67e2df88cc8fa9b6e07
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/slab.h> 11#include <linux/blkdev.h> 12#include <linux/elevator.h> 13#include <linux/jiffies.h> 14#include <linux/rbtree.h> 15#include <linux/ioprio.h> 16#include <linux/blktrace_api.h> 17#include "cfq.h" 18 19/* 20 * tunables 21 */ 22/* max queue in one round of service */ 23static const int cfq_quantum = 8; 24static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 25/* maximum backwards seek, in KiB */ 26static const int cfq_back_max = 16 * 1024; 27/* penalty of a backwards seek */ 28static const int cfq_back_penalty = 2; 29static const int cfq_slice_sync = HZ / 10; 30static int cfq_slice_async = HZ / 25; 31static const int cfq_slice_async_rq = 2; 32static int cfq_slice_idle = HZ / 125; 33static int cfq_group_idle = HZ / 125; 34static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ 35static const int cfq_hist_divisor = 4; 36 37/* 38 * offset from end of service tree 39 */ 40#define CFQ_IDLE_DELAY (HZ / 5) 41 42/* 43 * below this threshold, we consider thinktime immediate 44 */ 45#define CFQ_MIN_TT (2) 46 47#define CFQ_SLICE_SCALE (5) 48#define CFQ_HW_QUEUE_MIN (5) 49#define CFQ_SERVICE_SHIFT 12 50 51#define CFQQ_SEEK_THR (sector_t)(8 * 100) 52#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) 53#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) 54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 55 56#define RQ_CIC(rq) \ 57 ((struct cfq_io_context *) (rq)->elevator_private) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) 60 61static struct kmem_cache *cfq_pool; 62static struct kmem_cache *cfq_ioc_pool; 63 64static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); 65static struct completion *ioc_gone; 66static DEFINE_SPINLOCK(ioc_gone_lock); 67 68static DEFINE_SPINLOCK(cic_index_lock); 69static DEFINE_IDA(cic_index_ida); 70 71#define CFQ_PRIO_LISTS IOPRIO_BE_NR 72#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 73#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 74 75#define sample_valid(samples) ((samples) > 80) 76#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 77 78/* 79 * Most of our rbtree usage is for sorting with min extraction, so 80 * if we cache the leftmost node we don't have to walk down the tree 81 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 82 * move this into the elevator for the rq sorting as well. 83 */ 84struct cfq_rb_root { 85 struct rb_root rb; 86 struct rb_node *left; 87 unsigned count; 88 unsigned total_weight; 89 u64 min_vdisktime; 90 struct rb_node *active; 91}; 92#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ 93 .count = 0, .min_vdisktime = 0, } 94 95/* 96 * Per process-grouping structure 97 */ 98struct cfq_queue { 99 /* reference count */ 100 atomic_t ref; 101 /* various state flags, see below */ 102 unsigned int flags; 103 /* parent cfq_data */ 104 struct cfq_data *cfqd; 105 /* service_tree member */ 106 struct rb_node rb_node; 107 /* service_tree key */ 108 unsigned long rb_key; 109 /* prio tree member */ 110 struct rb_node p_node; 111 /* prio tree root we belong to, if any */ 112 struct rb_root *p_root; 113 /* sorted list of pending requests */ 114 struct rb_root sort_list; 115 /* if fifo isn't expired, next request to serve */ 116 struct request *next_rq; 117 /* requests queued in sort_list */ 118 int queued[2]; 119 /* currently allocated requests */ 120 int allocated[2]; 121 /* fifo list of requests in sort_list */ 122 struct list_head fifo; 123 124 /* time when queue got scheduled in to dispatch first request. */ 125 unsigned long dispatch_start; 126 unsigned int allocated_slice; 127 unsigned int slice_dispatch; 128 /* time when first request from queue completed and slice started. */ 129 unsigned long slice_start; 130 unsigned long slice_end; 131 long slice_resid; 132 133 /* pending metadata requests */ 134 int meta_pending; 135 /* number of requests that are on the dispatch list or inside driver */ 136 int dispatched; 137 138 /* io prio of this group */ 139 unsigned short ioprio, org_ioprio; 140 unsigned short ioprio_class, org_ioprio_class; 141 142 pid_t pid; 143 144 u32 seek_history; 145 sector_t last_request_pos; 146 147 struct cfq_rb_root *service_tree; 148 struct cfq_queue *new_cfqq; 149 struct cfq_group *cfqg; 150 struct cfq_group *orig_cfqg; 151 /* Number of sectors dispatched from queue in single dispatch round */ 152 unsigned long nr_sectors; 153}; 154 155/* 156 * First index in the service_trees. 157 * IDLE is handled separately, so it has negative index 158 */ 159enum wl_prio_t { 160 BE_WORKLOAD = 0, 161 RT_WORKLOAD = 1, 162 IDLE_WORKLOAD = 2, 163}; 164 165/* 166 * Second index in the service_trees. 167 */ 168enum wl_type_t { 169 ASYNC_WORKLOAD = 0, 170 SYNC_NOIDLE_WORKLOAD = 1, 171 SYNC_WORKLOAD = 2 172}; 173 174/* This is per cgroup per device grouping structure */ 175struct cfq_group { 176 /* group service_tree member */ 177 struct rb_node rb_node; 178 179 /* group service_tree key */ 180 u64 vdisktime; 181 unsigned int weight; 182 bool on_st; 183 184 /* number of cfqq currently on this group */ 185 int nr_cfqq; 186 187 /* Per group busy queus average. Useful for workload slice calc. */ 188 unsigned int busy_queues_avg[2]; 189 /* 190 * rr lists of queues with requests, onle rr for each priority class. 191 * Counts are embedded in the cfq_rb_root 192 */ 193 struct cfq_rb_root service_trees[2][3]; 194 struct cfq_rb_root service_tree_idle; 195 196 unsigned long saved_workload_slice; 197 enum wl_type_t saved_workload; 198 enum wl_prio_t saved_serving_prio; 199 struct blkio_group blkg; 200#ifdef CONFIG_CFQ_GROUP_IOSCHED 201 struct hlist_node cfqd_node; 202 atomic_t ref; 203#endif 204 /* number of requests that are on the dispatch list or inside driver */ 205 int dispatched; 206}; 207 208/* 209 * Per block device queue structure 210 */ 211struct cfq_data { 212 struct request_queue *queue; 213 /* Root service tree for cfq_groups */ 214 struct cfq_rb_root grp_service_tree; 215 struct cfq_group root_group; 216 217 /* 218 * The priority currently being served 219 */ 220 enum wl_prio_t serving_prio; 221 enum wl_type_t serving_type; 222 unsigned long workload_expires; 223 struct cfq_group *serving_group; 224 bool noidle_tree_requires_idle; 225 226 /* 227 * Each priority tree is sorted by next_request position. These 228 * trees are used when determining if two or more queues are 229 * interleaving requests (see cfq_close_cooperator). 230 */ 231 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 232 233 unsigned int busy_queues; 234 235 int rq_in_driver; 236 int rq_in_flight[2]; 237 238 /* 239 * queue-depth detection 240 */ 241 int rq_queued; 242 int hw_tag; 243 /* 244 * hw_tag can be 245 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) 246 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) 247 * 0 => no NCQ 248 */ 249 int hw_tag_est_depth; 250 unsigned int hw_tag_samples; 251 252 /* 253 * idle window management 254 */ 255 struct timer_list idle_slice_timer; 256 struct work_struct unplug_work; 257 258 struct cfq_queue *active_queue; 259 struct cfq_io_context *active_cic; 260 261 /* 262 * async queue for each priority case 263 */ 264 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; 265 struct cfq_queue *async_idle_cfqq; 266 267 sector_t last_position; 268 269 /* 270 * tunables, see top of file 271 */ 272 unsigned int cfq_quantum; 273 unsigned int cfq_fifo_expire[2]; 274 unsigned int cfq_back_penalty; 275 unsigned int cfq_back_max; 276 unsigned int cfq_slice[2]; 277 unsigned int cfq_slice_async_rq; 278 unsigned int cfq_slice_idle; 279 unsigned int cfq_group_idle; 280 unsigned int cfq_latency; 281 unsigned int cfq_group_isolation; 282 283 unsigned int cic_index; 284 struct list_head cic_list; 285 286 /* 287 * Fallback dummy cfqq for extreme OOM conditions 288 */ 289 struct cfq_queue oom_cfqq; 290 291 unsigned long last_delayed_sync; 292 293 /* List of cfq groups being managed on this device*/ 294 struct hlist_head cfqg_list; 295 struct rcu_head rcu; 296}; 297 298static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 299 300static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, 301 enum wl_prio_t prio, 302 enum wl_type_t type) 303{ 304 if (!cfqg) 305 return NULL; 306 307 if (prio == IDLE_WORKLOAD) 308 return &cfqg->service_tree_idle; 309 310 return &cfqg->service_trees[prio][type]; 311} 312 313enum cfqq_state_flags { 314 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 315 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 316 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 317 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 318 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 319 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 320 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 321 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 322 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 323 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ 324 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ 325 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ 326 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ 327}; 328 329#define CFQ_CFQQ_FNS(name) \ 330static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 331{ \ 332 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 333} \ 334static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 335{ \ 336 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 337} \ 338static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 339{ \ 340 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 341} 342 343CFQ_CFQQ_FNS(on_rr); 344CFQ_CFQQ_FNS(wait_request); 345CFQ_CFQQ_FNS(must_dispatch); 346CFQ_CFQQ_FNS(must_alloc_slice); 347CFQ_CFQQ_FNS(fifo_expire); 348CFQ_CFQQ_FNS(idle_window); 349CFQ_CFQQ_FNS(prio_changed); 350CFQ_CFQQ_FNS(slice_new); 351CFQ_CFQQ_FNS(sync); 352CFQ_CFQQ_FNS(coop); 353CFQ_CFQQ_FNS(split_coop); 354CFQ_CFQQ_FNS(deep); 355CFQ_CFQQ_FNS(wait_busy); 356#undef CFQ_CFQQ_FNS 357 358#ifdef CONFIG_CFQ_GROUP_IOSCHED 359#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 360 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 361 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 362 blkg_path(&(cfqq)->cfqg->blkg), ##args); 363 364#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 365 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 366 blkg_path(&(cfqg)->blkg), ##args); \ 367 368#else 369#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 370 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 371#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); 372#endif 373#define cfq_log(cfqd, fmt, args...) \ 374 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 375 376/* Traverses through cfq group service trees */ 377#define for_each_cfqg_st(cfqg, i, j, st) \ 378 for (i = 0; i <= IDLE_WORKLOAD; i++) \ 379 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ 380 : &cfqg->service_tree_idle; \ 381 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ 382 (i == IDLE_WORKLOAD && j == 0); \ 383 j++, st = i < IDLE_WORKLOAD ? \ 384 &cfqg->service_trees[i][j]: NULL) \ 385 386 387static inline bool iops_mode(struct cfq_data *cfqd) 388{ 389 /* 390 * If we are not idling on queues and it is a NCQ drive, parallel 391 * execution of requests is on and measuring time is not possible 392 * in most of the cases until and unless we drive shallower queue 393 * depths and that becomes a performance bottleneck. In such cases 394 * switch to start providing fairness in terms of number of IOs. 395 */ 396 if (!cfqd->cfq_slice_idle && cfqd->hw_tag) 397 return true; 398 else 399 return false; 400} 401 402static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 403{ 404 if (cfq_class_idle(cfqq)) 405 return IDLE_WORKLOAD; 406 if (cfq_class_rt(cfqq)) 407 return RT_WORKLOAD; 408 return BE_WORKLOAD; 409} 410 411 412static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) 413{ 414 if (!cfq_cfqq_sync(cfqq)) 415 return ASYNC_WORKLOAD; 416 if (!cfq_cfqq_idle_window(cfqq)) 417 return SYNC_NOIDLE_WORKLOAD; 418 return SYNC_WORKLOAD; 419} 420 421static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, 422 struct cfq_data *cfqd, 423 struct cfq_group *cfqg) 424{ 425 if (wl == IDLE_WORKLOAD) 426 return cfqg->service_tree_idle.count; 427 428 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count 429 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count 430 + cfqg->service_trees[wl][SYNC_WORKLOAD].count; 431} 432 433static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 434 struct cfq_group *cfqg) 435{ 436 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count 437 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 438} 439 440static void cfq_dispatch_insert(struct request_queue *, struct request *); 441static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 442 struct io_context *, gfp_t); 443static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 444 struct io_context *); 445 446static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 447 bool is_sync) 448{ 449 return cic->cfqq[is_sync]; 450} 451 452static inline void cic_set_cfqq(struct cfq_io_context *cic, 453 struct cfq_queue *cfqq, bool is_sync) 454{ 455 cic->cfqq[is_sync] = cfqq; 456} 457 458#define CIC_DEAD_KEY 1ul 459#define CIC_DEAD_INDEX_SHIFT 1 460 461static inline void *cfqd_dead_key(struct cfq_data *cfqd) 462{ 463 return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); 464} 465 466static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) 467{ 468 struct cfq_data *cfqd = cic->key; 469 470 if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) 471 return NULL; 472 473 return cfqd; 474} 475 476/* 477 * We regard a request as SYNC, if it's either a read or has the SYNC bit 478 * set (in which case it could also be direct WRITE). 479 */ 480static inline bool cfq_bio_sync(struct bio *bio) 481{ 482 return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC); 483} 484 485/* 486 * scheduler run of queue, if there are requests pending and no one in the 487 * driver that will restart queueing 488 */ 489static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 490{ 491 if (cfqd->busy_queues) { 492 cfq_log(cfqd, "schedule dispatch"); 493 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 494 } 495} 496 497static int cfq_queue_empty(struct request_queue *q) 498{ 499 struct cfq_data *cfqd = q->elevator->elevator_data; 500 501 return !cfqd->rq_queued; 502} 503 504/* 505 * Scale schedule slice based on io priority. Use the sync time slice only 506 * if a queue is marked sync and has sync io queued. A sync queue with async 507 * io only, should not get full sync slice length. 508 */ 509static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync, 510 unsigned short prio) 511{ 512 const int base_slice = cfqd->cfq_slice[sync]; 513 514 WARN_ON(prio >= IOPRIO_BE_NR); 515 516 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); 517} 518 519static inline int 520cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 521{ 522 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 523} 524 525static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) 526{ 527 u64 d = delta << CFQ_SERVICE_SHIFT; 528 529 d = d * BLKIO_WEIGHT_DEFAULT; 530 do_div(d, cfqg->weight); 531 return d; 532} 533 534static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 535{ 536 s64 delta = (s64)(vdisktime - min_vdisktime); 537 if (delta > 0) 538 min_vdisktime = vdisktime; 539 540 return min_vdisktime; 541} 542 543static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) 544{ 545 s64 delta = (s64)(vdisktime - min_vdisktime); 546 if (delta < 0) 547 min_vdisktime = vdisktime; 548 549 return min_vdisktime; 550} 551 552static void update_min_vdisktime(struct cfq_rb_root *st) 553{ 554 u64 vdisktime = st->min_vdisktime; 555 struct cfq_group *cfqg; 556 557 if (st->active) { 558 cfqg = rb_entry_cfqg(st->active); 559 vdisktime = cfqg->vdisktime; 560 } 561 562 if (st->left) { 563 cfqg = rb_entry_cfqg(st->left); 564 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 565 } 566 567 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); 568} 569 570/* 571 * get averaged number of queues of RT/BE priority. 572 * average is updated, with a formula that gives more weight to higher numbers, 573 * to quickly follows sudden increases and decrease slowly 574 */ 575 576static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, 577 struct cfq_group *cfqg, bool rt) 578{ 579 unsigned min_q, max_q; 580 unsigned mult = cfq_hist_divisor - 1; 581 unsigned round = cfq_hist_divisor / 2; 582 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); 583 584 min_q = min(cfqg->busy_queues_avg[rt], busy); 585 max_q = max(cfqg->busy_queues_avg[rt], busy); 586 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / 587 cfq_hist_divisor; 588 return cfqg->busy_queues_avg[rt]; 589} 590 591static inline unsigned 592cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 593{ 594 struct cfq_rb_root *st = &cfqd->grp_service_tree; 595 596 return cfq_target_latency * cfqg->weight / st->total_weight; 597} 598 599static inline void 600cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 601{ 602 unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 603 if (cfqd->cfq_latency) { 604 /* 605 * interested queues (we consider only the ones with the same 606 * priority class in the cfq group) 607 */ 608 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, 609 cfq_class_rt(cfqq)); 610 unsigned sync_slice = cfqd->cfq_slice[1]; 611 unsigned expect_latency = sync_slice * iq; 612 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); 613 614 if (expect_latency > group_slice) { 615 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; 616 /* scale low_slice according to IO priority 617 * and sync vs async */ 618 unsigned low_slice = 619 min(slice, base_low_slice * slice / sync_slice); 620 /* the adapted slice value is scaled to fit all iqs 621 * into the target latency */ 622 slice = max(slice * group_slice / expect_latency, 623 low_slice); 624 } 625 } 626 cfqq->slice_start = jiffies; 627 cfqq->slice_end = jiffies + slice; 628 cfqq->allocated_slice = slice; 629 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 630} 631 632/* 633 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 634 * isn't valid until the first request from the dispatch is activated 635 * and the slice time set. 636 */ 637static inline bool cfq_slice_used(struct cfq_queue *cfqq) 638{ 639 if (cfq_cfqq_slice_new(cfqq)) 640 return false; 641 if (time_before(jiffies, cfqq->slice_end)) 642 return false; 643 644 return true; 645} 646 647/* 648 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 649 * We choose the request that is closest to the head right now. Distance 650 * behind the head is penalized and only allowed to a certain extent. 651 */ 652static struct request * 653cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) 654{ 655 sector_t s1, s2, d1 = 0, d2 = 0; 656 unsigned long back_max; 657#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 658#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 659 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 660 661 if (rq1 == NULL || rq1 == rq2) 662 return rq2; 663 if (rq2 == NULL) 664 return rq1; 665 666 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 667 return rq1; 668 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 669 return rq2; 670 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) 671 return rq1; 672 else if ((rq2->cmd_flags & REQ_META) && 673 !(rq1->cmd_flags & REQ_META)) 674 return rq2; 675 676 s1 = blk_rq_pos(rq1); 677 s2 = blk_rq_pos(rq2); 678 679 /* 680 * by definition, 1KiB is 2 sectors 681 */ 682 back_max = cfqd->cfq_back_max * 2; 683 684 /* 685 * Strict one way elevator _except_ in the case where we allow 686 * short backward seeks which are biased as twice the cost of a 687 * similar forward seek. 688 */ 689 if (s1 >= last) 690 d1 = s1 - last; 691 else if (s1 + back_max >= last) 692 d1 = (last - s1) * cfqd->cfq_back_penalty; 693 else 694 wrap |= CFQ_RQ1_WRAP; 695 696 if (s2 >= last) 697 d2 = s2 - last; 698 else if (s2 + back_max >= last) 699 d2 = (last - s2) * cfqd->cfq_back_penalty; 700 else 701 wrap |= CFQ_RQ2_WRAP; 702 703 /* Found required data */ 704 705 /* 706 * By doing switch() on the bit mask "wrap" we avoid having to 707 * check two variables for all permutations: --> faster! 708 */ 709 switch (wrap) { 710 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 711 if (d1 < d2) 712 return rq1; 713 else if (d2 < d1) 714 return rq2; 715 else { 716 if (s1 >= s2) 717 return rq1; 718 else 719 return rq2; 720 } 721 722 case CFQ_RQ2_WRAP: 723 return rq1; 724 case CFQ_RQ1_WRAP: 725 return rq2; 726 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 727 default: 728 /* 729 * Since both rqs are wrapped, 730 * start with the one that's further behind head 731 * (--> only *one* back seek required), 732 * since back seek takes more time than forward. 733 */ 734 if (s1 <= s2) 735 return rq1; 736 else 737 return rq2; 738 } 739} 740 741/* 742 * The below is leftmost cache rbtree addon 743 */ 744static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 745{ 746 /* Service tree is empty */ 747 if (!root->count) 748 return NULL; 749 750 if (!root->left) 751 root->left = rb_first(&root->rb); 752 753 if (root->left) 754 return rb_entry(root->left, struct cfq_queue, rb_node); 755 756 return NULL; 757} 758 759static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) 760{ 761 if (!root->left) 762 root->left = rb_first(&root->rb); 763 764 if (root->left) 765 return rb_entry_cfqg(root->left); 766 767 return NULL; 768} 769 770static void rb_erase_init(struct rb_node *n, struct rb_root *root) 771{ 772 rb_erase(n, root); 773 RB_CLEAR_NODE(n); 774} 775 776static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 777{ 778 if (root->left == n) 779 root->left = NULL; 780 rb_erase_init(n, &root->rb); 781 --root->count; 782} 783 784/* 785 * would be nice to take fifo expire time into account as well 786 */ 787static struct request * 788cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 789 struct request *last) 790{ 791 struct rb_node *rbnext = rb_next(&last->rb_node); 792 struct rb_node *rbprev = rb_prev(&last->rb_node); 793 struct request *next = NULL, *prev = NULL; 794 795 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 796 797 if (rbprev) 798 prev = rb_entry_rq(rbprev); 799 800 if (rbnext) 801 next = rb_entry_rq(rbnext); 802 else { 803 rbnext = rb_first(&cfqq->sort_list); 804 if (rbnext && rbnext != &last->rb_node) 805 next = rb_entry_rq(rbnext); 806 } 807 808 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); 809} 810 811static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 812 struct cfq_queue *cfqq) 813{ 814 /* 815 * just an approximation, should be ok. 816 */ 817 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - 818 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 819} 820 821static inline s64 822cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) 823{ 824 return cfqg->vdisktime - st->min_vdisktime; 825} 826 827static void 828__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 829{ 830 struct rb_node **node = &st->rb.rb_node; 831 struct rb_node *parent = NULL; 832 struct cfq_group *__cfqg; 833 s64 key = cfqg_key(st, cfqg); 834 int left = 1; 835 836 while (*node != NULL) { 837 parent = *node; 838 __cfqg = rb_entry_cfqg(parent); 839 840 if (key < cfqg_key(st, __cfqg)) 841 node = &parent->rb_left; 842 else { 843 node = &parent->rb_right; 844 left = 0; 845 } 846 } 847 848 if (left) 849 st->left = &cfqg->rb_node; 850 851 rb_link_node(&cfqg->rb_node, parent, node); 852 rb_insert_color(&cfqg->rb_node, &st->rb); 853} 854 855static void 856cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 857{ 858 struct cfq_rb_root *st = &cfqd->grp_service_tree; 859 struct cfq_group *__cfqg; 860 struct rb_node *n; 861 862 cfqg->nr_cfqq++; 863 if (cfqg->on_st) 864 return; 865 866 /* 867 * Currently put the group at the end. Later implement something 868 * so that groups get lesser vtime based on their weights, so that 869 * if group does not loose all if it was not continously backlogged. 870 */ 871 n = rb_last(&st->rb); 872 if (n) { 873 __cfqg = rb_entry_cfqg(n); 874 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 875 } else 876 cfqg->vdisktime = st->min_vdisktime; 877 878 __cfq_group_service_tree_add(st, cfqg); 879 cfqg->on_st = true; 880 st->total_weight += cfqg->weight; 881} 882 883static void 884cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 885{ 886 struct cfq_rb_root *st = &cfqd->grp_service_tree; 887 888 if (st->active == &cfqg->rb_node) 889 st->active = NULL; 890 891 BUG_ON(cfqg->nr_cfqq < 1); 892 cfqg->nr_cfqq--; 893 894 /* If there are other cfq queues under this group, don't delete it */ 895 if (cfqg->nr_cfqq) 896 return; 897 898 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 899 cfqg->on_st = false; 900 st->total_weight -= cfqg->weight; 901 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 902 cfq_rb_erase(&cfqg->rb_node, st); 903 cfqg->saved_workload_slice = 0; 904 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 905} 906 907static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 908{ 909 unsigned int slice_used; 910 911 /* 912 * Queue got expired before even a single request completed or 913 * got expired immediately after first request completion. 914 */ 915 if (!cfqq->slice_start || cfqq->slice_start == jiffies) { 916 /* 917 * Also charge the seek time incurred to the group, otherwise 918 * if there are mutiple queues in the group, each can dispatch 919 * a single request on seeky media and cause lots of seek time 920 * and group will never know it. 921 */ 922 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), 923 1); 924 } else { 925 slice_used = jiffies - cfqq->slice_start; 926 if (slice_used > cfqq->allocated_slice) 927 slice_used = cfqq->allocated_slice; 928 } 929 930 return slice_used; 931} 932 933static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, 934 struct cfq_queue *cfqq) 935{ 936 struct cfq_rb_root *st = &cfqd->grp_service_tree; 937 unsigned int used_sl, charge; 938 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 939 - cfqg->service_tree_idle.count; 940 941 BUG_ON(nr_sync < 0); 942 used_sl = charge = cfq_cfqq_slice_usage(cfqq); 943 944 if (iops_mode(cfqd)) 945 charge = cfqq->slice_dispatch; 946 else if (!cfq_cfqq_sync(cfqq) && !nr_sync) 947 charge = cfqq->allocated_slice; 948 949 /* Can't update vdisktime while group is on service tree */ 950 cfq_rb_erase(&cfqg->rb_node, st); 951 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 952 __cfq_group_service_tree_add(st, cfqg); 953 954 /* This group is being expired. Save the context */ 955 if (time_after(cfqd->workload_expires, jiffies)) { 956 cfqg->saved_workload_slice = cfqd->workload_expires 957 - jiffies; 958 cfqg->saved_workload = cfqd->serving_type; 959 cfqg->saved_serving_prio = cfqd->serving_prio; 960 } else 961 cfqg->saved_workload_slice = 0; 962 963 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 964 st->min_vdisktime); 965 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" 966 " sect=%u", used_sl, cfqq->slice_dispatch, charge, 967 iops_mode(cfqd), cfqq->nr_sectors); 968 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 969 cfq_blkiocg_set_start_empty_time(&cfqg->blkg); 970} 971 972#ifdef CONFIG_CFQ_GROUP_IOSCHED 973static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) 974{ 975 if (blkg) 976 return container_of(blkg, struct cfq_group, blkg); 977 return NULL; 978} 979 980void 981cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 982{ 983 cfqg_of_blkg(blkg)->weight = weight; 984} 985 986static struct cfq_group * 987cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) 988{ 989 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 990 struct cfq_group *cfqg = NULL; 991 void *key = cfqd; 992 int i, j; 993 struct cfq_rb_root *st; 994 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 995 unsigned int major, minor; 996 997 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 998 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 999 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1000 cfqg->blkg.dev = MKDEV(major, minor); 1001 goto done; 1002 } 1003 if (cfqg || !create) 1004 goto done; 1005 1006 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 1007 if (!cfqg) 1008 goto done; 1009 1010 for_each_cfqg_st(cfqg, i, j, st) 1011 *st = CFQ_RB_ROOT; 1012 RB_CLEAR_NODE(&cfqg->rb_node); 1013 1014 /* 1015 * Take the initial reference that will be released on destroy 1016 * This can be thought of a joint reference by cgroup and 1017 * elevator which will be dropped by either elevator exit 1018 * or cgroup deletion path depending on who is exiting first. 1019 */ 1020 atomic_set(&cfqg->ref, 1); 1021 1022 /* 1023 * Add group onto cgroup list. It might happen that bdi->dev is 1024 * not initiliazed yet. Initialize this new group without major 1025 * and minor info and this info will be filled in once a new thread 1026 * comes for IO. See code above. 1027 */ 1028 if (bdi->dev) { 1029 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1030 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1031 MKDEV(major, minor)); 1032 } else 1033 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1034 0); 1035 1036 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); 1037 1038 /* Add group on cfqd list */ 1039 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 1040 1041done: 1042 return cfqg; 1043} 1044 1045/* 1046 * Search for the cfq group current task belongs to. If create = 1, then also 1047 * create the cfq group if it does not exist. request_queue lock must be held. 1048 */ 1049static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1050{ 1051 struct cgroup *cgroup; 1052 struct cfq_group *cfqg = NULL; 1053 1054 rcu_read_lock(); 1055 cgroup = task_cgroup(current, blkio_subsys_id); 1056 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); 1057 if (!cfqg && create) 1058 cfqg = &cfqd->root_group; 1059 rcu_read_unlock(); 1060 return cfqg; 1061} 1062 1063static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1064{ 1065 atomic_inc(&cfqg->ref); 1066 return cfqg; 1067} 1068 1069static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1070{ 1071 /* Currently, all async queues are mapped to root group */ 1072 if (!cfq_cfqq_sync(cfqq)) 1073 cfqg = &cfqq->cfqd->root_group; 1074 1075 cfqq->cfqg = cfqg; 1076 /* cfqq reference on cfqg */ 1077 atomic_inc(&cfqq->cfqg->ref); 1078} 1079 1080static void cfq_put_cfqg(struct cfq_group *cfqg) 1081{ 1082 struct cfq_rb_root *st; 1083 int i, j; 1084 1085 BUG_ON(atomic_read(&cfqg->ref) <= 0); 1086 if (!atomic_dec_and_test(&cfqg->ref)) 1087 return; 1088 for_each_cfqg_st(cfqg, i, j, st) 1089 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); 1090 kfree(cfqg); 1091} 1092 1093static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) 1094{ 1095 /* Something wrong if we are trying to remove same group twice */ 1096 BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); 1097 1098 hlist_del_init(&cfqg->cfqd_node); 1099 1100 /* 1101 * Put the reference taken at the time of creation so that when all 1102 * queues are gone, group can be destroyed. 1103 */ 1104 cfq_put_cfqg(cfqg); 1105} 1106 1107static void cfq_release_cfq_groups(struct cfq_data *cfqd) 1108{ 1109 struct hlist_node *pos, *n; 1110 struct cfq_group *cfqg; 1111 1112 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { 1113 /* 1114 * If cgroup removal path got to blk_group first and removed 1115 * it from cgroup list, then it will take care of destroying 1116 * cfqg also. 1117 */ 1118 if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) 1119 cfq_destroy_cfqg(cfqd, cfqg); 1120 } 1121} 1122 1123/* 1124 * Blk cgroup controller notification saying that blkio_group object is being 1125 * delinked as associated cgroup object is going away. That also means that 1126 * no new IO will come in this group. So get rid of this group as soon as 1127 * any pending IO in the group is finished. 1128 * 1129 * This function is called under rcu_read_lock(). key is the rcu protected 1130 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu 1131 * read lock. 1132 * 1133 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means 1134 * it should not be NULL as even if elevator was exiting, cgroup deltion 1135 * path got to it first. 1136 */ 1137void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) 1138{ 1139 unsigned long flags; 1140 struct cfq_data *cfqd = key; 1141 1142 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1143 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); 1144 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1145} 1146 1147#else /* GROUP_IOSCHED */ 1148static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1149{ 1150 return &cfqd->root_group; 1151} 1152 1153static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1154{ 1155 return cfqg; 1156} 1157 1158static inline void 1159cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 1160 cfqq->cfqg = cfqg; 1161} 1162 1163static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} 1164static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} 1165 1166#endif /* GROUP_IOSCHED */ 1167 1168/* 1169 * The cfqd->service_trees holds all pending cfq_queue's that have 1170 * requests waiting to be processed. It is sorted in the order that 1171 * we will service the queues. 1172 */ 1173static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1174 bool add_front) 1175{ 1176 struct rb_node **p, *parent; 1177 struct cfq_queue *__cfqq; 1178 unsigned long rb_key; 1179 struct cfq_rb_root *service_tree; 1180 int left; 1181 int new_cfqq = 1; 1182 int group_changed = 0; 1183 1184#ifdef CONFIG_CFQ_GROUP_IOSCHED 1185 if (!cfqd->cfq_group_isolation 1186 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD 1187 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { 1188 /* Move this cfq to root group */ 1189 cfq_log_cfqq(cfqd, cfqq, "moving to root group"); 1190 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1191 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1192 cfqq->orig_cfqg = cfqq->cfqg; 1193 cfqq->cfqg = &cfqd->root_group; 1194 atomic_inc(&cfqd->root_group.ref); 1195 group_changed = 1; 1196 } else if (!cfqd->cfq_group_isolation 1197 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { 1198 /* cfqq is sequential now needs to go to its original group */ 1199 BUG_ON(cfqq->cfqg != &cfqd->root_group); 1200 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1201 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1202 cfq_put_cfqg(cfqq->cfqg); 1203 cfqq->cfqg = cfqq->orig_cfqg; 1204 cfqq->orig_cfqg = NULL; 1205 group_changed = 1; 1206 cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); 1207 } 1208#endif 1209 1210 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1211 cfqq_type(cfqq)); 1212 if (cfq_class_idle(cfqq)) { 1213 rb_key = CFQ_IDLE_DELAY; 1214 parent = rb_last(&service_tree->rb); 1215 if (parent && parent != &cfqq->rb_node) { 1216 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1217 rb_key += __cfqq->rb_key; 1218 } else 1219 rb_key += jiffies; 1220 } else if (!add_front) { 1221 /* 1222 * Get our rb key offset. Subtract any residual slice 1223 * value carried from last service. A negative resid 1224 * count indicates slice overrun, and this should position 1225 * the next service time further away in the tree. 1226 */ 1227 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; 1228 rb_key -= cfqq->slice_resid; 1229 cfqq->slice_resid = 0; 1230 } else { 1231 rb_key = -HZ; 1232 __cfqq = cfq_rb_first(service_tree); 1233 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 1234 } 1235 1236 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1237 new_cfqq = 0; 1238 /* 1239 * same position, nothing more to do 1240 */ 1241 if (rb_key == cfqq->rb_key && 1242 cfqq->service_tree == service_tree) 1243 return; 1244 1245 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1246 cfqq->service_tree = NULL; 1247 } 1248 1249 left = 1; 1250 parent = NULL; 1251 cfqq->service_tree = service_tree; 1252 p = &service_tree->rb.rb_node; 1253 while (*p) { 1254 struct rb_node **n; 1255 1256 parent = *p; 1257 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1258 1259 /* 1260 * sort by key, that represents service time. 1261 */ 1262 if (time_before(rb_key, __cfqq->rb_key)) 1263 n = &(*p)->rb_left; 1264 else { 1265 n = &(*p)->rb_right; 1266 left = 0; 1267 } 1268 1269 p = n; 1270 } 1271 1272 if (left) 1273 service_tree->left = &cfqq->rb_node; 1274 1275 cfqq->rb_key = rb_key; 1276 rb_link_node(&cfqq->rb_node, parent, p); 1277 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 1278 service_tree->count++; 1279 if ((add_front || !new_cfqq) && !group_changed) 1280 return; 1281 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1282} 1283 1284static struct cfq_queue * 1285cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, 1286 sector_t sector, struct rb_node **ret_parent, 1287 struct rb_node ***rb_link) 1288{ 1289 struct rb_node **p, *parent; 1290 struct cfq_queue *cfqq = NULL; 1291 1292 parent = NULL; 1293 p = &root->rb_node; 1294 while (*p) { 1295 struct rb_node **n; 1296 1297 parent = *p; 1298 cfqq = rb_entry(parent, struct cfq_queue, p_node); 1299 1300 /* 1301 * Sort strictly based on sector. Smallest to the left, 1302 * largest to the right. 1303 */ 1304 if (sector > blk_rq_pos(cfqq->next_rq)) 1305 n = &(*p)->rb_right; 1306 else if (sector < blk_rq_pos(cfqq->next_rq)) 1307 n = &(*p)->rb_left; 1308 else 1309 break; 1310 p = n; 1311 cfqq = NULL; 1312 } 1313 1314 *ret_parent = parent; 1315 if (rb_link) 1316 *rb_link = p; 1317 return cfqq; 1318} 1319 1320static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1321{ 1322 struct rb_node **p, *parent; 1323 struct cfq_queue *__cfqq; 1324 1325 if (cfqq->p_root) { 1326 rb_erase(&cfqq->p_node, cfqq->p_root); 1327 cfqq->p_root = NULL; 1328 } 1329 1330 if (cfq_class_idle(cfqq)) 1331 return; 1332 if (!cfqq->next_rq) 1333 return; 1334 1335 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; 1336 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, 1337 blk_rq_pos(cfqq->next_rq), &parent, &p); 1338 if (!__cfqq) { 1339 rb_link_node(&cfqq->p_node, parent, p); 1340 rb_insert_color(&cfqq->p_node, cfqq->p_root); 1341 } else 1342 cfqq->p_root = NULL; 1343} 1344 1345/* 1346 * Update cfqq's position in the service tree. 1347 */ 1348static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1349{ 1350 /* 1351 * Resorting requires the cfqq to be on the RR list already. 1352 */ 1353 if (cfq_cfqq_on_rr(cfqq)) { 1354 cfq_service_tree_add(cfqd, cfqq, 0); 1355 cfq_prio_tree_add(cfqd, cfqq); 1356 } 1357} 1358 1359/* 1360 * add to busy list of queues for service, trying to be fair in ordering 1361 * the pending list according to last request service 1362 */ 1363static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1364{ 1365 cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); 1366 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1367 cfq_mark_cfqq_on_rr(cfqq); 1368 cfqd->busy_queues++; 1369 1370 cfq_resort_rr_list(cfqd, cfqq); 1371} 1372 1373/* 1374 * Called when the cfqq no longer has requests pending, remove it from 1375 * the service tree. 1376 */ 1377static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1378{ 1379 cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); 1380 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1381 cfq_clear_cfqq_on_rr(cfqq); 1382 1383 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1384 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1385 cfqq->service_tree = NULL; 1386 } 1387 if (cfqq->p_root) { 1388 rb_erase(&cfqq->p_node, cfqq->p_root); 1389 cfqq->p_root = NULL; 1390 } 1391 1392 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1393 BUG_ON(!cfqd->busy_queues); 1394 cfqd->busy_queues--; 1395} 1396 1397/* 1398 * rb tree support functions 1399 */ 1400static void cfq_del_rq_rb(struct request *rq) 1401{ 1402 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1403 const int sync = rq_is_sync(rq); 1404 1405 BUG_ON(!cfqq->queued[sync]); 1406 cfqq->queued[sync]--; 1407 1408 elv_rb_del(&cfqq->sort_list, rq); 1409 1410 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { 1411 /* 1412 * Queue will be deleted from service tree when we actually 1413 * expire it later. Right now just remove it from prio tree 1414 * as it is empty. 1415 */ 1416 if (cfqq->p_root) { 1417 rb_erase(&cfqq->p_node, cfqq->p_root); 1418 cfqq->p_root = NULL; 1419 } 1420 } 1421} 1422 1423static void cfq_add_rq_rb(struct request *rq) 1424{ 1425 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1426 struct cfq_data *cfqd = cfqq->cfqd; 1427 struct request *__alias, *prev; 1428 1429 cfqq->queued[rq_is_sync(rq)]++; 1430 1431 /* 1432 * looks a little odd, but the first insert might return an alias. 1433 * if that happens, put the alias on the dispatch list 1434 */ 1435 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 1436 cfq_dispatch_insert(cfqd->queue, __alias); 1437 1438 if (!cfq_cfqq_on_rr(cfqq)) 1439 cfq_add_cfqq_rr(cfqd, cfqq); 1440 1441 /* 1442 * check if this request is a better next-serve candidate 1443 */ 1444 prev = cfqq->next_rq; 1445 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); 1446 1447 /* 1448 * adjust priority tree position, if ->next_rq changes 1449 */ 1450 if (prev != cfqq->next_rq) 1451 cfq_prio_tree_add(cfqd, cfqq); 1452 1453 BUG_ON(!cfqq->next_rq); 1454} 1455 1456static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 1457{ 1458 elv_rb_del(&cfqq->sort_list, rq); 1459 cfqq->queued[rq_is_sync(rq)]--; 1460 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1461 rq_data_dir(rq), rq_is_sync(rq)); 1462 cfq_add_rq_rb(rq); 1463 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 1464 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), 1465 rq_is_sync(rq)); 1466} 1467 1468static struct request * 1469cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 1470{ 1471 struct task_struct *tsk = current; 1472 struct cfq_io_context *cic; 1473 struct cfq_queue *cfqq; 1474 1475 cic = cfq_cic_lookup(cfqd, tsk->io_context); 1476 if (!cic) 1477 return NULL; 1478 1479 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1480 if (cfqq) { 1481 sector_t sector = bio->bi_sector + bio_sectors(bio); 1482 1483 return elv_rb_find(&cfqq->sort_list, sector); 1484 } 1485 1486 return NULL; 1487} 1488 1489static void cfq_activate_request(struct request_queue *q, struct request *rq) 1490{ 1491 struct cfq_data *cfqd = q->elevator->elevator_data; 1492 1493 cfqd->rq_in_driver++; 1494 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 1495 cfqd->rq_in_driver); 1496 1497 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 1498} 1499 1500static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 1501{ 1502 struct cfq_data *cfqd = q->elevator->elevator_data; 1503 1504 WARN_ON(!cfqd->rq_in_driver); 1505 cfqd->rq_in_driver--; 1506 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 1507 cfqd->rq_in_driver); 1508} 1509 1510static void cfq_remove_request(struct request *rq) 1511{ 1512 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1513 1514 if (cfqq->next_rq == rq) 1515 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 1516 1517 list_del_init(&rq->queuelist); 1518 cfq_del_rq_rb(rq); 1519 1520 cfqq->cfqd->rq_queued--; 1521 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1522 rq_data_dir(rq), rq_is_sync(rq)); 1523 if (rq->cmd_flags & REQ_META) { 1524 WARN_ON(!cfqq->meta_pending); 1525 cfqq->meta_pending--; 1526 } 1527} 1528 1529static int cfq_merge(struct request_queue *q, struct request **req, 1530 struct bio *bio) 1531{ 1532 struct cfq_data *cfqd = q->elevator->elevator_data; 1533 struct request *__rq; 1534 1535 __rq = cfq_find_rq_fmerge(cfqd, bio); 1536 if (__rq && elv_rq_merge_ok(__rq, bio)) { 1537 *req = __rq; 1538 return ELEVATOR_FRONT_MERGE; 1539 } 1540 1541 return ELEVATOR_NO_MERGE; 1542} 1543 1544static void cfq_merged_request(struct request_queue *q, struct request *req, 1545 int type) 1546{ 1547 if (type == ELEVATOR_FRONT_MERGE) { 1548 struct cfq_queue *cfqq = RQ_CFQQ(req); 1549 1550 cfq_reposition_rq_rb(cfqq, req); 1551 } 1552} 1553 1554static void cfq_bio_merged(struct request_queue *q, struct request *req, 1555 struct bio *bio) 1556{ 1557 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, 1558 bio_data_dir(bio), cfq_bio_sync(bio)); 1559} 1560 1561static void 1562cfq_merged_requests(struct request_queue *q, struct request *rq, 1563 struct request *next) 1564{ 1565 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1566 /* 1567 * reposition in fifo if next is older than rq 1568 */ 1569 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 1570 time_before(rq_fifo_time(next), rq_fifo_time(rq))) { 1571 list_move(&rq->queuelist, &next->queuelist); 1572 rq_set_fifo_time(rq, rq_fifo_time(next)); 1573 } 1574 1575 if (cfqq->next_rq == next) 1576 cfqq->next_rq = rq; 1577 cfq_remove_request(next); 1578 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, 1579 rq_data_dir(next), rq_is_sync(next)); 1580} 1581 1582static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1583 struct bio *bio) 1584{ 1585 struct cfq_data *cfqd = q->elevator->elevator_data; 1586 struct cfq_io_context *cic; 1587 struct cfq_queue *cfqq; 1588 1589 /* 1590 * Disallow merge of a sync bio into an async request. 1591 */ 1592 if (cfq_bio_sync(bio) && !rq_is_sync(rq)) 1593 return false; 1594 1595 /* 1596 * Lookup the cfqq that this bio will be queued with. Allow 1597 * merge only if rq is queued there. 1598 */ 1599 cic = cfq_cic_lookup(cfqd, current->io_context); 1600 if (!cic) 1601 return false; 1602 1603 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1604 return cfqq == RQ_CFQQ(rq); 1605} 1606 1607static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1608{ 1609 del_timer(&cfqd->idle_slice_timer); 1610 cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); 1611} 1612 1613static void __cfq_set_active_queue(struct cfq_data *cfqd, 1614 struct cfq_queue *cfqq) 1615{ 1616 if (cfqq) { 1617 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 1618 cfqd->serving_prio, cfqd->serving_type); 1619 cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); 1620 cfqq->slice_start = 0; 1621 cfqq->dispatch_start = jiffies; 1622 cfqq->allocated_slice = 0; 1623 cfqq->slice_end = 0; 1624 cfqq->slice_dispatch = 0; 1625 cfqq->nr_sectors = 0; 1626 1627 cfq_clear_cfqq_wait_request(cfqq); 1628 cfq_clear_cfqq_must_dispatch(cfqq); 1629 cfq_clear_cfqq_must_alloc_slice(cfqq); 1630 cfq_clear_cfqq_fifo_expire(cfqq); 1631 cfq_mark_cfqq_slice_new(cfqq); 1632 1633 cfq_del_timer(cfqd, cfqq); 1634 } 1635 1636 cfqd->active_queue = cfqq; 1637} 1638 1639/* 1640 * current cfqq expired its slice (or was too idle), select new one 1641 */ 1642static void 1643__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1644 bool timed_out) 1645{ 1646 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 1647 1648 if (cfq_cfqq_wait_request(cfqq)) 1649 cfq_del_timer(cfqd, cfqq); 1650 1651 cfq_clear_cfqq_wait_request(cfqq); 1652 cfq_clear_cfqq_wait_busy(cfqq); 1653 1654 /* 1655 * If this cfqq is shared between multiple processes, check to 1656 * make sure that those processes are still issuing I/Os within 1657 * the mean seek distance. If not, it may be time to break the 1658 * queues apart again. 1659 */ 1660 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) 1661 cfq_mark_cfqq_split_coop(cfqq); 1662 1663 /* 1664 * store what was left of this slice, if the queue idled/timed out 1665 */ 1666 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 1667 cfqq->slice_resid = cfqq->slice_end - jiffies; 1668 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1669 } 1670 1671 cfq_group_served(cfqd, cfqq->cfqg, cfqq); 1672 1673 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1674 cfq_del_cfqq_rr(cfqd, cfqq); 1675 1676 cfq_resort_rr_list(cfqd, cfqq); 1677 1678 if (cfqq == cfqd->active_queue) 1679 cfqd->active_queue = NULL; 1680 1681 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) 1682 cfqd->grp_service_tree.active = NULL; 1683 1684 if (cfqd->active_cic) { 1685 put_io_context(cfqd->active_cic->ioc); 1686 cfqd->active_cic = NULL; 1687 } 1688} 1689 1690static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) 1691{ 1692 struct cfq_queue *cfqq = cfqd->active_queue; 1693 1694 if (cfqq) 1695 __cfq_slice_expired(cfqd, cfqq, timed_out); 1696} 1697 1698/* 1699 * Get next queue for service. Unless we have a queue preemption, 1700 * we'll simply select the first cfqq in the service tree. 1701 */ 1702static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 1703{ 1704 struct cfq_rb_root *service_tree = 1705 service_tree_for(cfqd->serving_group, cfqd->serving_prio, 1706 cfqd->serving_type); 1707 1708 if (!cfqd->rq_queued) 1709 return NULL; 1710 1711 /* There is nothing to dispatch */ 1712 if (!service_tree) 1713 return NULL; 1714 if (RB_EMPTY_ROOT(&service_tree->rb)) 1715 return NULL; 1716 return cfq_rb_first(service_tree); 1717} 1718 1719static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 1720{ 1721 struct cfq_group *cfqg; 1722 struct cfq_queue *cfqq; 1723 int i, j; 1724 struct cfq_rb_root *st; 1725 1726 if (!cfqd->rq_queued) 1727 return NULL; 1728 1729 cfqg = cfq_get_next_cfqg(cfqd); 1730 if (!cfqg) 1731 return NULL; 1732 1733 for_each_cfqg_st(cfqg, i, j, st) 1734 if ((cfqq = cfq_rb_first(st)) != NULL) 1735 return cfqq; 1736 return NULL; 1737} 1738 1739/* 1740 * Get and set a new active queue for service. 1741 */ 1742static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 1743 struct cfq_queue *cfqq) 1744{ 1745 if (!cfqq) 1746 cfqq = cfq_get_next_queue(cfqd); 1747 1748 __cfq_set_active_queue(cfqd, cfqq); 1749 return cfqq; 1750} 1751 1752static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 1753 struct request *rq) 1754{ 1755 if (blk_rq_pos(rq) >= cfqd->last_position) 1756 return blk_rq_pos(rq) - cfqd->last_position; 1757 else 1758 return cfqd->last_position - blk_rq_pos(rq); 1759} 1760 1761static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1762 struct request *rq) 1763{ 1764 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; 1765} 1766 1767static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 1768 struct cfq_queue *cur_cfqq) 1769{ 1770 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; 1771 struct rb_node *parent, *node; 1772 struct cfq_queue *__cfqq; 1773 sector_t sector = cfqd->last_position; 1774 1775 if (RB_EMPTY_ROOT(root)) 1776 return NULL; 1777 1778 /* 1779 * First, if we find a request starting at the end of the last 1780 * request, choose it. 1781 */ 1782 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); 1783 if (__cfqq) 1784 return __cfqq; 1785 1786 /* 1787 * If the exact sector wasn't found, the parent of the NULL leaf 1788 * will contain the closest sector. 1789 */ 1790 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1791 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1792 return __cfqq; 1793 1794 if (blk_rq_pos(__cfqq->next_rq) < sector) 1795 node = rb_next(&__cfqq->p_node); 1796 else 1797 node = rb_prev(&__cfqq->p_node); 1798 if (!node) 1799 return NULL; 1800 1801 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1802 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1803 return __cfqq; 1804 1805 return NULL; 1806} 1807 1808/* 1809 * cfqd - obvious 1810 * cur_cfqq - passed in so that we don't decide that the current queue is 1811 * closely cooperating with itself. 1812 * 1813 * So, basically we're assuming that that cur_cfqq has dispatched at least 1814 * one request, and that cfqd->last_position reflects a position on the disk 1815 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 1816 * assumption. 1817 */ 1818static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1819 struct cfq_queue *cur_cfqq) 1820{ 1821 struct cfq_queue *cfqq; 1822 1823 if (cfq_class_idle(cur_cfqq)) 1824 return NULL; 1825 if (!cfq_cfqq_sync(cur_cfqq)) 1826 return NULL; 1827 if (CFQQ_SEEKY(cur_cfqq)) 1828 return NULL; 1829 1830 /* 1831 * Don't search priority tree if it's the only queue in the group. 1832 */ 1833 if (cur_cfqq->cfqg->nr_cfqq == 1) 1834 return NULL; 1835 1836 /* 1837 * We should notice if some of the queues are cooperating, eg 1838 * working closely on the same area of the disk. In that case, 1839 * we can group them together and don't waste time idling. 1840 */ 1841 cfqq = cfqq_close(cfqd, cur_cfqq); 1842 if (!cfqq) 1843 return NULL; 1844 1845 /* If new queue belongs to different cfq_group, don't choose it */ 1846 if (cur_cfqq->cfqg != cfqq->cfqg) 1847 return NULL; 1848 1849 /* 1850 * It only makes sense to merge sync queues. 1851 */ 1852 if (!cfq_cfqq_sync(cfqq)) 1853 return NULL; 1854 if (CFQQ_SEEKY(cfqq)) 1855 return NULL; 1856 1857 /* 1858 * Do not merge queues of different priority classes 1859 */ 1860 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) 1861 return NULL; 1862 1863 return cfqq; 1864} 1865 1866/* 1867 * Determine whether we should enforce idle window for this queue. 1868 */ 1869 1870static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1871{ 1872 enum wl_prio_t prio = cfqq_prio(cfqq); 1873 struct cfq_rb_root *service_tree = cfqq->service_tree; 1874 1875 BUG_ON(!service_tree); 1876 BUG_ON(!service_tree->count); 1877 1878 if (!cfqd->cfq_slice_idle) 1879 return false; 1880 1881 /* We never do for idle class queues. */ 1882 if (prio == IDLE_WORKLOAD) 1883 return false; 1884 1885 /* We do for queues that were marked with idle window flag. */ 1886 if (cfq_cfqq_idle_window(cfqq) && 1887 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) 1888 return true; 1889 1890 /* 1891 * Otherwise, we do only if they are the last ones 1892 * in their service tree. 1893 */ 1894 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) 1895 return true; 1896 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", 1897 service_tree->count); 1898 return false; 1899} 1900 1901static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1902{ 1903 struct cfq_queue *cfqq = cfqd->active_queue; 1904 struct cfq_io_context *cic; 1905 unsigned long sl, group_idle = 0; 1906 1907 /* 1908 * SSD device without seek penalty, disable idling. But only do so 1909 * for devices that support queuing, otherwise we still have a problem 1910 * with sync vs async workloads. 1911 */ 1912 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) 1913 return; 1914 1915 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 1916 WARN_ON(cfq_cfqq_slice_new(cfqq)); 1917 1918 /* 1919 * idle is disabled, either manually or by past process history 1920 */ 1921 if (!cfq_should_idle(cfqd, cfqq)) { 1922 /* no queue idling. Check for group idling */ 1923 if (cfqd->cfq_group_idle) 1924 group_idle = cfqd->cfq_group_idle; 1925 else 1926 return; 1927 } 1928 1929 /* 1930 * still active requests from this queue, don't idle 1931 */ 1932 if (cfqq->dispatched) 1933 return; 1934 1935 /* 1936 * task has exited, don't wait 1937 */ 1938 cic = cfqd->active_cic; 1939 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1940 return; 1941 1942 /* 1943 * If our average think time is larger than the remaining time 1944 * slice, then don't idle. This avoids overrunning the allotted 1945 * time slice. 1946 */ 1947 if (sample_valid(cic->ttime_samples) && 1948 (cfqq->slice_end - jiffies < cic->ttime_mean)) { 1949 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", 1950 cic->ttime_mean); 1951 return; 1952 } 1953 1954 /* There are other queues in the group, don't do group idle */ 1955 if (group_idle && cfqq->cfqg->nr_cfqq > 1) 1956 return; 1957 1958 cfq_mark_cfqq_wait_request(cfqq); 1959 1960 if (group_idle) 1961 sl = cfqd->cfq_group_idle; 1962 else 1963 sl = cfqd->cfq_slice_idle; 1964 1965 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1966 cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); 1967 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, 1968 group_idle ? 1 : 0); 1969} 1970 1971/* 1972 * Move request from internal lists to the request queue dispatch list. 1973 */ 1974static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) 1975{ 1976 struct cfq_data *cfqd = q->elevator->elevator_data; 1977 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1978 1979 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 1980 1981 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); 1982 cfq_remove_request(rq); 1983 cfqq->dispatched++; 1984 (RQ_CFQG(rq))->dispatched++; 1985 elv_dispatch_sort(q, rq); 1986 1987 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 1988 cfqq->nr_sectors += blk_rq_sectors(rq); 1989 cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), 1990 rq_data_dir(rq), rq_is_sync(rq)); 1991} 1992 1993/* 1994 * return expired entry, or NULL to just start from scratch in rbtree 1995 */ 1996static struct request *cfq_check_fifo(struct cfq_queue *cfqq) 1997{ 1998 struct request *rq = NULL; 1999 2000 if (cfq_cfqq_fifo_expire(cfqq)) 2001 return NULL; 2002 2003 cfq_mark_cfqq_fifo_expire(cfqq); 2004 2005 if (list_empty(&cfqq->fifo)) 2006 return NULL; 2007 2008 rq = rq_entry_fifo(cfqq->fifo.next); 2009 if (time_before(jiffies, rq_fifo_time(rq))) 2010 rq = NULL; 2011 2012 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); 2013 return rq; 2014} 2015 2016static inline int 2017cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2018{ 2019 const int base_rq = cfqd->cfq_slice_async_rq; 2020 2021 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 2022 2023 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 2024} 2025 2026/* 2027 * Must be called with the queue_lock held. 2028 */ 2029static int cfqq_process_refs(struct cfq_queue *cfqq) 2030{ 2031 int process_refs, io_refs; 2032 2033 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 2034 process_refs = atomic_read(&cfqq->ref) - io_refs; 2035 BUG_ON(process_refs < 0); 2036 return process_refs; 2037} 2038 2039static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) 2040{ 2041 int process_refs, new_process_refs; 2042 struct cfq_queue *__cfqq; 2043 2044 /* 2045 * If there are no process references on the new_cfqq, then it is 2046 * unsafe to follow the ->new_cfqq chain as other cfqq's in the 2047 * chain may have dropped their last reference (not just their 2048 * last process reference). 2049 */ 2050 if (!cfqq_process_refs(new_cfqq)) 2051 return; 2052 2053 /* Avoid a circular list and skip interim queue merges */ 2054 while ((__cfqq = new_cfqq->new_cfqq)) { 2055 if (__cfqq == cfqq) 2056 return; 2057 new_cfqq = __cfqq; 2058 } 2059 2060 process_refs = cfqq_process_refs(cfqq); 2061 new_process_refs = cfqq_process_refs(new_cfqq); 2062 /* 2063 * If the process for the cfqq has gone away, there is no 2064 * sense in merging the queues. 2065 */ 2066 if (process_refs == 0 || new_process_refs == 0) 2067 return; 2068 2069 /* 2070 * Merge in the direction of the lesser amount of work. 2071 */ 2072 if (new_process_refs >= process_refs) { 2073 cfqq->new_cfqq = new_cfqq; 2074 atomic_add(process_refs, &new_cfqq->ref); 2075 } else { 2076 new_cfqq->new_cfqq = cfqq; 2077 atomic_add(new_process_refs, &cfqq->ref); 2078 } 2079} 2080 2081static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, 2082 struct cfq_group *cfqg, enum wl_prio_t prio) 2083{ 2084 struct cfq_queue *queue; 2085 int i; 2086 bool key_valid = false; 2087 unsigned long lowest_key = 0; 2088 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; 2089 2090 for (i = 0; i <= SYNC_WORKLOAD; ++i) { 2091 /* select the one with lowest rb_key */ 2092 queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); 2093 if (queue && 2094 (!key_valid || time_before(queue->rb_key, lowest_key))) { 2095 lowest_key = queue->rb_key; 2096 cur_best = i; 2097 key_valid = true; 2098 } 2099 } 2100 2101 return cur_best; 2102} 2103 2104static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) 2105{ 2106 unsigned slice; 2107 unsigned count; 2108 struct cfq_rb_root *st; 2109 unsigned group_slice; 2110 2111 if (!cfqg) { 2112 cfqd->serving_prio = IDLE_WORKLOAD; 2113 cfqd->workload_expires = jiffies + 1; 2114 return; 2115 } 2116 2117 /* Choose next priority. RT > BE > IDLE */ 2118 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2119 cfqd->serving_prio = RT_WORKLOAD; 2120 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 2121 cfqd->serving_prio = BE_WORKLOAD; 2122 else { 2123 cfqd->serving_prio = IDLE_WORKLOAD; 2124 cfqd->workload_expires = jiffies + 1; 2125 return; 2126 } 2127 2128 /* 2129 * For RT and BE, we have to choose also the type 2130 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2131 * expiration time 2132 */ 2133 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2134 count = st->count; 2135 2136 /* 2137 * check workload expiration, and that we still have other queues ready 2138 */ 2139 if (count && !time_after(jiffies, cfqd->workload_expires)) 2140 return; 2141 2142 /* otherwise select new workload type */ 2143 cfqd->serving_type = 2144 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); 2145 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2146 count = st->count; 2147 2148 /* 2149 * the workload slice is computed as a fraction of target latency 2150 * proportional to the number of queues in that workload, over 2151 * all the queues in the same priority class 2152 */ 2153 group_slice = cfq_group_slice(cfqd, cfqg); 2154 2155 slice = group_slice * count / 2156 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], 2157 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); 2158 2159 if (cfqd->serving_type == ASYNC_WORKLOAD) { 2160 unsigned int tmp; 2161 2162 /* 2163 * Async queues are currently system wide. Just taking 2164 * proportion of queues with-in same group will lead to higher 2165 * async ratio system wide as generally root group is going 2166 * to have higher weight. A more accurate thing would be to 2167 * calculate system wide asnc/sync ratio. 2168 */ 2169 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); 2170 tmp = tmp/cfqd->busy_queues; 2171 slice = min_t(unsigned, slice, tmp); 2172 2173 /* async workload slice is scaled down according to 2174 * the sync/async slice ratio. */ 2175 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; 2176 } else 2177 /* sync workload slice is at least 2 * cfq_slice_idle */ 2178 slice = max(slice, 2 * cfqd->cfq_slice_idle); 2179 2180 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2181 cfq_log(cfqd, "workload slice:%d", slice); 2182 cfqd->workload_expires = jiffies + slice; 2183 cfqd->noidle_tree_requires_idle = false; 2184} 2185 2186static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2187{ 2188 struct cfq_rb_root *st = &cfqd->grp_service_tree; 2189 struct cfq_group *cfqg; 2190 2191 if (RB_EMPTY_ROOT(&st->rb)) 2192 return NULL; 2193 cfqg = cfq_rb_first_group(st); 2194 st->active = &cfqg->rb_node; 2195 update_min_vdisktime(st); 2196 return cfqg; 2197} 2198 2199static void cfq_choose_cfqg(struct cfq_data *cfqd) 2200{ 2201 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); 2202 2203 cfqd->serving_group = cfqg; 2204 2205 /* Restore the workload type data */ 2206 if (cfqg->saved_workload_slice) { 2207 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; 2208 cfqd->serving_type = cfqg->saved_workload; 2209 cfqd->serving_prio = cfqg->saved_serving_prio; 2210 } else 2211 cfqd->workload_expires = jiffies - 1; 2212 2213 choose_service_tree(cfqd, cfqg); 2214} 2215 2216/* 2217 * Select a queue for service. If we have a current active queue, 2218 * check whether to continue servicing it, or retrieve and set a new one. 2219 */ 2220static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 2221{ 2222 struct cfq_queue *cfqq, *new_cfqq = NULL; 2223 2224 cfqq = cfqd->active_queue; 2225 if (!cfqq) 2226 goto new_queue; 2227 2228 if (!cfqd->rq_queued) 2229 return NULL; 2230 2231 /* 2232 * We were waiting for group to get backlogged. Expire the queue 2233 */ 2234 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) 2235 goto expire; 2236 2237 /* 2238 * The active queue has run out of time, expire it and select new. 2239 */ 2240 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { 2241 /* 2242 * If slice had not expired at the completion of last request 2243 * we might not have turned on wait_busy flag. Don't expire 2244 * the queue yet. Allow the group to get backlogged. 2245 * 2246 * The very fact that we have used the slice, that means we 2247 * have been idling all along on this queue and it should be 2248 * ok to wait for this request to complete. 2249 */ 2250 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) 2251 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 2252 cfqq = NULL; 2253 goto keep_queue; 2254 } else 2255 goto check_group_idle; 2256 } 2257 2258 /* 2259 * The active queue has requests and isn't expired, allow it to 2260 * dispatch. 2261 */ 2262 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 2263 goto keep_queue; 2264 2265 /* 2266 * If another queue has a request waiting within our mean seek 2267 * distance, let it run. The expire code will check for close 2268 * cooperators and put the close queue at the front of the service 2269 * tree. If possible, merge the expiring queue with the new cfqq. 2270 */ 2271 new_cfqq = cfq_close_cooperator(cfqd, cfqq); 2272 if (new_cfqq) { 2273 if (!cfqq->new_cfqq) 2274 cfq_setup_merge(cfqq, new_cfqq); 2275 goto expire; 2276 } 2277 2278 /* 2279 * No requests pending. If the active queue still has requests in 2280 * flight or is idling for a new request, allow either of these 2281 * conditions to happen (or time out) before selecting a new queue. 2282 */ 2283 if (timer_pending(&cfqd->idle_slice_timer)) { 2284 cfqq = NULL; 2285 goto keep_queue; 2286 } 2287 2288 /* 2289 * This is a deep seek queue, but the device is much faster than 2290 * the queue can deliver, don't idle 2291 **/ 2292 if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && 2293 (cfq_cfqq_slice_new(cfqq) || 2294 (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) { 2295 cfq_clear_cfqq_deep(cfqq); 2296 cfq_clear_cfqq_idle_window(cfqq); 2297 } 2298 2299 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 2300 cfqq = NULL; 2301 goto keep_queue; 2302 } 2303 2304 /* 2305 * If group idle is enabled and there are requests dispatched from 2306 * this group, wait for requests to complete. 2307 */ 2308check_group_idle: 2309 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 2310 && cfqq->cfqg->dispatched) { 2311 cfqq = NULL; 2312 goto keep_queue; 2313 } 2314 2315expire: 2316 cfq_slice_expired(cfqd, 0); 2317new_queue: 2318 /* 2319 * Current queue expired. Check if we have to switch to a new 2320 * service tree 2321 */ 2322 if (!new_cfqq) 2323 cfq_choose_cfqg(cfqd); 2324 2325 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 2326keep_queue: 2327 return cfqq; 2328} 2329 2330static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 2331{ 2332 int dispatched = 0; 2333 2334 while (cfqq->next_rq) { 2335 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 2336 dispatched++; 2337 } 2338 2339 BUG_ON(!list_empty(&cfqq->fifo)); 2340 2341 /* By default cfqq is not expired if it is empty. Do it explicitly */ 2342 __cfq_slice_expired(cfqq->cfqd, cfqq, 0); 2343 return dispatched; 2344} 2345 2346/* 2347 * Drain our current requests. Used for barriers and when switching 2348 * io schedulers on-the-fly. 2349 */ 2350static int cfq_forced_dispatch(struct cfq_data *cfqd) 2351{ 2352 struct cfq_queue *cfqq; 2353 int dispatched = 0; 2354 2355 /* Expire the timeslice of the current active queue first */ 2356 cfq_slice_expired(cfqd, 0); 2357 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { 2358 __cfq_set_active_queue(cfqd, cfqq); 2359 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 2360 } 2361 2362 BUG_ON(cfqd->busy_queues); 2363 2364 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 2365 return dispatched; 2366} 2367 2368static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, 2369 struct cfq_queue *cfqq) 2370{ 2371 /* the queue hasn't finished any request, can't estimate */ 2372 if (cfq_cfqq_slice_new(cfqq)) 2373 return true; 2374 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, 2375 cfqq->slice_end)) 2376 return true; 2377 2378 return false; 2379} 2380 2381static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2382{ 2383 unsigned int max_dispatch; 2384 2385 /* 2386 * Drain async requests before we start sync IO 2387 */ 2388 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) 2389 return false; 2390 2391 /* 2392 * If this is an async queue and we have sync IO in flight, let it wait 2393 */ 2394 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) 2395 return false; 2396 2397 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); 2398 if (cfq_class_idle(cfqq)) 2399 max_dispatch = 1; 2400 2401 /* 2402 * Does this cfqq already have too much IO in flight? 2403 */ 2404 if (cfqq->dispatched >= max_dispatch) { 2405 /* 2406 * idle queue must always only have a single IO in flight 2407 */ 2408 if (cfq_class_idle(cfqq)) 2409 return false; 2410 2411 /* 2412 * We have other queues, don't allow more IO from this one 2413 */ 2414 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) 2415 return false; 2416 2417 /* 2418 * Sole queue user, no limit 2419 */ 2420 if (cfqd->busy_queues == 1) 2421 max_dispatch = -1; 2422 else 2423 /* 2424 * Normally we start throttling cfqq when cfq_quantum/2 2425 * requests have been dispatched. But we can drive 2426 * deeper queue depths at the beginning of slice 2427 * subjected to upper limit of cfq_quantum. 2428 * */ 2429 max_dispatch = cfqd->cfq_quantum; 2430 } 2431 2432 /* 2433 * Async queues must wait a bit before being allowed dispatch. 2434 * We also ramp up the dispatch depth gradually for async IO, 2435 * based on the last sync IO we serviced 2436 */ 2437 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { 2438 unsigned long last_sync = jiffies - cfqd->last_delayed_sync; 2439 unsigned int depth; 2440 2441 depth = last_sync / cfqd->cfq_slice[1]; 2442 if (!depth && !cfqq->dispatched) 2443 depth = 1; 2444 if (depth < max_dispatch) 2445 max_dispatch = depth; 2446 } 2447 2448 /* 2449 * If we're below the current max, allow a dispatch 2450 */ 2451 return cfqq->dispatched < max_dispatch; 2452} 2453 2454/* 2455 * Dispatch a request from cfqq, moving them to the request queue 2456 * dispatch list. 2457 */ 2458static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2459{ 2460 struct request *rq; 2461 2462 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 2463 2464 if (!cfq_may_dispatch(cfqd, cfqq)) 2465 return false; 2466 2467 /* 2468 * follow expired path, else get first next available 2469 */ 2470 rq = cfq_check_fifo(cfqq); 2471 if (!rq) 2472 rq = cfqq->next_rq; 2473 2474 /* 2475 * insert request into driver dispatch list 2476 */ 2477 cfq_dispatch_insert(cfqd->queue, rq); 2478 2479 if (!cfqd->active_cic) { 2480 struct cfq_io_context *cic = RQ_CIC(rq); 2481 2482 atomic_long_inc(&cic->ioc->refcount); 2483 cfqd->active_cic = cic; 2484 } 2485 2486 return true; 2487} 2488 2489/* 2490 * Find the cfqq that we need to service and move a request from that to the 2491 * dispatch list 2492 */ 2493static int cfq_dispatch_requests(struct request_queue *q, int force) 2494{ 2495 struct cfq_data *cfqd = q->elevator->elevator_data; 2496 struct cfq_queue *cfqq; 2497 2498 if (!cfqd->busy_queues) 2499 return 0; 2500 2501 if (unlikely(force)) 2502 return cfq_forced_dispatch(cfqd); 2503 2504 cfqq = cfq_select_queue(cfqd); 2505 if (!cfqq) 2506 return 0; 2507 2508 /* 2509 * Dispatch a request from this cfqq, if it is allowed 2510 */ 2511 if (!cfq_dispatch_request(cfqd, cfqq)) 2512 return 0; 2513 2514 cfqq->slice_dispatch++; 2515 cfq_clear_cfqq_must_dispatch(cfqq); 2516 2517 /* 2518 * expire an async queue immediately if it has used up its slice. idle 2519 * queue always expire after 1 dispatch round. 2520 */ 2521 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 2522 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 2523 cfq_class_idle(cfqq))) { 2524 cfqq->slice_end = jiffies + 1; 2525 cfq_slice_expired(cfqd, 0); 2526 } 2527 2528 cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); 2529 return 1; 2530} 2531 2532/* 2533 * task holds one reference to the queue, dropped when task exits. each rq 2534 * in-flight on this queue also holds a reference, dropped when rq is freed. 2535 * 2536 * Each cfq queue took a reference on the parent group. Drop it now. 2537 * queue lock must be held here. 2538 */ 2539static void cfq_put_queue(struct cfq_queue *cfqq) 2540{ 2541 struct cfq_data *cfqd = cfqq->cfqd; 2542 struct cfq_group *cfqg, *orig_cfqg; 2543 2544 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2545 2546 if (!atomic_dec_and_test(&cfqq->ref)) 2547 return; 2548 2549 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2550 BUG_ON(rb_first(&cfqq->sort_list)); 2551 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2552 cfqg = cfqq->cfqg; 2553 orig_cfqg = cfqq->orig_cfqg; 2554 2555 if (unlikely(cfqd->active_queue == cfqq)) { 2556 __cfq_slice_expired(cfqd, cfqq, 0); 2557 cfq_schedule_dispatch(cfqd); 2558 } 2559 2560 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2561 kmem_cache_free(cfq_pool, cfqq); 2562 cfq_put_cfqg(cfqg); 2563 if (orig_cfqg) 2564 cfq_put_cfqg(orig_cfqg); 2565} 2566 2567/* 2568 * Must always be called with the rcu_read_lock() held 2569 */ 2570static void 2571__call_for_each_cic(struct io_context *ioc, 2572 void (*func)(struct io_context *, struct cfq_io_context *)) 2573{ 2574 struct cfq_io_context *cic; 2575 struct hlist_node *n; 2576 2577 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 2578 func(ioc, cic); 2579} 2580 2581/* 2582 * Call func for each cic attached to this ioc. 2583 */ 2584static void 2585call_for_each_cic(struct io_context *ioc, 2586 void (*func)(struct io_context *, struct cfq_io_context *)) 2587{ 2588 rcu_read_lock(); 2589 __call_for_each_cic(ioc, func); 2590 rcu_read_unlock(); 2591} 2592 2593static void cfq_cic_free_rcu(struct rcu_head *head) 2594{ 2595 struct cfq_io_context *cic; 2596 2597 cic = container_of(head, struct cfq_io_context, rcu_head); 2598 2599 kmem_cache_free(cfq_ioc_pool, cic); 2600 elv_ioc_count_dec(cfq_ioc_count); 2601 2602 if (ioc_gone) { 2603 /* 2604 * CFQ scheduler is exiting, grab exit lock and check 2605 * the pending io context count. If it hits zero, 2606 * complete ioc_gone and set it back to NULL 2607 */ 2608 spin_lock(&ioc_gone_lock); 2609 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { 2610 complete(ioc_gone); 2611 ioc_gone = NULL; 2612 } 2613 spin_unlock(&ioc_gone_lock); 2614 } 2615} 2616 2617static void cfq_cic_free(struct cfq_io_context *cic) 2618{ 2619 call_rcu(&cic->rcu_head, cfq_cic_free_rcu); 2620} 2621 2622static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 2623{ 2624 unsigned long flags; 2625 unsigned long dead_key = (unsigned long) cic->key; 2626 2627 BUG_ON(!(dead_key & CIC_DEAD_KEY)); 2628 2629 spin_lock_irqsave(&ioc->lock, flags); 2630 radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); 2631 hlist_del_rcu(&cic->cic_list); 2632 spin_unlock_irqrestore(&ioc->lock, flags); 2633 2634 cfq_cic_free(cic); 2635} 2636 2637/* 2638 * Must be called with rcu_read_lock() held or preemption otherwise disabled. 2639 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), 2640 * and ->trim() which is called with the task lock held 2641 */ 2642static void cfq_free_io_context(struct io_context *ioc) 2643{ 2644 /* 2645 * ioc->refcount is zero here, or we are called from elv_unregister(), 2646 * so no more cic's are allowed to be linked into this ioc. So it 2647 * should be ok to iterate over the known list, we will see all cic's 2648 * since no new ones are added. 2649 */ 2650 __call_for_each_cic(ioc, cic_free_func); 2651} 2652 2653static void cfq_put_cooperator(struct cfq_queue *cfqq) 2654{ 2655 struct cfq_queue *__cfqq, *next; 2656 2657 /* 2658 * If this queue was scheduled to merge with another queue, be 2659 * sure to drop the reference taken on that queue (and others in 2660 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. 2661 */ 2662 __cfqq = cfqq->new_cfqq; 2663 while (__cfqq) { 2664 if (__cfqq == cfqq) { 2665 WARN(1, "cfqq->new_cfqq loop detected\n"); 2666 break; 2667 } 2668 next = __cfqq->new_cfqq; 2669 cfq_put_queue(__cfqq); 2670 __cfqq = next; 2671 } 2672} 2673 2674static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2675{ 2676 if (unlikely(cfqq == cfqd->active_queue)) { 2677 __cfq_slice_expired(cfqd, cfqq, 0); 2678 cfq_schedule_dispatch(cfqd); 2679 } 2680 2681 cfq_put_cooperator(cfqq); 2682 2683 cfq_put_queue(cfqq); 2684} 2685 2686static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 2687 struct cfq_io_context *cic) 2688{ 2689 struct io_context *ioc = cic->ioc; 2690 2691 list_del_init(&cic->queue_list); 2692 2693 /* 2694 * Make sure dead mark is seen for dead queues 2695 */ 2696 smp_wmb(); 2697 cic->key = cfqd_dead_key(cfqd); 2698 2699 if (ioc->ioc_data == cic) 2700 rcu_assign_pointer(ioc->ioc_data, NULL); 2701 2702 if (cic->cfqq[BLK_RW_ASYNC]) { 2703 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 2704 cic->cfqq[BLK_RW_ASYNC] = NULL; 2705 } 2706 2707 if (cic->cfqq[BLK_RW_SYNC]) { 2708 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 2709 cic->cfqq[BLK_RW_SYNC] = NULL; 2710 } 2711} 2712 2713static void cfq_exit_single_io_context(struct io_context *ioc, 2714 struct cfq_io_context *cic) 2715{ 2716 struct cfq_data *cfqd = cic_to_cfqd(cic); 2717 2718 if (cfqd) { 2719 struct request_queue *q = cfqd->queue; 2720 unsigned long flags; 2721 2722 spin_lock_irqsave(q->queue_lock, flags); 2723 2724 /* 2725 * Ensure we get a fresh copy of the ->key to prevent 2726 * race between exiting task and queue 2727 */ 2728 smp_read_barrier_depends(); 2729 if (cic->key == cfqd) 2730 __cfq_exit_single_io_context(cfqd, cic); 2731 2732 spin_unlock_irqrestore(q->queue_lock, flags); 2733 } 2734} 2735 2736/* 2737 * The process that ioc belongs to has exited, we need to clean up 2738 * and put the internal structures we have that belongs to that process. 2739 */ 2740static void cfq_exit_io_context(struct io_context *ioc) 2741{ 2742 call_for_each_cic(ioc, cfq_exit_single_io_context); 2743} 2744 2745static struct cfq_io_context * 2746cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 2747{ 2748 struct cfq_io_context *cic; 2749 2750 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, 2751 cfqd->queue->node); 2752 if (cic) { 2753 cic->last_end_request = jiffies; 2754 INIT_LIST_HEAD(&cic->queue_list); 2755 INIT_HLIST_NODE(&cic->cic_list); 2756 cic->dtor = cfq_free_io_context; 2757 cic->exit = cfq_exit_io_context; 2758 elv_ioc_count_inc(cfq_ioc_count); 2759 } 2760 2761 return cic; 2762} 2763 2764static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 2765{ 2766 struct task_struct *tsk = current; 2767 int ioprio_class; 2768 2769 if (!cfq_cfqq_prio_changed(cfqq)) 2770 return; 2771 2772 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); 2773 switch (ioprio_class) { 2774 default: 2775 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 2776 case IOPRIO_CLASS_NONE: 2777 /* 2778 * no prio set, inherit CPU scheduling settings 2779 */ 2780 cfqq->ioprio = task_nice_ioprio(tsk); 2781 cfqq->ioprio_class = task_nice_ioclass(tsk); 2782 break; 2783 case IOPRIO_CLASS_RT: 2784 cfqq->ioprio = task_ioprio(ioc); 2785 cfqq->ioprio_class = IOPRIO_CLASS_RT; 2786 break; 2787 case IOPRIO_CLASS_BE: 2788 cfqq->ioprio = task_ioprio(ioc); 2789 cfqq->ioprio_class = IOPRIO_CLASS_BE; 2790 break; 2791 case IOPRIO_CLASS_IDLE: 2792 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 2793 cfqq->ioprio = 7; 2794 cfq_clear_cfqq_idle_window(cfqq); 2795 break; 2796 } 2797 2798 /* 2799 * keep track of original prio settings in case we have to temporarily 2800 * elevate the priority of this queue 2801 */ 2802 cfqq->org_ioprio = cfqq->ioprio; 2803 cfqq->org_ioprio_class = cfqq->ioprio_class; 2804 cfq_clear_cfqq_prio_changed(cfqq); 2805} 2806 2807static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 2808{ 2809 struct cfq_data *cfqd = cic_to_cfqd(cic); 2810 struct cfq_queue *cfqq; 2811 unsigned long flags; 2812 2813 if (unlikely(!cfqd)) 2814 return; 2815 2816 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2817 2818 cfqq = cic->cfqq[BLK_RW_ASYNC]; 2819 if (cfqq) { 2820 struct cfq_queue *new_cfqq; 2821 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 2822 GFP_ATOMIC); 2823 if (new_cfqq) { 2824 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 2825 cfq_put_queue(cfqq); 2826 } 2827 } 2828 2829 cfqq = cic->cfqq[BLK_RW_SYNC]; 2830 if (cfqq) 2831 cfq_mark_cfqq_prio_changed(cfqq); 2832 2833 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2834} 2835 2836static void cfq_ioc_set_ioprio(struct io_context *ioc) 2837{ 2838 call_for_each_cic(ioc, changed_ioprio); 2839 ioc->ioprio_changed = 0; 2840} 2841 2842static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2843 pid_t pid, bool is_sync) 2844{ 2845 RB_CLEAR_NODE(&cfqq->rb_node); 2846 RB_CLEAR_NODE(&cfqq->p_node); 2847 INIT_LIST_HEAD(&cfqq->fifo); 2848 2849 atomic_set(&cfqq->ref, 0); 2850 cfqq->cfqd = cfqd; 2851 2852 cfq_mark_cfqq_prio_changed(cfqq); 2853 2854 if (is_sync) { 2855 if (!cfq_class_idle(cfqq)) 2856 cfq_mark_cfqq_idle_window(cfqq); 2857 cfq_mark_cfqq_sync(cfqq); 2858 } 2859 cfqq->pid = pid; 2860} 2861 2862#ifdef CONFIG_CFQ_GROUP_IOSCHED 2863static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 2864{ 2865 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 2866 struct cfq_data *cfqd = cic_to_cfqd(cic); 2867 unsigned long flags; 2868 struct request_queue *q; 2869 2870 if (unlikely(!cfqd)) 2871 return; 2872 2873 q = cfqd->queue; 2874 2875 spin_lock_irqsave(q->queue_lock, flags); 2876 2877 if (sync_cfqq) { 2878 /* 2879 * Drop reference to sync queue. A new sync queue will be 2880 * assigned in new group upon arrival of a fresh request. 2881 */ 2882 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); 2883 cic_set_cfqq(cic, NULL, 1); 2884 cfq_put_queue(sync_cfqq); 2885 } 2886 2887 spin_unlock_irqrestore(q->queue_lock, flags); 2888} 2889 2890static void cfq_ioc_set_cgroup(struct io_context *ioc) 2891{ 2892 call_for_each_cic(ioc, changed_cgroup); 2893 ioc->cgroup_changed = 0; 2894} 2895#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 2896 2897static struct cfq_queue * 2898cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 2899 struct io_context *ioc, gfp_t gfp_mask) 2900{ 2901 struct cfq_queue *cfqq, *new_cfqq = NULL; 2902 struct cfq_io_context *cic; 2903 struct cfq_group *cfqg; 2904 2905retry: 2906 cfqg = cfq_get_cfqg(cfqd, 1); 2907 cic = cfq_cic_lookup(cfqd, ioc); 2908 /* cic always exists here */ 2909 cfqq = cic_to_cfqq(cic, is_sync); 2910 2911 /* 2912 * Always try a new alloc if we fell back to the OOM cfqq 2913 * originally, since it should just be a temporary situation. 2914 */ 2915 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 2916 cfqq = NULL; 2917 if (new_cfqq) { 2918 cfqq = new_cfqq; 2919 new_cfqq = NULL; 2920 } else if (gfp_mask & __GFP_WAIT) { 2921 spin_unlock_irq(cfqd->queue->queue_lock); 2922 new_cfqq = kmem_cache_alloc_node(cfq_pool, 2923 gfp_mask | __GFP_ZERO, 2924 cfqd->queue->node); 2925 spin_lock_irq(cfqd->queue->queue_lock); 2926 if (new_cfqq) 2927 goto retry; 2928 } else { 2929 cfqq = kmem_cache_alloc_node(cfq_pool, 2930 gfp_mask | __GFP_ZERO, 2931 cfqd->queue->node); 2932 } 2933 2934 if (cfqq) { 2935 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 2936 cfq_init_prio_data(cfqq, ioc); 2937 cfq_link_cfqq_cfqg(cfqq, cfqg); 2938 cfq_log_cfqq(cfqd, cfqq, "alloced"); 2939 } else 2940 cfqq = &cfqd->oom_cfqq; 2941 } 2942 2943 if (new_cfqq) 2944 kmem_cache_free(cfq_pool, new_cfqq); 2945 2946 return cfqq; 2947} 2948 2949static struct cfq_queue ** 2950cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) 2951{ 2952 switch (ioprio_class) { 2953 case IOPRIO_CLASS_RT: 2954 return &cfqd->async_cfqq[0][ioprio]; 2955 case IOPRIO_CLASS_BE: 2956 return &cfqd->async_cfqq[1][ioprio]; 2957 case IOPRIO_CLASS_IDLE: 2958 return &cfqd->async_idle_cfqq; 2959 default: 2960 BUG(); 2961 } 2962} 2963 2964static struct cfq_queue * 2965cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, 2966 gfp_t gfp_mask) 2967{ 2968 const int ioprio = task_ioprio(ioc); 2969 const int ioprio_class = task_ioprio_class(ioc); 2970 struct cfq_queue **async_cfqq = NULL; 2971 struct cfq_queue *cfqq = NULL; 2972 2973 if (!is_sync) { 2974 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 2975 cfqq = *async_cfqq; 2976 } 2977 2978 if (!cfqq) 2979 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 2980 2981 /* 2982 * pin the queue now that it's allocated, scheduler exit will prune it 2983 */ 2984 if (!is_sync && !(*async_cfqq)) { 2985 atomic_inc(&cfqq->ref); 2986 *async_cfqq = cfqq; 2987 } 2988 2989 atomic_inc(&cfqq->ref); 2990 return cfqq; 2991} 2992 2993/* 2994 * We drop cfq io contexts lazily, so we may find a dead one. 2995 */ 2996static void 2997cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, 2998 struct cfq_io_context *cic) 2999{ 3000 unsigned long flags; 3001 3002 WARN_ON(!list_empty(&cic->queue_list)); 3003 BUG_ON(cic->key != cfqd_dead_key(cfqd)); 3004 3005 spin_lock_irqsave(&ioc->lock, flags); 3006 3007 BUG_ON(ioc->ioc_data == cic); 3008 3009 radix_tree_delete(&ioc->radix_root, cfqd->cic_index); 3010 hlist_del_rcu(&cic->cic_list); 3011 spin_unlock_irqrestore(&ioc->lock, flags); 3012 3013 cfq_cic_free(cic); 3014} 3015 3016static struct cfq_io_context * 3017cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 3018{ 3019 struct cfq_io_context *cic; 3020 unsigned long flags; 3021 3022 if (unlikely(!ioc)) 3023 return NULL; 3024 3025 rcu_read_lock(); 3026 3027 /* 3028 * we maintain a last-hit cache, to avoid browsing over the tree 3029 */ 3030 cic = rcu_dereference(ioc->ioc_data); 3031 if (cic && cic->key == cfqd) { 3032 rcu_read_unlock(); 3033 return cic; 3034 } 3035 3036 do { 3037 cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); 3038 rcu_read_unlock(); 3039 if (!cic) 3040 break; 3041 if (unlikely(cic->key != cfqd)) { 3042 cfq_drop_dead_cic(cfqd, ioc, cic); 3043 rcu_read_lock(); 3044 continue; 3045 } 3046 3047 spin_lock_irqsave(&ioc->lock, flags); 3048 rcu_assign_pointer(ioc->ioc_data, cic); 3049 spin_unlock_irqrestore(&ioc->lock, flags); 3050 break; 3051 } while (1); 3052 3053 return cic; 3054} 3055 3056/* 3057 * Add cic into ioc, using cfqd as the search key. This enables us to lookup 3058 * the process specific cfq io context when entered from the block layer. 3059 * Also adds the cic to a per-cfqd list, used when this queue is removed. 3060 */ 3061static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 3062 struct cfq_io_context *cic, gfp_t gfp_mask) 3063{ 3064 unsigned long flags; 3065 int ret; 3066 3067 ret = radix_tree_preload(gfp_mask); 3068 if (!ret) { 3069 cic->ioc = ioc; 3070 cic->key = cfqd; 3071 3072 spin_lock_irqsave(&ioc->lock, flags); 3073 ret = radix_tree_insert(&ioc->radix_root, 3074 cfqd->cic_index, cic); 3075 if (!ret) 3076 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 3077 spin_unlock_irqrestore(&ioc->lock, flags); 3078 3079 radix_tree_preload_end(); 3080 3081 if (!ret) { 3082 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3083 list_add(&cic->queue_list, &cfqd->cic_list); 3084 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3085 } 3086 } 3087 3088 if (ret) 3089 printk(KERN_ERR "cfq: cic link failed!\n"); 3090 3091 return ret; 3092} 3093 3094/* 3095 * Setup general io context and cfq io context. There can be several cfq 3096 * io contexts per general io context, if this process is doing io to more 3097 * than one device managed by cfq. 3098 */ 3099static struct cfq_io_context * 3100cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 3101{ 3102 struct io_context *ioc = NULL; 3103 struct cfq_io_context *cic; 3104 3105 might_sleep_if(gfp_mask & __GFP_WAIT); 3106 3107 ioc = get_io_context(gfp_mask, cfqd->queue->node); 3108 if (!ioc) 3109 return NULL; 3110 3111 cic = cfq_cic_lookup(cfqd, ioc); 3112 if (cic) 3113 goto out; 3114 3115 cic = cfq_alloc_io_context(cfqd, gfp_mask); 3116 if (cic == NULL) 3117 goto err; 3118 3119 if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) 3120 goto err_free; 3121 3122out: 3123 smp_read_barrier_depends(); 3124 if (unlikely(ioc->ioprio_changed)) 3125 cfq_ioc_set_ioprio(ioc); 3126 3127#ifdef CONFIG_CFQ_GROUP_IOSCHED 3128 if (unlikely(ioc->cgroup_changed)) 3129 cfq_ioc_set_cgroup(ioc); 3130#endif 3131 return cic; 3132err_free: 3133 cfq_cic_free(cic); 3134err: 3135 put_io_context(ioc); 3136 return NULL; 3137} 3138 3139static void 3140cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 3141{ 3142 unsigned long elapsed = jiffies - cic->last_end_request; 3143 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 3144 3145 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 3146 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 3147 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 3148} 3149 3150static void 3151cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3152 struct request *rq) 3153{ 3154 sector_t sdist = 0; 3155 sector_t n_sec = blk_rq_sectors(rq); 3156 if (cfqq->last_request_pos) { 3157 if (cfqq->last_request_pos < blk_rq_pos(rq)) 3158 sdist = blk_rq_pos(rq) - cfqq->last_request_pos; 3159 else 3160 sdist = cfqq->last_request_pos - blk_rq_pos(rq); 3161 } 3162 3163 cfqq->seek_history <<= 1; 3164 if (blk_queue_nonrot(cfqd->queue)) 3165 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); 3166 else 3167 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); 3168} 3169 3170/* 3171 * Disable idle window if the process thinks too long or seeks so much that 3172 * it doesn't matter 3173 */ 3174static void 3175cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3176 struct cfq_io_context *cic) 3177{ 3178 int old_idle, enable_idle; 3179 3180 /* 3181 * Don't idle for async or idle io prio class 3182 */ 3183 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) 3184 return; 3185 3186 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3187 3188 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3189 cfq_mark_cfqq_deep(cfqq); 3190 3191 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3192 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3193 enable_idle = 0; 3194 else if (sample_valid(cic->ttime_samples)) { 3195 if (cic->ttime_mean > cfqd->cfq_slice_idle) 3196 enable_idle = 0; 3197 else 3198 enable_idle = 1; 3199 } 3200 3201 if (old_idle != enable_idle) { 3202 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); 3203 if (enable_idle) 3204 cfq_mark_cfqq_idle_window(cfqq); 3205 else 3206 cfq_clear_cfqq_idle_window(cfqq); 3207 } 3208} 3209 3210/* 3211 * Check if new_cfqq should preempt the currently active queue. Return 0 for 3212 * no or if we aren't sure, a 1 will cause a preempt. 3213 */ 3214static bool 3215cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 3216 struct request *rq) 3217{ 3218 struct cfq_queue *cfqq; 3219 3220 cfqq = cfqd->active_queue; 3221 if (!cfqq) 3222 return false; 3223 3224 if (cfq_class_idle(new_cfqq)) 3225 return false; 3226 3227 if (cfq_class_idle(cfqq)) 3228 return true; 3229 3230 /* 3231 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. 3232 */ 3233 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) 3234 return false; 3235 3236 /* 3237 * if the new request is sync, but the currently running queue is 3238 * not, let the sync request have priority. 3239 */ 3240 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3241 return true; 3242 3243 if (new_cfqq->cfqg != cfqq->cfqg) 3244 return false; 3245 3246 if (cfq_slice_used(cfqq)) 3247 return true; 3248 3249 /* Allow preemption only if we are idling on sync-noidle tree */ 3250 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && 3251 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 3252 new_cfqq->service_tree->count == 2 && 3253 RB_EMPTY_ROOT(&cfqq->sort_list)) 3254 return true; 3255 3256 /* 3257 * So both queues are sync. Let the new request get disk time if 3258 * it's a metadata request and the current queue is doing regular IO. 3259 */ 3260 if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) 3261 return true; 3262 3263 /* 3264 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. 3265 */ 3266 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 3267 return true; 3268 3269 /* An idle queue should not be idle now for some reason */ 3270 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) 3271 return true; 3272 3273 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 3274 return false; 3275 3276 /* 3277 * if this request is as-good as one we would expect from the 3278 * current cfqq, let it preempt 3279 */ 3280 if (cfq_rq_close(cfqd, cfqq, rq)) 3281 return true; 3282 3283 return false; 3284} 3285 3286/* 3287 * cfqq preempts the active queue. if we allowed preempt with no slice left, 3288 * let it have half of its nominal slice. 3289 */ 3290static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3291{ 3292 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3293 cfq_slice_expired(cfqd, 1); 3294 3295 /* 3296 * Put the new queue at the front of the of the current list, 3297 * so we know that it will be selected next. 3298 */ 3299 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 3300 3301 cfq_service_tree_add(cfqd, cfqq, 1); 3302 3303 cfqq->slice_end = 0; 3304 cfq_mark_cfqq_slice_new(cfqq); 3305} 3306 3307/* 3308 * Called when a new fs request (rq) is added (to cfqq). Check if there's 3309 * something we should do about it 3310 */ 3311static void 3312cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3313 struct request *rq) 3314{ 3315 struct cfq_io_context *cic = RQ_CIC(rq); 3316 3317 cfqd->rq_queued++; 3318 if (rq->cmd_flags & REQ_META) 3319 cfqq->meta_pending++; 3320 3321 cfq_update_io_thinktime(cfqd, cic); 3322 cfq_update_io_seektime(cfqd, cfqq, rq); 3323 cfq_update_idle_window(cfqd, cfqq, cic); 3324 3325 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3326 3327 if (cfqq == cfqd->active_queue) { 3328 /* 3329 * Remember that we saw a request from this process, but 3330 * don't start queuing just yet. Otherwise we risk seeing lots 3331 * of tiny requests, because we disrupt the normal plugging 3332 * and merging. If the request is already larger than a single 3333 * page, let it rip immediately. For that case we assume that 3334 * merging is already done. Ditto for a busy system that 3335 * has other work pending, don't risk delaying until the 3336 * idle timer unplug to continue working. 3337 */ 3338 if (cfq_cfqq_wait_request(cfqq)) { 3339 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3340 cfqd->busy_queues > 1) { 3341 cfq_del_timer(cfqd, cfqq); 3342 cfq_clear_cfqq_wait_request(cfqq); 3343 __blk_run_queue(cfqd->queue); 3344 } else { 3345 cfq_blkiocg_update_idle_time_stats( 3346 &cfqq->cfqg->blkg); 3347 cfq_mark_cfqq_must_dispatch(cfqq); 3348 } 3349 } 3350 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3351 /* 3352 * not the active queue - expire current slice if it is 3353 * idle and has expired it's mean thinktime or this new queue 3354 * has some old slice time left and is of higher priority or 3355 * this new queue is RT and the current one is BE 3356 */ 3357 cfq_preempt_queue(cfqd, cfqq); 3358 __blk_run_queue(cfqd->queue); 3359 } 3360} 3361 3362static void cfq_insert_request(struct request_queue *q, struct request *rq) 3363{ 3364 struct cfq_data *cfqd = q->elevator->elevator_data; 3365 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3366 3367 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3368 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3369 3370 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3371 list_add_tail(&rq->queuelist, &cfqq->fifo); 3372 cfq_add_rq_rb(rq); 3373 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 3374 &cfqd->serving_group->blkg, rq_data_dir(rq), 3375 rq_is_sync(rq)); 3376 cfq_rq_enqueued(cfqd, cfqq, rq); 3377} 3378 3379/* 3380 * Update hw_tag based on peak queue depth over 50 samples under 3381 * sufficient load. 3382 */ 3383static void cfq_update_hw_tag(struct cfq_data *cfqd) 3384{ 3385 struct cfq_queue *cfqq = cfqd->active_queue; 3386 3387 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) 3388 cfqd->hw_tag_est_depth = cfqd->rq_in_driver; 3389 3390 if (cfqd->hw_tag == 1) 3391 return; 3392 3393 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 3394 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) 3395 return; 3396 3397 /* 3398 * If active queue hasn't enough requests and can idle, cfq might not 3399 * dispatch sufficient requests to hardware. Don't zero hw_tag in this 3400 * case 3401 */ 3402 if (cfqq && cfq_cfqq_idle_window(cfqq) && 3403 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < 3404 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) 3405 return; 3406 3407 if (cfqd->hw_tag_samples++ < 50) 3408 return; 3409 3410 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) 3411 cfqd->hw_tag = 1; 3412 else 3413 cfqd->hw_tag = 0; 3414} 3415 3416static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3417{ 3418 struct cfq_io_context *cic = cfqd->active_cic; 3419 3420 /* If there are other queues in the group, don't wait */ 3421 if (cfqq->cfqg->nr_cfqq > 1) 3422 return false; 3423 3424 if (cfq_slice_used(cfqq)) 3425 return true; 3426 3427 /* if slice left is less than think time, wait busy */ 3428 if (cic && sample_valid(cic->ttime_samples) 3429 && (cfqq->slice_end - jiffies < cic->ttime_mean)) 3430 return true; 3431 3432 /* 3433 * If think times is less than a jiffy than ttime_mean=0 and above 3434 * will not be true. It might happen that slice has not expired yet 3435 * but will expire soon (4-5 ns) during select_queue(). To cover the 3436 * case where think time is less than a jiffy, mark the queue wait 3437 * busy if only 1 jiffy is left in the slice. 3438 */ 3439 if (cfqq->slice_end - jiffies == 1) 3440 return true; 3441 3442 return false; 3443} 3444 3445static void cfq_completed_request(struct request_queue *q, struct request *rq) 3446{ 3447 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3448 struct cfq_data *cfqd = cfqq->cfqd; 3449 const int sync = rq_is_sync(rq); 3450 unsigned long now; 3451 3452 now = jiffies; 3453 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", 3454 !!(rq->cmd_flags & REQ_NOIDLE)); 3455 3456 cfq_update_hw_tag(cfqd); 3457 3458 WARN_ON(!cfqd->rq_in_driver); 3459 WARN_ON(!cfqq->dispatched); 3460 cfqd->rq_in_driver--; 3461 cfqq->dispatched--; 3462 (RQ_CFQG(rq))->dispatched--; 3463 cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, 3464 rq_start_time_ns(rq), rq_io_start_time_ns(rq), 3465 rq_data_dir(rq), rq_is_sync(rq)); 3466 3467 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3468 3469 if (sync) { 3470 RQ_CIC(rq)->last_end_request = now; 3471 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) 3472 cfqd->last_delayed_sync = now; 3473 } 3474 3475 /* 3476 * If this is the active queue, check if it needs to be expired, 3477 * or if we want to idle in case it has no pending requests. 3478 */ 3479 if (cfqd->active_queue == cfqq) { 3480 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 3481 3482 if (cfq_cfqq_slice_new(cfqq)) { 3483 cfq_set_prio_slice(cfqd, cfqq); 3484 cfq_clear_cfqq_slice_new(cfqq); 3485 } 3486 3487 /* 3488 * Should we wait for next request to come in before we expire 3489 * the queue. 3490 */ 3491 if (cfq_should_wait_busy(cfqd, cfqq)) { 3492 unsigned long extend_sl = cfqd->cfq_slice_idle; 3493 if (!cfqd->cfq_slice_idle) 3494 extend_sl = cfqd->cfq_group_idle; 3495 cfqq->slice_end = jiffies + extend_sl; 3496 cfq_mark_cfqq_wait_busy(cfqq); 3497 cfq_log_cfqq(cfqd, cfqq, "will busy wait"); 3498 } 3499 3500 /* 3501 * Idling is not enabled on: 3502 * - expired queues 3503 * - idle-priority queues 3504 * - async queues 3505 * - queues with still some requests queued 3506 * - when there is a close cooperator 3507 */ 3508 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3509 cfq_slice_expired(cfqd, 1); 3510 else if (sync && cfqq_empty && 3511 !cfq_close_cooperator(cfqd, cfqq)) { 3512 cfqd->noidle_tree_requires_idle |= 3513 !(rq->cmd_flags & REQ_NOIDLE); 3514 /* 3515 * Idling is enabled for SYNC_WORKLOAD. 3516 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree 3517 * only if we processed at least one !REQ_NOIDLE request 3518 */ 3519 if (cfqd->serving_type == SYNC_WORKLOAD 3520 || cfqd->noidle_tree_requires_idle 3521 || cfqq->cfqg->nr_cfqq == 1) 3522 cfq_arm_slice_timer(cfqd); 3523 } 3524 } 3525 3526 if (!cfqd->rq_in_driver) { 3527 cfq_schedule_dispatch(cfqd); 3528 return; 3529 } 3530 /* 3531 * A queue is idle at cfq_dispatch_requests(), but it gets noidle 3532 * later. We schedule a dispatch if the queue has no requests, 3533 * otherwise the disk is actually in idle till all requests 3534 * are finished even cfq_arm_slice_timer doesn't make the queue idle 3535 * */ 3536 cfqq = cfqd->active_queue; 3537 if (!cfqq) 3538 return; 3539 3540 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq) && 3541 (!cfqd->cfq_group_idle || cfqq->cfqg->nr_cfqq > 1)) { 3542 cfq_del_timer(cfqd, cfqq); 3543 cfq_schedule_dispatch(cfqd); 3544 } 3545} 3546 3547/* 3548 * we temporarily boost lower priority queues if they are holding fs exclusive 3549 * resources. they are boosted to normal prio (CLASS_BE/4) 3550 */ 3551static void cfq_prio_boost(struct cfq_queue *cfqq) 3552{ 3553 if (has_fs_excl()) { 3554 /* 3555 * boost idle prio on transactions that would lock out other 3556 * users of the filesystem 3557 */ 3558 if (cfq_class_idle(cfqq)) 3559 cfqq->ioprio_class = IOPRIO_CLASS_BE; 3560 if (cfqq->ioprio > IOPRIO_NORM) 3561 cfqq->ioprio = IOPRIO_NORM; 3562 } else { 3563 /* 3564 * unboost the queue (if needed) 3565 */ 3566 cfqq->ioprio_class = cfqq->org_ioprio_class; 3567 cfqq->ioprio = cfqq->org_ioprio; 3568 } 3569} 3570 3571static inline int __cfq_may_queue(struct cfq_queue *cfqq) 3572{ 3573 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { 3574 cfq_mark_cfqq_must_alloc_slice(cfqq); 3575 return ELV_MQUEUE_MUST; 3576 } 3577 3578 return ELV_MQUEUE_MAY; 3579} 3580 3581static int cfq_may_queue(struct request_queue *q, int rw) 3582{ 3583 struct cfq_data *cfqd = q->elevator->elevator_data; 3584 struct task_struct *tsk = current; 3585 struct cfq_io_context *cic; 3586 struct cfq_queue *cfqq; 3587 3588 /* 3589 * don't force setup of a queue from here, as a call to may_queue 3590 * does not necessarily imply that a request actually will be queued. 3591 * so just lookup a possibly existing queue, or return 'may queue' 3592 * if that fails 3593 */ 3594 cic = cfq_cic_lookup(cfqd, tsk->io_context); 3595 if (!cic) 3596 return ELV_MQUEUE_MAY; 3597 3598 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3599 if (cfqq) { 3600 cfq_init_prio_data(cfqq, cic->ioc); 3601 cfq_prio_boost(cfqq); 3602 3603 return __cfq_may_queue(cfqq); 3604 } 3605 3606 return ELV_MQUEUE_MAY; 3607} 3608 3609/* 3610 * queue lock held here 3611 */ 3612static void cfq_put_request(struct request *rq) 3613{ 3614 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3615 3616 if (cfqq) { 3617 const int rw = rq_data_dir(rq); 3618 3619 BUG_ON(!cfqq->allocated[rw]); 3620 cfqq->allocated[rw]--; 3621 3622 put_io_context(RQ_CIC(rq)->ioc); 3623 3624 rq->elevator_private = NULL; 3625 rq->elevator_private2 = NULL; 3626 3627 /* Put down rq reference on cfqg */ 3628 cfq_put_cfqg(RQ_CFQG(rq)); 3629 rq->elevator_private3 = NULL; 3630 3631 cfq_put_queue(cfqq); 3632 } 3633} 3634 3635static struct cfq_queue * 3636cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, 3637 struct cfq_queue *cfqq) 3638{ 3639 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 3640 cic_set_cfqq(cic, cfqq->new_cfqq, 1); 3641 cfq_mark_cfqq_coop(cfqq->new_cfqq); 3642 cfq_put_queue(cfqq); 3643 return cic_to_cfqq(cic, 1); 3644} 3645 3646/* 3647 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this 3648 * was the last process referring to said cfqq. 3649 */ 3650static struct cfq_queue * 3651split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) 3652{ 3653 if (cfqq_process_refs(cfqq) == 1) { 3654 cfqq->pid = current->pid; 3655 cfq_clear_cfqq_coop(cfqq); 3656 cfq_clear_cfqq_split_coop(cfqq); 3657 return cfqq; 3658 } 3659 3660 cic_set_cfqq(cic, NULL, 1); 3661 3662 cfq_put_cooperator(cfqq); 3663 3664 cfq_put_queue(cfqq); 3665 return NULL; 3666} 3667/* 3668 * Allocate cfq data structures associated with this request. 3669 */ 3670static int 3671cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 3672{ 3673 struct cfq_data *cfqd = q->elevator->elevator_data; 3674 struct cfq_io_context *cic; 3675 const int rw = rq_data_dir(rq); 3676 const bool is_sync = rq_is_sync(rq); 3677 struct cfq_queue *cfqq; 3678 unsigned long flags; 3679 3680 might_sleep_if(gfp_mask & __GFP_WAIT); 3681 3682 cic = cfq_get_io_context(cfqd, gfp_mask); 3683 3684 spin_lock_irqsave(q->queue_lock, flags); 3685 3686 if (!cic) 3687 goto queue_fail; 3688 3689new_queue: 3690 cfqq = cic_to_cfqq(cic, is_sync); 3691 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3692 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3693 cic_set_cfqq(cic, cfqq, is_sync); 3694 } else { 3695 /* 3696 * If the queue was seeky for too long, break it apart. 3697 */ 3698 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { 3699 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); 3700 cfqq = split_cfqq(cic, cfqq); 3701 if (!cfqq) 3702 goto new_queue; 3703 } 3704 3705 /* 3706 * Check to see if this queue is scheduled to merge with 3707 * another, closely cooperating queue. The merging of 3708 * queues happens here as it must be done in process context. 3709 * The reference on new_cfqq was taken in merge_cfqqs. 3710 */ 3711 if (cfqq->new_cfqq) 3712 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); 3713 } 3714 3715 cfqq->allocated[rw]++; 3716 atomic_inc(&cfqq->ref); 3717 3718 spin_unlock_irqrestore(q->queue_lock, flags); 3719 3720 rq->elevator_private = cic; 3721 rq->elevator_private2 = cfqq; 3722 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); 3723 return 0; 3724 3725queue_fail: 3726 if (cic) 3727 put_io_context(cic->ioc); 3728 3729 cfq_schedule_dispatch(cfqd); 3730 spin_unlock_irqrestore(q->queue_lock, flags); 3731 cfq_log(cfqd, "set_request fail"); 3732 return 1; 3733} 3734 3735static void cfq_kick_queue(struct work_struct *work) 3736{ 3737 struct cfq_data *cfqd = 3738 container_of(work, struct cfq_data, unplug_work); 3739 struct request_queue *q = cfqd->queue; 3740 3741 spin_lock_irq(q->queue_lock); 3742 __blk_run_queue(cfqd->queue); 3743 spin_unlock_irq(q->queue_lock); 3744} 3745 3746/* 3747 * Timer running if the active_queue is currently idling inside its time slice 3748 */ 3749static void cfq_idle_slice_timer(unsigned long data) 3750{ 3751 struct cfq_data *cfqd = (struct cfq_data *) data; 3752 struct cfq_queue *cfqq; 3753 unsigned long flags; 3754 int timed_out = 1; 3755 3756 cfq_log(cfqd, "idle timer fired"); 3757 3758 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3759 3760 cfqq = cfqd->active_queue; 3761 if (cfqq) { 3762 timed_out = 0; 3763 3764 /* 3765 * We saw a request before the queue expired, let it through 3766 */ 3767 if (cfq_cfqq_must_dispatch(cfqq)) 3768 goto out_kick; 3769 3770 /* 3771 * expired 3772 */ 3773 if (cfq_slice_used(cfqq)) 3774 goto expire; 3775 3776 /* 3777 * only expire and reinvoke request handler, if there are 3778 * other queues with pending requests 3779 */ 3780 if (!cfqd->busy_queues) 3781 goto out_cont; 3782 3783 /* 3784 * not expired and it has a request pending, let it dispatch 3785 */ 3786 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3787 goto out_kick; 3788 3789 /* 3790 * Queue depth flag is reset only when the idle didn't succeed 3791 */ 3792 cfq_clear_cfqq_deep(cfqq); 3793 } 3794expire: 3795 cfq_slice_expired(cfqd, timed_out); 3796out_kick: 3797 cfq_schedule_dispatch(cfqd); 3798out_cont: 3799 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3800} 3801 3802static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 3803{ 3804 del_timer_sync(&cfqd->idle_slice_timer); 3805 cancel_work_sync(&cfqd->unplug_work); 3806} 3807 3808static void cfq_put_async_queues(struct cfq_data *cfqd) 3809{ 3810 int i; 3811 3812 for (i = 0; i < IOPRIO_BE_NR; i++) { 3813 if (cfqd->async_cfqq[0][i]) 3814 cfq_put_queue(cfqd->async_cfqq[0][i]); 3815 if (cfqd->async_cfqq[1][i]) 3816 cfq_put_queue(cfqd->async_cfqq[1][i]); 3817 } 3818 3819 if (cfqd->async_idle_cfqq) 3820 cfq_put_queue(cfqd->async_idle_cfqq); 3821} 3822 3823static void cfq_cfqd_free(struct rcu_head *head) 3824{ 3825 kfree(container_of(head, struct cfq_data, rcu)); 3826} 3827 3828static void cfq_exit_queue(struct elevator_queue *e) 3829{ 3830 struct cfq_data *cfqd = e->elevator_data; 3831 struct request_queue *q = cfqd->queue; 3832 3833 cfq_shutdown_timer_wq(cfqd); 3834 3835 spin_lock_irq(q->queue_lock); 3836 3837 if (cfqd->active_queue) 3838 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3839 3840 while (!list_empty(&cfqd->cic_list)) { 3841 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 3842 struct cfq_io_context, 3843 queue_list); 3844 3845 __cfq_exit_single_io_context(cfqd, cic); 3846 } 3847 3848 cfq_put_async_queues(cfqd); 3849 cfq_release_cfq_groups(cfqd); 3850 cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3851 3852 spin_unlock_irq(q->queue_lock); 3853 3854 cfq_shutdown_timer_wq(cfqd); 3855 3856 spin_lock(&cic_index_lock); 3857 ida_remove(&cic_index_ida, cfqd->cic_index); 3858 spin_unlock(&cic_index_lock); 3859 3860 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3861 call_rcu(&cfqd->rcu, cfq_cfqd_free); 3862} 3863 3864static int cfq_alloc_cic_index(void) 3865{ 3866 int index, error; 3867 3868 do { 3869 if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) 3870 return -ENOMEM; 3871 3872 spin_lock(&cic_index_lock); 3873 error = ida_get_new(&cic_index_ida, &index); 3874 spin_unlock(&cic_index_lock); 3875 if (error && error != -EAGAIN) 3876 return error; 3877 } while (error); 3878 3879 return index; 3880} 3881 3882static void *cfq_init_queue(struct request_queue *q) 3883{ 3884 struct cfq_data *cfqd; 3885 int i, j; 3886 struct cfq_group *cfqg; 3887 struct cfq_rb_root *st; 3888 3889 i = cfq_alloc_cic_index(); 3890 if (i < 0) 3891 return NULL; 3892 3893 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3894 if (!cfqd) 3895 return NULL; 3896 3897 cfqd->cic_index = i; 3898 3899 /* Init root service tree */ 3900 cfqd->grp_service_tree = CFQ_RB_ROOT; 3901 3902 /* Init root group */ 3903 cfqg = &cfqd->root_group; 3904 for_each_cfqg_st(cfqg, i, j, st) 3905 *st = CFQ_RB_ROOT; 3906 RB_CLEAR_NODE(&cfqg->rb_node); 3907 3908 /* Give preference to root group over other groups */ 3909 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; 3910 3911#ifdef CONFIG_CFQ_GROUP_IOSCHED 3912 /* 3913 * Take a reference to root group which we never drop. This is just 3914 * to make sure that cfq_put_cfqg() does not try to kfree root group 3915 */ 3916 atomic_set(&cfqg->ref, 1); 3917 rcu_read_lock(); 3918 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, 3919 (void *)cfqd, 0); 3920 rcu_read_unlock(); 3921#endif 3922 /* 3923 * Not strictly needed (since RB_ROOT just clears the node and we 3924 * zeroed cfqd on alloc), but better be safe in case someone decides 3925 * to add magic to the rb code 3926 */ 3927 for (i = 0; i < CFQ_PRIO_LISTS; i++) 3928 cfqd->prio_trees[i] = RB_ROOT; 3929 3930 /* 3931 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 3932 * Grab a permanent reference to it, so that the normal code flow 3933 * will not attempt to free it. 3934 */ 3935 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3936 atomic_inc(&cfqd->oom_cfqq.ref); 3937 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 3938 3939 INIT_LIST_HEAD(&cfqd->cic_list); 3940 3941 cfqd->queue = q; 3942 3943 init_timer(&cfqd->idle_slice_timer); 3944 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 3945 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 3946 3947 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 3948 3949 cfqd->cfq_quantum = cfq_quantum; 3950 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 3951 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 3952 cfqd->cfq_back_max = cfq_back_max; 3953 cfqd->cfq_back_penalty = cfq_back_penalty; 3954 cfqd->cfq_slice[0] = cfq_slice_async; 3955 cfqd->cfq_slice[1] = cfq_slice_sync; 3956 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3957 cfqd->cfq_slice_idle = cfq_slice_idle; 3958 cfqd->cfq_group_idle = cfq_group_idle; 3959 cfqd->cfq_latency = 1; 3960 cfqd->cfq_group_isolation = 0; 3961 cfqd->hw_tag = -1; 3962 /* 3963 * we optimistically start assuming sync ops weren't delayed in last 3964 * second, in order to have larger depth for async operations. 3965 */ 3966 cfqd->last_delayed_sync = jiffies - HZ; 3967 return cfqd; 3968} 3969 3970static void cfq_slab_kill(void) 3971{ 3972 /* 3973 * Caller already ensured that pending RCU callbacks are completed, 3974 * so we should have no busy allocations at this point. 3975 */ 3976 if (cfq_pool) 3977 kmem_cache_destroy(cfq_pool); 3978 if (cfq_ioc_pool) 3979 kmem_cache_destroy(cfq_ioc_pool); 3980} 3981 3982static int __init cfq_slab_setup(void) 3983{ 3984 cfq_pool = KMEM_CACHE(cfq_queue, 0); 3985 if (!cfq_pool) 3986 goto fail; 3987 3988 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); 3989 if (!cfq_ioc_pool) 3990 goto fail; 3991 3992 return 0; 3993fail: 3994 cfq_slab_kill(); 3995 return -ENOMEM; 3996} 3997 3998/* 3999 * sysfs parts below --> 4000 */ 4001static ssize_t 4002cfq_var_show(unsigned int var, char *page) 4003{ 4004 return sprintf(page, "%d\n", var); 4005} 4006 4007static ssize_t 4008cfq_var_store(unsigned int *var, const char *page, size_t count) 4009{ 4010 char *p = (char *) page; 4011 4012 *var = simple_strtoul(p, &p, 10); 4013 return count; 4014} 4015 4016#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 4017static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 4018{ \ 4019 struct cfq_data *cfqd = e->elevator_data; \ 4020 unsigned int __data = __VAR; \ 4021 if (__CONV) \ 4022 __data = jiffies_to_msecs(__data); \ 4023 return cfq_var_show(__data, (page)); \ 4024} 4025SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 4026SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 4027SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 4028SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 4029SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 4030SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 4031SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); 4032SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 4033SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4034SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4035SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 4036SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); 4037#undef SHOW_FUNCTION 4038 4039#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 4040static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 4041{ \ 4042 struct cfq_data *cfqd = e->elevator_data; \ 4043 unsigned int __data; \ 4044 int ret = cfq_var_store(&__data, (page), count); \ 4045 if (__data < (MIN)) \ 4046 __data = (MIN); \ 4047 else if (__data > (MAX)) \ 4048 __data = (MAX); \ 4049 if (__CONV) \ 4050 *(__PTR) = msecs_to_jiffies(__data); \ 4051 else \ 4052 *(__PTR) = __data; \ 4053 return ret; \ 4054} 4055STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 4056STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, 4057 UINT_MAX, 1); 4058STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, 4059 UINT_MAX, 1); 4060STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 4061STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 4062 UINT_MAX, 0); 4063STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 4064STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); 4065STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 4066STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 4067STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4068 UINT_MAX, 0); 4069STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4070STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); 4071#undef STORE_FUNCTION 4072 4073#define CFQ_ATTR(name) \ 4074 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 4075 4076static struct elv_fs_entry cfq_attrs[] = { 4077 CFQ_ATTR(quantum), 4078 CFQ_ATTR(fifo_expire_sync), 4079 CFQ_ATTR(fifo_expire_async), 4080 CFQ_ATTR(back_seek_max), 4081 CFQ_ATTR(back_seek_penalty), 4082 CFQ_ATTR(slice_sync), 4083 CFQ_ATTR(slice_async), 4084 CFQ_ATTR(slice_async_rq), 4085 CFQ_ATTR(slice_idle), 4086 CFQ_ATTR(group_idle), 4087 CFQ_ATTR(low_latency), 4088 CFQ_ATTR(group_isolation), 4089 __ATTR_NULL 4090}; 4091 4092static struct elevator_type iosched_cfq = { 4093 .ops = { 4094 .elevator_merge_fn = cfq_merge, 4095 .elevator_merged_fn = cfq_merged_request, 4096 .elevator_merge_req_fn = cfq_merged_requests, 4097 .elevator_allow_merge_fn = cfq_allow_merge, 4098 .elevator_bio_merged_fn = cfq_bio_merged, 4099 .elevator_dispatch_fn = cfq_dispatch_requests, 4100 .elevator_add_req_fn = cfq_insert_request, 4101 .elevator_activate_req_fn = cfq_activate_request, 4102 .elevator_deactivate_req_fn = cfq_deactivate_request, 4103 .elevator_queue_empty_fn = cfq_queue_empty, 4104 .elevator_completed_req_fn = cfq_completed_request, 4105 .elevator_former_req_fn = elv_rb_former_request, 4106 .elevator_latter_req_fn = elv_rb_latter_request, 4107 .elevator_set_req_fn = cfq_set_request, 4108 .elevator_put_req_fn = cfq_put_request, 4109 .elevator_may_queue_fn = cfq_may_queue, 4110 .elevator_init_fn = cfq_init_queue, 4111 .elevator_exit_fn = cfq_exit_queue, 4112 .trim = cfq_free_io_context, 4113 }, 4114 .elevator_attrs = cfq_attrs, 4115 .elevator_name = "cfq", 4116 .elevator_owner = THIS_MODULE, 4117}; 4118 4119#ifdef CONFIG_CFQ_GROUP_IOSCHED 4120static struct blkio_policy_type blkio_policy_cfq = { 4121 .ops = { 4122 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4123 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4124 }, 4125}; 4126#else 4127static struct blkio_policy_type blkio_policy_cfq; 4128#endif 4129 4130static int __init cfq_init(void) 4131{ 4132 /* 4133 * could be 0 on HZ < 1000 setups 4134 */ 4135 if (!cfq_slice_async) 4136 cfq_slice_async = 1; 4137 if (!cfq_slice_idle) 4138 cfq_slice_idle = 1; 4139 4140#ifdef CONFIG_CFQ_GROUP_IOSCHED 4141 if (!cfq_group_idle) 4142 cfq_group_idle = 1; 4143#else 4144 cfq_group_idle = 0; 4145#endif 4146 if (cfq_slab_setup()) 4147 return -ENOMEM; 4148 4149 elv_register(&iosched_cfq); 4150 blkio_policy_register(&blkio_policy_cfq); 4151 4152 return 0; 4153} 4154 4155static void __exit cfq_exit(void) 4156{ 4157 DECLARE_COMPLETION_ONSTACK(all_gone); 4158 blkio_policy_unregister(&blkio_policy_cfq); 4159 elv_unregister(&iosched_cfq); 4160 ioc_gone = &all_gone; 4161 /* ioc_gone's update must be visible before reading ioc_count */ 4162 smp_wmb(); 4163 4164 /* 4165 * this also protects us from entering cfq_slab_kill() with 4166 * pending RCU callbacks 4167 */ 4168 if (elv_ioc_count_read(cfq_ioc_count)) 4169 wait_for_completion(&all_gone); 4170 ida_destroy(&cic_index_ida); 4171 cfq_slab_kill(); 4172} 4173 4174module_init(cfq_init); 4175module_exit(cfq_exit); 4176 4177MODULE_AUTHOR("Jens Axboe"); 4178MODULE_LICENSE("GPL"); 4179MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 4180