cfq-iosched.c revision 9a0785b0da561e1e9c6617df85e93ae107a42f18
1/*
2 *  CFQ, or complete fairness queueing, disk scheduler.
3 *
4 *  Based on ideas from a previously unfinished io
5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/blkdev.h>
11#include <linux/elevator.h>
12#include <linux/jiffies.h>
13#include <linux/rbtree.h>
14#include <linux/ioprio.h>
15#include <linux/blktrace_api.h>
16#include "blk-cgroup.h"
17
18/*
19 * tunables
20 */
21/* max queue in one round of service */
22static const int cfq_quantum = 8;
23static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
24/* maximum backwards seek, in KiB */
25static const int cfq_back_max = 16 * 1024;
26/* penalty of a backwards seek */
27static const int cfq_back_penalty = 2;
28static const int cfq_slice_sync = HZ / 10;
29static int cfq_slice_async = HZ / 25;
30static const int cfq_slice_async_rq = 2;
31static int cfq_slice_idle = HZ / 125;
32static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
33static const int cfq_hist_divisor = 4;
34
35/*
36 * offset from end of service tree
37 */
38#define CFQ_IDLE_DELAY		(HZ / 5)
39
40/*
41 * below this threshold, we consider thinktime immediate
42 */
43#define CFQ_MIN_TT		(2)
44
45#define CFQ_SLICE_SCALE		(5)
46#define CFQ_HW_QUEUE_MIN	(5)
47#define CFQ_SERVICE_SHIFT       12
48
49#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
50#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
51#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
52#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
53
54#define RQ_CIC(rq)		\
55	((struct cfq_io_context *) (rq)->elevator_private)
56#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
57
58static struct kmem_cache *cfq_pool;
59static struct kmem_cache *cfq_ioc_pool;
60
61static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
62static struct completion *ioc_gone;
63static DEFINE_SPINLOCK(ioc_gone_lock);
64
65#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
66#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
67#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
68
69#define sample_valid(samples)	((samples) > 80)
70#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
71
72/*
73 * Most of our rbtree usage is for sorting with min extraction, so
74 * if we cache the leftmost node we don't have to walk down the tree
75 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
76 * move this into the elevator for the rq sorting as well.
77 */
78struct cfq_rb_root {
79	struct rb_root rb;
80	struct rb_node *left;
81	unsigned count;
82	unsigned total_weight;
83	u64 min_vdisktime;
84	struct rb_node *active;
85};
86#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
87			.count = 0, .min_vdisktime = 0, }
88
89/*
90 * Per process-grouping structure
91 */
92struct cfq_queue {
93	/* reference count */
94	atomic_t ref;
95	/* various state flags, see below */
96	unsigned int flags;
97	/* parent cfq_data */
98	struct cfq_data *cfqd;
99	/* service_tree member */
100	struct rb_node rb_node;
101	/* service_tree key */
102	unsigned long rb_key;
103	/* prio tree member */
104	struct rb_node p_node;
105	/* prio tree root we belong to, if any */
106	struct rb_root *p_root;
107	/* sorted list of pending requests */
108	struct rb_root sort_list;
109	/* if fifo isn't expired, next request to serve */
110	struct request *next_rq;
111	/* requests queued in sort_list */
112	int queued[2];
113	/* currently allocated requests */
114	int allocated[2];
115	/* fifo list of requests in sort_list */
116	struct list_head fifo;
117
118	/* time when queue got scheduled in to dispatch first request. */
119	unsigned long dispatch_start;
120	unsigned int allocated_slice;
121	unsigned int slice_dispatch;
122	/* time when first request from queue completed and slice started. */
123	unsigned long slice_start;
124	unsigned long slice_end;
125	long slice_resid;
126
127	/* pending metadata requests */
128	int meta_pending;
129	/* number of requests that are on the dispatch list or inside driver */
130	int dispatched;
131
132	/* io prio of this group */
133	unsigned short ioprio, org_ioprio;
134	unsigned short ioprio_class, org_ioprio_class;
135
136	pid_t pid;
137
138	u32 seek_history;
139	sector_t last_request_pos;
140
141	struct cfq_rb_root *service_tree;
142	struct cfq_queue *new_cfqq;
143	struct cfq_group *cfqg;
144	struct cfq_group *orig_cfqg;
145};
146
147/*
148 * First index in the service_trees.
149 * IDLE is handled separately, so it has negative index
150 */
151enum wl_prio_t {
152	BE_WORKLOAD = 0,
153	RT_WORKLOAD = 1,
154	IDLE_WORKLOAD = 2,
155};
156
157/*
158 * Second index in the service_trees.
159 */
160enum wl_type_t {
161	ASYNC_WORKLOAD = 0,
162	SYNC_NOIDLE_WORKLOAD = 1,
163	SYNC_WORKLOAD = 2
164};
165
166/* This is per cgroup per device grouping structure */
167struct cfq_group {
168	/* group service_tree member */
169	struct rb_node rb_node;
170
171	/* group service_tree key */
172	u64 vdisktime;
173	unsigned int weight;
174	bool on_st;
175
176	/* number of cfqq currently on this group */
177	int nr_cfqq;
178
179	/* Per group busy queus average. Useful for workload slice calc. */
180	unsigned int busy_queues_avg[2];
181	/*
182	 * rr lists of queues with requests, onle rr for each priority class.
183	 * Counts are embedded in the cfq_rb_root
184	 */
185	struct cfq_rb_root service_trees[2][3];
186	struct cfq_rb_root service_tree_idle;
187
188	unsigned long saved_workload_slice;
189	enum wl_type_t saved_workload;
190	enum wl_prio_t saved_serving_prio;
191	struct blkio_group blkg;
192#ifdef CONFIG_CFQ_GROUP_IOSCHED
193	struct hlist_node cfqd_node;
194	atomic_t ref;
195#endif
196};
197
198/*
199 * Per block device queue structure
200 */
201struct cfq_data {
202	struct request_queue *queue;
203	/* Root service tree for cfq_groups */
204	struct cfq_rb_root grp_service_tree;
205	struct cfq_group root_group;
206
207	/*
208	 * The priority currently being served
209	 */
210	enum wl_prio_t serving_prio;
211	enum wl_type_t serving_type;
212	unsigned long workload_expires;
213	struct cfq_group *serving_group;
214	bool noidle_tree_requires_idle;
215
216	/*
217	 * Each priority tree is sorted by next_request position.  These
218	 * trees are used when determining if two or more queues are
219	 * interleaving requests (see cfq_close_cooperator).
220	 */
221	struct rb_root prio_trees[CFQ_PRIO_LISTS];
222
223	unsigned int busy_queues;
224
225	int rq_in_driver;
226	int rq_in_flight[2];
227
228	/*
229	 * queue-depth detection
230	 */
231	int rq_queued;
232	int hw_tag;
233	/*
234	 * hw_tag can be
235	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
236	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
237	 *  0 => no NCQ
238	 */
239	int hw_tag_est_depth;
240	unsigned int hw_tag_samples;
241
242	/*
243	 * idle window management
244	 */
245	struct timer_list idle_slice_timer;
246	struct work_struct unplug_work;
247
248	struct cfq_queue *active_queue;
249	struct cfq_io_context *active_cic;
250
251	/*
252	 * async queue for each priority case
253	 */
254	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
255	struct cfq_queue *async_idle_cfqq;
256
257	sector_t last_position;
258
259	/*
260	 * tunables, see top of file
261	 */
262	unsigned int cfq_quantum;
263	unsigned int cfq_fifo_expire[2];
264	unsigned int cfq_back_penalty;
265	unsigned int cfq_back_max;
266	unsigned int cfq_slice[2];
267	unsigned int cfq_slice_async_rq;
268	unsigned int cfq_slice_idle;
269	unsigned int cfq_latency;
270	unsigned int cfq_group_isolation;
271
272	struct list_head cic_list;
273
274	/*
275	 * Fallback dummy cfqq for extreme OOM conditions
276	 */
277	struct cfq_queue oom_cfqq;
278
279	unsigned long last_delayed_sync;
280
281	/* List of cfq groups being managed on this device*/
282	struct hlist_head cfqg_list;
283	struct rcu_head rcu;
284};
285
286static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
287
288static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
289					    enum wl_prio_t prio,
290					    enum wl_type_t type)
291{
292	if (!cfqg)
293		return NULL;
294
295	if (prio == IDLE_WORKLOAD)
296		return &cfqg->service_tree_idle;
297
298	return &cfqg->service_trees[prio][type];
299}
300
301enum cfqq_state_flags {
302	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
303	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
304	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
305	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
306	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
307	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
308	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
309	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
310	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
311	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
312	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
313	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
314	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
315};
316
317#define CFQ_CFQQ_FNS(name)						\
318static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
319{									\
320	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
321}									\
322static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
323{									\
324	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
325}									\
326static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
327{									\
328	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
329}
330
331CFQ_CFQQ_FNS(on_rr);
332CFQ_CFQQ_FNS(wait_request);
333CFQ_CFQQ_FNS(must_dispatch);
334CFQ_CFQQ_FNS(must_alloc_slice);
335CFQ_CFQQ_FNS(fifo_expire);
336CFQ_CFQQ_FNS(idle_window);
337CFQ_CFQQ_FNS(prio_changed);
338CFQ_CFQQ_FNS(slice_new);
339CFQ_CFQQ_FNS(sync);
340CFQ_CFQQ_FNS(coop);
341CFQ_CFQQ_FNS(split_coop);
342CFQ_CFQQ_FNS(deep);
343CFQ_CFQQ_FNS(wait_busy);
344#undef CFQ_CFQQ_FNS
345
346#ifdef CONFIG_DEBUG_CFQ_IOSCHED
347#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
348	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
349			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
350			blkg_path(&(cfqq)->cfqg->blkg), ##args);
351
352#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
353	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
354				blkg_path(&(cfqg)->blkg), ##args);      \
355
356#else
357#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
358	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
359#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0);
360#endif
361#define cfq_log(cfqd, fmt, args...)	\
362	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
363
364/* Traverses through cfq group service trees */
365#define for_each_cfqg_st(cfqg, i, j, st) \
366	for (i = 0; i <= IDLE_WORKLOAD; i++) \
367		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
368			: &cfqg->service_tree_idle; \
369			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
370			(i == IDLE_WORKLOAD && j == 0); \
371			j++, st = i < IDLE_WORKLOAD ? \
372			&cfqg->service_trees[i][j]: NULL) \
373
374
375static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
376{
377	if (cfq_class_idle(cfqq))
378		return IDLE_WORKLOAD;
379	if (cfq_class_rt(cfqq))
380		return RT_WORKLOAD;
381	return BE_WORKLOAD;
382}
383
384
385static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
386{
387	if (!cfq_cfqq_sync(cfqq))
388		return ASYNC_WORKLOAD;
389	if (!cfq_cfqq_idle_window(cfqq))
390		return SYNC_NOIDLE_WORKLOAD;
391	return SYNC_WORKLOAD;
392}
393
394static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
395					struct cfq_data *cfqd,
396					struct cfq_group *cfqg)
397{
398	if (wl == IDLE_WORKLOAD)
399		return cfqg->service_tree_idle.count;
400
401	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
402		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
403		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
404}
405
406static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
407					struct cfq_group *cfqg)
408{
409	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
410		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
411}
412
413static void cfq_dispatch_insert(struct request_queue *, struct request *);
414static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
415				       struct io_context *, gfp_t);
416static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
417						struct io_context *);
418
419static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
420					    bool is_sync)
421{
422	return cic->cfqq[is_sync];
423}
424
425static inline void cic_set_cfqq(struct cfq_io_context *cic,
426				struct cfq_queue *cfqq, bool is_sync)
427{
428	cic->cfqq[is_sync] = cfqq;
429}
430
431/*
432 * We regard a request as SYNC, if it's either a read or has the SYNC bit
433 * set (in which case it could also be direct WRITE).
434 */
435static inline bool cfq_bio_sync(struct bio *bio)
436{
437	return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
438}
439
440/*
441 * scheduler run of queue, if there are requests pending and no one in the
442 * driver that will restart queueing
443 */
444static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
445{
446	if (cfqd->busy_queues) {
447		cfq_log(cfqd, "schedule dispatch");
448		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
449	}
450}
451
452static int cfq_queue_empty(struct request_queue *q)
453{
454	struct cfq_data *cfqd = q->elevator->elevator_data;
455
456	return !cfqd->rq_queued;
457}
458
459/*
460 * Scale schedule slice based on io priority. Use the sync time slice only
461 * if a queue is marked sync and has sync io queued. A sync queue with async
462 * io only, should not get full sync slice length.
463 */
464static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
465				 unsigned short prio)
466{
467	const int base_slice = cfqd->cfq_slice[sync];
468
469	WARN_ON(prio >= IOPRIO_BE_NR);
470
471	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
472}
473
474static inline int
475cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
476{
477	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
478}
479
480static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
481{
482	u64 d = delta << CFQ_SERVICE_SHIFT;
483
484	d = d * BLKIO_WEIGHT_DEFAULT;
485	do_div(d, cfqg->weight);
486	return d;
487}
488
489static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
490{
491	s64 delta = (s64)(vdisktime - min_vdisktime);
492	if (delta > 0)
493		min_vdisktime = vdisktime;
494
495	return min_vdisktime;
496}
497
498static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
499{
500	s64 delta = (s64)(vdisktime - min_vdisktime);
501	if (delta < 0)
502		min_vdisktime = vdisktime;
503
504	return min_vdisktime;
505}
506
507static void update_min_vdisktime(struct cfq_rb_root *st)
508{
509	u64 vdisktime = st->min_vdisktime;
510	struct cfq_group *cfqg;
511
512	if (st->active) {
513		cfqg = rb_entry_cfqg(st->active);
514		vdisktime = cfqg->vdisktime;
515	}
516
517	if (st->left) {
518		cfqg = rb_entry_cfqg(st->left);
519		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
520	}
521
522	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
523}
524
525/*
526 * get averaged number of queues of RT/BE priority.
527 * average is updated, with a formula that gives more weight to higher numbers,
528 * to quickly follows sudden increases and decrease slowly
529 */
530
531static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
532					struct cfq_group *cfqg, bool rt)
533{
534	unsigned min_q, max_q;
535	unsigned mult  = cfq_hist_divisor - 1;
536	unsigned round = cfq_hist_divisor / 2;
537	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
538
539	min_q = min(cfqg->busy_queues_avg[rt], busy);
540	max_q = max(cfqg->busy_queues_avg[rt], busy);
541	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
542		cfq_hist_divisor;
543	return cfqg->busy_queues_avg[rt];
544}
545
546static inline unsigned
547cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
548{
549	struct cfq_rb_root *st = &cfqd->grp_service_tree;
550
551	return cfq_target_latency * cfqg->weight / st->total_weight;
552}
553
554static inline void
555cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
556{
557	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
558	if (cfqd->cfq_latency) {
559		/*
560		 * interested queues (we consider only the ones with the same
561		 * priority class in the cfq group)
562		 */
563		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
564						cfq_class_rt(cfqq));
565		unsigned sync_slice = cfqd->cfq_slice[1];
566		unsigned expect_latency = sync_slice * iq;
567		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
568
569		if (expect_latency > group_slice) {
570			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
571			/* scale low_slice according to IO priority
572			 * and sync vs async */
573			unsigned low_slice =
574				min(slice, base_low_slice * slice / sync_slice);
575			/* the adapted slice value is scaled to fit all iqs
576			 * into the target latency */
577			slice = max(slice * group_slice / expect_latency,
578				    low_slice);
579		}
580	}
581	cfqq->slice_start = jiffies;
582	cfqq->slice_end = jiffies + slice;
583	cfqq->allocated_slice = slice;
584	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
585}
586
587/*
588 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
589 * isn't valid until the first request from the dispatch is activated
590 * and the slice time set.
591 */
592static inline bool cfq_slice_used(struct cfq_queue *cfqq)
593{
594	if (cfq_cfqq_slice_new(cfqq))
595		return 0;
596	if (time_before(jiffies, cfqq->slice_end))
597		return 0;
598
599	return 1;
600}
601
602/*
603 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
604 * We choose the request that is closest to the head right now. Distance
605 * behind the head is penalized and only allowed to a certain extent.
606 */
607static struct request *
608cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
609{
610	sector_t s1, s2, d1 = 0, d2 = 0;
611	unsigned long back_max;
612#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
613#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
614	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
615
616	if (rq1 == NULL || rq1 == rq2)
617		return rq2;
618	if (rq2 == NULL)
619		return rq1;
620
621	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
622		return rq1;
623	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
624		return rq2;
625	if (rq_is_meta(rq1) && !rq_is_meta(rq2))
626		return rq1;
627	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
628		return rq2;
629
630	s1 = blk_rq_pos(rq1);
631	s2 = blk_rq_pos(rq2);
632
633	/*
634	 * by definition, 1KiB is 2 sectors
635	 */
636	back_max = cfqd->cfq_back_max * 2;
637
638	/*
639	 * Strict one way elevator _except_ in the case where we allow
640	 * short backward seeks which are biased as twice the cost of a
641	 * similar forward seek.
642	 */
643	if (s1 >= last)
644		d1 = s1 - last;
645	else if (s1 + back_max >= last)
646		d1 = (last - s1) * cfqd->cfq_back_penalty;
647	else
648		wrap |= CFQ_RQ1_WRAP;
649
650	if (s2 >= last)
651		d2 = s2 - last;
652	else if (s2 + back_max >= last)
653		d2 = (last - s2) * cfqd->cfq_back_penalty;
654	else
655		wrap |= CFQ_RQ2_WRAP;
656
657	/* Found required data */
658
659	/*
660	 * By doing switch() on the bit mask "wrap" we avoid having to
661	 * check two variables for all permutations: --> faster!
662	 */
663	switch (wrap) {
664	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
665		if (d1 < d2)
666			return rq1;
667		else if (d2 < d1)
668			return rq2;
669		else {
670			if (s1 >= s2)
671				return rq1;
672			else
673				return rq2;
674		}
675
676	case CFQ_RQ2_WRAP:
677		return rq1;
678	case CFQ_RQ1_WRAP:
679		return rq2;
680	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
681	default:
682		/*
683		 * Since both rqs are wrapped,
684		 * start with the one that's further behind head
685		 * (--> only *one* back seek required),
686		 * since back seek takes more time than forward.
687		 */
688		if (s1 <= s2)
689			return rq1;
690		else
691			return rq2;
692	}
693}
694
695/*
696 * The below is leftmost cache rbtree addon
697 */
698static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
699{
700	/* Service tree is empty */
701	if (!root->count)
702		return NULL;
703
704	if (!root->left)
705		root->left = rb_first(&root->rb);
706
707	if (root->left)
708		return rb_entry(root->left, struct cfq_queue, rb_node);
709
710	return NULL;
711}
712
713static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
714{
715	if (!root->left)
716		root->left = rb_first(&root->rb);
717
718	if (root->left)
719		return rb_entry_cfqg(root->left);
720
721	return NULL;
722}
723
724static void rb_erase_init(struct rb_node *n, struct rb_root *root)
725{
726	rb_erase(n, root);
727	RB_CLEAR_NODE(n);
728}
729
730static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
731{
732	if (root->left == n)
733		root->left = NULL;
734	rb_erase_init(n, &root->rb);
735	--root->count;
736}
737
738/*
739 * would be nice to take fifo expire time into account as well
740 */
741static struct request *
742cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
743		  struct request *last)
744{
745	struct rb_node *rbnext = rb_next(&last->rb_node);
746	struct rb_node *rbprev = rb_prev(&last->rb_node);
747	struct request *next = NULL, *prev = NULL;
748
749	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
750
751	if (rbprev)
752		prev = rb_entry_rq(rbprev);
753
754	if (rbnext)
755		next = rb_entry_rq(rbnext);
756	else {
757		rbnext = rb_first(&cfqq->sort_list);
758		if (rbnext && rbnext != &last->rb_node)
759			next = rb_entry_rq(rbnext);
760	}
761
762	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
763}
764
765static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
766				      struct cfq_queue *cfqq)
767{
768	/*
769	 * just an approximation, should be ok.
770	 */
771	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
772		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
773}
774
775static inline s64
776cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
777{
778	return cfqg->vdisktime - st->min_vdisktime;
779}
780
781static void
782__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
783{
784	struct rb_node **node = &st->rb.rb_node;
785	struct rb_node *parent = NULL;
786	struct cfq_group *__cfqg;
787	s64 key = cfqg_key(st, cfqg);
788	int left = 1;
789
790	while (*node != NULL) {
791		parent = *node;
792		__cfqg = rb_entry_cfqg(parent);
793
794		if (key < cfqg_key(st, __cfqg))
795			node = &parent->rb_left;
796		else {
797			node = &parent->rb_right;
798			left = 0;
799		}
800	}
801
802	if (left)
803		st->left = &cfqg->rb_node;
804
805	rb_link_node(&cfqg->rb_node, parent, node);
806	rb_insert_color(&cfqg->rb_node, &st->rb);
807}
808
809static void
810cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
811{
812	struct cfq_rb_root *st = &cfqd->grp_service_tree;
813	struct cfq_group *__cfqg;
814	struct rb_node *n;
815
816	cfqg->nr_cfqq++;
817	if (cfqg->on_st)
818		return;
819
820	/*
821	 * Currently put the group at the end. Later implement something
822	 * so that groups get lesser vtime based on their weights, so that
823	 * if group does not loose all if it was not continously backlogged.
824	 */
825	n = rb_last(&st->rb);
826	if (n) {
827		__cfqg = rb_entry_cfqg(n);
828		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
829	} else
830		cfqg->vdisktime = st->min_vdisktime;
831
832	__cfq_group_service_tree_add(st, cfqg);
833	cfqg->on_st = true;
834	st->total_weight += cfqg->weight;
835}
836
837static void
838cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
839{
840	struct cfq_rb_root *st = &cfqd->grp_service_tree;
841
842	if (st->active == &cfqg->rb_node)
843		st->active = NULL;
844
845	BUG_ON(cfqg->nr_cfqq < 1);
846	cfqg->nr_cfqq--;
847
848	/* If there are other cfq queues under this group, don't delete it */
849	if (cfqg->nr_cfqq)
850		return;
851
852	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
853	cfqg->on_st = false;
854	st->total_weight -= cfqg->weight;
855	if (!RB_EMPTY_NODE(&cfqg->rb_node))
856		cfq_rb_erase(&cfqg->rb_node, st);
857	cfqg->saved_workload_slice = 0;
858	blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
859}
860
861static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
862{
863	unsigned int slice_used;
864
865	/*
866	 * Queue got expired before even a single request completed or
867	 * got expired immediately after first request completion.
868	 */
869	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
870		/*
871		 * Also charge the seek time incurred to the group, otherwise
872		 * if there are mutiple queues in the group, each can dispatch
873		 * a single request on seeky media and cause lots of seek time
874		 * and group will never know it.
875		 */
876		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
877					1);
878	} else {
879		slice_used = jiffies - cfqq->slice_start;
880		if (slice_used > cfqq->allocated_slice)
881			slice_used = cfqq->allocated_slice;
882	}
883
884	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
885	return slice_used;
886}
887
888static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
889				struct cfq_queue *cfqq)
890{
891	struct cfq_rb_root *st = &cfqd->grp_service_tree;
892	unsigned int used_sl, charge_sl;
893	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
894			- cfqg->service_tree_idle.count;
895
896	BUG_ON(nr_sync < 0);
897	used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
898
899	if (!cfq_cfqq_sync(cfqq) && !nr_sync)
900		charge_sl = cfqq->allocated_slice;
901
902	/* Can't update vdisktime while group is on service tree */
903	cfq_rb_erase(&cfqg->rb_node, st);
904	cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
905	__cfq_group_service_tree_add(st, cfqg);
906
907	/* This group is being expired. Save the context */
908	if (time_after(cfqd->workload_expires, jiffies)) {
909		cfqg->saved_workload_slice = cfqd->workload_expires
910						- jiffies;
911		cfqg->saved_workload = cfqd->serving_type;
912		cfqg->saved_serving_prio = cfqd->serving_prio;
913	} else
914		cfqg->saved_workload_slice = 0;
915
916	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
917					st->min_vdisktime);
918	blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl);
919}
920
921#ifdef CONFIG_CFQ_GROUP_IOSCHED
922static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
923{
924	if (blkg)
925		return container_of(blkg, struct cfq_group, blkg);
926	return NULL;
927}
928
929void
930cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
931{
932	cfqg_of_blkg(blkg)->weight = weight;
933}
934
935static struct cfq_group *
936cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
937{
938	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
939	struct cfq_group *cfqg = NULL;
940	void *key = cfqd;
941	int i, j;
942	struct cfq_rb_root *st;
943	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
944	unsigned int major, minor;
945
946	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
947	if (cfqg || !create)
948		goto done;
949
950	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
951	if (!cfqg)
952		goto done;
953
954	cfqg->weight = blkcg->weight;
955	for_each_cfqg_st(cfqg, i, j, st)
956		*st = CFQ_RB_ROOT;
957	RB_CLEAR_NODE(&cfqg->rb_node);
958
959	/*
960	 * Take the initial reference that will be released on destroy
961	 * This can be thought of a joint reference by cgroup and
962	 * elevator which will be dropped by either elevator exit
963	 * or cgroup deletion path depending on who is exiting first.
964	 */
965	atomic_set(&cfqg->ref, 1);
966
967	/* Add group onto cgroup list */
968	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
969	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
970					MKDEV(major, minor));
971
972	/* Add group on cfqd list */
973	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
974
975done:
976	return cfqg;
977}
978
979/*
980 * Search for the cfq group current task belongs to. If create = 1, then also
981 * create the cfq group if it does not exist. request_queue lock must be held.
982 */
983static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
984{
985	struct cgroup *cgroup;
986	struct cfq_group *cfqg = NULL;
987
988	rcu_read_lock();
989	cgroup = task_cgroup(current, blkio_subsys_id);
990	cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
991	if (!cfqg && create)
992		cfqg = &cfqd->root_group;
993	rcu_read_unlock();
994	return cfqg;
995}
996
997static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
998{
999	/* Currently, all async queues are mapped to root group */
1000	if (!cfq_cfqq_sync(cfqq))
1001		cfqg = &cfqq->cfqd->root_group;
1002
1003	cfqq->cfqg = cfqg;
1004	/* cfqq reference on cfqg */
1005	atomic_inc(&cfqq->cfqg->ref);
1006}
1007
1008static void cfq_put_cfqg(struct cfq_group *cfqg)
1009{
1010	struct cfq_rb_root *st;
1011	int i, j;
1012
1013	BUG_ON(atomic_read(&cfqg->ref) <= 0);
1014	if (!atomic_dec_and_test(&cfqg->ref))
1015		return;
1016	for_each_cfqg_st(cfqg, i, j, st)
1017		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
1018	kfree(cfqg);
1019}
1020
1021static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1022{
1023	/* Something wrong if we are trying to remove same group twice */
1024	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1025
1026	hlist_del_init(&cfqg->cfqd_node);
1027
1028	/*
1029	 * Put the reference taken at the time of creation so that when all
1030	 * queues are gone, group can be destroyed.
1031	 */
1032	cfq_put_cfqg(cfqg);
1033}
1034
1035static void cfq_release_cfq_groups(struct cfq_data *cfqd)
1036{
1037	struct hlist_node *pos, *n;
1038	struct cfq_group *cfqg;
1039
1040	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1041		/*
1042		 * If cgroup removal path got to blk_group first and removed
1043		 * it from cgroup list, then it will take care of destroying
1044		 * cfqg also.
1045		 */
1046		if (!blkiocg_del_blkio_group(&cfqg->blkg))
1047			cfq_destroy_cfqg(cfqd, cfqg);
1048	}
1049}
1050
1051/*
1052 * Blk cgroup controller notification saying that blkio_group object is being
1053 * delinked as associated cgroup object is going away. That also means that
1054 * no new IO will come in this group. So get rid of this group as soon as
1055 * any pending IO in the group is finished.
1056 *
1057 * This function is called under rcu_read_lock(). key is the rcu protected
1058 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1059 * read lock.
1060 *
1061 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1062 * it should not be NULL as even if elevator was exiting, cgroup deltion
1063 * path got to it first.
1064 */
1065void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1066{
1067	unsigned long  flags;
1068	struct cfq_data *cfqd = key;
1069
1070	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1071	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1072	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1073}
1074
1075#else /* GROUP_IOSCHED */
1076static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1077{
1078	return &cfqd->root_group;
1079}
1080static inline void
1081cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1082	cfqq->cfqg = cfqg;
1083}
1084
1085static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1086static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1087
1088#endif /* GROUP_IOSCHED */
1089
1090/*
1091 * The cfqd->service_trees holds all pending cfq_queue's that have
1092 * requests waiting to be processed. It is sorted in the order that
1093 * we will service the queues.
1094 */
1095static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1096				 bool add_front)
1097{
1098	struct rb_node **p, *parent;
1099	struct cfq_queue *__cfqq;
1100	unsigned long rb_key;
1101	struct cfq_rb_root *service_tree;
1102	int left;
1103	int new_cfqq = 1;
1104	int group_changed = 0;
1105
1106#ifdef CONFIG_CFQ_GROUP_IOSCHED
1107	if (!cfqd->cfq_group_isolation
1108	    && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1109	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1110		/* Move this cfq to root group */
1111		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1112		if (!RB_EMPTY_NODE(&cfqq->rb_node))
1113			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1114		cfqq->orig_cfqg = cfqq->cfqg;
1115		cfqq->cfqg = &cfqd->root_group;
1116		atomic_inc(&cfqd->root_group.ref);
1117		group_changed = 1;
1118	} else if (!cfqd->cfq_group_isolation
1119		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1120		/* cfqq is sequential now needs to go to its original group */
1121		BUG_ON(cfqq->cfqg != &cfqd->root_group);
1122		if (!RB_EMPTY_NODE(&cfqq->rb_node))
1123			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1124		cfq_put_cfqg(cfqq->cfqg);
1125		cfqq->cfqg = cfqq->orig_cfqg;
1126		cfqq->orig_cfqg = NULL;
1127		group_changed = 1;
1128		cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1129	}
1130#endif
1131
1132	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1133						cfqq_type(cfqq));
1134	if (cfq_class_idle(cfqq)) {
1135		rb_key = CFQ_IDLE_DELAY;
1136		parent = rb_last(&service_tree->rb);
1137		if (parent && parent != &cfqq->rb_node) {
1138			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1139			rb_key += __cfqq->rb_key;
1140		} else
1141			rb_key += jiffies;
1142	} else if (!add_front) {
1143		/*
1144		 * Get our rb key offset. Subtract any residual slice
1145		 * value carried from last service. A negative resid
1146		 * count indicates slice overrun, and this should position
1147		 * the next service time further away in the tree.
1148		 */
1149		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1150		rb_key -= cfqq->slice_resid;
1151		cfqq->slice_resid = 0;
1152	} else {
1153		rb_key = -HZ;
1154		__cfqq = cfq_rb_first(service_tree);
1155		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1156	}
1157
1158	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1159		new_cfqq = 0;
1160		/*
1161		 * same position, nothing more to do
1162		 */
1163		if (rb_key == cfqq->rb_key &&
1164		    cfqq->service_tree == service_tree)
1165			return;
1166
1167		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1168		cfqq->service_tree = NULL;
1169	}
1170
1171	left = 1;
1172	parent = NULL;
1173	cfqq->service_tree = service_tree;
1174	p = &service_tree->rb.rb_node;
1175	while (*p) {
1176		struct rb_node **n;
1177
1178		parent = *p;
1179		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1180
1181		/*
1182		 * sort by key, that represents service time.
1183		 */
1184		if (time_before(rb_key, __cfqq->rb_key))
1185			n = &(*p)->rb_left;
1186		else {
1187			n = &(*p)->rb_right;
1188			left = 0;
1189		}
1190
1191		p = n;
1192	}
1193
1194	if (left)
1195		service_tree->left = &cfqq->rb_node;
1196
1197	cfqq->rb_key = rb_key;
1198	rb_link_node(&cfqq->rb_node, parent, p);
1199	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1200	service_tree->count++;
1201	if ((add_front || !new_cfqq) && !group_changed)
1202		return;
1203	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
1204}
1205
1206static struct cfq_queue *
1207cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1208		     sector_t sector, struct rb_node **ret_parent,
1209		     struct rb_node ***rb_link)
1210{
1211	struct rb_node **p, *parent;
1212	struct cfq_queue *cfqq = NULL;
1213
1214	parent = NULL;
1215	p = &root->rb_node;
1216	while (*p) {
1217		struct rb_node **n;
1218
1219		parent = *p;
1220		cfqq = rb_entry(parent, struct cfq_queue, p_node);
1221
1222		/*
1223		 * Sort strictly based on sector.  Smallest to the left,
1224		 * largest to the right.
1225		 */
1226		if (sector > blk_rq_pos(cfqq->next_rq))
1227			n = &(*p)->rb_right;
1228		else if (sector < blk_rq_pos(cfqq->next_rq))
1229			n = &(*p)->rb_left;
1230		else
1231			break;
1232		p = n;
1233		cfqq = NULL;
1234	}
1235
1236	*ret_parent = parent;
1237	if (rb_link)
1238		*rb_link = p;
1239	return cfqq;
1240}
1241
1242static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1243{
1244	struct rb_node **p, *parent;
1245	struct cfq_queue *__cfqq;
1246
1247	if (cfqq->p_root) {
1248		rb_erase(&cfqq->p_node, cfqq->p_root);
1249		cfqq->p_root = NULL;
1250	}
1251
1252	if (cfq_class_idle(cfqq))
1253		return;
1254	if (!cfqq->next_rq)
1255		return;
1256
1257	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1258	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1259				      blk_rq_pos(cfqq->next_rq), &parent, &p);
1260	if (!__cfqq) {
1261		rb_link_node(&cfqq->p_node, parent, p);
1262		rb_insert_color(&cfqq->p_node, cfqq->p_root);
1263	} else
1264		cfqq->p_root = NULL;
1265}
1266
1267/*
1268 * Update cfqq's position in the service tree.
1269 */
1270static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1271{
1272	/*
1273	 * Resorting requires the cfqq to be on the RR list already.
1274	 */
1275	if (cfq_cfqq_on_rr(cfqq)) {
1276		cfq_service_tree_add(cfqd, cfqq, 0);
1277		cfq_prio_tree_add(cfqd, cfqq);
1278	}
1279}
1280
1281/*
1282 * add to busy list of queues for service, trying to be fair in ordering
1283 * the pending list according to last request service
1284 */
1285static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1286{
1287	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
1288	BUG_ON(cfq_cfqq_on_rr(cfqq));
1289	cfq_mark_cfqq_on_rr(cfqq);
1290	cfqd->busy_queues++;
1291
1292	cfq_resort_rr_list(cfqd, cfqq);
1293}
1294
1295/*
1296 * Called when the cfqq no longer has requests pending, remove it from
1297 * the service tree.
1298 */
1299static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1300{
1301	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
1302	BUG_ON(!cfq_cfqq_on_rr(cfqq));
1303	cfq_clear_cfqq_on_rr(cfqq);
1304
1305	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1306		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1307		cfqq->service_tree = NULL;
1308	}
1309	if (cfqq->p_root) {
1310		rb_erase(&cfqq->p_node, cfqq->p_root);
1311		cfqq->p_root = NULL;
1312	}
1313
1314	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1315	BUG_ON(!cfqd->busy_queues);
1316	cfqd->busy_queues--;
1317}
1318
1319/*
1320 * rb tree support functions
1321 */
1322static void cfq_del_rq_rb(struct request *rq)
1323{
1324	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1325	const int sync = rq_is_sync(rq);
1326
1327	BUG_ON(!cfqq->queued[sync]);
1328	cfqq->queued[sync]--;
1329
1330	elv_rb_del(&cfqq->sort_list, rq);
1331
1332	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1333		/*
1334		 * Queue will be deleted from service tree when we actually
1335		 * expire it later. Right now just remove it from prio tree
1336		 * as it is empty.
1337		 */
1338		if (cfqq->p_root) {
1339			rb_erase(&cfqq->p_node, cfqq->p_root);
1340			cfqq->p_root = NULL;
1341		}
1342	}
1343}
1344
1345static void cfq_add_rq_rb(struct request *rq)
1346{
1347	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1348	struct cfq_data *cfqd = cfqq->cfqd;
1349	struct request *__alias, *prev;
1350
1351	cfqq->queued[rq_is_sync(rq)]++;
1352
1353	/*
1354	 * looks a little odd, but the first insert might return an alias.
1355	 * if that happens, put the alias on the dispatch list
1356	 */
1357	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
1358		cfq_dispatch_insert(cfqd->queue, __alias);
1359
1360	if (!cfq_cfqq_on_rr(cfqq))
1361		cfq_add_cfqq_rr(cfqd, cfqq);
1362
1363	/*
1364	 * check if this request is a better next-serve candidate
1365	 */
1366	prev = cfqq->next_rq;
1367	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
1368
1369	/*
1370	 * adjust priority tree position, if ->next_rq changes
1371	 */
1372	if (prev != cfqq->next_rq)
1373		cfq_prio_tree_add(cfqd, cfqq);
1374
1375	BUG_ON(!cfqq->next_rq);
1376}
1377
1378static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1379{
1380	elv_rb_del(&cfqq->sort_list, rq);
1381	cfqq->queued[rq_is_sync(rq)]--;
1382	cfq_add_rq_rb(rq);
1383}
1384
1385static struct request *
1386cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1387{
1388	struct task_struct *tsk = current;
1389	struct cfq_io_context *cic;
1390	struct cfq_queue *cfqq;
1391
1392	cic = cfq_cic_lookup(cfqd, tsk->io_context);
1393	if (!cic)
1394		return NULL;
1395
1396	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1397	if (cfqq) {
1398		sector_t sector = bio->bi_sector + bio_sectors(bio);
1399
1400		return elv_rb_find(&cfqq->sort_list, sector);
1401	}
1402
1403	return NULL;
1404}
1405
1406static void cfq_activate_request(struct request_queue *q, struct request *rq)
1407{
1408	struct cfq_data *cfqd = q->elevator->elevator_data;
1409
1410	cfqd->rq_in_driver++;
1411	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
1412						cfqd->rq_in_driver);
1413
1414	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1415}
1416
1417static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
1418{
1419	struct cfq_data *cfqd = q->elevator->elevator_data;
1420
1421	WARN_ON(!cfqd->rq_in_driver);
1422	cfqd->rq_in_driver--;
1423	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
1424						cfqd->rq_in_driver);
1425}
1426
1427static void cfq_remove_request(struct request *rq)
1428{
1429	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1430
1431	if (cfqq->next_rq == rq)
1432		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
1433
1434	list_del_init(&rq->queuelist);
1435	cfq_del_rq_rb(rq);
1436
1437	cfqq->cfqd->rq_queued--;
1438	if (rq_is_meta(rq)) {
1439		WARN_ON(!cfqq->meta_pending);
1440		cfqq->meta_pending--;
1441	}
1442}
1443
1444static int cfq_merge(struct request_queue *q, struct request **req,
1445		     struct bio *bio)
1446{
1447	struct cfq_data *cfqd = q->elevator->elevator_data;
1448	struct request *__rq;
1449
1450	__rq = cfq_find_rq_fmerge(cfqd, bio);
1451	if (__rq && elv_rq_merge_ok(__rq, bio)) {
1452		*req = __rq;
1453		return ELEVATOR_FRONT_MERGE;
1454	}
1455
1456	return ELEVATOR_NO_MERGE;
1457}
1458
1459static void cfq_merged_request(struct request_queue *q, struct request *req,
1460			       int type)
1461{
1462	if (type == ELEVATOR_FRONT_MERGE) {
1463		struct cfq_queue *cfqq = RQ_CFQQ(req);
1464
1465		cfq_reposition_rq_rb(cfqq, req);
1466	}
1467}
1468
1469static void
1470cfq_merged_requests(struct request_queue *q, struct request *rq,
1471		    struct request *next)
1472{
1473	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1474	/*
1475	 * reposition in fifo if next is older than rq
1476	 */
1477	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1478	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1479		list_move(&rq->queuelist, &next->queuelist);
1480		rq_set_fifo_time(rq, rq_fifo_time(next));
1481	}
1482
1483	if (cfqq->next_rq == next)
1484		cfqq->next_rq = rq;
1485	cfq_remove_request(next);
1486}
1487
1488static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1489			   struct bio *bio)
1490{
1491	struct cfq_data *cfqd = q->elevator->elevator_data;
1492	struct cfq_io_context *cic;
1493	struct cfq_queue *cfqq;
1494
1495	/*
1496	 * Disallow merge of a sync bio into an async request.
1497	 */
1498	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
1499		return false;
1500
1501	/*
1502	 * Lookup the cfqq that this bio will be queued with. Allow
1503	 * merge only if rq is queued there.
1504	 */
1505	cic = cfq_cic_lookup(cfqd, current->io_context);
1506	if (!cic)
1507		return false;
1508
1509	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1510	return cfqq == RQ_CFQQ(rq);
1511}
1512
1513static void __cfq_set_active_queue(struct cfq_data *cfqd,
1514				   struct cfq_queue *cfqq)
1515{
1516	if (cfqq) {
1517		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1518				cfqd->serving_prio, cfqd->serving_type);
1519		cfqq->slice_start = 0;
1520		cfqq->dispatch_start = jiffies;
1521		cfqq->allocated_slice = 0;
1522		cfqq->slice_end = 0;
1523		cfqq->slice_dispatch = 0;
1524
1525		cfq_clear_cfqq_wait_request(cfqq);
1526		cfq_clear_cfqq_must_dispatch(cfqq);
1527		cfq_clear_cfqq_must_alloc_slice(cfqq);
1528		cfq_clear_cfqq_fifo_expire(cfqq);
1529		cfq_mark_cfqq_slice_new(cfqq);
1530
1531		del_timer(&cfqd->idle_slice_timer);
1532	}
1533
1534	cfqd->active_queue = cfqq;
1535}
1536
1537/*
1538 * current cfqq expired its slice (or was too idle), select new one
1539 */
1540static void
1541__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1542		    bool timed_out)
1543{
1544	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1545
1546	if (cfq_cfqq_wait_request(cfqq))
1547		del_timer(&cfqd->idle_slice_timer);
1548
1549	cfq_clear_cfqq_wait_request(cfqq);
1550	cfq_clear_cfqq_wait_busy(cfqq);
1551
1552	/*
1553	 * If this cfqq is shared between multiple processes, check to
1554	 * make sure that those processes are still issuing I/Os within
1555	 * the mean seek distance.  If not, it may be time to break the
1556	 * queues apart again.
1557	 */
1558	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
1559		cfq_mark_cfqq_split_coop(cfqq);
1560
1561	/*
1562	 * store what was left of this slice, if the queue idled/timed out
1563	 */
1564	if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
1565		cfqq->slice_resid = cfqq->slice_end - jiffies;
1566		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1567	}
1568
1569	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1570
1571	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1572		cfq_del_cfqq_rr(cfqd, cfqq);
1573
1574	cfq_resort_rr_list(cfqd, cfqq);
1575
1576	if (cfqq == cfqd->active_queue)
1577		cfqd->active_queue = NULL;
1578
1579	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1580		cfqd->grp_service_tree.active = NULL;
1581
1582	if (cfqd->active_cic) {
1583		put_io_context(cfqd->active_cic->ioc);
1584		cfqd->active_cic = NULL;
1585	}
1586}
1587
1588static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
1589{
1590	struct cfq_queue *cfqq = cfqd->active_queue;
1591
1592	if (cfqq)
1593		__cfq_slice_expired(cfqd, cfqq, timed_out);
1594}
1595
1596/*
1597 * Get next queue for service. Unless we have a queue preemption,
1598 * we'll simply select the first cfqq in the service tree.
1599 */
1600static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1601{
1602	struct cfq_rb_root *service_tree =
1603		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1604					cfqd->serving_type);
1605
1606	if (!cfqd->rq_queued)
1607		return NULL;
1608
1609	/* There is nothing to dispatch */
1610	if (!service_tree)
1611		return NULL;
1612	if (RB_EMPTY_ROOT(&service_tree->rb))
1613		return NULL;
1614	return cfq_rb_first(service_tree);
1615}
1616
1617static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1618{
1619	struct cfq_group *cfqg;
1620	struct cfq_queue *cfqq;
1621	int i, j;
1622	struct cfq_rb_root *st;
1623
1624	if (!cfqd->rq_queued)
1625		return NULL;
1626
1627	cfqg = cfq_get_next_cfqg(cfqd);
1628	if (!cfqg)
1629		return NULL;
1630
1631	for_each_cfqg_st(cfqg, i, j, st)
1632		if ((cfqq = cfq_rb_first(st)) != NULL)
1633			return cfqq;
1634	return NULL;
1635}
1636
1637/*
1638 * Get and set a new active queue for service.
1639 */
1640static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
1641					      struct cfq_queue *cfqq)
1642{
1643	if (!cfqq)
1644		cfqq = cfq_get_next_queue(cfqd);
1645
1646	__cfq_set_active_queue(cfqd, cfqq);
1647	return cfqq;
1648}
1649
1650static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
1651					  struct request *rq)
1652{
1653	if (blk_rq_pos(rq) >= cfqd->last_position)
1654		return blk_rq_pos(rq) - cfqd->last_position;
1655	else
1656		return cfqd->last_position - blk_rq_pos(rq);
1657}
1658
1659static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1660			       struct request *rq)
1661{
1662	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
1663}
1664
1665static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1666				    struct cfq_queue *cur_cfqq)
1667{
1668	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
1669	struct rb_node *parent, *node;
1670	struct cfq_queue *__cfqq;
1671	sector_t sector = cfqd->last_position;
1672
1673	if (RB_EMPTY_ROOT(root))
1674		return NULL;
1675
1676	/*
1677	 * First, if we find a request starting at the end of the last
1678	 * request, choose it.
1679	 */
1680	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
1681	if (__cfqq)
1682		return __cfqq;
1683
1684	/*
1685	 * If the exact sector wasn't found, the parent of the NULL leaf
1686	 * will contain the closest sector.
1687	 */
1688	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
1689	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1690		return __cfqq;
1691
1692	if (blk_rq_pos(__cfqq->next_rq) < sector)
1693		node = rb_next(&__cfqq->p_node);
1694	else
1695		node = rb_prev(&__cfqq->p_node);
1696	if (!node)
1697		return NULL;
1698
1699	__cfqq = rb_entry(node, struct cfq_queue, p_node);
1700	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1701		return __cfqq;
1702
1703	return NULL;
1704}
1705
1706/*
1707 * cfqd - obvious
1708 * cur_cfqq - passed in so that we don't decide that the current queue is
1709 * 	      closely cooperating with itself.
1710 *
1711 * So, basically we're assuming that that cur_cfqq has dispatched at least
1712 * one request, and that cfqd->last_position reflects a position on the disk
1713 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
1714 * assumption.
1715 */
1716static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1717					      struct cfq_queue *cur_cfqq)
1718{
1719	struct cfq_queue *cfqq;
1720
1721	if (cfq_class_idle(cur_cfqq))
1722		return NULL;
1723	if (!cfq_cfqq_sync(cur_cfqq))
1724		return NULL;
1725	if (CFQQ_SEEKY(cur_cfqq))
1726		return NULL;
1727
1728	/*
1729	 * Don't search priority tree if it's the only queue in the group.
1730	 */
1731	if (cur_cfqq->cfqg->nr_cfqq == 1)
1732		return NULL;
1733
1734	/*
1735	 * We should notice if some of the queues are cooperating, eg
1736	 * working closely on the same area of the disk. In that case,
1737	 * we can group them together and don't waste time idling.
1738	 */
1739	cfqq = cfqq_close(cfqd, cur_cfqq);
1740	if (!cfqq)
1741		return NULL;
1742
1743	/* If new queue belongs to different cfq_group, don't choose it */
1744	if (cur_cfqq->cfqg != cfqq->cfqg)
1745		return NULL;
1746
1747	/*
1748	 * It only makes sense to merge sync queues.
1749	 */
1750	if (!cfq_cfqq_sync(cfqq))
1751		return NULL;
1752	if (CFQQ_SEEKY(cfqq))
1753		return NULL;
1754
1755	/*
1756	 * Do not merge queues of different priority classes
1757	 */
1758	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1759		return NULL;
1760
1761	return cfqq;
1762}
1763
1764/*
1765 * Determine whether we should enforce idle window for this queue.
1766 */
1767
1768static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1769{
1770	enum wl_prio_t prio = cfqq_prio(cfqq);
1771	struct cfq_rb_root *service_tree = cfqq->service_tree;
1772
1773	BUG_ON(!service_tree);
1774	BUG_ON(!service_tree->count);
1775
1776	/* We never do for idle class queues. */
1777	if (prio == IDLE_WORKLOAD)
1778		return false;
1779
1780	/* We do for queues that were marked with idle window flag. */
1781	if (cfq_cfqq_idle_window(cfqq) &&
1782	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1783		return true;
1784
1785	/*
1786	 * Otherwise, we do only if they are the last ones
1787	 * in their service tree.
1788	 */
1789	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1790		return 1;
1791	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1792			service_tree->count);
1793	return 0;
1794}
1795
1796static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1797{
1798	struct cfq_queue *cfqq = cfqd->active_queue;
1799	struct cfq_io_context *cic;
1800	unsigned long sl;
1801
1802	/*
1803	 * SSD device without seek penalty, disable idling. But only do so
1804	 * for devices that support queuing, otherwise we still have a problem
1805	 * with sync vs async workloads.
1806	 */
1807	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
1808		return;
1809
1810	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
1811	WARN_ON(cfq_cfqq_slice_new(cfqq));
1812
1813	/*
1814	 * idle is disabled, either manually or by past process history
1815	 */
1816	if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
1817		return;
1818
1819	/*
1820	 * still active requests from this queue, don't idle
1821	 */
1822	if (cfqq->dispatched)
1823		return;
1824
1825	/*
1826	 * task has exited, don't wait
1827	 */
1828	cic = cfqd->active_cic;
1829	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
1830		return;
1831
1832	/*
1833	 * If our average think time is larger than the remaining time
1834	 * slice, then don't idle. This avoids overrunning the allotted
1835	 * time slice.
1836	 */
1837	if (sample_valid(cic->ttime_samples) &&
1838	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
1839		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
1840				cic->ttime_mean);
1841		return;
1842	}
1843
1844	cfq_mark_cfqq_wait_request(cfqq);
1845
1846	sl = cfqd->cfq_slice_idle;
1847
1848	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1849	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
1850}
1851
1852/*
1853 * Move request from internal lists to the request queue dispatch list.
1854 */
1855static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1856{
1857	struct cfq_data *cfqd = q->elevator->elevator_data;
1858	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1859
1860	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
1861
1862	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
1863	cfq_remove_request(rq);
1864	cfqq->dispatched++;
1865	elv_dispatch_sort(q, rq);
1866
1867	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1868}
1869
1870/*
1871 * return expired entry, or NULL to just start from scratch in rbtree
1872 */
1873static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
1874{
1875	struct request *rq = NULL;
1876
1877	if (cfq_cfqq_fifo_expire(cfqq))
1878		return NULL;
1879
1880	cfq_mark_cfqq_fifo_expire(cfqq);
1881
1882	if (list_empty(&cfqq->fifo))
1883		return NULL;
1884
1885	rq = rq_entry_fifo(cfqq->fifo.next);
1886	if (time_before(jiffies, rq_fifo_time(rq)))
1887		rq = NULL;
1888
1889	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
1890	return rq;
1891}
1892
1893static inline int
1894cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1895{
1896	const int base_rq = cfqd->cfq_slice_async_rq;
1897
1898	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1899
1900	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
1901}
1902
1903/*
1904 * Must be called with the queue_lock held.
1905 */
1906static int cfqq_process_refs(struct cfq_queue *cfqq)
1907{
1908	int process_refs, io_refs;
1909
1910	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
1911	process_refs = atomic_read(&cfqq->ref) - io_refs;
1912	BUG_ON(process_refs < 0);
1913	return process_refs;
1914}
1915
1916static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
1917{
1918	int process_refs, new_process_refs;
1919	struct cfq_queue *__cfqq;
1920
1921	/* Avoid a circular list and skip interim queue merges */
1922	while ((__cfqq = new_cfqq->new_cfqq)) {
1923		if (__cfqq == cfqq)
1924			return;
1925		new_cfqq = __cfqq;
1926	}
1927
1928	process_refs = cfqq_process_refs(cfqq);
1929	/*
1930	 * If the process for the cfqq has gone away, there is no
1931	 * sense in merging the queues.
1932	 */
1933	if (process_refs == 0)
1934		return;
1935
1936	/*
1937	 * Merge in the direction of the lesser amount of work.
1938	 */
1939	new_process_refs = cfqq_process_refs(new_cfqq);
1940	if (new_process_refs >= process_refs) {
1941		cfqq->new_cfqq = new_cfqq;
1942		atomic_add(process_refs, &new_cfqq->ref);
1943	} else {
1944		new_cfqq->new_cfqq = cfqq;
1945		atomic_add(new_process_refs, &cfqq->ref);
1946	}
1947}
1948
1949static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
1950				struct cfq_group *cfqg, enum wl_prio_t prio)
1951{
1952	struct cfq_queue *queue;
1953	int i;
1954	bool key_valid = false;
1955	unsigned long lowest_key = 0;
1956	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
1957
1958	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
1959		/* select the one with lowest rb_key */
1960		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
1961		if (queue &&
1962		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
1963			lowest_key = queue->rb_key;
1964			cur_best = i;
1965			key_valid = true;
1966		}
1967	}
1968
1969	return cur_best;
1970}
1971
1972static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
1973{
1974	unsigned slice;
1975	unsigned count;
1976	struct cfq_rb_root *st;
1977	unsigned group_slice;
1978
1979	if (!cfqg) {
1980		cfqd->serving_prio = IDLE_WORKLOAD;
1981		cfqd->workload_expires = jiffies + 1;
1982		return;
1983	}
1984
1985	/* Choose next priority. RT > BE > IDLE */
1986	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
1987		cfqd->serving_prio = RT_WORKLOAD;
1988	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
1989		cfqd->serving_prio = BE_WORKLOAD;
1990	else {
1991		cfqd->serving_prio = IDLE_WORKLOAD;
1992		cfqd->workload_expires = jiffies + 1;
1993		return;
1994	}
1995
1996	/*
1997	 * For RT and BE, we have to choose also the type
1998	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
1999	 * expiration time
2000	 */
2001	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2002	count = st->count;
2003
2004	/*
2005	 * check workload expiration, and that we still have other queues ready
2006	 */
2007	if (count && !time_after(jiffies, cfqd->workload_expires))
2008		return;
2009
2010	/* otherwise select new workload type */
2011	cfqd->serving_type =
2012		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2013	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2014	count = st->count;
2015
2016	/*
2017	 * the workload slice is computed as a fraction of target latency
2018	 * proportional to the number of queues in that workload, over
2019	 * all the queues in the same priority class
2020	 */
2021	group_slice = cfq_group_slice(cfqd, cfqg);
2022
2023	slice = group_slice * count /
2024		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2025		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2026
2027	if (cfqd->serving_type == ASYNC_WORKLOAD) {
2028		unsigned int tmp;
2029
2030		/*
2031		 * Async queues are currently system wide. Just taking
2032		 * proportion of queues with-in same group will lead to higher
2033		 * async ratio system wide as generally root group is going
2034		 * to have higher weight. A more accurate thing would be to
2035		 * calculate system wide asnc/sync ratio.
2036		 */
2037		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2038		tmp = tmp/cfqd->busy_queues;
2039		slice = min_t(unsigned, slice, tmp);
2040
2041		/* async workload slice is scaled down according to
2042		 * the sync/async slice ratio. */
2043		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2044	} else
2045		/* sync workload slice is at least 2 * cfq_slice_idle */
2046		slice = max(slice, 2 * cfqd->cfq_slice_idle);
2047
2048	slice = max_t(unsigned, slice, CFQ_MIN_TT);
2049	cfq_log(cfqd, "workload slice:%d", slice);
2050	cfqd->workload_expires = jiffies + slice;
2051	cfqd->noidle_tree_requires_idle = false;
2052}
2053
2054static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2055{
2056	struct cfq_rb_root *st = &cfqd->grp_service_tree;
2057	struct cfq_group *cfqg;
2058
2059	if (RB_EMPTY_ROOT(&st->rb))
2060		return NULL;
2061	cfqg = cfq_rb_first_group(st);
2062	st->active = &cfqg->rb_node;
2063	update_min_vdisktime(st);
2064	return cfqg;
2065}
2066
2067static void cfq_choose_cfqg(struct cfq_data *cfqd)
2068{
2069	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2070
2071	cfqd->serving_group = cfqg;
2072
2073	/* Restore the workload type data */
2074	if (cfqg->saved_workload_slice) {
2075		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2076		cfqd->serving_type = cfqg->saved_workload;
2077		cfqd->serving_prio = cfqg->saved_serving_prio;
2078	} else
2079		cfqd->workload_expires = jiffies - 1;
2080
2081	choose_service_tree(cfqd, cfqg);
2082}
2083
2084/*
2085 * Select a queue for service. If we have a current active queue,
2086 * check whether to continue servicing it, or retrieve and set a new one.
2087 */
2088static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2089{
2090	struct cfq_queue *cfqq, *new_cfqq = NULL;
2091
2092	cfqq = cfqd->active_queue;
2093	if (!cfqq)
2094		goto new_queue;
2095
2096	if (!cfqd->rq_queued)
2097		return NULL;
2098
2099	/*
2100	 * We were waiting for group to get backlogged. Expire the queue
2101	 */
2102	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2103		goto expire;
2104
2105	/*
2106	 * The active queue has run out of time, expire it and select new.
2107	 */
2108	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2109		/*
2110		 * If slice had not expired at the completion of last request
2111		 * we might not have turned on wait_busy flag. Don't expire
2112		 * the queue yet. Allow the group to get backlogged.
2113		 *
2114		 * The very fact that we have used the slice, that means we
2115		 * have been idling all along on this queue and it should be
2116		 * ok to wait for this request to complete.
2117		 */
2118		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2119		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2120			cfqq = NULL;
2121			goto keep_queue;
2122		} else
2123			goto expire;
2124	}
2125
2126	/*
2127	 * The active queue has requests and isn't expired, allow it to
2128	 * dispatch.
2129	 */
2130	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2131		goto keep_queue;
2132
2133	/*
2134	 * If another queue has a request waiting within our mean seek
2135	 * distance, let it run.  The expire code will check for close
2136	 * cooperators and put the close queue at the front of the service
2137	 * tree.  If possible, merge the expiring queue with the new cfqq.
2138	 */
2139	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2140	if (new_cfqq) {
2141		if (!cfqq->new_cfqq)
2142			cfq_setup_merge(cfqq, new_cfqq);
2143		goto expire;
2144	}
2145
2146	/*
2147	 * No requests pending. If the active queue still has requests in
2148	 * flight or is idling for a new request, allow either of these
2149	 * conditions to happen (or time out) before selecting a new queue.
2150	 */
2151	if (timer_pending(&cfqd->idle_slice_timer) ||
2152	    (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
2153		cfqq = NULL;
2154		goto keep_queue;
2155	}
2156
2157expire:
2158	cfq_slice_expired(cfqd, 0);
2159new_queue:
2160	/*
2161	 * Current queue expired. Check if we have to switch to a new
2162	 * service tree
2163	 */
2164	if (!new_cfqq)
2165		cfq_choose_cfqg(cfqd);
2166
2167	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2168keep_queue:
2169	return cfqq;
2170}
2171
2172static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2173{
2174	int dispatched = 0;
2175
2176	while (cfqq->next_rq) {
2177		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2178		dispatched++;
2179	}
2180
2181	BUG_ON(!list_empty(&cfqq->fifo));
2182
2183	/* By default cfqq is not expired if it is empty. Do it explicitly */
2184	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2185	return dispatched;
2186}
2187
2188/*
2189 * Drain our current requests. Used for barriers and when switching
2190 * io schedulers on-the-fly.
2191 */
2192static int cfq_forced_dispatch(struct cfq_data *cfqd)
2193{
2194	struct cfq_queue *cfqq;
2195	int dispatched = 0;
2196
2197	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
2198		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2199
2200	cfq_slice_expired(cfqd, 0);
2201	BUG_ON(cfqd->busy_queues);
2202
2203	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
2204	return dispatched;
2205}
2206
2207static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2208	struct cfq_queue *cfqq)
2209{
2210	/* the queue hasn't finished any request, can't estimate */
2211	if (cfq_cfqq_slice_new(cfqq))
2212		return 1;
2213	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2214		cfqq->slice_end))
2215		return 1;
2216
2217	return 0;
2218}
2219
2220static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2221{
2222	unsigned int max_dispatch;
2223
2224	/*
2225	 * Drain async requests before we start sync IO
2226	 */
2227	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
2228		return false;
2229
2230	/*
2231	 * If this is an async queue and we have sync IO in flight, let it wait
2232	 */
2233	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
2234		return false;
2235
2236	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2237	if (cfq_class_idle(cfqq))
2238		max_dispatch = 1;
2239
2240	/*
2241	 * Does this cfqq already have too much IO in flight?
2242	 */
2243	if (cfqq->dispatched >= max_dispatch) {
2244		/*
2245		 * idle queue must always only have a single IO in flight
2246		 */
2247		if (cfq_class_idle(cfqq))
2248			return false;
2249
2250		/*
2251		 * We have other queues, don't allow more IO from this one
2252		 */
2253		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
2254			return false;
2255
2256		/*
2257		 * Sole queue user, no limit
2258		 */
2259		if (cfqd->busy_queues == 1)
2260			max_dispatch = -1;
2261		else
2262			/*
2263			 * Normally we start throttling cfqq when cfq_quantum/2
2264			 * requests have been dispatched. But we can drive
2265			 * deeper queue depths at the beginning of slice
2266			 * subjected to upper limit of cfq_quantum.
2267			 * */
2268			max_dispatch = cfqd->cfq_quantum;
2269	}
2270
2271	/*
2272	 * Async queues must wait a bit before being allowed dispatch.
2273	 * We also ramp up the dispatch depth gradually for async IO,
2274	 * based on the last sync IO we serviced
2275	 */
2276	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
2277		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
2278		unsigned int depth;
2279
2280		depth = last_sync / cfqd->cfq_slice[1];
2281		if (!depth && !cfqq->dispatched)
2282			depth = 1;
2283		if (depth < max_dispatch)
2284			max_dispatch = depth;
2285	}
2286
2287	/*
2288	 * If we're below the current max, allow a dispatch
2289	 */
2290	return cfqq->dispatched < max_dispatch;
2291}
2292
2293/*
2294 * Dispatch a request from cfqq, moving them to the request queue
2295 * dispatch list.
2296 */
2297static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2298{
2299	struct request *rq;
2300
2301	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
2302
2303	if (!cfq_may_dispatch(cfqd, cfqq))
2304		return false;
2305
2306	/*
2307	 * follow expired path, else get first next available
2308	 */
2309	rq = cfq_check_fifo(cfqq);
2310	if (!rq)
2311		rq = cfqq->next_rq;
2312
2313	/*
2314	 * insert request into driver dispatch list
2315	 */
2316	cfq_dispatch_insert(cfqd->queue, rq);
2317
2318	if (!cfqd->active_cic) {
2319		struct cfq_io_context *cic = RQ_CIC(rq);
2320
2321		atomic_long_inc(&cic->ioc->refcount);
2322		cfqd->active_cic = cic;
2323	}
2324
2325	return true;
2326}
2327
2328/*
2329 * Find the cfqq that we need to service and move a request from that to the
2330 * dispatch list
2331 */
2332static int cfq_dispatch_requests(struct request_queue *q, int force)
2333{
2334	struct cfq_data *cfqd = q->elevator->elevator_data;
2335	struct cfq_queue *cfqq;
2336
2337	if (!cfqd->busy_queues)
2338		return 0;
2339
2340	if (unlikely(force))
2341		return cfq_forced_dispatch(cfqd);
2342
2343	cfqq = cfq_select_queue(cfqd);
2344	if (!cfqq)
2345		return 0;
2346
2347	/*
2348	 * Dispatch a request from this cfqq, if it is allowed
2349	 */
2350	if (!cfq_dispatch_request(cfqd, cfqq))
2351		return 0;
2352
2353	cfqq->slice_dispatch++;
2354	cfq_clear_cfqq_must_dispatch(cfqq);
2355
2356	/*
2357	 * expire an async queue immediately if it has used up its slice. idle
2358	 * queue always expire after 1 dispatch round.
2359	 */
2360	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
2361	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
2362	    cfq_class_idle(cfqq))) {
2363		cfqq->slice_end = jiffies + 1;
2364		cfq_slice_expired(cfqd, 0);
2365	}
2366
2367	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2368	return 1;
2369}
2370
2371/*
2372 * task holds one reference to the queue, dropped when task exits. each rq
2373 * in-flight on this queue also holds a reference, dropped when rq is freed.
2374 *
2375 * Each cfq queue took a reference on the parent group. Drop it now.
2376 * queue lock must be held here.
2377 */
2378static void cfq_put_queue(struct cfq_queue *cfqq)
2379{
2380	struct cfq_data *cfqd = cfqq->cfqd;
2381	struct cfq_group *cfqg, *orig_cfqg;
2382
2383	BUG_ON(atomic_read(&cfqq->ref) <= 0);
2384
2385	if (!atomic_dec_and_test(&cfqq->ref))
2386		return;
2387
2388	cfq_log_cfqq(cfqd, cfqq, "put_queue");
2389	BUG_ON(rb_first(&cfqq->sort_list));
2390	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2391	cfqg = cfqq->cfqg;
2392	orig_cfqg = cfqq->orig_cfqg;
2393
2394	if (unlikely(cfqd->active_queue == cfqq)) {
2395		__cfq_slice_expired(cfqd, cfqq, 0);
2396		cfq_schedule_dispatch(cfqd);
2397	}
2398
2399	BUG_ON(cfq_cfqq_on_rr(cfqq));
2400	kmem_cache_free(cfq_pool, cfqq);
2401	cfq_put_cfqg(cfqg);
2402	if (orig_cfqg)
2403		cfq_put_cfqg(orig_cfqg);
2404}
2405
2406/*
2407 * Must always be called with the rcu_read_lock() held
2408 */
2409static void
2410__call_for_each_cic(struct io_context *ioc,
2411		    void (*func)(struct io_context *, struct cfq_io_context *))
2412{
2413	struct cfq_io_context *cic;
2414	struct hlist_node *n;
2415
2416	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2417		func(ioc, cic);
2418}
2419
2420/*
2421 * Call func for each cic attached to this ioc.
2422 */
2423static void
2424call_for_each_cic(struct io_context *ioc,
2425		  void (*func)(struct io_context *, struct cfq_io_context *))
2426{
2427	rcu_read_lock();
2428	__call_for_each_cic(ioc, func);
2429	rcu_read_unlock();
2430}
2431
2432static void cfq_cic_free_rcu(struct rcu_head *head)
2433{
2434	struct cfq_io_context *cic;
2435
2436	cic = container_of(head, struct cfq_io_context, rcu_head);
2437
2438	kmem_cache_free(cfq_ioc_pool, cic);
2439	elv_ioc_count_dec(cfq_ioc_count);
2440
2441	if (ioc_gone) {
2442		/*
2443		 * CFQ scheduler is exiting, grab exit lock and check
2444		 * the pending io context count. If it hits zero,
2445		 * complete ioc_gone and set it back to NULL
2446		 */
2447		spin_lock(&ioc_gone_lock);
2448		if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
2449			complete(ioc_gone);
2450			ioc_gone = NULL;
2451		}
2452		spin_unlock(&ioc_gone_lock);
2453	}
2454}
2455
2456static void cfq_cic_free(struct cfq_io_context *cic)
2457{
2458	call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
2459}
2460
2461static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
2462{
2463	unsigned long flags;
2464
2465	BUG_ON(!cic->dead_key);
2466
2467	spin_lock_irqsave(&ioc->lock, flags);
2468	radix_tree_delete(&ioc->radix_root, cic->dead_key);
2469	hlist_del_rcu(&cic->cic_list);
2470	spin_unlock_irqrestore(&ioc->lock, flags);
2471
2472	cfq_cic_free(cic);
2473}
2474
2475/*
2476 * Must be called with rcu_read_lock() held or preemption otherwise disabled.
2477 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
2478 * and ->trim() which is called with the task lock held
2479 */
2480static void cfq_free_io_context(struct io_context *ioc)
2481{
2482	/*
2483	 * ioc->refcount is zero here, or we are called from elv_unregister(),
2484	 * so no more cic's are allowed to be linked into this ioc.  So it
2485	 * should be ok to iterate over the known list, we will see all cic's
2486	 * since no new ones are added.
2487	 */
2488	__call_for_each_cic(ioc, cic_free_func);
2489}
2490
2491static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2492{
2493	struct cfq_queue *__cfqq, *next;
2494
2495	if (unlikely(cfqq == cfqd->active_queue)) {
2496		__cfq_slice_expired(cfqd, cfqq, 0);
2497		cfq_schedule_dispatch(cfqd);
2498	}
2499
2500	/*
2501	 * If this queue was scheduled to merge with another queue, be
2502	 * sure to drop the reference taken on that queue (and others in
2503	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
2504	 */
2505	__cfqq = cfqq->new_cfqq;
2506	while (__cfqq) {
2507		if (__cfqq == cfqq) {
2508			WARN(1, "cfqq->new_cfqq loop detected\n");
2509			break;
2510		}
2511		next = __cfqq->new_cfqq;
2512		cfq_put_queue(__cfqq);
2513		__cfqq = next;
2514	}
2515
2516	cfq_put_queue(cfqq);
2517}
2518
2519static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2520					 struct cfq_io_context *cic)
2521{
2522	struct io_context *ioc = cic->ioc;
2523
2524	list_del_init(&cic->queue_list);
2525
2526	/*
2527	 * Make sure key == NULL is seen for dead queues
2528	 */
2529	smp_wmb();
2530	cic->dead_key = (unsigned long) cic->key;
2531	cic->key = NULL;
2532
2533	if (ioc->ioc_data == cic)
2534		rcu_assign_pointer(ioc->ioc_data, NULL);
2535
2536	if (cic->cfqq[BLK_RW_ASYNC]) {
2537		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
2538		cic->cfqq[BLK_RW_ASYNC] = NULL;
2539	}
2540
2541	if (cic->cfqq[BLK_RW_SYNC]) {
2542		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
2543		cic->cfqq[BLK_RW_SYNC] = NULL;
2544	}
2545}
2546
2547static void cfq_exit_single_io_context(struct io_context *ioc,
2548				       struct cfq_io_context *cic)
2549{
2550	struct cfq_data *cfqd = cic->key;
2551
2552	if (cfqd) {
2553		struct request_queue *q = cfqd->queue;
2554		unsigned long flags;
2555
2556		spin_lock_irqsave(q->queue_lock, flags);
2557
2558		/*
2559		 * Ensure we get a fresh copy of the ->key to prevent
2560		 * race between exiting task and queue
2561		 */
2562		smp_read_barrier_depends();
2563		if (cic->key)
2564			__cfq_exit_single_io_context(cfqd, cic);
2565
2566		spin_unlock_irqrestore(q->queue_lock, flags);
2567	}
2568}
2569
2570/*
2571 * The process that ioc belongs to has exited, we need to clean up
2572 * and put the internal structures we have that belongs to that process.
2573 */
2574static void cfq_exit_io_context(struct io_context *ioc)
2575{
2576	call_for_each_cic(ioc, cfq_exit_single_io_context);
2577}
2578
2579static struct cfq_io_context *
2580cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2581{
2582	struct cfq_io_context *cic;
2583
2584	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
2585							cfqd->queue->node);
2586	if (cic) {
2587		cic->last_end_request = jiffies;
2588		INIT_LIST_HEAD(&cic->queue_list);
2589		INIT_HLIST_NODE(&cic->cic_list);
2590		cic->dtor = cfq_free_io_context;
2591		cic->exit = cfq_exit_io_context;
2592		elv_ioc_count_inc(cfq_ioc_count);
2593	}
2594
2595	return cic;
2596}
2597
2598static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2599{
2600	struct task_struct *tsk = current;
2601	int ioprio_class;
2602
2603	if (!cfq_cfqq_prio_changed(cfqq))
2604		return;
2605
2606	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2607	switch (ioprio_class) {
2608	default:
2609		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
2610	case IOPRIO_CLASS_NONE:
2611		/*
2612		 * no prio set, inherit CPU scheduling settings
2613		 */
2614		cfqq->ioprio = task_nice_ioprio(tsk);
2615		cfqq->ioprio_class = task_nice_ioclass(tsk);
2616		break;
2617	case IOPRIO_CLASS_RT:
2618		cfqq->ioprio = task_ioprio(ioc);
2619		cfqq->ioprio_class = IOPRIO_CLASS_RT;
2620		break;
2621	case IOPRIO_CLASS_BE:
2622		cfqq->ioprio = task_ioprio(ioc);
2623		cfqq->ioprio_class = IOPRIO_CLASS_BE;
2624		break;
2625	case IOPRIO_CLASS_IDLE:
2626		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
2627		cfqq->ioprio = 7;
2628		cfq_clear_cfqq_idle_window(cfqq);
2629		break;
2630	}
2631
2632	/*
2633	 * keep track of original prio settings in case we have to temporarily
2634	 * elevate the priority of this queue
2635	 */
2636	cfqq->org_ioprio = cfqq->ioprio;
2637	cfqq->org_ioprio_class = cfqq->ioprio_class;
2638	cfq_clear_cfqq_prio_changed(cfqq);
2639}
2640
2641static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
2642{
2643	struct cfq_data *cfqd = cic->key;
2644	struct cfq_queue *cfqq;
2645	unsigned long flags;
2646
2647	if (unlikely(!cfqd))
2648		return;
2649
2650	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2651
2652	cfqq = cic->cfqq[BLK_RW_ASYNC];
2653	if (cfqq) {
2654		struct cfq_queue *new_cfqq;
2655		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
2656						GFP_ATOMIC);
2657		if (new_cfqq) {
2658			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2659			cfq_put_queue(cfqq);
2660		}
2661	}
2662
2663	cfqq = cic->cfqq[BLK_RW_SYNC];
2664	if (cfqq)
2665		cfq_mark_cfqq_prio_changed(cfqq);
2666
2667	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2668}
2669
2670static void cfq_ioc_set_ioprio(struct io_context *ioc)
2671{
2672	call_for_each_cic(ioc, changed_ioprio);
2673	ioc->ioprio_changed = 0;
2674}
2675
2676static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2677			  pid_t pid, bool is_sync)
2678{
2679	RB_CLEAR_NODE(&cfqq->rb_node);
2680	RB_CLEAR_NODE(&cfqq->p_node);
2681	INIT_LIST_HEAD(&cfqq->fifo);
2682
2683	atomic_set(&cfqq->ref, 0);
2684	cfqq->cfqd = cfqd;
2685
2686	cfq_mark_cfqq_prio_changed(cfqq);
2687
2688	if (is_sync) {
2689		if (!cfq_class_idle(cfqq))
2690			cfq_mark_cfqq_idle_window(cfqq);
2691		cfq_mark_cfqq_sync(cfqq);
2692	}
2693	cfqq->pid = pid;
2694}
2695
2696#ifdef CONFIG_CFQ_GROUP_IOSCHED
2697static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2698{
2699	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2700	struct cfq_data *cfqd = cic->key;
2701	unsigned long flags;
2702	struct request_queue *q;
2703
2704	if (unlikely(!cfqd))
2705		return;
2706
2707	q = cfqd->queue;
2708
2709	spin_lock_irqsave(q->queue_lock, flags);
2710
2711	if (sync_cfqq) {
2712		/*
2713		 * Drop reference to sync queue. A new sync queue will be
2714		 * assigned in new group upon arrival of a fresh request.
2715		 */
2716		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2717		cic_set_cfqq(cic, NULL, 1);
2718		cfq_put_queue(sync_cfqq);
2719	}
2720
2721	spin_unlock_irqrestore(q->queue_lock, flags);
2722}
2723
2724static void cfq_ioc_set_cgroup(struct io_context *ioc)
2725{
2726	call_for_each_cic(ioc, changed_cgroup);
2727	ioc->cgroup_changed = 0;
2728}
2729#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
2730
2731static struct cfq_queue *
2732cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2733		     struct io_context *ioc, gfp_t gfp_mask)
2734{
2735	struct cfq_queue *cfqq, *new_cfqq = NULL;
2736	struct cfq_io_context *cic;
2737	struct cfq_group *cfqg;
2738
2739retry:
2740	cfqg = cfq_get_cfqg(cfqd, 1);
2741	cic = cfq_cic_lookup(cfqd, ioc);
2742	/* cic always exists here */
2743	cfqq = cic_to_cfqq(cic, is_sync);
2744
2745	/*
2746	 * Always try a new alloc if we fell back to the OOM cfqq
2747	 * originally, since it should just be a temporary situation.
2748	 */
2749	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2750		cfqq = NULL;
2751		if (new_cfqq) {
2752			cfqq = new_cfqq;
2753			new_cfqq = NULL;
2754		} else if (gfp_mask & __GFP_WAIT) {
2755			spin_unlock_irq(cfqd->queue->queue_lock);
2756			new_cfqq = kmem_cache_alloc_node(cfq_pool,
2757					gfp_mask | __GFP_ZERO,
2758					cfqd->queue->node);
2759			spin_lock_irq(cfqd->queue->queue_lock);
2760			if (new_cfqq)
2761				goto retry;
2762		} else {
2763			cfqq = kmem_cache_alloc_node(cfq_pool,
2764					gfp_mask | __GFP_ZERO,
2765					cfqd->queue->node);
2766		}
2767
2768		if (cfqq) {
2769			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
2770			cfq_init_prio_data(cfqq, ioc);
2771			cfq_link_cfqq_cfqg(cfqq, cfqg);
2772			cfq_log_cfqq(cfqd, cfqq, "alloced");
2773		} else
2774			cfqq = &cfqd->oom_cfqq;
2775	}
2776
2777	if (new_cfqq)
2778		kmem_cache_free(cfq_pool, new_cfqq);
2779
2780	return cfqq;
2781}
2782
2783static struct cfq_queue **
2784cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2785{
2786	switch (ioprio_class) {
2787	case IOPRIO_CLASS_RT:
2788		return &cfqd->async_cfqq[0][ioprio];
2789	case IOPRIO_CLASS_BE:
2790		return &cfqd->async_cfqq[1][ioprio];
2791	case IOPRIO_CLASS_IDLE:
2792		return &cfqd->async_idle_cfqq;
2793	default:
2794		BUG();
2795	}
2796}
2797
2798static struct cfq_queue *
2799cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2800	      gfp_t gfp_mask)
2801{
2802	const int ioprio = task_ioprio(ioc);
2803	const int ioprio_class = task_ioprio_class(ioc);
2804	struct cfq_queue **async_cfqq = NULL;
2805	struct cfq_queue *cfqq = NULL;
2806
2807	if (!is_sync) {
2808		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
2809		cfqq = *async_cfqq;
2810	}
2811
2812	if (!cfqq)
2813		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
2814
2815	/*
2816	 * pin the queue now that it's allocated, scheduler exit will prune it
2817	 */
2818	if (!is_sync && !(*async_cfqq)) {
2819		atomic_inc(&cfqq->ref);
2820		*async_cfqq = cfqq;
2821	}
2822
2823	atomic_inc(&cfqq->ref);
2824	return cfqq;
2825}
2826
2827/*
2828 * We drop cfq io contexts lazily, so we may find a dead one.
2829 */
2830static void
2831cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
2832		  struct cfq_io_context *cic)
2833{
2834	unsigned long flags;
2835
2836	WARN_ON(!list_empty(&cic->queue_list));
2837
2838	spin_lock_irqsave(&ioc->lock, flags);
2839
2840	BUG_ON(ioc->ioc_data == cic);
2841
2842	radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
2843	hlist_del_rcu(&cic->cic_list);
2844	spin_unlock_irqrestore(&ioc->lock, flags);
2845
2846	cfq_cic_free(cic);
2847}
2848
2849static struct cfq_io_context *
2850cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
2851{
2852	struct cfq_io_context *cic;
2853	unsigned long flags;
2854	void *k;
2855
2856	if (unlikely(!ioc))
2857		return NULL;
2858
2859	rcu_read_lock();
2860
2861	/*
2862	 * we maintain a last-hit cache, to avoid browsing over the tree
2863	 */
2864	cic = rcu_dereference(ioc->ioc_data);
2865	if (cic && cic->key == cfqd) {
2866		rcu_read_unlock();
2867		return cic;
2868	}
2869
2870	do {
2871		cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
2872		rcu_read_unlock();
2873		if (!cic)
2874			break;
2875		/* ->key must be copied to avoid race with cfq_exit_queue() */
2876		k = cic->key;
2877		if (unlikely(!k)) {
2878			cfq_drop_dead_cic(cfqd, ioc, cic);
2879			rcu_read_lock();
2880			continue;
2881		}
2882
2883		spin_lock_irqsave(&ioc->lock, flags);
2884		rcu_assign_pointer(ioc->ioc_data, cic);
2885		spin_unlock_irqrestore(&ioc->lock, flags);
2886		break;
2887	} while (1);
2888
2889	return cic;
2890}
2891
2892/*
2893 * Add cic into ioc, using cfqd as the search key. This enables us to lookup
2894 * the process specific cfq io context when entered from the block layer.
2895 * Also adds the cic to a per-cfqd list, used when this queue is removed.
2896 */
2897static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
2898			struct cfq_io_context *cic, gfp_t gfp_mask)
2899{
2900	unsigned long flags;
2901	int ret;
2902
2903	ret = radix_tree_preload(gfp_mask);
2904	if (!ret) {
2905		cic->ioc = ioc;
2906		cic->key = cfqd;
2907
2908		spin_lock_irqsave(&ioc->lock, flags);
2909		ret = radix_tree_insert(&ioc->radix_root,
2910						(unsigned long) cfqd, cic);
2911		if (!ret)
2912			hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
2913		spin_unlock_irqrestore(&ioc->lock, flags);
2914
2915		radix_tree_preload_end();
2916
2917		if (!ret) {
2918			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2919			list_add(&cic->queue_list, &cfqd->cic_list);
2920			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2921		}
2922	}
2923
2924	if (ret)
2925		printk(KERN_ERR "cfq: cic link failed!\n");
2926
2927	return ret;
2928}
2929
2930/*
2931 * Setup general io context and cfq io context. There can be several cfq
2932 * io contexts per general io context, if this process is doing io to more
2933 * than one device managed by cfq.
2934 */
2935static struct cfq_io_context *
2936cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2937{
2938	struct io_context *ioc = NULL;
2939	struct cfq_io_context *cic;
2940
2941	might_sleep_if(gfp_mask & __GFP_WAIT);
2942
2943	ioc = get_io_context(gfp_mask, cfqd->queue->node);
2944	if (!ioc)
2945		return NULL;
2946
2947	cic = cfq_cic_lookup(cfqd, ioc);
2948	if (cic)
2949		goto out;
2950
2951	cic = cfq_alloc_io_context(cfqd, gfp_mask);
2952	if (cic == NULL)
2953		goto err;
2954
2955	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
2956		goto err_free;
2957
2958out:
2959	smp_read_barrier_depends();
2960	if (unlikely(ioc->ioprio_changed))
2961		cfq_ioc_set_ioprio(ioc);
2962
2963#ifdef CONFIG_CFQ_GROUP_IOSCHED
2964	if (unlikely(ioc->cgroup_changed))
2965		cfq_ioc_set_cgroup(ioc);
2966#endif
2967	return cic;
2968err_free:
2969	cfq_cic_free(cic);
2970err:
2971	put_io_context(ioc);
2972	return NULL;
2973}
2974
2975static void
2976cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
2977{
2978	unsigned long elapsed = jiffies - cic->last_end_request;
2979	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
2980
2981	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
2982	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
2983	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
2984}
2985
2986static void
2987cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2988		       struct request *rq)
2989{
2990	sector_t sdist = 0;
2991	sector_t n_sec = blk_rq_sectors(rq);
2992	if (cfqq->last_request_pos) {
2993		if (cfqq->last_request_pos < blk_rq_pos(rq))
2994			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
2995		else
2996			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
2997	}
2998
2999	cfqq->seek_history <<= 1;
3000	if (blk_queue_nonrot(cfqd->queue))
3001		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3002	else
3003		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3004}
3005
3006/*
3007 * Disable idle window if the process thinks too long or seeks so much that
3008 * it doesn't matter
3009 */
3010static void
3011cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3012		       struct cfq_io_context *cic)
3013{
3014	int old_idle, enable_idle;
3015
3016	/*
3017	 * Don't idle for async or idle io prio class
3018	 */
3019	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3020		return;
3021
3022	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3023
3024	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3025		cfq_mark_cfqq_deep(cfqq);
3026
3027	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3028	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3029		enable_idle = 0;
3030	else if (sample_valid(cic->ttime_samples)) {
3031		if (cic->ttime_mean > cfqd->cfq_slice_idle)
3032			enable_idle = 0;
3033		else
3034			enable_idle = 1;
3035	}
3036
3037	if (old_idle != enable_idle) {
3038		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3039		if (enable_idle)
3040			cfq_mark_cfqq_idle_window(cfqq);
3041		else
3042			cfq_clear_cfqq_idle_window(cfqq);
3043	}
3044}
3045
3046/*
3047 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3048 * no or if we aren't sure, a 1 will cause a preempt.
3049 */
3050static bool
3051cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3052		   struct request *rq)
3053{
3054	struct cfq_queue *cfqq;
3055
3056	cfqq = cfqd->active_queue;
3057	if (!cfqq)
3058		return false;
3059
3060	if (cfq_class_idle(new_cfqq))
3061		return false;
3062
3063	if (cfq_class_idle(cfqq))
3064		return true;
3065
3066	/*
3067	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3068	 */
3069	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3070		return false;
3071
3072	/*
3073	 * if the new request is sync, but the currently running queue is
3074	 * not, let the sync request have priority.
3075	 */
3076	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3077		return true;
3078
3079	if (new_cfqq->cfqg != cfqq->cfqg)
3080		return false;
3081
3082	if (cfq_slice_used(cfqq))
3083		return true;
3084
3085	/* Allow preemption only if we are idling on sync-noidle tree */
3086	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3087	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3088	    new_cfqq->service_tree->count == 2 &&
3089	    RB_EMPTY_ROOT(&cfqq->sort_list))
3090		return true;
3091
3092	/*
3093	 * So both queues are sync. Let the new request get disk time if
3094	 * it's a metadata request and the current queue is doing regular IO.
3095	 */
3096	if (rq_is_meta(rq) && !cfqq->meta_pending)
3097		return true;
3098
3099	/*
3100	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3101	 */
3102	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3103		return true;
3104
3105	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3106		return false;
3107
3108	/*
3109	 * if this request is as-good as one we would expect from the
3110	 * current cfqq, let it preempt
3111	 */
3112	if (cfq_rq_close(cfqd, cfqq, rq))
3113		return true;
3114
3115	return false;
3116}
3117
3118/*
3119 * cfqq preempts the active queue. if we allowed preempt with no slice left,
3120 * let it have half of its nominal slice.
3121 */
3122static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3123{
3124	cfq_log_cfqq(cfqd, cfqq, "preempt");
3125	cfq_slice_expired(cfqd, 1);
3126
3127	/*
3128	 * Put the new queue at the front of the of the current list,
3129	 * so we know that it will be selected next.
3130	 */
3131	BUG_ON(!cfq_cfqq_on_rr(cfqq));
3132
3133	cfq_service_tree_add(cfqd, cfqq, 1);
3134
3135	cfqq->slice_end = 0;
3136	cfq_mark_cfqq_slice_new(cfqq);
3137}
3138
3139/*
3140 * Called when a new fs request (rq) is added (to cfqq). Check if there's
3141 * something we should do about it
3142 */
3143static void
3144cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3145		struct request *rq)
3146{
3147	struct cfq_io_context *cic = RQ_CIC(rq);
3148
3149	cfqd->rq_queued++;
3150	if (rq_is_meta(rq))
3151		cfqq->meta_pending++;
3152
3153	cfq_update_io_thinktime(cfqd, cic);
3154	cfq_update_io_seektime(cfqd, cfqq, rq);
3155	cfq_update_idle_window(cfqd, cfqq, cic);
3156
3157	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3158
3159	if (cfqq == cfqd->active_queue) {
3160		/*
3161		 * Remember that we saw a request from this process, but
3162		 * don't start queuing just yet. Otherwise we risk seeing lots
3163		 * of tiny requests, because we disrupt the normal plugging
3164		 * and merging. If the request is already larger than a single
3165		 * page, let it rip immediately. For that case we assume that
3166		 * merging is already done. Ditto for a busy system that
3167		 * has other work pending, don't risk delaying until the
3168		 * idle timer unplug to continue working.
3169		 */
3170		if (cfq_cfqq_wait_request(cfqq)) {
3171			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3172			    cfqd->busy_queues > 1) {
3173				del_timer(&cfqd->idle_slice_timer);
3174				cfq_clear_cfqq_wait_request(cfqq);
3175				__blk_run_queue(cfqd->queue);
3176			} else
3177				cfq_mark_cfqq_must_dispatch(cfqq);
3178		}
3179	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3180		/*
3181		 * not the active queue - expire current slice if it is
3182		 * idle and has expired it's mean thinktime or this new queue
3183		 * has some old slice time left and is of higher priority or
3184		 * this new queue is RT and the current one is BE
3185		 */
3186		cfq_preempt_queue(cfqd, cfqq);
3187		__blk_run_queue(cfqd->queue);
3188	}
3189}
3190
3191static void cfq_insert_request(struct request_queue *q, struct request *rq)
3192{
3193	struct cfq_data *cfqd = q->elevator->elevator_data;
3194	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3195
3196	cfq_log_cfqq(cfqd, cfqq, "insert_request");
3197	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
3198
3199	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3200	list_add_tail(&rq->queuelist, &cfqq->fifo);
3201	cfq_add_rq_rb(rq);
3202
3203	cfq_rq_enqueued(cfqd, cfqq, rq);
3204}
3205
3206/*
3207 * Update hw_tag based on peak queue depth over 50 samples under
3208 * sufficient load.
3209 */
3210static void cfq_update_hw_tag(struct cfq_data *cfqd)
3211{
3212	struct cfq_queue *cfqq = cfqd->active_queue;
3213
3214	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3215		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3216
3217	if (cfqd->hw_tag == 1)
3218		return;
3219
3220	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3221	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3222		return;
3223
3224	/*
3225	 * If active queue hasn't enough requests and can idle, cfq might not
3226	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3227	 * case
3228	 */
3229	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3230	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3231	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3232		return;
3233
3234	if (cfqd->hw_tag_samples++ < 50)
3235		return;
3236
3237	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3238		cfqd->hw_tag = 1;
3239	else
3240		cfqd->hw_tag = 0;
3241}
3242
3243static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3244{
3245	struct cfq_io_context *cic = cfqd->active_cic;
3246
3247	/* If there are other queues in the group, don't wait */
3248	if (cfqq->cfqg->nr_cfqq > 1)
3249		return false;
3250
3251	if (cfq_slice_used(cfqq))
3252		return true;
3253
3254	/* if slice left is less than think time, wait busy */
3255	if (cic && sample_valid(cic->ttime_samples)
3256	    && (cfqq->slice_end - jiffies < cic->ttime_mean))
3257		return true;
3258
3259	/*
3260	 * If think times is less than a jiffy than ttime_mean=0 and above
3261	 * will not be true. It might happen that slice has not expired yet
3262	 * but will expire soon (4-5 ns) during select_queue(). To cover the
3263	 * case where think time is less than a jiffy, mark the queue wait
3264	 * busy if only 1 jiffy is left in the slice.
3265	 */
3266	if (cfqq->slice_end - jiffies == 1)
3267		return true;
3268
3269	return false;
3270}
3271
3272static void cfq_completed_request(struct request_queue *q, struct request *rq)
3273{
3274	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3275	struct cfq_data *cfqd = cfqq->cfqd;
3276	const int sync = rq_is_sync(rq);
3277	unsigned long now;
3278
3279	now = jiffies;
3280	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
3281
3282	cfq_update_hw_tag(cfqd);
3283
3284	WARN_ON(!cfqd->rq_in_driver);
3285	WARN_ON(!cfqq->dispatched);
3286	cfqd->rq_in_driver--;
3287	cfqq->dispatched--;
3288
3289	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3290
3291	if (sync) {
3292		RQ_CIC(rq)->last_end_request = now;
3293		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3294			cfqd->last_delayed_sync = now;
3295	}
3296
3297	/*
3298	 * If this is the active queue, check if it needs to be expired,
3299	 * or if we want to idle in case it has no pending requests.
3300	 */
3301	if (cfqd->active_queue == cfqq) {
3302		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3303
3304		if (cfq_cfqq_slice_new(cfqq)) {
3305			cfq_set_prio_slice(cfqd, cfqq);
3306			cfq_clear_cfqq_slice_new(cfqq);
3307		}
3308
3309		/*
3310		 * Should we wait for next request to come in before we expire
3311		 * the queue.
3312		 */
3313		if (cfq_should_wait_busy(cfqd, cfqq)) {
3314			cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
3315			cfq_mark_cfqq_wait_busy(cfqq);
3316			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3317		}
3318
3319		/*
3320		 * Idling is not enabled on:
3321		 * - expired queues
3322		 * - idle-priority queues
3323		 * - async queues
3324		 * - queues with still some requests queued
3325		 * - when there is a close cooperator
3326		 */
3327		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3328			cfq_slice_expired(cfqd, 1);
3329		else if (sync && cfqq_empty &&
3330			 !cfq_close_cooperator(cfqd, cfqq)) {
3331			cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
3332			/*
3333			 * Idling is enabled for SYNC_WORKLOAD.
3334			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3335			 * only if we processed at least one !rq_noidle request
3336			 */
3337			if (cfqd->serving_type == SYNC_WORKLOAD
3338			    || cfqd->noidle_tree_requires_idle
3339			    || cfqq->cfqg->nr_cfqq == 1)
3340				cfq_arm_slice_timer(cfqd);
3341		}
3342	}
3343
3344	if (!cfqd->rq_in_driver)
3345		cfq_schedule_dispatch(cfqd);
3346}
3347
3348/*
3349 * we temporarily boost lower priority queues if they are holding fs exclusive
3350 * resources. they are boosted to normal prio (CLASS_BE/4)
3351 */
3352static void cfq_prio_boost(struct cfq_queue *cfqq)
3353{
3354	if (has_fs_excl()) {
3355		/*
3356		 * boost idle prio on transactions that would lock out other
3357		 * users of the filesystem
3358		 */
3359		if (cfq_class_idle(cfqq))
3360			cfqq->ioprio_class = IOPRIO_CLASS_BE;
3361		if (cfqq->ioprio > IOPRIO_NORM)
3362			cfqq->ioprio = IOPRIO_NORM;
3363	} else {
3364		/*
3365		 * unboost the queue (if needed)
3366		 */
3367		cfqq->ioprio_class = cfqq->org_ioprio_class;
3368		cfqq->ioprio = cfqq->org_ioprio;
3369	}
3370}
3371
3372static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3373{
3374	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3375		cfq_mark_cfqq_must_alloc_slice(cfqq);
3376		return ELV_MQUEUE_MUST;
3377	}
3378
3379	return ELV_MQUEUE_MAY;
3380}
3381
3382static int cfq_may_queue(struct request_queue *q, int rw)
3383{
3384	struct cfq_data *cfqd = q->elevator->elevator_data;
3385	struct task_struct *tsk = current;
3386	struct cfq_io_context *cic;
3387	struct cfq_queue *cfqq;
3388
3389	/*
3390	 * don't force setup of a queue from here, as a call to may_queue
3391	 * does not necessarily imply that a request actually will be queued.
3392	 * so just lookup a possibly existing queue, or return 'may queue'
3393	 * if that fails
3394	 */
3395	cic = cfq_cic_lookup(cfqd, tsk->io_context);
3396	if (!cic)
3397		return ELV_MQUEUE_MAY;
3398
3399	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3400	if (cfqq) {
3401		cfq_init_prio_data(cfqq, cic->ioc);
3402		cfq_prio_boost(cfqq);
3403
3404		return __cfq_may_queue(cfqq);
3405	}
3406
3407	return ELV_MQUEUE_MAY;
3408}
3409
3410/*
3411 * queue lock held here
3412 */
3413static void cfq_put_request(struct request *rq)
3414{
3415	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3416
3417	if (cfqq) {
3418		const int rw = rq_data_dir(rq);
3419
3420		BUG_ON(!cfqq->allocated[rw]);
3421		cfqq->allocated[rw]--;
3422
3423		put_io_context(RQ_CIC(rq)->ioc);
3424
3425		rq->elevator_private = NULL;
3426		rq->elevator_private2 = NULL;
3427
3428		cfq_put_queue(cfqq);
3429	}
3430}
3431
3432static struct cfq_queue *
3433cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3434		struct cfq_queue *cfqq)
3435{
3436	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3437	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3438	cfq_mark_cfqq_coop(cfqq->new_cfqq);
3439	cfq_put_queue(cfqq);
3440	return cic_to_cfqq(cic, 1);
3441}
3442
3443/*
3444 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3445 * was the last process referring to said cfqq.
3446 */
3447static struct cfq_queue *
3448split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3449{
3450	if (cfqq_process_refs(cfqq) == 1) {
3451		cfqq->pid = current->pid;
3452		cfq_clear_cfqq_coop(cfqq);
3453		cfq_clear_cfqq_split_coop(cfqq);
3454		return cfqq;
3455	}
3456
3457	cic_set_cfqq(cic, NULL, 1);
3458	cfq_put_queue(cfqq);
3459	return NULL;
3460}
3461/*
3462 * Allocate cfq data structures associated with this request.
3463 */
3464static int
3465cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3466{
3467	struct cfq_data *cfqd = q->elevator->elevator_data;
3468	struct cfq_io_context *cic;
3469	const int rw = rq_data_dir(rq);
3470	const bool is_sync = rq_is_sync(rq);
3471	struct cfq_queue *cfqq;
3472	unsigned long flags;
3473
3474	might_sleep_if(gfp_mask & __GFP_WAIT);
3475
3476	cic = cfq_get_io_context(cfqd, gfp_mask);
3477
3478	spin_lock_irqsave(q->queue_lock, flags);
3479
3480	if (!cic)
3481		goto queue_fail;
3482
3483new_queue:
3484	cfqq = cic_to_cfqq(cic, is_sync);
3485	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3486		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
3487		cic_set_cfqq(cic, cfqq, is_sync);
3488	} else {
3489		/*
3490		 * If the queue was seeky for too long, break it apart.
3491		 */
3492		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3493			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3494			cfqq = split_cfqq(cic, cfqq);
3495			if (!cfqq)
3496				goto new_queue;
3497		}
3498
3499		/*
3500		 * Check to see if this queue is scheduled to merge with
3501		 * another, closely cooperating queue.  The merging of
3502		 * queues happens here as it must be done in process context.
3503		 * The reference on new_cfqq was taken in merge_cfqqs.
3504		 */
3505		if (cfqq->new_cfqq)
3506			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
3507	}
3508
3509	cfqq->allocated[rw]++;
3510	atomic_inc(&cfqq->ref);
3511
3512	spin_unlock_irqrestore(q->queue_lock, flags);
3513
3514	rq->elevator_private = cic;
3515	rq->elevator_private2 = cfqq;
3516	return 0;
3517
3518queue_fail:
3519	if (cic)
3520		put_io_context(cic->ioc);
3521
3522	cfq_schedule_dispatch(cfqd);
3523	spin_unlock_irqrestore(q->queue_lock, flags);
3524	cfq_log(cfqd, "set_request fail");
3525	return 1;
3526}
3527
3528static void cfq_kick_queue(struct work_struct *work)
3529{
3530	struct cfq_data *cfqd =
3531		container_of(work, struct cfq_data, unplug_work);
3532	struct request_queue *q = cfqd->queue;
3533
3534	spin_lock_irq(q->queue_lock);
3535	__blk_run_queue(cfqd->queue);
3536	spin_unlock_irq(q->queue_lock);
3537}
3538
3539/*
3540 * Timer running if the active_queue is currently idling inside its time slice
3541 */
3542static void cfq_idle_slice_timer(unsigned long data)
3543{
3544	struct cfq_data *cfqd = (struct cfq_data *) data;
3545	struct cfq_queue *cfqq;
3546	unsigned long flags;
3547	int timed_out = 1;
3548
3549	cfq_log(cfqd, "idle timer fired");
3550
3551	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3552
3553	cfqq = cfqd->active_queue;
3554	if (cfqq) {
3555		timed_out = 0;
3556
3557		/*
3558		 * We saw a request before the queue expired, let it through
3559		 */
3560		if (cfq_cfqq_must_dispatch(cfqq))
3561			goto out_kick;
3562
3563		/*
3564		 * expired
3565		 */
3566		if (cfq_slice_used(cfqq))
3567			goto expire;
3568
3569		/*
3570		 * only expire and reinvoke request handler, if there are
3571		 * other queues with pending requests
3572		 */
3573		if (!cfqd->busy_queues)
3574			goto out_cont;
3575
3576		/*
3577		 * not expired and it has a request pending, let it dispatch
3578		 */
3579		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3580			goto out_kick;
3581
3582		/*
3583		 * Queue depth flag is reset only when the idle didn't succeed
3584		 */
3585		cfq_clear_cfqq_deep(cfqq);
3586	}
3587expire:
3588	cfq_slice_expired(cfqd, timed_out);
3589out_kick:
3590	cfq_schedule_dispatch(cfqd);
3591out_cont:
3592	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3593}
3594
3595static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
3596{
3597	del_timer_sync(&cfqd->idle_slice_timer);
3598	cancel_work_sync(&cfqd->unplug_work);
3599}
3600
3601static void cfq_put_async_queues(struct cfq_data *cfqd)
3602{
3603	int i;
3604
3605	for (i = 0; i < IOPRIO_BE_NR; i++) {
3606		if (cfqd->async_cfqq[0][i])
3607			cfq_put_queue(cfqd->async_cfqq[0][i]);
3608		if (cfqd->async_cfqq[1][i])
3609			cfq_put_queue(cfqd->async_cfqq[1][i]);
3610	}
3611
3612	if (cfqd->async_idle_cfqq)
3613		cfq_put_queue(cfqd->async_idle_cfqq);
3614}
3615
3616static void cfq_cfqd_free(struct rcu_head *head)
3617{
3618	kfree(container_of(head, struct cfq_data, rcu));
3619}
3620
3621static void cfq_exit_queue(struct elevator_queue *e)
3622{
3623	struct cfq_data *cfqd = e->elevator_data;
3624	struct request_queue *q = cfqd->queue;
3625
3626	cfq_shutdown_timer_wq(cfqd);
3627
3628	spin_lock_irq(q->queue_lock);
3629
3630	if (cfqd->active_queue)
3631		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3632
3633	while (!list_empty(&cfqd->cic_list)) {
3634		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
3635							struct cfq_io_context,
3636							queue_list);
3637
3638		__cfq_exit_single_io_context(cfqd, cic);
3639	}
3640
3641	cfq_put_async_queues(cfqd);
3642	cfq_release_cfq_groups(cfqd);
3643	blkiocg_del_blkio_group(&cfqd->root_group.blkg);
3644
3645	spin_unlock_irq(q->queue_lock);
3646
3647	cfq_shutdown_timer_wq(cfqd);
3648
3649	/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
3650	call_rcu(&cfqd->rcu, cfq_cfqd_free);
3651}
3652
3653static void *cfq_init_queue(struct request_queue *q)
3654{
3655	struct cfq_data *cfqd;
3656	int i, j;
3657	struct cfq_group *cfqg;
3658	struct cfq_rb_root *st;
3659
3660	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3661	if (!cfqd)
3662		return NULL;
3663
3664	/* Init root service tree */
3665	cfqd->grp_service_tree = CFQ_RB_ROOT;
3666
3667	/* Init root group */
3668	cfqg = &cfqd->root_group;
3669	for_each_cfqg_st(cfqg, i, j, st)
3670		*st = CFQ_RB_ROOT;
3671	RB_CLEAR_NODE(&cfqg->rb_node);
3672
3673	/* Give preference to root group over other groups */
3674	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3675
3676#ifdef CONFIG_CFQ_GROUP_IOSCHED
3677	/*
3678	 * Take a reference to root group which we never drop. This is just
3679	 * to make sure that cfq_put_cfqg() does not try to kfree root group
3680	 */
3681	atomic_set(&cfqg->ref, 1);
3682	blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
3683					0);
3684#endif
3685	/*
3686	 * Not strictly needed (since RB_ROOT just clears the node and we
3687	 * zeroed cfqd on alloc), but better be safe in case someone decides
3688	 * to add magic to the rb code
3689	 */
3690	for (i = 0; i < CFQ_PRIO_LISTS; i++)
3691		cfqd->prio_trees[i] = RB_ROOT;
3692
3693	/*
3694	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
3695	 * Grab a permanent reference to it, so that the normal code flow
3696	 * will not attempt to free it.
3697	 */
3698	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3699	atomic_inc(&cfqd->oom_cfqq.ref);
3700	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3701
3702	INIT_LIST_HEAD(&cfqd->cic_list);
3703
3704	cfqd->queue = q;
3705
3706	init_timer(&cfqd->idle_slice_timer);
3707	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
3708	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
3709
3710	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
3711
3712	cfqd->cfq_quantum = cfq_quantum;
3713	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
3714	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
3715	cfqd->cfq_back_max = cfq_back_max;
3716	cfqd->cfq_back_penalty = cfq_back_penalty;
3717	cfqd->cfq_slice[0] = cfq_slice_async;
3718	cfqd->cfq_slice[1] = cfq_slice_sync;
3719	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
3720	cfqd->cfq_slice_idle = cfq_slice_idle;
3721	cfqd->cfq_latency = 1;
3722	cfqd->cfq_group_isolation = 0;
3723	cfqd->hw_tag = -1;
3724	/*
3725	 * we optimistically start assuming sync ops weren't delayed in last
3726	 * second, in order to have larger depth for async operations.
3727	 */
3728	cfqd->last_delayed_sync = jiffies - HZ;
3729	INIT_RCU_HEAD(&cfqd->rcu);
3730	return cfqd;
3731}
3732
3733static void cfq_slab_kill(void)
3734{
3735	/*
3736	 * Caller already ensured that pending RCU callbacks are completed,
3737	 * so we should have no busy allocations at this point.
3738	 */
3739	if (cfq_pool)
3740		kmem_cache_destroy(cfq_pool);
3741	if (cfq_ioc_pool)
3742		kmem_cache_destroy(cfq_ioc_pool);
3743}
3744
3745static int __init cfq_slab_setup(void)
3746{
3747	cfq_pool = KMEM_CACHE(cfq_queue, 0);
3748	if (!cfq_pool)
3749		goto fail;
3750
3751	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
3752	if (!cfq_ioc_pool)
3753		goto fail;
3754
3755	return 0;
3756fail:
3757	cfq_slab_kill();
3758	return -ENOMEM;
3759}
3760
3761/*
3762 * sysfs parts below -->
3763 */
3764static ssize_t
3765cfq_var_show(unsigned int var, char *page)
3766{
3767	return sprintf(page, "%d\n", var);
3768}
3769
3770static ssize_t
3771cfq_var_store(unsigned int *var, const char *page, size_t count)
3772{
3773	char *p = (char *) page;
3774
3775	*var = simple_strtoul(p, &p, 10);
3776	return count;
3777}
3778
3779#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
3780static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
3781{									\
3782	struct cfq_data *cfqd = e->elevator_data;			\
3783	unsigned int __data = __VAR;					\
3784	if (__CONV)							\
3785		__data = jiffies_to_msecs(__data);			\
3786	return cfq_var_show(__data, (page));				\
3787}
3788SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
3789SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
3790SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3791SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
3792SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3793SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3794SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
3795SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
3796SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
3797SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3798SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
3799#undef SHOW_FUNCTION
3800
3801#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
3802static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
3803{									\
3804	struct cfq_data *cfqd = e->elevator_data;			\
3805	unsigned int __data;						\
3806	int ret = cfq_var_store(&__data, (page), count);		\
3807	if (__data < (MIN))						\
3808		__data = (MIN);						\
3809	else if (__data > (MAX))					\
3810		__data = (MAX);						\
3811	if (__CONV)							\
3812		*(__PTR) = msecs_to_jiffies(__data);			\
3813	else								\
3814		*(__PTR) = __data;					\
3815	return ret;							\
3816}
3817STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
3818STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
3819		UINT_MAX, 1);
3820STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
3821		UINT_MAX, 1);
3822STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
3823STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
3824		UINT_MAX, 0);
3825STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
3826STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
3827STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
3828STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
3829		UINT_MAX, 0);
3830STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
3831STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
3832#undef STORE_FUNCTION
3833
3834#define CFQ_ATTR(name) \
3835	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
3836
3837static struct elv_fs_entry cfq_attrs[] = {
3838	CFQ_ATTR(quantum),
3839	CFQ_ATTR(fifo_expire_sync),
3840	CFQ_ATTR(fifo_expire_async),
3841	CFQ_ATTR(back_seek_max),
3842	CFQ_ATTR(back_seek_penalty),
3843	CFQ_ATTR(slice_sync),
3844	CFQ_ATTR(slice_async),
3845	CFQ_ATTR(slice_async_rq),
3846	CFQ_ATTR(slice_idle),
3847	CFQ_ATTR(low_latency),
3848	CFQ_ATTR(group_isolation),
3849	__ATTR_NULL
3850};
3851
3852static struct elevator_type iosched_cfq = {
3853	.ops = {
3854		.elevator_merge_fn = 		cfq_merge,
3855		.elevator_merged_fn =		cfq_merged_request,
3856		.elevator_merge_req_fn =	cfq_merged_requests,
3857		.elevator_allow_merge_fn =	cfq_allow_merge,
3858		.elevator_dispatch_fn =		cfq_dispatch_requests,
3859		.elevator_add_req_fn =		cfq_insert_request,
3860		.elevator_activate_req_fn =	cfq_activate_request,
3861		.elevator_deactivate_req_fn =	cfq_deactivate_request,
3862		.elevator_queue_empty_fn =	cfq_queue_empty,
3863		.elevator_completed_req_fn =	cfq_completed_request,
3864		.elevator_former_req_fn =	elv_rb_former_request,
3865		.elevator_latter_req_fn =	elv_rb_latter_request,
3866		.elevator_set_req_fn =		cfq_set_request,
3867		.elevator_put_req_fn =		cfq_put_request,
3868		.elevator_may_queue_fn =	cfq_may_queue,
3869		.elevator_init_fn =		cfq_init_queue,
3870		.elevator_exit_fn =		cfq_exit_queue,
3871		.trim =				cfq_free_io_context,
3872	},
3873	.elevator_attrs =	cfq_attrs,
3874	.elevator_name =	"cfq",
3875	.elevator_owner =	THIS_MODULE,
3876};
3877
3878#ifdef CONFIG_CFQ_GROUP_IOSCHED
3879static struct blkio_policy_type blkio_policy_cfq = {
3880	.ops = {
3881		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
3882		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
3883	},
3884};
3885#else
3886static struct blkio_policy_type blkio_policy_cfq;
3887#endif
3888
3889static int __init cfq_init(void)
3890{
3891	/*
3892	 * could be 0 on HZ < 1000 setups
3893	 */
3894	if (!cfq_slice_async)
3895		cfq_slice_async = 1;
3896	if (!cfq_slice_idle)
3897		cfq_slice_idle = 1;
3898
3899	if (cfq_slab_setup())
3900		return -ENOMEM;
3901
3902	elv_register(&iosched_cfq);
3903	blkio_policy_register(&blkio_policy_cfq);
3904
3905	return 0;
3906}
3907
3908static void __exit cfq_exit(void)
3909{
3910	DECLARE_COMPLETION_ONSTACK(all_gone);
3911	blkio_policy_unregister(&blkio_policy_cfq);
3912	elv_unregister(&iosched_cfq);
3913	ioc_gone = &all_gone;
3914	/* ioc_gone's update must be visible before reading ioc_count */
3915	smp_wmb();
3916
3917	/*
3918	 * this also protects us from entering cfq_slab_kill() with
3919	 * pending RCU callbacks
3920	 */
3921	if (elv_ioc_count_read(cfq_ioc_count))
3922		wait_for_completion(&all_gone);
3923	cfq_slab_kill();
3924}
3925
3926module_init(cfq_init);
3927module_exit(cfq_exit);
3928
3929MODULE_AUTHOR("Jens Axboe");
3930MODULE_LICENSE("GPL");
3931MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
3932