cfq-iosched.c revision ca32aefc7f2539ed88d42763330d54ee3e61769a
1/*
2 *  CFQ, or complete fairness queueing, disk scheduler.
3 *
4 *  Based on ideas from a previously unfinished io
5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <linux/blkdev.h>
12#include <linux/elevator.h>
13#include <linux/jiffies.h>
14#include <linux/rbtree.h>
15#include <linux/ioprio.h>
16#include <linux/blktrace_api.h>
17#include "blk.h"
18#include "cfq.h"
19
20/*
21 * tunables
22 */
23/* max queue in one round of service */
24static const int cfq_quantum = 8;
25static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
26/* maximum backwards seek, in KiB */
27static const int cfq_back_max = 16 * 1024;
28/* penalty of a backwards seek */
29static const int cfq_back_penalty = 2;
30static const int cfq_slice_sync = HZ / 10;
31static int cfq_slice_async = HZ / 25;
32static const int cfq_slice_async_rq = 2;
33static int cfq_slice_idle = HZ / 125;
34static int cfq_group_idle = HZ / 125;
35static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
36static const int cfq_hist_divisor = 4;
37
38/*
39 * offset from end of service tree
40 */
41#define CFQ_IDLE_DELAY		(HZ / 5)
42
43/*
44 * below this threshold, we consider thinktime immediate
45 */
46#define CFQ_MIN_TT		(2)
47
48#define CFQ_SLICE_SCALE		(5)
49#define CFQ_HW_QUEUE_MIN	(5)
50#define CFQ_SERVICE_SHIFT       12
51
52#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
53#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
54#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
55#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
56
57#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
58#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
59#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
60
61static struct kmem_cache *cfq_pool;
62
63#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
64#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
65#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
66
67#define sample_valid(samples)	((samples) > 80)
68#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
69
70struct cfq_ttime {
71	unsigned long last_end_request;
72
73	unsigned long ttime_total;
74	unsigned long ttime_samples;
75	unsigned long ttime_mean;
76};
77
78/*
79 * Most of our rbtree usage is for sorting with min extraction, so
80 * if we cache the leftmost node we don't have to walk down the tree
81 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
82 * move this into the elevator for the rq sorting as well.
83 */
84struct cfq_rb_root {
85	struct rb_root rb;
86	struct rb_node *left;
87	unsigned count;
88	unsigned total_weight;
89	u64 min_vdisktime;
90	struct cfq_ttime ttime;
91};
92#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, \
93			.ttime = {.last_end_request = jiffies,},}
94
95/*
96 * Per process-grouping structure
97 */
98struct cfq_queue {
99	/* reference count */
100	int ref;
101	/* various state flags, see below */
102	unsigned int flags;
103	/* parent cfq_data */
104	struct cfq_data *cfqd;
105	/* service_tree member */
106	struct rb_node rb_node;
107	/* service_tree key */
108	unsigned long rb_key;
109	/* prio tree member */
110	struct rb_node p_node;
111	/* prio tree root we belong to, if any */
112	struct rb_root *p_root;
113	/* sorted list of pending requests */
114	struct rb_root sort_list;
115	/* if fifo isn't expired, next request to serve */
116	struct request *next_rq;
117	/* requests queued in sort_list */
118	int queued[2];
119	/* currently allocated requests */
120	int allocated[2];
121	/* fifo list of requests in sort_list */
122	struct list_head fifo;
123
124	/* time when queue got scheduled in to dispatch first request. */
125	unsigned long dispatch_start;
126	unsigned int allocated_slice;
127	unsigned int slice_dispatch;
128	/* time when first request from queue completed and slice started. */
129	unsigned long slice_start;
130	unsigned long slice_end;
131	long slice_resid;
132
133	/* pending priority requests */
134	int prio_pending;
135	/* number of requests that are on the dispatch list or inside driver */
136	int dispatched;
137
138	/* io prio of this group */
139	unsigned short ioprio, org_ioprio;
140	unsigned short ioprio_class;
141
142	pid_t pid;
143
144	u32 seek_history;
145	sector_t last_request_pos;
146
147	struct cfq_rb_root *service_tree;
148	struct cfq_queue *new_cfqq;
149	struct cfq_group *cfqg;
150	/* Number of sectors dispatched from queue in single dispatch round */
151	unsigned long nr_sectors;
152};
153
154/*
155 * First index in the service_trees.
156 * IDLE is handled separately, so it has negative index
157 */
158enum wl_prio_t {
159	BE_WORKLOAD = 0,
160	RT_WORKLOAD = 1,
161	IDLE_WORKLOAD = 2,
162	CFQ_PRIO_NR,
163};
164
165/*
166 * Second index in the service_trees.
167 */
168enum wl_type_t {
169	ASYNC_WORKLOAD = 0,
170	SYNC_NOIDLE_WORKLOAD = 1,
171	SYNC_WORKLOAD = 2
172};
173
174/* This is per cgroup per device grouping structure */
175struct cfq_group {
176	/* group service_tree member */
177	struct rb_node rb_node;
178
179	/* group service_tree key */
180	u64 vdisktime;
181	unsigned int weight;
182	unsigned int new_weight;
183	bool needs_update;
184
185	/* number of cfqq currently on this group */
186	int nr_cfqq;
187
188	/*
189	 * Per group busy queues average. Useful for workload slice calc. We
190	 * create the array for each prio class but at run time it is used
191	 * only for RT and BE class and slot for IDLE class remains unused.
192	 * This is primarily done to avoid confusion and a gcc warning.
193	 */
194	unsigned int busy_queues_avg[CFQ_PRIO_NR];
195	/*
196	 * rr lists of queues with requests. We maintain service trees for
197	 * RT and BE classes. These trees are subdivided in subclasses
198	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
199	 * class there is no subclassification and all the cfq queues go on
200	 * a single tree service_tree_idle.
201	 * Counts are embedded in the cfq_rb_root
202	 */
203	struct cfq_rb_root service_trees[2][3];
204	struct cfq_rb_root service_tree_idle;
205
206	unsigned long saved_workload_slice;
207	enum wl_type_t saved_workload;
208	enum wl_prio_t saved_serving_prio;
209	struct blkio_group blkg;
210#ifdef CONFIG_CFQ_GROUP_IOSCHED
211	struct hlist_node cfqd_node;
212	int ref;
213#endif
214	/* number of requests that are on the dispatch list or inside driver */
215	int dispatched;
216	struct cfq_ttime ttime;
217};
218
219struct cfq_io_cq {
220	struct io_cq		icq;		/* must be the first member */
221	struct cfq_queue	*cfqq[2];
222	struct cfq_ttime	ttime;
223};
224
225/*
226 * Per block device queue structure
227 */
228struct cfq_data {
229	struct request_queue *queue;
230	/* Root service tree for cfq_groups */
231	struct cfq_rb_root grp_service_tree;
232	struct cfq_group root_group;
233
234	/*
235	 * The priority currently being served
236	 */
237	enum wl_prio_t serving_prio;
238	enum wl_type_t serving_type;
239	unsigned long workload_expires;
240	struct cfq_group *serving_group;
241
242	/*
243	 * Each priority tree is sorted by next_request position.  These
244	 * trees are used when determining if two or more queues are
245	 * interleaving requests (see cfq_close_cooperator).
246	 */
247	struct rb_root prio_trees[CFQ_PRIO_LISTS];
248
249	unsigned int busy_queues;
250	unsigned int busy_sync_queues;
251
252	int rq_in_driver;
253	int rq_in_flight[2];
254
255	/*
256	 * queue-depth detection
257	 */
258	int rq_queued;
259	int hw_tag;
260	/*
261	 * hw_tag can be
262	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
263	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
264	 *  0 => no NCQ
265	 */
266	int hw_tag_est_depth;
267	unsigned int hw_tag_samples;
268
269	/*
270	 * idle window management
271	 */
272	struct timer_list idle_slice_timer;
273	struct work_struct unplug_work;
274
275	struct cfq_queue *active_queue;
276	struct cfq_io_cq *active_cic;
277
278	/*
279	 * async queue for each priority case
280	 */
281	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
282	struct cfq_queue *async_idle_cfqq;
283
284	sector_t last_position;
285
286	/*
287	 * tunables, see top of file
288	 */
289	unsigned int cfq_quantum;
290	unsigned int cfq_fifo_expire[2];
291	unsigned int cfq_back_penalty;
292	unsigned int cfq_back_max;
293	unsigned int cfq_slice[2];
294	unsigned int cfq_slice_async_rq;
295	unsigned int cfq_slice_idle;
296	unsigned int cfq_group_idle;
297	unsigned int cfq_latency;
298
299	/*
300	 * Fallback dummy cfqq for extreme OOM conditions
301	 */
302	struct cfq_queue oom_cfqq;
303
304	unsigned long last_delayed_sync;
305
306	/* List of cfq groups being managed on this device*/
307	struct hlist_head cfqg_list;
308
309	/* Number of groups which are on blkcg->blkg_list */
310	unsigned int nr_blkcg_linked_grps;
311};
312
313static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
314
315static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
316					    enum wl_prio_t prio,
317					    enum wl_type_t type)
318{
319	if (!cfqg)
320		return NULL;
321
322	if (prio == IDLE_WORKLOAD)
323		return &cfqg->service_tree_idle;
324
325	return &cfqg->service_trees[prio][type];
326}
327
328enum cfqq_state_flags {
329	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
330	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
331	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
332	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
333	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
334	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
335	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
336	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
337	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
338	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
339	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
340	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
341	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
342};
343
344#define CFQ_CFQQ_FNS(name)						\
345static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
346{									\
347	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
348}									\
349static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
350{									\
351	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
352}									\
353static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
354{									\
355	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
356}
357
358CFQ_CFQQ_FNS(on_rr);
359CFQ_CFQQ_FNS(wait_request);
360CFQ_CFQQ_FNS(must_dispatch);
361CFQ_CFQQ_FNS(must_alloc_slice);
362CFQ_CFQQ_FNS(fifo_expire);
363CFQ_CFQQ_FNS(idle_window);
364CFQ_CFQQ_FNS(prio_changed);
365CFQ_CFQQ_FNS(slice_new);
366CFQ_CFQQ_FNS(sync);
367CFQ_CFQQ_FNS(coop);
368CFQ_CFQQ_FNS(split_coop);
369CFQ_CFQQ_FNS(deep);
370CFQ_CFQQ_FNS(wait_busy);
371#undef CFQ_CFQQ_FNS
372
373#ifdef CONFIG_CFQ_GROUP_IOSCHED
374#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
375	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
376			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
377			blkg_path(&(cfqq)->cfqg->blkg), ##args)
378
379#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
380	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
381				blkg_path(&(cfqg)->blkg), ##args)       \
382
383#else
384#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
385	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
386#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
387#endif
388#define cfq_log(cfqd, fmt, args...)	\
389	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
390
391/* Traverses through cfq group service trees */
392#define for_each_cfqg_st(cfqg, i, j, st) \
393	for (i = 0; i <= IDLE_WORKLOAD; i++) \
394		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
395			: &cfqg->service_tree_idle; \
396			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
397			(i == IDLE_WORKLOAD && j == 0); \
398			j++, st = i < IDLE_WORKLOAD ? \
399			&cfqg->service_trees[i][j]: NULL) \
400
401static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
402	struct cfq_ttime *ttime, bool group_idle)
403{
404	unsigned long slice;
405	if (!sample_valid(ttime->ttime_samples))
406		return false;
407	if (group_idle)
408		slice = cfqd->cfq_group_idle;
409	else
410		slice = cfqd->cfq_slice_idle;
411	return ttime->ttime_mean > slice;
412}
413
414static inline bool iops_mode(struct cfq_data *cfqd)
415{
416	/*
417	 * If we are not idling on queues and it is a NCQ drive, parallel
418	 * execution of requests is on and measuring time is not possible
419	 * in most of the cases until and unless we drive shallower queue
420	 * depths and that becomes a performance bottleneck. In such cases
421	 * switch to start providing fairness in terms of number of IOs.
422	 */
423	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
424		return true;
425	else
426		return false;
427}
428
429static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
430{
431	if (cfq_class_idle(cfqq))
432		return IDLE_WORKLOAD;
433	if (cfq_class_rt(cfqq))
434		return RT_WORKLOAD;
435	return BE_WORKLOAD;
436}
437
438
439static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
440{
441	if (!cfq_cfqq_sync(cfqq))
442		return ASYNC_WORKLOAD;
443	if (!cfq_cfqq_idle_window(cfqq))
444		return SYNC_NOIDLE_WORKLOAD;
445	return SYNC_WORKLOAD;
446}
447
448static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
449					struct cfq_data *cfqd,
450					struct cfq_group *cfqg)
451{
452	if (wl == IDLE_WORKLOAD)
453		return cfqg->service_tree_idle.count;
454
455	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
456		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
457		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
458}
459
460static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
461					struct cfq_group *cfqg)
462{
463	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
464		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
465}
466
467static void cfq_dispatch_insert(struct request_queue *, struct request *);
468static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
469				       struct io_context *, gfp_t);
470
471static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
472{
473	/* cic->icq is the first member, %NULL will convert to %NULL */
474	return container_of(icq, struct cfq_io_cq, icq);
475}
476
477static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
478					       struct io_context *ioc)
479{
480	if (ioc)
481		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
482	return NULL;
483}
484
485static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
486{
487	return cic->cfqq[is_sync];
488}
489
490static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
491				bool is_sync)
492{
493	cic->cfqq[is_sync] = cfqq;
494}
495
496static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
497{
498	return cic->icq.q->elevator->elevator_data;
499}
500
501/*
502 * We regard a request as SYNC, if it's either a read or has the SYNC bit
503 * set (in which case it could also be direct WRITE).
504 */
505static inline bool cfq_bio_sync(struct bio *bio)
506{
507	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
508}
509
510/*
511 * scheduler run of queue, if there are requests pending and no one in the
512 * driver that will restart queueing
513 */
514static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
515{
516	if (cfqd->busy_queues) {
517		cfq_log(cfqd, "schedule dispatch");
518		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
519	}
520}
521
522/*
523 * Scale schedule slice based on io priority. Use the sync time slice only
524 * if a queue is marked sync and has sync io queued. A sync queue with async
525 * io only, should not get full sync slice length.
526 */
527static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
528				 unsigned short prio)
529{
530	const int base_slice = cfqd->cfq_slice[sync];
531
532	WARN_ON(prio >= IOPRIO_BE_NR);
533
534	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
535}
536
537static inline int
538cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
539{
540	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
541}
542
543static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
544{
545	u64 d = delta << CFQ_SERVICE_SHIFT;
546
547	d = d * BLKIO_WEIGHT_DEFAULT;
548	do_div(d, cfqg->weight);
549	return d;
550}
551
552static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
553{
554	s64 delta = (s64)(vdisktime - min_vdisktime);
555	if (delta > 0)
556		min_vdisktime = vdisktime;
557
558	return min_vdisktime;
559}
560
561static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
562{
563	s64 delta = (s64)(vdisktime - min_vdisktime);
564	if (delta < 0)
565		min_vdisktime = vdisktime;
566
567	return min_vdisktime;
568}
569
570static void update_min_vdisktime(struct cfq_rb_root *st)
571{
572	struct cfq_group *cfqg;
573
574	if (st->left) {
575		cfqg = rb_entry_cfqg(st->left);
576		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
577						  cfqg->vdisktime);
578	}
579}
580
581/*
582 * get averaged number of queues of RT/BE priority.
583 * average is updated, with a formula that gives more weight to higher numbers,
584 * to quickly follows sudden increases and decrease slowly
585 */
586
587static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
588					struct cfq_group *cfqg, bool rt)
589{
590	unsigned min_q, max_q;
591	unsigned mult  = cfq_hist_divisor - 1;
592	unsigned round = cfq_hist_divisor / 2;
593	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
594
595	min_q = min(cfqg->busy_queues_avg[rt], busy);
596	max_q = max(cfqg->busy_queues_avg[rt], busy);
597	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
598		cfq_hist_divisor;
599	return cfqg->busy_queues_avg[rt];
600}
601
602static inline unsigned
603cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
604{
605	struct cfq_rb_root *st = &cfqd->grp_service_tree;
606
607	return cfq_target_latency * cfqg->weight / st->total_weight;
608}
609
610static inline unsigned
611cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
612{
613	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
614	if (cfqd->cfq_latency) {
615		/*
616		 * interested queues (we consider only the ones with the same
617		 * priority class in the cfq group)
618		 */
619		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
620						cfq_class_rt(cfqq));
621		unsigned sync_slice = cfqd->cfq_slice[1];
622		unsigned expect_latency = sync_slice * iq;
623		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
624
625		if (expect_latency > group_slice) {
626			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
627			/* scale low_slice according to IO priority
628			 * and sync vs async */
629			unsigned low_slice =
630				min(slice, base_low_slice * slice / sync_slice);
631			/* the adapted slice value is scaled to fit all iqs
632			 * into the target latency */
633			slice = max(slice * group_slice / expect_latency,
634				    low_slice);
635		}
636	}
637	return slice;
638}
639
640static inline void
641cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
642{
643	unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
644
645	cfqq->slice_start = jiffies;
646	cfqq->slice_end = jiffies + slice;
647	cfqq->allocated_slice = slice;
648	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
649}
650
651/*
652 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
653 * isn't valid until the first request from the dispatch is activated
654 * and the slice time set.
655 */
656static inline bool cfq_slice_used(struct cfq_queue *cfqq)
657{
658	if (cfq_cfqq_slice_new(cfqq))
659		return false;
660	if (time_before(jiffies, cfqq->slice_end))
661		return false;
662
663	return true;
664}
665
666/*
667 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
668 * We choose the request that is closest to the head right now. Distance
669 * behind the head is penalized and only allowed to a certain extent.
670 */
671static struct request *
672cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
673{
674	sector_t s1, s2, d1 = 0, d2 = 0;
675	unsigned long back_max;
676#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
677#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
678	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
679
680	if (rq1 == NULL || rq1 == rq2)
681		return rq2;
682	if (rq2 == NULL)
683		return rq1;
684
685	if (rq_is_sync(rq1) != rq_is_sync(rq2))
686		return rq_is_sync(rq1) ? rq1 : rq2;
687
688	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
689		return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
690
691	s1 = blk_rq_pos(rq1);
692	s2 = blk_rq_pos(rq2);
693
694	/*
695	 * by definition, 1KiB is 2 sectors
696	 */
697	back_max = cfqd->cfq_back_max * 2;
698
699	/*
700	 * Strict one way elevator _except_ in the case where we allow
701	 * short backward seeks which are biased as twice the cost of a
702	 * similar forward seek.
703	 */
704	if (s1 >= last)
705		d1 = s1 - last;
706	else if (s1 + back_max >= last)
707		d1 = (last - s1) * cfqd->cfq_back_penalty;
708	else
709		wrap |= CFQ_RQ1_WRAP;
710
711	if (s2 >= last)
712		d2 = s2 - last;
713	else if (s2 + back_max >= last)
714		d2 = (last - s2) * cfqd->cfq_back_penalty;
715	else
716		wrap |= CFQ_RQ2_WRAP;
717
718	/* Found required data */
719
720	/*
721	 * By doing switch() on the bit mask "wrap" we avoid having to
722	 * check two variables for all permutations: --> faster!
723	 */
724	switch (wrap) {
725	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
726		if (d1 < d2)
727			return rq1;
728		else if (d2 < d1)
729			return rq2;
730		else {
731			if (s1 >= s2)
732				return rq1;
733			else
734				return rq2;
735		}
736
737	case CFQ_RQ2_WRAP:
738		return rq1;
739	case CFQ_RQ1_WRAP:
740		return rq2;
741	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
742	default:
743		/*
744		 * Since both rqs are wrapped,
745		 * start with the one that's further behind head
746		 * (--> only *one* back seek required),
747		 * since back seek takes more time than forward.
748		 */
749		if (s1 <= s2)
750			return rq1;
751		else
752			return rq2;
753	}
754}
755
756/*
757 * The below is leftmost cache rbtree addon
758 */
759static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
760{
761	/* Service tree is empty */
762	if (!root->count)
763		return NULL;
764
765	if (!root->left)
766		root->left = rb_first(&root->rb);
767
768	if (root->left)
769		return rb_entry(root->left, struct cfq_queue, rb_node);
770
771	return NULL;
772}
773
774static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
775{
776	if (!root->left)
777		root->left = rb_first(&root->rb);
778
779	if (root->left)
780		return rb_entry_cfqg(root->left);
781
782	return NULL;
783}
784
785static void rb_erase_init(struct rb_node *n, struct rb_root *root)
786{
787	rb_erase(n, root);
788	RB_CLEAR_NODE(n);
789}
790
791static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
792{
793	if (root->left == n)
794		root->left = NULL;
795	rb_erase_init(n, &root->rb);
796	--root->count;
797}
798
799/*
800 * would be nice to take fifo expire time into account as well
801 */
802static struct request *
803cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
804		  struct request *last)
805{
806	struct rb_node *rbnext = rb_next(&last->rb_node);
807	struct rb_node *rbprev = rb_prev(&last->rb_node);
808	struct request *next = NULL, *prev = NULL;
809
810	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
811
812	if (rbprev)
813		prev = rb_entry_rq(rbprev);
814
815	if (rbnext)
816		next = rb_entry_rq(rbnext);
817	else {
818		rbnext = rb_first(&cfqq->sort_list);
819		if (rbnext && rbnext != &last->rb_node)
820			next = rb_entry_rq(rbnext);
821	}
822
823	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
824}
825
826static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
827				      struct cfq_queue *cfqq)
828{
829	/*
830	 * just an approximation, should be ok.
831	 */
832	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
833		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
834}
835
836static inline s64
837cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
838{
839	return cfqg->vdisktime - st->min_vdisktime;
840}
841
842static void
843__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
844{
845	struct rb_node **node = &st->rb.rb_node;
846	struct rb_node *parent = NULL;
847	struct cfq_group *__cfqg;
848	s64 key = cfqg_key(st, cfqg);
849	int left = 1;
850
851	while (*node != NULL) {
852		parent = *node;
853		__cfqg = rb_entry_cfqg(parent);
854
855		if (key < cfqg_key(st, __cfqg))
856			node = &parent->rb_left;
857		else {
858			node = &parent->rb_right;
859			left = 0;
860		}
861	}
862
863	if (left)
864		st->left = &cfqg->rb_node;
865
866	rb_link_node(&cfqg->rb_node, parent, node);
867	rb_insert_color(&cfqg->rb_node, &st->rb);
868}
869
870static void
871cfq_update_group_weight(struct cfq_group *cfqg)
872{
873	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
874	if (cfqg->needs_update) {
875		cfqg->weight = cfqg->new_weight;
876		cfqg->needs_update = false;
877	}
878}
879
880static void
881cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
882{
883	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
884
885	cfq_update_group_weight(cfqg);
886	__cfq_group_service_tree_add(st, cfqg);
887	st->total_weight += cfqg->weight;
888}
889
890static void
891cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
892{
893	struct cfq_rb_root *st = &cfqd->grp_service_tree;
894	struct cfq_group *__cfqg;
895	struct rb_node *n;
896
897	cfqg->nr_cfqq++;
898	if (!RB_EMPTY_NODE(&cfqg->rb_node))
899		return;
900
901	/*
902	 * Currently put the group at the end. Later implement something
903	 * so that groups get lesser vtime based on their weights, so that
904	 * if group does not loose all if it was not continuously backlogged.
905	 */
906	n = rb_last(&st->rb);
907	if (n) {
908		__cfqg = rb_entry_cfqg(n);
909		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
910	} else
911		cfqg->vdisktime = st->min_vdisktime;
912	cfq_group_service_tree_add(st, cfqg);
913}
914
915static void
916cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
917{
918	st->total_weight -= cfqg->weight;
919	if (!RB_EMPTY_NODE(&cfqg->rb_node))
920		cfq_rb_erase(&cfqg->rb_node, st);
921}
922
923static void
924cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
925{
926	struct cfq_rb_root *st = &cfqd->grp_service_tree;
927
928	BUG_ON(cfqg->nr_cfqq < 1);
929	cfqg->nr_cfqq--;
930
931	/* If there are other cfq queues under this group, don't delete it */
932	if (cfqg->nr_cfqq)
933		return;
934
935	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
936	cfq_group_service_tree_del(st, cfqg);
937	cfqg->saved_workload_slice = 0;
938	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
939}
940
941static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
942						unsigned int *unaccounted_time)
943{
944	unsigned int slice_used;
945
946	/*
947	 * Queue got expired before even a single request completed or
948	 * got expired immediately after first request completion.
949	 */
950	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
951		/*
952		 * Also charge the seek time incurred to the group, otherwise
953		 * if there are mutiple queues in the group, each can dispatch
954		 * a single request on seeky media and cause lots of seek time
955		 * and group will never know it.
956		 */
957		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
958					1);
959	} else {
960		slice_used = jiffies - cfqq->slice_start;
961		if (slice_used > cfqq->allocated_slice) {
962			*unaccounted_time = slice_used - cfqq->allocated_slice;
963			slice_used = cfqq->allocated_slice;
964		}
965		if (time_after(cfqq->slice_start, cfqq->dispatch_start))
966			*unaccounted_time += cfqq->slice_start -
967					cfqq->dispatch_start;
968	}
969
970	return slice_used;
971}
972
973static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
974				struct cfq_queue *cfqq)
975{
976	struct cfq_rb_root *st = &cfqd->grp_service_tree;
977	unsigned int used_sl, charge, unaccounted_sl = 0;
978	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
979			- cfqg->service_tree_idle.count;
980
981	BUG_ON(nr_sync < 0);
982	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
983
984	if (iops_mode(cfqd))
985		charge = cfqq->slice_dispatch;
986	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
987		charge = cfqq->allocated_slice;
988
989	/* Can't update vdisktime while group is on service tree */
990	cfq_group_service_tree_del(st, cfqg);
991	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
992	/* If a new weight was requested, update now, off tree */
993	cfq_group_service_tree_add(st, cfqg);
994
995	/* This group is being expired. Save the context */
996	if (time_after(cfqd->workload_expires, jiffies)) {
997		cfqg->saved_workload_slice = cfqd->workload_expires
998						- jiffies;
999		cfqg->saved_workload = cfqd->serving_type;
1000		cfqg->saved_serving_prio = cfqd->serving_prio;
1001	} else
1002		cfqg->saved_workload_slice = 0;
1003
1004	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1005					st->min_vdisktime);
1006	cfq_log_cfqq(cfqq->cfqd, cfqq,
1007		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
1008		     used_sl, cfqq->slice_dispatch, charge,
1009		     iops_mode(cfqd), cfqq->nr_sectors);
1010	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
1011					  unaccounted_sl);
1012	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
1013}
1014
1015#ifdef CONFIG_CFQ_GROUP_IOSCHED
1016static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
1017{
1018	if (blkg)
1019		return container_of(blkg, struct cfq_group, blkg);
1020	return NULL;
1021}
1022
1023static void cfq_update_blkio_group_weight(struct request_queue *q,
1024					  struct blkio_group *blkg,
1025					  unsigned int weight)
1026{
1027	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1028	cfqg->new_weight = weight;
1029	cfqg->needs_update = true;
1030}
1031
1032static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
1033			struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
1034{
1035	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1036	unsigned int major, minor;
1037
1038	/*
1039	 * Add group onto cgroup list. It might happen that bdi->dev is
1040	 * not initialized yet. Initialize this new group without major
1041	 * and minor info and this info will be filled in once a new thread
1042	 * comes for IO.
1043	 */
1044	if (bdi->dev) {
1045		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1046		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1047					cfqd->queue, MKDEV(major, minor));
1048	} else
1049		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1050					cfqd->queue, 0);
1051
1052	cfqd->nr_blkcg_linked_grps++;
1053	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1054
1055	/* Add group on cfqd list */
1056	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1057}
1058
1059/*
1060 * Should be called from sleepable context. No request queue lock as per
1061 * cpu stats are allocated dynamically and alloc_percpu needs to be called
1062 * from sleepable context.
1063 */
1064static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
1065{
1066	struct cfq_group *cfqg = NULL;
1067	int i, j, ret;
1068	struct cfq_rb_root *st;
1069
1070	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1071	if (!cfqg)
1072		return NULL;
1073
1074	for_each_cfqg_st(cfqg, i, j, st)
1075		*st = CFQ_RB_ROOT;
1076	RB_CLEAR_NODE(&cfqg->rb_node);
1077
1078	cfqg->ttime.last_end_request = jiffies;
1079
1080	/*
1081	 * Take the initial reference that will be released on destroy
1082	 * This can be thought of a joint reference by cgroup and
1083	 * elevator which will be dropped by either elevator exit
1084	 * or cgroup deletion path depending on who is exiting first.
1085	 */
1086	cfqg->ref = 1;
1087
1088	ret = blkio_alloc_blkg_stats(&cfqg->blkg);
1089	if (ret) {
1090		kfree(cfqg);
1091		return NULL;
1092	}
1093
1094	return cfqg;
1095}
1096
1097static struct cfq_group *
1098cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
1099{
1100	struct cfq_group *cfqg = NULL;
1101	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1102	unsigned int major, minor;
1103
1104	/*
1105	 * This is the common case when there are no blkio cgroups.
1106	 * Avoid lookup in this case
1107	 */
1108	if (blkcg == &blkio_root_cgroup)
1109		cfqg = &cfqd->root_group;
1110	else
1111		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, cfqd->queue,
1112							 BLKIO_POLICY_PROP));
1113
1114	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
1115		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1116		cfqg->blkg.dev = MKDEV(major, minor);
1117	}
1118
1119	return cfqg;
1120}
1121
1122/*
1123 * Search for the cfq group current task belongs to. request_queue lock must
1124 * be held.
1125 */
1126static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd,
1127				      struct blkio_cgroup *blkcg)
1128{
1129	struct cfq_group *cfqg = NULL, *__cfqg = NULL;
1130	struct request_queue *q = cfqd->queue;
1131
1132	cfqg = cfq_find_cfqg(cfqd, blkcg);
1133	if (cfqg)
1134		return cfqg;
1135
1136	if (!css_tryget(&blkcg->css))
1137		return NULL;
1138
1139	/*
1140	 * Need to allocate a group. Allocation of group also needs allocation
1141	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
1142	 * we need to drop rcu lock and queue_lock before we call alloc.
1143	 *
1144	 * Not taking any queue reference here and assuming that queue is
1145	 * around by the time we return. CFQ queue allocation code does
1146	 * the same. It might be racy though.
1147	 */
1148	rcu_read_unlock();
1149	spin_unlock_irq(q->queue_lock);
1150
1151	cfqg = cfq_alloc_cfqg(cfqd);
1152
1153	spin_lock_irq(q->queue_lock);
1154	rcu_read_lock();
1155	css_put(&blkcg->css);
1156
1157	/*
1158	 * If some other thread already allocated the group while we were
1159	 * not holding queue lock, free up the group
1160	 */
1161	__cfqg = cfq_find_cfqg(cfqd, blkcg);
1162
1163	if (__cfqg) {
1164		kfree(cfqg);
1165		return __cfqg;
1166	}
1167
1168	if (!cfqg)
1169		cfqg = &cfqd->root_group;
1170
1171	cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
1172	return cfqg;
1173}
1174
1175static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1176{
1177	cfqg->ref++;
1178	return cfqg;
1179}
1180
1181static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1182{
1183	/* Currently, all async queues are mapped to root group */
1184	if (!cfq_cfqq_sync(cfqq))
1185		cfqg = &cfqq->cfqd->root_group;
1186
1187	cfqq->cfqg = cfqg;
1188	/* cfqq reference on cfqg */
1189	cfqq->cfqg->ref++;
1190}
1191
1192static void cfq_put_cfqg(struct cfq_group *cfqg)
1193{
1194	struct cfq_rb_root *st;
1195	int i, j;
1196
1197	BUG_ON(cfqg->ref <= 0);
1198	cfqg->ref--;
1199	if (cfqg->ref)
1200		return;
1201	for_each_cfqg_st(cfqg, i, j, st)
1202		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1203	free_percpu(cfqg->blkg.stats_cpu);
1204	kfree(cfqg);
1205}
1206
1207static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1208{
1209	/* Something wrong if we are trying to remove same group twice */
1210	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1211
1212	hlist_del_init(&cfqg->cfqd_node);
1213
1214	BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
1215	cfqd->nr_blkcg_linked_grps--;
1216
1217	/*
1218	 * Put the reference taken at the time of creation so that when all
1219	 * queues are gone, group can be destroyed.
1220	 */
1221	cfq_put_cfqg(cfqg);
1222}
1223
1224static bool cfq_release_cfq_groups(struct cfq_data *cfqd)
1225{
1226	struct hlist_node *pos, *n;
1227	struct cfq_group *cfqg;
1228	bool empty = true;
1229
1230	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1231		/*
1232		 * If cgroup removal path got to blk_group first and removed
1233		 * it from cgroup list, then it will take care of destroying
1234		 * cfqg also.
1235		 */
1236		if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
1237			cfq_destroy_cfqg(cfqd, cfqg);
1238		else
1239			empty = false;
1240	}
1241	return empty;
1242}
1243
1244/*
1245 * Blk cgroup controller notification saying that blkio_group object is being
1246 * delinked as associated cgroup object is going away. That also means that
1247 * no new IO will come in this group. So get rid of this group as soon as
1248 * any pending IO in the group is finished.
1249 *
1250 * This function is called under rcu_read_lock(). key is the rcu protected
1251 * pointer. That means @q is a valid request_queue pointer as long as we
1252 * are rcu read lock.
1253 *
1254 * @q was fetched from blkio_group under blkio_cgroup->lock. That means
1255 * it should not be NULL as even if elevator was exiting, cgroup deltion
1256 * path got to it first.
1257 */
1258static void cfq_unlink_blkio_group(struct request_queue *q,
1259				   struct blkio_group *blkg)
1260{
1261	struct cfq_data *cfqd = q->elevator->elevator_data;
1262	unsigned long flags;
1263
1264	spin_lock_irqsave(q->queue_lock, flags);
1265	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1266	spin_unlock_irqrestore(q->queue_lock, flags);
1267}
1268
1269static struct elevator_type iosched_cfq;
1270
1271static bool cfq_clear_queue(struct request_queue *q)
1272{
1273	lockdep_assert_held(q->queue_lock);
1274
1275	/* shoot down blkgs iff the current elevator is cfq */
1276	if (!q->elevator || q->elevator->type != &iosched_cfq)
1277		return true;
1278
1279	return cfq_release_cfq_groups(q->elevator->elevator_data);
1280}
1281
1282#else /* GROUP_IOSCHED */
1283static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd,
1284				      struct blkio_cgroup *blkcg)
1285{
1286	return &cfqd->root_group;
1287}
1288
1289static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1290{
1291	return cfqg;
1292}
1293
1294static inline void
1295cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1296	cfqq->cfqg = cfqg;
1297}
1298
1299static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1300static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1301
1302#endif /* GROUP_IOSCHED */
1303
1304/*
1305 * The cfqd->service_trees holds all pending cfq_queue's that have
1306 * requests waiting to be processed. It is sorted in the order that
1307 * we will service the queues.
1308 */
1309static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1310				 bool add_front)
1311{
1312	struct rb_node **p, *parent;
1313	struct cfq_queue *__cfqq;
1314	unsigned long rb_key;
1315	struct cfq_rb_root *service_tree;
1316	int left;
1317	int new_cfqq = 1;
1318
1319	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1320						cfqq_type(cfqq));
1321	if (cfq_class_idle(cfqq)) {
1322		rb_key = CFQ_IDLE_DELAY;
1323		parent = rb_last(&service_tree->rb);
1324		if (parent && parent != &cfqq->rb_node) {
1325			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1326			rb_key += __cfqq->rb_key;
1327		} else
1328			rb_key += jiffies;
1329	} else if (!add_front) {
1330		/*
1331		 * Get our rb key offset. Subtract any residual slice
1332		 * value carried from last service. A negative resid
1333		 * count indicates slice overrun, and this should position
1334		 * the next service time further away in the tree.
1335		 */
1336		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1337		rb_key -= cfqq->slice_resid;
1338		cfqq->slice_resid = 0;
1339	} else {
1340		rb_key = -HZ;
1341		__cfqq = cfq_rb_first(service_tree);
1342		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1343	}
1344
1345	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1346		new_cfqq = 0;
1347		/*
1348		 * same position, nothing more to do
1349		 */
1350		if (rb_key == cfqq->rb_key &&
1351		    cfqq->service_tree == service_tree)
1352			return;
1353
1354		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1355		cfqq->service_tree = NULL;
1356	}
1357
1358	left = 1;
1359	parent = NULL;
1360	cfqq->service_tree = service_tree;
1361	p = &service_tree->rb.rb_node;
1362	while (*p) {
1363		struct rb_node **n;
1364
1365		parent = *p;
1366		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1367
1368		/*
1369		 * sort by key, that represents service time.
1370		 */
1371		if (time_before(rb_key, __cfqq->rb_key))
1372			n = &(*p)->rb_left;
1373		else {
1374			n = &(*p)->rb_right;
1375			left = 0;
1376		}
1377
1378		p = n;
1379	}
1380
1381	if (left)
1382		service_tree->left = &cfqq->rb_node;
1383
1384	cfqq->rb_key = rb_key;
1385	rb_link_node(&cfqq->rb_node, parent, p);
1386	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1387	service_tree->count++;
1388	if (add_front || !new_cfqq)
1389		return;
1390	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1391}
1392
1393static struct cfq_queue *
1394cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1395		     sector_t sector, struct rb_node **ret_parent,
1396		     struct rb_node ***rb_link)
1397{
1398	struct rb_node **p, *parent;
1399	struct cfq_queue *cfqq = NULL;
1400
1401	parent = NULL;
1402	p = &root->rb_node;
1403	while (*p) {
1404		struct rb_node **n;
1405
1406		parent = *p;
1407		cfqq = rb_entry(parent, struct cfq_queue, p_node);
1408
1409		/*
1410		 * Sort strictly based on sector.  Smallest to the left,
1411		 * largest to the right.
1412		 */
1413		if (sector > blk_rq_pos(cfqq->next_rq))
1414			n = &(*p)->rb_right;
1415		else if (sector < blk_rq_pos(cfqq->next_rq))
1416			n = &(*p)->rb_left;
1417		else
1418			break;
1419		p = n;
1420		cfqq = NULL;
1421	}
1422
1423	*ret_parent = parent;
1424	if (rb_link)
1425		*rb_link = p;
1426	return cfqq;
1427}
1428
1429static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1430{
1431	struct rb_node **p, *parent;
1432	struct cfq_queue *__cfqq;
1433
1434	if (cfqq->p_root) {
1435		rb_erase(&cfqq->p_node, cfqq->p_root);
1436		cfqq->p_root = NULL;
1437	}
1438
1439	if (cfq_class_idle(cfqq))
1440		return;
1441	if (!cfqq->next_rq)
1442		return;
1443
1444	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1445	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1446				      blk_rq_pos(cfqq->next_rq), &parent, &p);
1447	if (!__cfqq) {
1448		rb_link_node(&cfqq->p_node, parent, p);
1449		rb_insert_color(&cfqq->p_node, cfqq->p_root);
1450	} else
1451		cfqq->p_root = NULL;
1452}
1453
1454/*
1455 * Update cfqq's position in the service tree.
1456 */
1457static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1458{
1459	/*
1460	 * Resorting requires the cfqq to be on the RR list already.
1461	 */
1462	if (cfq_cfqq_on_rr(cfqq)) {
1463		cfq_service_tree_add(cfqd, cfqq, 0);
1464		cfq_prio_tree_add(cfqd, cfqq);
1465	}
1466}
1467
1468/*
1469 * add to busy list of queues for service, trying to be fair in ordering
1470 * the pending list according to last request service
1471 */
1472static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1473{
1474	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
1475	BUG_ON(cfq_cfqq_on_rr(cfqq));
1476	cfq_mark_cfqq_on_rr(cfqq);
1477	cfqd->busy_queues++;
1478	if (cfq_cfqq_sync(cfqq))
1479		cfqd->busy_sync_queues++;
1480
1481	cfq_resort_rr_list(cfqd, cfqq);
1482}
1483
1484/*
1485 * Called when the cfqq no longer has requests pending, remove it from
1486 * the service tree.
1487 */
1488static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1489{
1490	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
1491	BUG_ON(!cfq_cfqq_on_rr(cfqq));
1492	cfq_clear_cfqq_on_rr(cfqq);
1493
1494	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1495		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1496		cfqq->service_tree = NULL;
1497	}
1498	if (cfqq->p_root) {
1499		rb_erase(&cfqq->p_node, cfqq->p_root);
1500		cfqq->p_root = NULL;
1501	}
1502
1503	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
1504	BUG_ON(!cfqd->busy_queues);
1505	cfqd->busy_queues--;
1506	if (cfq_cfqq_sync(cfqq))
1507		cfqd->busy_sync_queues--;
1508}
1509
1510/*
1511 * rb tree support functions
1512 */
1513static void cfq_del_rq_rb(struct request *rq)
1514{
1515	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1516	const int sync = rq_is_sync(rq);
1517
1518	BUG_ON(!cfqq->queued[sync]);
1519	cfqq->queued[sync]--;
1520
1521	elv_rb_del(&cfqq->sort_list, rq);
1522
1523	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1524		/*
1525		 * Queue will be deleted from service tree when we actually
1526		 * expire it later. Right now just remove it from prio tree
1527		 * as it is empty.
1528		 */
1529		if (cfqq->p_root) {
1530			rb_erase(&cfqq->p_node, cfqq->p_root);
1531			cfqq->p_root = NULL;
1532		}
1533	}
1534}
1535
1536static void cfq_add_rq_rb(struct request *rq)
1537{
1538	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1539	struct cfq_data *cfqd = cfqq->cfqd;
1540	struct request *prev;
1541
1542	cfqq->queued[rq_is_sync(rq)]++;
1543
1544	elv_rb_add(&cfqq->sort_list, rq);
1545
1546	if (!cfq_cfqq_on_rr(cfqq))
1547		cfq_add_cfqq_rr(cfqd, cfqq);
1548
1549	/*
1550	 * check if this request is a better next-serve candidate
1551	 */
1552	prev = cfqq->next_rq;
1553	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
1554
1555	/*
1556	 * adjust priority tree position, if ->next_rq changes
1557	 */
1558	if (prev != cfqq->next_rq)
1559		cfq_prio_tree_add(cfqd, cfqq);
1560
1561	BUG_ON(!cfqq->next_rq);
1562}
1563
1564static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1565{
1566	elv_rb_del(&cfqq->sort_list, rq);
1567	cfqq->queued[rq_is_sync(rq)]--;
1568	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1569					rq_data_dir(rq), rq_is_sync(rq));
1570	cfq_add_rq_rb(rq);
1571	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1572			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1573			rq_is_sync(rq));
1574}
1575
1576static struct request *
1577cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1578{
1579	struct task_struct *tsk = current;
1580	struct cfq_io_cq *cic;
1581	struct cfq_queue *cfqq;
1582
1583	cic = cfq_cic_lookup(cfqd, tsk->io_context);
1584	if (!cic)
1585		return NULL;
1586
1587	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1588	if (cfqq) {
1589		sector_t sector = bio->bi_sector + bio_sectors(bio);
1590
1591		return elv_rb_find(&cfqq->sort_list, sector);
1592	}
1593
1594	return NULL;
1595}
1596
1597static void cfq_activate_request(struct request_queue *q, struct request *rq)
1598{
1599	struct cfq_data *cfqd = q->elevator->elevator_data;
1600
1601	cfqd->rq_in_driver++;
1602	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
1603						cfqd->rq_in_driver);
1604
1605	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1606}
1607
1608static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
1609{
1610	struct cfq_data *cfqd = q->elevator->elevator_data;
1611
1612	WARN_ON(!cfqd->rq_in_driver);
1613	cfqd->rq_in_driver--;
1614	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
1615						cfqd->rq_in_driver);
1616}
1617
1618static void cfq_remove_request(struct request *rq)
1619{
1620	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1621
1622	if (cfqq->next_rq == rq)
1623		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
1624
1625	list_del_init(&rq->queuelist);
1626	cfq_del_rq_rb(rq);
1627
1628	cfqq->cfqd->rq_queued--;
1629	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1630					rq_data_dir(rq), rq_is_sync(rq));
1631	if (rq->cmd_flags & REQ_PRIO) {
1632		WARN_ON(!cfqq->prio_pending);
1633		cfqq->prio_pending--;
1634	}
1635}
1636
1637static int cfq_merge(struct request_queue *q, struct request **req,
1638		     struct bio *bio)
1639{
1640	struct cfq_data *cfqd = q->elevator->elevator_data;
1641	struct request *__rq;
1642
1643	__rq = cfq_find_rq_fmerge(cfqd, bio);
1644	if (__rq && elv_rq_merge_ok(__rq, bio)) {
1645		*req = __rq;
1646		return ELEVATOR_FRONT_MERGE;
1647	}
1648
1649	return ELEVATOR_NO_MERGE;
1650}
1651
1652static void cfq_merged_request(struct request_queue *q, struct request *req,
1653			       int type)
1654{
1655	if (type == ELEVATOR_FRONT_MERGE) {
1656		struct cfq_queue *cfqq = RQ_CFQQ(req);
1657
1658		cfq_reposition_rq_rb(cfqq, req);
1659	}
1660}
1661
1662static void cfq_bio_merged(struct request_queue *q, struct request *req,
1663				struct bio *bio)
1664{
1665	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
1666					bio_data_dir(bio), cfq_bio_sync(bio));
1667}
1668
1669static void
1670cfq_merged_requests(struct request_queue *q, struct request *rq,
1671		    struct request *next)
1672{
1673	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1674	struct cfq_data *cfqd = q->elevator->elevator_data;
1675
1676	/*
1677	 * reposition in fifo if next is older than rq
1678	 */
1679	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1680	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1681		list_move(&rq->queuelist, &next->queuelist);
1682		rq_set_fifo_time(rq, rq_fifo_time(next));
1683	}
1684
1685	if (cfqq->next_rq == next)
1686		cfqq->next_rq = rq;
1687	cfq_remove_request(next);
1688	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
1689					rq_data_dir(next), rq_is_sync(next));
1690
1691	cfqq = RQ_CFQQ(next);
1692	/*
1693	 * all requests of this queue are merged to other queues, delete it
1694	 * from the service tree. If it's the active_queue,
1695	 * cfq_dispatch_requests() will choose to expire it or do idle
1696	 */
1697	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
1698	    cfqq != cfqd->active_queue)
1699		cfq_del_cfqq_rr(cfqd, cfqq);
1700}
1701
1702static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1703			   struct bio *bio)
1704{
1705	struct cfq_data *cfqd = q->elevator->elevator_data;
1706	struct cfq_io_cq *cic;
1707	struct cfq_queue *cfqq;
1708
1709	/*
1710	 * Disallow merge of a sync bio into an async request.
1711	 */
1712	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
1713		return false;
1714
1715	/*
1716	 * Lookup the cfqq that this bio will be queued with and allow
1717	 * merge only if rq is queued there.
1718	 */
1719	cic = cfq_cic_lookup(cfqd, current->io_context);
1720	if (!cic)
1721		return false;
1722
1723	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1724	return cfqq == RQ_CFQQ(rq);
1725}
1726
1727static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1728{
1729	del_timer(&cfqd->idle_slice_timer);
1730	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1731}
1732
1733static void __cfq_set_active_queue(struct cfq_data *cfqd,
1734				   struct cfq_queue *cfqq)
1735{
1736	if (cfqq) {
1737		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1738				cfqd->serving_prio, cfqd->serving_type);
1739		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1740		cfqq->slice_start = 0;
1741		cfqq->dispatch_start = jiffies;
1742		cfqq->allocated_slice = 0;
1743		cfqq->slice_end = 0;
1744		cfqq->slice_dispatch = 0;
1745		cfqq->nr_sectors = 0;
1746
1747		cfq_clear_cfqq_wait_request(cfqq);
1748		cfq_clear_cfqq_must_dispatch(cfqq);
1749		cfq_clear_cfqq_must_alloc_slice(cfqq);
1750		cfq_clear_cfqq_fifo_expire(cfqq);
1751		cfq_mark_cfqq_slice_new(cfqq);
1752
1753		cfq_del_timer(cfqd, cfqq);
1754	}
1755
1756	cfqd->active_queue = cfqq;
1757}
1758
1759/*
1760 * current cfqq expired its slice (or was too idle), select new one
1761 */
1762static void
1763__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1764		    bool timed_out)
1765{
1766	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1767
1768	if (cfq_cfqq_wait_request(cfqq))
1769		cfq_del_timer(cfqd, cfqq);
1770
1771	cfq_clear_cfqq_wait_request(cfqq);
1772	cfq_clear_cfqq_wait_busy(cfqq);
1773
1774	/*
1775	 * If this cfqq is shared between multiple processes, check to
1776	 * make sure that those processes are still issuing I/Os within
1777	 * the mean seek distance.  If not, it may be time to break the
1778	 * queues apart again.
1779	 */
1780	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
1781		cfq_mark_cfqq_split_coop(cfqq);
1782
1783	/*
1784	 * store what was left of this slice, if the queue idled/timed out
1785	 */
1786	if (timed_out) {
1787		if (cfq_cfqq_slice_new(cfqq))
1788			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
1789		else
1790			cfqq->slice_resid = cfqq->slice_end - jiffies;
1791		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1792	}
1793
1794	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1795
1796	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1797		cfq_del_cfqq_rr(cfqd, cfqq);
1798
1799	cfq_resort_rr_list(cfqd, cfqq);
1800
1801	if (cfqq == cfqd->active_queue)
1802		cfqd->active_queue = NULL;
1803
1804	if (cfqd->active_cic) {
1805		put_io_context(cfqd->active_cic->icq.ioc);
1806		cfqd->active_cic = NULL;
1807	}
1808}
1809
1810static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
1811{
1812	struct cfq_queue *cfqq = cfqd->active_queue;
1813
1814	if (cfqq)
1815		__cfq_slice_expired(cfqd, cfqq, timed_out);
1816}
1817
1818/*
1819 * Get next queue for service. Unless we have a queue preemption,
1820 * we'll simply select the first cfqq in the service tree.
1821 */
1822static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1823{
1824	struct cfq_rb_root *service_tree =
1825		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1826					cfqd->serving_type);
1827
1828	if (!cfqd->rq_queued)
1829		return NULL;
1830
1831	/* There is nothing to dispatch */
1832	if (!service_tree)
1833		return NULL;
1834	if (RB_EMPTY_ROOT(&service_tree->rb))
1835		return NULL;
1836	return cfq_rb_first(service_tree);
1837}
1838
1839static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1840{
1841	struct cfq_group *cfqg;
1842	struct cfq_queue *cfqq;
1843	int i, j;
1844	struct cfq_rb_root *st;
1845
1846	if (!cfqd->rq_queued)
1847		return NULL;
1848
1849	cfqg = cfq_get_next_cfqg(cfqd);
1850	if (!cfqg)
1851		return NULL;
1852
1853	for_each_cfqg_st(cfqg, i, j, st)
1854		if ((cfqq = cfq_rb_first(st)) != NULL)
1855			return cfqq;
1856	return NULL;
1857}
1858
1859/*
1860 * Get and set a new active queue for service.
1861 */
1862static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
1863					      struct cfq_queue *cfqq)
1864{
1865	if (!cfqq)
1866		cfqq = cfq_get_next_queue(cfqd);
1867
1868	__cfq_set_active_queue(cfqd, cfqq);
1869	return cfqq;
1870}
1871
1872static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
1873					  struct request *rq)
1874{
1875	if (blk_rq_pos(rq) >= cfqd->last_position)
1876		return blk_rq_pos(rq) - cfqd->last_position;
1877	else
1878		return cfqd->last_position - blk_rq_pos(rq);
1879}
1880
1881static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1882			       struct request *rq)
1883{
1884	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
1885}
1886
1887static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1888				    struct cfq_queue *cur_cfqq)
1889{
1890	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
1891	struct rb_node *parent, *node;
1892	struct cfq_queue *__cfqq;
1893	sector_t sector = cfqd->last_position;
1894
1895	if (RB_EMPTY_ROOT(root))
1896		return NULL;
1897
1898	/*
1899	 * First, if we find a request starting at the end of the last
1900	 * request, choose it.
1901	 */
1902	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
1903	if (__cfqq)
1904		return __cfqq;
1905
1906	/*
1907	 * If the exact sector wasn't found, the parent of the NULL leaf
1908	 * will contain the closest sector.
1909	 */
1910	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
1911	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1912		return __cfqq;
1913
1914	if (blk_rq_pos(__cfqq->next_rq) < sector)
1915		node = rb_next(&__cfqq->p_node);
1916	else
1917		node = rb_prev(&__cfqq->p_node);
1918	if (!node)
1919		return NULL;
1920
1921	__cfqq = rb_entry(node, struct cfq_queue, p_node);
1922	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1923		return __cfqq;
1924
1925	return NULL;
1926}
1927
1928/*
1929 * cfqd - obvious
1930 * cur_cfqq - passed in so that we don't decide that the current queue is
1931 * 	      closely cooperating with itself.
1932 *
1933 * So, basically we're assuming that that cur_cfqq has dispatched at least
1934 * one request, and that cfqd->last_position reflects a position on the disk
1935 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
1936 * assumption.
1937 */
1938static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1939					      struct cfq_queue *cur_cfqq)
1940{
1941	struct cfq_queue *cfqq;
1942
1943	if (cfq_class_idle(cur_cfqq))
1944		return NULL;
1945	if (!cfq_cfqq_sync(cur_cfqq))
1946		return NULL;
1947	if (CFQQ_SEEKY(cur_cfqq))
1948		return NULL;
1949
1950	/*
1951	 * Don't search priority tree if it's the only queue in the group.
1952	 */
1953	if (cur_cfqq->cfqg->nr_cfqq == 1)
1954		return NULL;
1955
1956	/*
1957	 * We should notice if some of the queues are cooperating, eg
1958	 * working closely on the same area of the disk. In that case,
1959	 * we can group them together and don't waste time idling.
1960	 */
1961	cfqq = cfqq_close(cfqd, cur_cfqq);
1962	if (!cfqq)
1963		return NULL;
1964
1965	/* If new queue belongs to different cfq_group, don't choose it */
1966	if (cur_cfqq->cfqg != cfqq->cfqg)
1967		return NULL;
1968
1969	/*
1970	 * It only makes sense to merge sync queues.
1971	 */
1972	if (!cfq_cfqq_sync(cfqq))
1973		return NULL;
1974	if (CFQQ_SEEKY(cfqq))
1975		return NULL;
1976
1977	/*
1978	 * Do not merge queues of different priority classes
1979	 */
1980	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1981		return NULL;
1982
1983	return cfqq;
1984}
1985
1986/*
1987 * Determine whether we should enforce idle window for this queue.
1988 */
1989
1990static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1991{
1992	enum wl_prio_t prio = cfqq_prio(cfqq);
1993	struct cfq_rb_root *service_tree = cfqq->service_tree;
1994
1995	BUG_ON(!service_tree);
1996	BUG_ON(!service_tree->count);
1997
1998	if (!cfqd->cfq_slice_idle)
1999		return false;
2000
2001	/* We never do for idle class queues. */
2002	if (prio == IDLE_WORKLOAD)
2003		return false;
2004
2005	/* We do for queues that were marked with idle window flag. */
2006	if (cfq_cfqq_idle_window(cfqq) &&
2007	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2008		return true;
2009
2010	/*
2011	 * Otherwise, we do only if they are the last ones
2012	 * in their service tree.
2013	 */
2014	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
2015	   !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
2016		return true;
2017	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
2018			service_tree->count);
2019	return false;
2020}
2021
2022static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2023{
2024	struct cfq_queue *cfqq = cfqd->active_queue;
2025	struct cfq_io_cq *cic;
2026	unsigned long sl, group_idle = 0;
2027
2028	/*
2029	 * SSD device without seek penalty, disable idling. But only do so
2030	 * for devices that support queuing, otherwise we still have a problem
2031	 * with sync vs async workloads.
2032	 */
2033	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
2034		return;
2035
2036	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2037	WARN_ON(cfq_cfqq_slice_new(cfqq));
2038
2039	/*
2040	 * idle is disabled, either manually or by past process history
2041	 */
2042	if (!cfq_should_idle(cfqd, cfqq)) {
2043		/* no queue idling. Check for group idling */
2044		if (cfqd->cfq_group_idle)
2045			group_idle = cfqd->cfq_group_idle;
2046		else
2047			return;
2048	}
2049
2050	/*
2051	 * still active requests from this queue, don't idle
2052	 */
2053	if (cfqq->dispatched)
2054		return;
2055
2056	/*
2057	 * task has exited, don't wait
2058	 */
2059	cic = cfqd->active_cic;
2060	if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
2061		return;
2062
2063	/*
2064	 * If our average think time is larger than the remaining time
2065	 * slice, then don't idle. This avoids overrunning the allotted
2066	 * time slice.
2067	 */
2068	if (sample_valid(cic->ttime.ttime_samples) &&
2069	    (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
2070		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
2071			     cic->ttime.ttime_mean);
2072		return;
2073	}
2074
2075	/* There are other queues in the group, don't do group idle */
2076	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
2077		return;
2078
2079	cfq_mark_cfqq_wait_request(cfqq);
2080
2081	if (group_idle)
2082		sl = cfqd->cfq_group_idle;
2083	else
2084		sl = cfqd->cfq_slice_idle;
2085
2086	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2087	cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
2088	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2089			group_idle ? 1 : 0);
2090}
2091
2092/*
2093 * Move request from internal lists to the request queue dispatch list.
2094 */
2095static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2096{
2097	struct cfq_data *cfqd = q->elevator->elevator_data;
2098	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2099
2100	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2101
2102	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2103	cfq_remove_request(rq);
2104	cfqq->dispatched++;
2105	(RQ_CFQG(rq))->dispatched++;
2106	elv_dispatch_sort(q, rq);
2107
2108	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2109	cfqq->nr_sectors += blk_rq_sectors(rq);
2110	cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
2111					rq_data_dir(rq), rq_is_sync(rq));
2112}
2113
2114/*
2115 * return expired entry, or NULL to just start from scratch in rbtree
2116 */
2117static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
2118{
2119	struct request *rq = NULL;
2120
2121	if (cfq_cfqq_fifo_expire(cfqq))
2122		return NULL;
2123
2124	cfq_mark_cfqq_fifo_expire(cfqq);
2125
2126	if (list_empty(&cfqq->fifo))
2127		return NULL;
2128
2129	rq = rq_entry_fifo(cfqq->fifo.next);
2130	if (time_before(jiffies, rq_fifo_time(rq)))
2131		rq = NULL;
2132
2133	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
2134	return rq;
2135}
2136
2137static inline int
2138cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2139{
2140	const int base_rq = cfqd->cfq_slice_async_rq;
2141
2142	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2143
2144	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2145}
2146
2147/*
2148 * Must be called with the queue_lock held.
2149 */
2150static int cfqq_process_refs(struct cfq_queue *cfqq)
2151{
2152	int process_refs, io_refs;
2153
2154	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2155	process_refs = cfqq->ref - io_refs;
2156	BUG_ON(process_refs < 0);
2157	return process_refs;
2158}
2159
2160static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2161{
2162	int process_refs, new_process_refs;
2163	struct cfq_queue *__cfqq;
2164
2165	/*
2166	 * If there are no process references on the new_cfqq, then it is
2167	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2168	 * chain may have dropped their last reference (not just their
2169	 * last process reference).
2170	 */
2171	if (!cfqq_process_refs(new_cfqq))
2172		return;
2173
2174	/* Avoid a circular list and skip interim queue merges */
2175	while ((__cfqq = new_cfqq->new_cfqq)) {
2176		if (__cfqq == cfqq)
2177			return;
2178		new_cfqq = __cfqq;
2179	}
2180
2181	process_refs = cfqq_process_refs(cfqq);
2182	new_process_refs = cfqq_process_refs(new_cfqq);
2183	/*
2184	 * If the process for the cfqq has gone away, there is no
2185	 * sense in merging the queues.
2186	 */
2187	if (process_refs == 0 || new_process_refs == 0)
2188		return;
2189
2190	/*
2191	 * Merge in the direction of the lesser amount of work.
2192	 */
2193	if (new_process_refs >= process_refs) {
2194		cfqq->new_cfqq = new_cfqq;
2195		new_cfqq->ref += process_refs;
2196	} else {
2197		new_cfqq->new_cfqq = cfqq;
2198		cfqq->ref += new_process_refs;
2199	}
2200}
2201
2202static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2203				struct cfq_group *cfqg, enum wl_prio_t prio)
2204{
2205	struct cfq_queue *queue;
2206	int i;
2207	bool key_valid = false;
2208	unsigned long lowest_key = 0;
2209	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2210
2211	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2212		/* select the one with lowest rb_key */
2213		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
2214		if (queue &&
2215		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
2216			lowest_key = queue->rb_key;
2217			cur_best = i;
2218			key_valid = true;
2219		}
2220	}
2221
2222	return cur_best;
2223}
2224
2225static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2226{
2227	unsigned slice;
2228	unsigned count;
2229	struct cfq_rb_root *st;
2230	unsigned group_slice;
2231	enum wl_prio_t original_prio = cfqd->serving_prio;
2232
2233	/* Choose next priority. RT > BE > IDLE */
2234	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2235		cfqd->serving_prio = RT_WORKLOAD;
2236	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2237		cfqd->serving_prio = BE_WORKLOAD;
2238	else {
2239		cfqd->serving_prio = IDLE_WORKLOAD;
2240		cfqd->workload_expires = jiffies + 1;
2241		return;
2242	}
2243
2244	if (original_prio != cfqd->serving_prio)
2245		goto new_workload;
2246
2247	/*
2248	 * For RT and BE, we have to choose also the type
2249	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2250	 * expiration time
2251	 */
2252	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2253	count = st->count;
2254
2255	/*
2256	 * check workload expiration, and that we still have other queues ready
2257	 */
2258	if (count && !time_after(jiffies, cfqd->workload_expires))
2259		return;
2260
2261new_workload:
2262	/* otherwise select new workload type */
2263	cfqd->serving_type =
2264		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2265	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2266	count = st->count;
2267
2268	/*
2269	 * the workload slice is computed as a fraction of target latency
2270	 * proportional to the number of queues in that workload, over
2271	 * all the queues in the same priority class
2272	 */
2273	group_slice = cfq_group_slice(cfqd, cfqg);
2274
2275	slice = group_slice * count /
2276		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2277		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2278
2279	if (cfqd->serving_type == ASYNC_WORKLOAD) {
2280		unsigned int tmp;
2281
2282		/*
2283		 * Async queues are currently system wide. Just taking
2284		 * proportion of queues with-in same group will lead to higher
2285		 * async ratio system wide as generally root group is going
2286		 * to have higher weight. A more accurate thing would be to
2287		 * calculate system wide asnc/sync ratio.
2288		 */
2289		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2290		tmp = tmp/cfqd->busy_queues;
2291		slice = min_t(unsigned, slice, tmp);
2292
2293		/* async workload slice is scaled down according to
2294		 * the sync/async slice ratio. */
2295		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2296	} else
2297		/* sync workload slice is at least 2 * cfq_slice_idle */
2298		slice = max(slice, 2 * cfqd->cfq_slice_idle);
2299
2300	slice = max_t(unsigned, slice, CFQ_MIN_TT);
2301	cfq_log(cfqd, "workload slice:%d", slice);
2302	cfqd->workload_expires = jiffies + slice;
2303}
2304
2305static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2306{
2307	struct cfq_rb_root *st = &cfqd->grp_service_tree;
2308	struct cfq_group *cfqg;
2309
2310	if (RB_EMPTY_ROOT(&st->rb))
2311		return NULL;
2312	cfqg = cfq_rb_first_group(st);
2313	update_min_vdisktime(st);
2314	return cfqg;
2315}
2316
2317static void cfq_choose_cfqg(struct cfq_data *cfqd)
2318{
2319	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2320
2321	cfqd->serving_group = cfqg;
2322
2323	/* Restore the workload type data */
2324	if (cfqg->saved_workload_slice) {
2325		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2326		cfqd->serving_type = cfqg->saved_workload;
2327		cfqd->serving_prio = cfqg->saved_serving_prio;
2328	} else
2329		cfqd->workload_expires = jiffies - 1;
2330
2331	choose_service_tree(cfqd, cfqg);
2332}
2333
2334/*
2335 * Select a queue for service. If we have a current active queue,
2336 * check whether to continue servicing it, or retrieve and set a new one.
2337 */
2338static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2339{
2340	struct cfq_queue *cfqq, *new_cfqq = NULL;
2341
2342	cfqq = cfqd->active_queue;
2343	if (!cfqq)
2344		goto new_queue;
2345
2346	if (!cfqd->rq_queued)
2347		return NULL;
2348
2349	/*
2350	 * We were waiting for group to get backlogged. Expire the queue
2351	 */
2352	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2353		goto expire;
2354
2355	/*
2356	 * The active queue has run out of time, expire it and select new.
2357	 */
2358	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2359		/*
2360		 * If slice had not expired at the completion of last request
2361		 * we might not have turned on wait_busy flag. Don't expire
2362		 * the queue yet. Allow the group to get backlogged.
2363		 *
2364		 * The very fact that we have used the slice, that means we
2365		 * have been idling all along on this queue and it should be
2366		 * ok to wait for this request to complete.
2367		 */
2368		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2369		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2370			cfqq = NULL;
2371			goto keep_queue;
2372		} else
2373			goto check_group_idle;
2374	}
2375
2376	/*
2377	 * The active queue has requests and isn't expired, allow it to
2378	 * dispatch.
2379	 */
2380	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2381		goto keep_queue;
2382
2383	/*
2384	 * If another queue has a request waiting within our mean seek
2385	 * distance, let it run.  The expire code will check for close
2386	 * cooperators and put the close queue at the front of the service
2387	 * tree.  If possible, merge the expiring queue with the new cfqq.
2388	 */
2389	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2390	if (new_cfqq) {
2391		if (!cfqq->new_cfqq)
2392			cfq_setup_merge(cfqq, new_cfqq);
2393		goto expire;
2394	}
2395
2396	/*
2397	 * No requests pending. If the active queue still has requests in
2398	 * flight or is idling for a new request, allow either of these
2399	 * conditions to happen (or time out) before selecting a new queue.
2400	 */
2401	if (timer_pending(&cfqd->idle_slice_timer)) {
2402		cfqq = NULL;
2403		goto keep_queue;
2404	}
2405
2406	/*
2407	 * This is a deep seek queue, but the device is much faster than
2408	 * the queue can deliver, don't idle
2409	 **/
2410	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2411	    (cfq_cfqq_slice_new(cfqq) ||
2412	    (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2413		cfq_clear_cfqq_deep(cfqq);
2414		cfq_clear_cfqq_idle_window(cfqq);
2415	}
2416
2417	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2418		cfqq = NULL;
2419		goto keep_queue;
2420	}
2421
2422	/*
2423	 * If group idle is enabled and there are requests dispatched from
2424	 * this group, wait for requests to complete.
2425	 */
2426check_group_idle:
2427	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
2428	    cfqq->cfqg->dispatched &&
2429	    !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
2430		cfqq = NULL;
2431		goto keep_queue;
2432	}
2433
2434expire:
2435	cfq_slice_expired(cfqd, 0);
2436new_queue:
2437	/*
2438	 * Current queue expired. Check if we have to switch to a new
2439	 * service tree
2440	 */
2441	if (!new_cfqq)
2442		cfq_choose_cfqg(cfqd);
2443
2444	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2445keep_queue:
2446	return cfqq;
2447}
2448
2449static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2450{
2451	int dispatched = 0;
2452
2453	while (cfqq->next_rq) {
2454		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2455		dispatched++;
2456	}
2457
2458	BUG_ON(!list_empty(&cfqq->fifo));
2459
2460	/* By default cfqq is not expired if it is empty. Do it explicitly */
2461	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2462	return dispatched;
2463}
2464
2465/*
2466 * Drain our current requests. Used for barriers and when switching
2467 * io schedulers on-the-fly.
2468 */
2469static int cfq_forced_dispatch(struct cfq_data *cfqd)
2470{
2471	struct cfq_queue *cfqq;
2472	int dispatched = 0;
2473
2474	/* Expire the timeslice of the current active queue first */
2475	cfq_slice_expired(cfqd, 0);
2476	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
2477		__cfq_set_active_queue(cfqd, cfqq);
2478		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2479	}
2480
2481	BUG_ON(cfqd->busy_queues);
2482
2483	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
2484	return dispatched;
2485}
2486
2487static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2488	struct cfq_queue *cfqq)
2489{
2490	/* the queue hasn't finished any request, can't estimate */
2491	if (cfq_cfqq_slice_new(cfqq))
2492		return true;
2493	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2494		cfqq->slice_end))
2495		return true;
2496
2497	return false;
2498}
2499
2500static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2501{
2502	unsigned int max_dispatch;
2503
2504	/*
2505	 * Drain async requests before we start sync IO
2506	 */
2507	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
2508		return false;
2509
2510	/*
2511	 * If this is an async queue and we have sync IO in flight, let it wait
2512	 */
2513	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
2514		return false;
2515
2516	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2517	if (cfq_class_idle(cfqq))
2518		max_dispatch = 1;
2519
2520	/*
2521	 * Does this cfqq already have too much IO in flight?
2522	 */
2523	if (cfqq->dispatched >= max_dispatch) {
2524		bool promote_sync = false;
2525		/*
2526		 * idle queue must always only have a single IO in flight
2527		 */
2528		if (cfq_class_idle(cfqq))
2529			return false;
2530
2531		/*
2532		 * If there is only one sync queue
2533		 * we can ignore async queue here and give the sync
2534		 * queue no dispatch limit. The reason is a sync queue can
2535		 * preempt async queue, limiting the sync queue doesn't make
2536		 * sense. This is useful for aiostress test.
2537		 */
2538		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
2539			promote_sync = true;
2540
2541		/*
2542		 * We have other queues, don't allow more IO from this one
2543		 */
2544		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
2545				!promote_sync)
2546			return false;
2547
2548		/*
2549		 * Sole queue user, no limit
2550		 */
2551		if (cfqd->busy_queues == 1 || promote_sync)
2552			max_dispatch = -1;
2553		else
2554			/*
2555			 * Normally we start throttling cfqq when cfq_quantum/2
2556			 * requests have been dispatched. But we can drive
2557			 * deeper queue depths at the beginning of slice
2558			 * subjected to upper limit of cfq_quantum.
2559			 * */
2560			max_dispatch = cfqd->cfq_quantum;
2561	}
2562
2563	/*
2564	 * Async queues must wait a bit before being allowed dispatch.
2565	 * We also ramp up the dispatch depth gradually for async IO,
2566	 * based on the last sync IO we serviced
2567	 */
2568	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
2569		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
2570		unsigned int depth;
2571
2572		depth = last_sync / cfqd->cfq_slice[1];
2573		if (!depth && !cfqq->dispatched)
2574			depth = 1;
2575		if (depth < max_dispatch)
2576			max_dispatch = depth;
2577	}
2578
2579	/*
2580	 * If we're below the current max, allow a dispatch
2581	 */
2582	return cfqq->dispatched < max_dispatch;
2583}
2584
2585/*
2586 * Dispatch a request from cfqq, moving them to the request queue
2587 * dispatch list.
2588 */
2589static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2590{
2591	struct request *rq;
2592
2593	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
2594
2595	if (!cfq_may_dispatch(cfqd, cfqq))
2596		return false;
2597
2598	/*
2599	 * follow expired path, else get first next available
2600	 */
2601	rq = cfq_check_fifo(cfqq);
2602	if (!rq)
2603		rq = cfqq->next_rq;
2604
2605	/*
2606	 * insert request into driver dispatch list
2607	 */
2608	cfq_dispatch_insert(cfqd->queue, rq);
2609
2610	if (!cfqd->active_cic) {
2611		struct cfq_io_cq *cic = RQ_CIC(rq);
2612
2613		atomic_long_inc(&cic->icq.ioc->refcount);
2614		cfqd->active_cic = cic;
2615	}
2616
2617	return true;
2618}
2619
2620/*
2621 * Find the cfqq that we need to service and move a request from that to the
2622 * dispatch list
2623 */
2624static int cfq_dispatch_requests(struct request_queue *q, int force)
2625{
2626	struct cfq_data *cfqd = q->elevator->elevator_data;
2627	struct cfq_queue *cfqq;
2628
2629	if (!cfqd->busy_queues)
2630		return 0;
2631
2632	if (unlikely(force))
2633		return cfq_forced_dispatch(cfqd);
2634
2635	cfqq = cfq_select_queue(cfqd);
2636	if (!cfqq)
2637		return 0;
2638
2639	/*
2640	 * Dispatch a request from this cfqq, if it is allowed
2641	 */
2642	if (!cfq_dispatch_request(cfqd, cfqq))
2643		return 0;
2644
2645	cfqq->slice_dispatch++;
2646	cfq_clear_cfqq_must_dispatch(cfqq);
2647
2648	/*
2649	 * expire an async queue immediately if it has used up its slice. idle
2650	 * queue always expire after 1 dispatch round.
2651	 */
2652	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
2653	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
2654	    cfq_class_idle(cfqq))) {
2655		cfqq->slice_end = jiffies + 1;
2656		cfq_slice_expired(cfqd, 0);
2657	}
2658
2659	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2660	return 1;
2661}
2662
2663/*
2664 * task holds one reference to the queue, dropped when task exits. each rq
2665 * in-flight on this queue also holds a reference, dropped when rq is freed.
2666 *
2667 * Each cfq queue took a reference on the parent group. Drop it now.
2668 * queue lock must be held here.
2669 */
2670static void cfq_put_queue(struct cfq_queue *cfqq)
2671{
2672	struct cfq_data *cfqd = cfqq->cfqd;
2673	struct cfq_group *cfqg;
2674
2675	BUG_ON(cfqq->ref <= 0);
2676
2677	cfqq->ref--;
2678	if (cfqq->ref)
2679		return;
2680
2681	cfq_log_cfqq(cfqd, cfqq, "put_queue");
2682	BUG_ON(rb_first(&cfqq->sort_list));
2683	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2684	cfqg = cfqq->cfqg;
2685
2686	if (unlikely(cfqd->active_queue == cfqq)) {
2687		__cfq_slice_expired(cfqd, cfqq, 0);
2688		cfq_schedule_dispatch(cfqd);
2689	}
2690
2691	BUG_ON(cfq_cfqq_on_rr(cfqq));
2692	kmem_cache_free(cfq_pool, cfqq);
2693	cfq_put_cfqg(cfqg);
2694}
2695
2696static void cfq_put_cooperator(struct cfq_queue *cfqq)
2697{
2698	struct cfq_queue *__cfqq, *next;
2699
2700	/*
2701	 * If this queue was scheduled to merge with another queue, be
2702	 * sure to drop the reference taken on that queue (and others in
2703	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
2704	 */
2705	__cfqq = cfqq->new_cfqq;
2706	while (__cfqq) {
2707		if (__cfqq == cfqq) {
2708			WARN(1, "cfqq->new_cfqq loop detected\n");
2709			break;
2710		}
2711		next = __cfqq->new_cfqq;
2712		cfq_put_queue(__cfqq);
2713		__cfqq = next;
2714	}
2715}
2716
2717static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2718{
2719	if (unlikely(cfqq == cfqd->active_queue)) {
2720		__cfq_slice_expired(cfqd, cfqq, 0);
2721		cfq_schedule_dispatch(cfqd);
2722	}
2723
2724	cfq_put_cooperator(cfqq);
2725
2726	cfq_put_queue(cfqq);
2727}
2728
2729static void cfq_init_icq(struct io_cq *icq)
2730{
2731	struct cfq_io_cq *cic = icq_to_cic(icq);
2732
2733	cic->ttime.last_end_request = jiffies;
2734}
2735
2736static void cfq_exit_icq(struct io_cq *icq)
2737{
2738	struct cfq_io_cq *cic = icq_to_cic(icq);
2739	struct cfq_data *cfqd = cic_to_cfqd(cic);
2740
2741	if (cic->cfqq[BLK_RW_ASYNC]) {
2742		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
2743		cic->cfqq[BLK_RW_ASYNC] = NULL;
2744	}
2745
2746	if (cic->cfqq[BLK_RW_SYNC]) {
2747		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
2748		cic->cfqq[BLK_RW_SYNC] = NULL;
2749	}
2750}
2751
2752static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2753{
2754	struct task_struct *tsk = current;
2755	int ioprio_class;
2756
2757	if (!cfq_cfqq_prio_changed(cfqq))
2758		return;
2759
2760	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2761	switch (ioprio_class) {
2762	default:
2763		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
2764	case IOPRIO_CLASS_NONE:
2765		/*
2766		 * no prio set, inherit CPU scheduling settings
2767		 */
2768		cfqq->ioprio = task_nice_ioprio(tsk);
2769		cfqq->ioprio_class = task_nice_ioclass(tsk);
2770		break;
2771	case IOPRIO_CLASS_RT:
2772		cfqq->ioprio = task_ioprio(ioc);
2773		cfqq->ioprio_class = IOPRIO_CLASS_RT;
2774		break;
2775	case IOPRIO_CLASS_BE:
2776		cfqq->ioprio = task_ioprio(ioc);
2777		cfqq->ioprio_class = IOPRIO_CLASS_BE;
2778		break;
2779	case IOPRIO_CLASS_IDLE:
2780		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
2781		cfqq->ioprio = 7;
2782		cfq_clear_cfqq_idle_window(cfqq);
2783		break;
2784	}
2785
2786	/*
2787	 * keep track of original prio settings in case we have to temporarily
2788	 * elevate the priority of this queue
2789	 */
2790	cfqq->org_ioprio = cfqq->ioprio;
2791	cfq_clear_cfqq_prio_changed(cfqq);
2792}
2793
2794static void changed_ioprio(struct cfq_io_cq *cic)
2795{
2796	struct cfq_data *cfqd = cic_to_cfqd(cic);
2797	struct cfq_queue *cfqq;
2798
2799	if (unlikely(!cfqd))
2800		return;
2801
2802	cfqq = cic->cfqq[BLK_RW_ASYNC];
2803	if (cfqq) {
2804		struct cfq_queue *new_cfqq;
2805		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
2806						GFP_ATOMIC);
2807		if (new_cfqq) {
2808			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2809			cfq_put_queue(cfqq);
2810		}
2811	}
2812
2813	cfqq = cic->cfqq[BLK_RW_SYNC];
2814	if (cfqq)
2815		cfq_mark_cfqq_prio_changed(cfqq);
2816}
2817
2818static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2819			  pid_t pid, bool is_sync)
2820{
2821	RB_CLEAR_NODE(&cfqq->rb_node);
2822	RB_CLEAR_NODE(&cfqq->p_node);
2823	INIT_LIST_HEAD(&cfqq->fifo);
2824
2825	cfqq->ref = 0;
2826	cfqq->cfqd = cfqd;
2827
2828	cfq_mark_cfqq_prio_changed(cfqq);
2829
2830	if (is_sync) {
2831		if (!cfq_class_idle(cfqq))
2832			cfq_mark_cfqq_idle_window(cfqq);
2833		cfq_mark_cfqq_sync(cfqq);
2834	}
2835	cfqq->pid = pid;
2836}
2837
2838#ifdef CONFIG_CFQ_GROUP_IOSCHED
2839static void changed_cgroup(struct cfq_io_cq *cic)
2840{
2841	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2842	struct cfq_data *cfqd = cic_to_cfqd(cic);
2843	struct request_queue *q;
2844
2845	if (unlikely(!cfqd))
2846		return;
2847
2848	q = cfqd->queue;
2849
2850	if (sync_cfqq) {
2851		/*
2852		 * Drop reference to sync queue. A new sync queue will be
2853		 * assigned in new group upon arrival of a fresh request.
2854		 */
2855		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2856		cic_set_cfqq(cic, NULL, 1);
2857		cfq_put_queue(sync_cfqq);
2858	}
2859}
2860#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
2861
2862static struct cfq_queue *
2863cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2864		     struct io_context *ioc, gfp_t gfp_mask)
2865{
2866	struct blkio_cgroup *blkcg;
2867	struct cfq_queue *cfqq, *new_cfqq = NULL;
2868	struct cfq_io_cq *cic;
2869	struct cfq_group *cfqg;
2870
2871retry:
2872	rcu_read_lock();
2873
2874	blkcg = task_blkio_cgroup(current);
2875
2876	cfqg = cfq_get_cfqg(cfqd, blkcg);
2877	cic = cfq_cic_lookup(cfqd, ioc);
2878	/* cic always exists here */
2879	cfqq = cic_to_cfqq(cic, is_sync);
2880
2881	/*
2882	 * Always try a new alloc if we fell back to the OOM cfqq
2883	 * originally, since it should just be a temporary situation.
2884	 */
2885	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2886		cfqq = NULL;
2887		if (new_cfqq) {
2888			cfqq = new_cfqq;
2889			new_cfqq = NULL;
2890		} else if (gfp_mask & __GFP_WAIT) {
2891			rcu_read_unlock();
2892			spin_unlock_irq(cfqd->queue->queue_lock);
2893			new_cfqq = kmem_cache_alloc_node(cfq_pool,
2894					gfp_mask | __GFP_ZERO,
2895					cfqd->queue->node);
2896			spin_lock_irq(cfqd->queue->queue_lock);
2897			if (new_cfqq)
2898				goto retry;
2899		} else {
2900			cfqq = kmem_cache_alloc_node(cfq_pool,
2901					gfp_mask | __GFP_ZERO,
2902					cfqd->queue->node);
2903		}
2904
2905		if (cfqq) {
2906			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
2907			cfq_init_prio_data(cfqq, ioc);
2908			cfq_link_cfqq_cfqg(cfqq, cfqg);
2909			cfq_log_cfqq(cfqd, cfqq, "alloced");
2910		} else
2911			cfqq = &cfqd->oom_cfqq;
2912	}
2913
2914	if (new_cfqq)
2915		kmem_cache_free(cfq_pool, new_cfqq);
2916
2917	rcu_read_unlock();
2918	return cfqq;
2919}
2920
2921static struct cfq_queue **
2922cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2923{
2924	switch (ioprio_class) {
2925	case IOPRIO_CLASS_RT:
2926		return &cfqd->async_cfqq[0][ioprio];
2927	case IOPRIO_CLASS_BE:
2928		return &cfqd->async_cfqq[1][ioprio];
2929	case IOPRIO_CLASS_IDLE:
2930		return &cfqd->async_idle_cfqq;
2931	default:
2932		BUG();
2933	}
2934}
2935
2936static struct cfq_queue *
2937cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2938	      gfp_t gfp_mask)
2939{
2940	const int ioprio = task_ioprio(ioc);
2941	const int ioprio_class = task_ioprio_class(ioc);
2942	struct cfq_queue **async_cfqq = NULL;
2943	struct cfq_queue *cfqq = NULL;
2944
2945	if (!is_sync) {
2946		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
2947		cfqq = *async_cfqq;
2948	}
2949
2950	if (!cfqq)
2951		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
2952
2953	/*
2954	 * pin the queue now that it's allocated, scheduler exit will prune it
2955	 */
2956	if (!is_sync && !(*async_cfqq)) {
2957		cfqq->ref++;
2958		*async_cfqq = cfqq;
2959	}
2960
2961	cfqq->ref++;
2962	return cfqq;
2963}
2964
2965static void
2966__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
2967{
2968	unsigned long elapsed = jiffies - ttime->last_end_request;
2969	elapsed = min(elapsed, 2UL * slice_idle);
2970
2971	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
2972	ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
2973	ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
2974}
2975
2976static void
2977cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2978			struct cfq_io_cq *cic)
2979{
2980	if (cfq_cfqq_sync(cfqq)) {
2981		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
2982		__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
2983			cfqd->cfq_slice_idle);
2984	}
2985#ifdef CONFIG_CFQ_GROUP_IOSCHED
2986	__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
2987#endif
2988}
2989
2990static void
2991cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2992		       struct request *rq)
2993{
2994	sector_t sdist = 0;
2995	sector_t n_sec = blk_rq_sectors(rq);
2996	if (cfqq->last_request_pos) {
2997		if (cfqq->last_request_pos < blk_rq_pos(rq))
2998			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
2999		else
3000			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3001	}
3002
3003	cfqq->seek_history <<= 1;
3004	if (blk_queue_nonrot(cfqd->queue))
3005		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3006	else
3007		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3008}
3009
3010/*
3011 * Disable idle window if the process thinks too long or seeks so much that
3012 * it doesn't matter
3013 */
3014static void
3015cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3016		       struct cfq_io_cq *cic)
3017{
3018	int old_idle, enable_idle;
3019
3020	/*
3021	 * Don't idle for async or idle io prio class
3022	 */
3023	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3024		return;
3025
3026	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3027
3028	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3029		cfq_mark_cfqq_deep(cfqq);
3030
3031	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3032		enable_idle = 0;
3033	else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
3034		 !cfqd->cfq_slice_idle ||
3035		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3036		enable_idle = 0;
3037	else if (sample_valid(cic->ttime.ttime_samples)) {
3038		if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3039			enable_idle = 0;
3040		else
3041			enable_idle = 1;
3042	}
3043
3044	if (old_idle != enable_idle) {
3045		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3046		if (enable_idle)
3047			cfq_mark_cfqq_idle_window(cfqq);
3048		else
3049			cfq_clear_cfqq_idle_window(cfqq);
3050	}
3051}
3052
3053/*
3054 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3055 * no or if we aren't sure, a 1 will cause a preempt.
3056 */
3057static bool
3058cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3059		   struct request *rq)
3060{
3061	struct cfq_queue *cfqq;
3062
3063	cfqq = cfqd->active_queue;
3064	if (!cfqq)
3065		return false;
3066
3067	if (cfq_class_idle(new_cfqq))
3068		return false;
3069
3070	if (cfq_class_idle(cfqq))
3071		return true;
3072
3073	/*
3074	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3075	 */
3076	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3077		return false;
3078
3079	/*
3080	 * if the new request is sync, but the currently running queue is
3081	 * not, let the sync request have priority.
3082	 */
3083	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3084		return true;
3085
3086	if (new_cfqq->cfqg != cfqq->cfqg)
3087		return false;
3088
3089	if (cfq_slice_used(cfqq))
3090		return true;
3091
3092	/* Allow preemption only if we are idling on sync-noidle tree */
3093	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3094	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3095	    new_cfqq->service_tree->count == 2 &&
3096	    RB_EMPTY_ROOT(&cfqq->sort_list))
3097		return true;
3098
3099	/*
3100	 * So both queues are sync. Let the new request get disk time if
3101	 * it's a metadata request and the current queue is doing regular IO.
3102	 */
3103	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
3104		return true;
3105
3106	/*
3107	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3108	 */
3109	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3110		return true;
3111
3112	/* An idle queue should not be idle now for some reason */
3113	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3114		return true;
3115
3116	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3117		return false;
3118
3119	/*
3120	 * if this request is as-good as one we would expect from the
3121	 * current cfqq, let it preempt
3122	 */
3123	if (cfq_rq_close(cfqd, cfqq, rq))
3124		return true;
3125
3126	return false;
3127}
3128
3129/*
3130 * cfqq preempts the active queue. if we allowed preempt with no slice left,
3131 * let it have half of its nominal slice.
3132 */
3133static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3134{
3135	enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
3136
3137	cfq_log_cfqq(cfqd, cfqq, "preempt");
3138	cfq_slice_expired(cfqd, 1);
3139
3140	/*
3141	 * workload type is changed, don't save slice, otherwise preempt
3142	 * doesn't happen
3143	 */
3144	if (old_type != cfqq_type(cfqq))
3145		cfqq->cfqg->saved_workload_slice = 0;
3146
3147	/*
3148	 * Put the new queue at the front of the of the current list,
3149	 * so we know that it will be selected next.
3150	 */
3151	BUG_ON(!cfq_cfqq_on_rr(cfqq));
3152
3153	cfq_service_tree_add(cfqd, cfqq, 1);
3154
3155	cfqq->slice_end = 0;
3156	cfq_mark_cfqq_slice_new(cfqq);
3157}
3158
3159/*
3160 * Called when a new fs request (rq) is added (to cfqq). Check if there's
3161 * something we should do about it
3162 */
3163static void
3164cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3165		struct request *rq)
3166{
3167	struct cfq_io_cq *cic = RQ_CIC(rq);
3168
3169	cfqd->rq_queued++;
3170	if (rq->cmd_flags & REQ_PRIO)
3171		cfqq->prio_pending++;
3172
3173	cfq_update_io_thinktime(cfqd, cfqq, cic);
3174	cfq_update_io_seektime(cfqd, cfqq, rq);
3175	cfq_update_idle_window(cfqd, cfqq, cic);
3176
3177	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3178
3179	if (cfqq == cfqd->active_queue) {
3180		/*
3181		 * Remember that we saw a request from this process, but
3182		 * don't start queuing just yet. Otherwise we risk seeing lots
3183		 * of tiny requests, because we disrupt the normal plugging
3184		 * and merging. If the request is already larger than a single
3185		 * page, let it rip immediately. For that case we assume that
3186		 * merging is already done. Ditto for a busy system that
3187		 * has other work pending, don't risk delaying until the
3188		 * idle timer unplug to continue working.
3189		 */
3190		if (cfq_cfqq_wait_request(cfqq)) {
3191			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3192			    cfqd->busy_queues > 1) {
3193				cfq_del_timer(cfqd, cfqq);
3194				cfq_clear_cfqq_wait_request(cfqq);
3195				__blk_run_queue(cfqd->queue);
3196			} else {
3197				cfq_blkiocg_update_idle_time_stats(
3198						&cfqq->cfqg->blkg);
3199				cfq_mark_cfqq_must_dispatch(cfqq);
3200			}
3201		}
3202	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3203		/*
3204		 * not the active queue - expire current slice if it is
3205		 * idle and has expired it's mean thinktime or this new queue
3206		 * has some old slice time left and is of higher priority or
3207		 * this new queue is RT and the current one is BE
3208		 */
3209		cfq_preempt_queue(cfqd, cfqq);
3210		__blk_run_queue(cfqd->queue);
3211	}
3212}
3213
3214static void cfq_insert_request(struct request_queue *q, struct request *rq)
3215{
3216	struct cfq_data *cfqd = q->elevator->elevator_data;
3217	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3218
3219	cfq_log_cfqq(cfqd, cfqq, "insert_request");
3220	cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
3221
3222	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3223	list_add_tail(&rq->queuelist, &cfqq->fifo);
3224	cfq_add_rq_rb(rq);
3225	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3226			&cfqd->serving_group->blkg, rq_data_dir(rq),
3227			rq_is_sync(rq));
3228	cfq_rq_enqueued(cfqd, cfqq, rq);
3229}
3230
3231/*
3232 * Update hw_tag based on peak queue depth over 50 samples under
3233 * sufficient load.
3234 */
3235static void cfq_update_hw_tag(struct cfq_data *cfqd)
3236{
3237	struct cfq_queue *cfqq = cfqd->active_queue;
3238
3239	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3240		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3241
3242	if (cfqd->hw_tag == 1)
3243		return;
3244
3245	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3246	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3247		return;
3248
3249	/*
3250	 * If active queue hasn't enough requests and can idle, cfq might not
3251	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3252	 * case
3253	 */
3254	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3255	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3256	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3257		return;
3258
3259	if (cfqd->hw_tag_samples++ < 50)
3260		return;
3261
3262	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3263		cfqd->hw_tag = 1;
3264	else
3265		cfqd->hw_tag = 0;
3266}
3267
3268static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3269{
3270	struct cfq_io_cq *cic = cfqd->active_cic;
3271
3272	/* If the queue already has requests, don't wait */
3273	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3274		return false;
3275
3276	/* If there are other queues in the group, don't wait */
3277	if (cfqq->cfqg->nr_cfqq > 1)
3278		return false;
3279
3280	/* the only queue in the group, but think time is big */
3281	if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
3282		return false;
3283
3284	if (cfq_slice_used(cfqq))
3285		return true;
3286
3287	/* if slice left is less than think time, wait busy */
3288	if (cic && sample_valid(cic->ttime.ttime_samples)
3289	    && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
3290		return true;
3291
3292	/*
3293	 * If think times is less than a jiffy than ttime_mean=0 and above
3294	 * will not be true. It might happen that slice has not expired yet
3295	 * but will expire soon (4-5 ns) during select_queue(). To cover the
3296	 * case where think time is less than a jiffy, mark the queue wait
3297	 * busy if only 1 jiffy is left in the slice.
3298	 */
3299	if (cfqq->slice_end - jiffies == 1)
3300		return true;
3301
3302	return false;
3303}
3304
3305static void cfq_completed_request(struct request_queue *q, struct request *rq)
3306{
3307	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3308	struct cfq_data *cfqd = cfqq->cfqd;
3309	const int sync = rq_is_sync(rq);
3310	unsigned long now;
3311
3312	now = jiffies;
3313	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
3314		     !!(rq->cmd_flags & REQ_NOIDLE));
3315
3316	cfq_update_hw_tag(cfqd);
3317
3318	WARN_ON(!cfqd->rq_in_driver);
3319	WARN_ON(!cfqq->dispatched);
3320	cfqd->rq_in_driver--;
3321	cfqq->dispatched--;
3322	(RQ_CFQG(rq))->dispatched--;
3323	cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
3324			rq_start_time_ns(rq), rq_io_start_time_ns(rq),
3325			rq_data_dir(rq), rq_is_sync(rq));
3326
3327	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3328
3329	if (sync) {
3330		struct cfq_rb_root *service_tree;
3331
3332		RQ_CIC(rq)->ttime.last_end_request = now;
3333
3334		if (cfq_cfqq_on_rr(cfqq))
3335			service_tree = cfqq->service_tree;
3336		else
3337			service_tree = service_tree_for(cfqq->cfqg,
3338				cfqq_prio(cfqq), cfqq_type(cfqq));
3339		service_tree->ttime.last_end_request = now;
3340		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3341			cfqd->last_delayed_sync = now;
3342	}
3343
3344#ifdef CONFIG_CFQ_GROUP_IOSCHED
3345	cfqq->cfqg->ttime.last_end_request = now;
3346#endif
3347
3348	/*
3349	 * If this is the active queue, check if it needs to be expired,
3350	 * or if we want to idle in case it has no pending requests.
3351	 */
3352	if (cfqd->active_queue == cfqq) {
3353		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3354
3355		if (cfq_cfqq_slice_new(cfqq)) {
3356			cfq_set_prio_slice(cfqd, cfqq);
3357			cfq_clear_cfqq_slice_new(cfqq);
3358		}
3359
3360		/*
3361		 * Should we wait for next request to come in before we expire
3362		 * the queue.
3363		 */
3364		if (cfq_should_wait_busy(cfqd, cfqq)) {
3365			unsigned long extend_sl = cfqd->cfq_slice_idle;
3366			if (!cfqd->cfq_slice_idle)
3367				extend_sl = cfqd->cfq_group_idle;
3368			cfqq->slice_end = jiffies + extend_sl;
3369			cfq_mark_cfqq_wait_busy(cfqq);
3370			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3371		}
3372
3373		/*
3374		 * Idling is not enabled on:
3375		 * - expired queues
3376		 * - idle-priority queues
3377		 * - async queues
3378		 * - queues with still some requests queued
3379		 * - when there is a close cooperator
3380		 */
3381		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3382			cfq_slice_expired(cfqd, 1);
3383		else if (sync && cfqq_empty &&
3384			 !cfq_close_cooperator(cfqd, cfqq)) {
3385			cfq_arm_slice_timer(cfqd);
3386		}
3387	}
3388
3389	if (!cfqd->rq_in_driver)
3390		cfq_schedule_dispatch(cfqd);
3391}
3392
3393static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3394{
3395	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3396		cfq_mark_cfqq_must_alloc_slice(cfqq);
3397		return ELV_MQUEUE_MUST;
3398	}
3399
3400	return ELV_MQUEUE_MAY;
3401}
3402
3403static int cfq_may_queue(struct request_queue *q, int rw)
3404{
3405	struct cfq_data *cfqd = q->elevator->elevator_data;
3406	struct task_struct *tsk = current;
3407	struct cfq_io_cq *cic;
3408	struct cfq_queue *cfqq;
3409
3410	/*
3411	 * don't force setup of a queue from here, as a call to may_queue
3412	 * does not necessarily imply that a request actually will be queued.
3413	 * so just lookup a possibly existing queue, or return 'may queue'
3414	 * if that fails
3415	 */
3416	cic = cfq_cic_lookup(cfqd, tsk->io_context);
3417	if (!cic)
3418		return ELV_MQUEUE_MAY;
3419
3420	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3421	if (cfqq) {
3422		cfq_init_prio_data(cfqq, cic->icq.ioc);
3423
3424		return __cfq_may_queue(cfqq);
3425	}
3426
3427	return ELV_MQUEUE_MAY;
3428}
3429
3430/*
3431 * queue lock held here
3432 */
3433static void cfq_put_request(struct request *rq)
3434{
3435	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3436
3437	if (cfqq) {
3438		const int rw = rq_data_dir(rq);
3439
3440		BUG_ON(!cfqq->allocated[rw]);
3441		cfqq->allocated[rw]--;
3442
3443		/* Put down rq reference on cfqg */
3444		cfq_put_cfqg(RQ_CFQG(rq));
3445		rq->elv.priv[0] = NULL;
3446		rq->elv.priv[1] = NULL;
3447
3448		cfq_put_queue(cfqq);
3449	}
3450}
3451
3452static struct cfq_queue *
3453cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
3454		struct cfq_queue *cfqq)
3455{
3456	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3457	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3458	cfq_mark_cfqq_coop(cfqq->new_cfqq);
3459	cfq_put_queue(cfqq);
3460	return cic_to_cfqq(cic, 1);
3461}
3462
3463/*
3464 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3465 * was the last process referring to said cfqq.
3466 */
3467static struct cfq_queue *
3468split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
3469{
3470	if (cfqq_process_refs(cfqq) == 1) {
3471		cfqq->pid = current->pid;
3472		cfq_clear_cfqq_coop(cfqq);
3473		cfq_clear_cfqq_split_coop(cfqq);
3474		return cfqq;
3475	}
3476
3477	cic_set_cfqq(cic, NULL, 1);
3478
3479	cfq_put_cooperator(cfqq);
3480
3481	cfq_put_queue(cfqq);
3482	return NULL;
3483}
3484/*
3485 * Allocate cfq data structures associated with this request.
3486 */
3487static int
3488cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3489{
3490	struct cfq_data *cfqd = q->elevator->elevator_data;
3491	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
3492	const int rw = rq_data_dir(rq);
3493	const bool is_sync = rq_is_sync(rq);
3494	struct cfq_queue *cfqq;
3495	unsigned int changed;
3496
3497	might_sleep_if(gfp_mask & __GFP_WAIT);
3498
3499	spin_lock_irq(q->queue_lock);
3500
3501	/* handle changed notifications */
3502	changed = icq_get_changed(&cic->icq);
3503	if (unlikely(changed & ICQ_IOPRIO_CHANGED))
3504		changed_ioprio(cic);
3505#ifdef CONFIG_CFQ_GROUP_IOSCHED
3506	if (unlikely(changed & ICQ_CGROUP_CHANGED))
3507		changed_cgroup(cic);
3508#endif
3509
3510new_queue:
3511	cfqq = cic_to_cfqq(cic, is_sync);
3512	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3513		cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
3514		cic_set_cfqq(cic, cfqq, is_sync);
3515	} else {
3516		/*
3517		 * If the queue was seeky for too long, break it apart.
3518		 */
3519		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3520			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3521			cfqq = split_cfqq(cic, cfqq);
3522			if (!cfqq)
3523				goto new_queue;
3524		}
3525
3526		/*
3527		 * Check to see if this queue is scheduled to merge with
3528		 * another, closely cooperating queue.  The merging of
3529		 * queues happens here as it must be done in process context.
3530		 * The reference on new_cfqq was taken in merge_cfqqs.
3531		 */
3532		if (cfqq->new_cfqq)
3533			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
3534	}
3535
3536	cfqq->allocated[rw]++;
3537
3538	cfqq->ref++;
3539	rq->elv.priv[0] = cfqq;
3540	rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
3541	spin_unlock_irq(q->queue_lock);
3542	return 0;
3543}
3544
3545static void cfq_kick_queue(struct work_struct *work)
3546{
3547	struct cfq_data *cfqd =
3548		container_of(work, struct cfq_data, unplug_work);
3549	struct request_queue *q = cfqd->queue;
3550
3551	spin_lock_irq(q->queue_lock);
3552	__blk_run_queue(cfqd->queue);
3553	spin_unlock_irq(q->queue_lock);
3554}
3555
3556/*
3557 * Timer running if the active_queue is currently idling inside its time slice
3558 */
3559static void cfq_idle_slice_timer(unsigned long data)
3560{
3561	struct cfq_data *cfqd = (struct cfq_data *) data;
3562	struct cfq_queue *cfqq;
3563	unsigned long flags;
3564	int timed_out = 1;
3565
3566	cfq_log(cfqd, "idle timer fired");
3567
3568	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3569
3570	cfqq = cfqd->active_queue;
3571	if (cfqq) {
3572		timed_out = 0;
3573
3574		/*
3575		 * We saw a request before the queue expired, let it through
3576		 */
3577		if (cfq_cfqq_must_dispatch(cfqq))
3578			goto out_kick;
3579
3580		/*
3581		 * expired
3582		 */
3583		if (cfq_slice_used(cfqq))
3584			goto expire;
3585
3586		/*
3587		 * only expire and reinvoke request handler, if there are
3588		 * other queues with pending requests
3589		 */
3590		if (!cfqd->busy_queues)
3591			goto out_cont;
3592
3593		/*
3594		 * not expired and it has a request pending, let it dispatch
3595		 */
3596		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3597			goto out_kick;
3598
3599		/*
3600		 * Queue depth flag is reset only when the idle didn't succeed
3601		 */
3602		cfq_clear_cfqq_deep(cfqq);
3603	}
3604expire:
3605	cfq_slice_expired(cfqd, timed_out);
3606out_kick:
3607	cfq_schedule_dispatch(cfqd);
3608out_cont:
3609	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3610}
3611
3612static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
3613{
3614	del_timer_sync(&cfqd->idle_slice_timer);
3615	cancel_work_sync(&cfqd->unplug_work);
3616}
3617
3618static void cfq_put_async_queues(struct cfq_data *cfqd)
3619{
3620	int i;
3621
3622	for (i = 0; i < IOPRIO_BE_NR; i++) {
3623		if (cfqd->async_cfqq[0][i])
3624			cfq_put_queue(cfqd->async_cfqq[0][i]);
3625		if (cfqd->async_cfqq[1][i])
3626			cfq_put_queue(cfqd->async_cfqq[1][i]);
3627	}
3628
3629	if (cfqd->async_idle_cfqq)
3630		cfq_put_queue(cfqd->async_idle_cfqq);
3631}
3632
3633static void cfq_exit_queue(struct elevator_queue *e)
3634{
3635	struct cfq_data *cfqd = e->elevator_data;
3636	struct request_queue *q = cfqd->queue;
3637	bool wait = false;
3638
3639	cfq_shutdown_timer_wq(cfqd);
3640
3641	spin_lock_irq(q->queue_lock);
3642
3643	if (cfqd->active_queue)
3644		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3645
3646	cfq_put_async_queues(cfqd);
3647	cfq_release_cfq_groups(cfqd);
3648
3649	/*
3650	 * If there are groups which we could not unlink from blkcg list,
3651	 * wait for a rcu period for them to be freed.
3652	 */
3653	if (cfqd->nr_blkcg_linked_grps)
3654		wait = true;
3655
3656	spin_unlock_irq(q->queue_lock);
3657
3658	cfq_shutdown_timer_wq(cfqd);
3659
3660	/*
3661	 * Wait for cfqg->blkg->key accessors to exit their grace periods.
3662	 * Do this wait only if there are other unlinked groups out
3663	 * there. This can happen if cgroup deletion path claimed the
3664	 * responsibility of cleaning up a group before queue cleanup code
3665	 * get to the group.
3666	 *
3667	 * Do not call synchronize_rcu() unconditionally as there are drivers
3668	 * which create/delete request queue hundreds of times during scan/boot
3669	 * and synchronize_rcu() can take significant time and slow down boot.
3670	 */
3671	if (wait)
3672		synchronize_rcu();
3673
3674#ifdef CONFIG_CFQ_GROUP_IOSCHED
3675	/* Free up per cpu stats for root group */
3676	free_percpu(cfqd->root_group.blkg.stats_cpu);
3677#endif
3678	kfree(cfqd);
3679}
3680
3681static int cfq_init_queue(struct request_queue *q)
3682{
3683	struct cfq_data *cfqd;
3684	int i, j;
3685	struct cfq_group *cfqg;
3686	struct cfq_rb_root *st;
3687
3688	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3689	if (!cfqd)
3690		return -ENOMEM;
3691
3692	/* Init root service tree */
3693	cfqd->grp_service_tree = CFQ_RB_ROOT;
3694
3695	/* Init root group */
3696	cfqg = &cfqd->root_group;
3697	for_each_cfqg_st(cfqg, i, j, st)
3698		*st = CFQ_RB_ROOT;
3699	RB_CLEAR_NODE(&cfqg->rb_node);
3700
3701	/* Give preference to root group over other groups */
3702	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3703
3704#ifdef CONFIG_CFQ_GROUP_IOSCHED
3705	/*
3706	 * Set root group reference to 2. One reference will be dropped when
3707	 * all groups on cfqd->cfqg_list are being deleted during queue exit.
3708	 * Other reference will remain there as we don't want to delete this
3709	 * group as it is statically allocated and gets destroyed when
3710	 * throtl_data goes away.
3711	 */
3712	cfqg->ref = 2;
3713
3714	if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
3715		kfree(cfqg);
3716		kfree(cfqd);
3717		return -ENOMEM;
3718	}
3719
3720	rcu_read_lock();
3721
3722	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
3723				    cfqd->queue, 0);
3724	rcu_read_unlock();
3725	cfqd->nr_blkcg_linked_grps++;
3726
3727	/* Add group on cfqd->cfqg_list */
3728	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
3729#endif
3730	/*
3731	 * Not strictly needed (since RB_ROOT just clears the node and we
3732	 * zeroed cfqd on alloc), but better be safe in case someone decides
3733	 * to add magic to the rb code
3734	 */
3735	for (i = 0; i < CFQ_PRIO_LISTS; i++)
3736		cfqd->prio_trees[i] = RB_ROOT;
3737
3738	/*
3739	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
3740	 * Grab a permanent reference to it, so that the normal code flow
3741	 * will not attempt to free it.
3742	 */
3743	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3744	cfqd->oom_cfqq.ref++;
3745	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3746
3747	cfqd->queue = q;
3748	q->elevator->elevator_data = cfqd;
3749
3750	init_timer(&cfqd->idle_slice_timer);
3751	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
3752	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
3753
3754	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
3755
3756	cfqd->cfq_quantum = cfq_quantum;
3757	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
3758	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
3759	cfqd->cfq_back_max = cfq_back_max;
3760	cfqd->cfq_back_penalty = cfq_back_penalty;
3761	cfqd->cfq_slice[0] = cfq_slice_async;
3762	cfqd->cfq_slice[1] = cfq_slice_sync;
3763	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
3764	cfqd->cfq_slice_idle = cfq_slice_idle;
3765	cfqd->cfq_group_idle = cfq_group_idle;
3766	cfqd->cfq_latency = 1;
3767	cfqd->hw_tag = -1;
3768	/*
3769	 * we optimistically start assuming sync ops weren't delayed in last
3770	 * second, in order to have larger depth for async operations.
3771	 */
3772	cfqd->last_delayed_sync = jiffies - HZ;
3773	return 0;
3774}
3775
3776/*
3777 * sysfs parts below -->
3778 */
3779static ssize_t
3780cfq_var_show(unsigned int var, char *page)
3781{
3782	return sprintf(page, "%d\n", var);
3783}
3784
3785static ssize_t
3786cfq_var_store(unsigned int *var, const char *page, size_t count)
3787{
3788	char *p = (char *) page;
3789
3790	*var = simple_strtoul(p, &p, 10);
3791	return count;
3792}
3793
3794#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
3795static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
3796{									\
3797	struct cfq_data *cfqd = e->elevator_data;			\
3798	unsigned int __data = __VAR;					\
3799	if (__CONV)							\
3800		__data = jiffies_to_msecs(__data);			\
3801	return cfq_var_show(__data, (page));				\
3802}
3803SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
3804SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
3805SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3806SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
3807SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3808SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3809SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
3810SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
3811SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
3812SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
3813SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3814#undef SHOW_FUNCTION
3815
3816#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
3817static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
3818{									\
3819	struct cfq_data *cfqd = e->elevator_data;			\
3820	unsigned int __data;						\
3821	int ret = cfq_var_store(&__data, (page), count);		\
3822	if (__data < (MIN))						\
3823		__data = (MIN);						\
3824	else if (__data > (MAX))					\
3825		__data = (MAX);						\
3826	if (__CONV)							\
3827		*(__PTR) = msecs_to_jiffies(__data);			\
3828	else								\
3829		*(__PTR) = __data;					\
3830	return ret;							\
3831}
3832STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
3833STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
3834		UINT_MAX, 1);
3835STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
3836		UINT_MAX, 1);
3837STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
3838STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
3839		UINT_MAX, 0);
3840STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
3841STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
3842STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
3843STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
3844STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
3845		UINT_MAX, 0);
3846STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
3847#undef STORE_FUNCTION
3848
3849#define CFQ_ATTR(name) \
3850	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
3851
3852static struct elv_fs_entry cfq_attrs[] = {
3853	CFQ_ATTR(quantum),
3854	CFQ_ATTR(fifo_expire_sync),
3855	CFQ_ATTR(fifo_expire_async),
3856	CFQ_ATTR(back_seek_max),
3857	CFQ_ATTR(back_seek_penalty),
3858	CFQ_ATTR(slice_sync),
3859	CFQ_ATTR(slice_async),
3860	CFQ_ATTR(slice_async_rq),
3861	CFQ_ATTR(slice_idle),
3862	CFQ_ATTR(group_idle),
3863	CFQ_ATTR(low_latency),
3864	__ATTR_NULL
3865};
3866
3867static struct elevator_type iosched_cfq = {
3868	.ops = {
3869		.elevator_merge_fn = 		cfq_merge,
3870		.elevator_merged_fn =		cfq_merged_request,
3871		.elevator_merge_req_fn =	cfq_merged_requests,
3872		.elevator_allow_merge_fn =	cfq_allow_merge,
3873		.elevator_bio_merged_fn =	cfq_bio_merged,
3874		.elevator_dispatch_fn =		cfq_dispatch_requests,
3875		.elevator_add_req_fn =		cfq_insert_request,
3876		.elevator_activate_req_fn =	cfq_activate_request,
3877		.elevator_deactivate_req_fn =	cfq_deactivate_request,
3878		.elevator_completed_req_fn =	cfq_completed_request,
3879		.elevator_former_req_fn =	elv_rb_former_request,
3880		.elevator_latter_req_fn =	elv_rb_latter_request,
3881		.elevator_init_icq_fn =		cfq_init_icq,
3882		.elevator_exit_icq_fn =		cfq_exit_icq,
3883		.elevator_set_req_fn =		cfq_set_request,
3884		.elevator_put_req_fn =		cfq_put_request,
3885		.elevator_may_queue_fn =	cfq_may_queue,
3886		.elevator_init_fn =		cfq_init_queue,
3887		.elevator_exit_fn =		cfq_exit_queue,
3888	},
3889	.icq_size	=	sizeof(struct cfq_io_cq),
3890	.icq_align	=	__alignof__(struct cfq_io_cq),
3891	.elevator_attrs =	cfq_attrs,
3892	.elevator_name	=	"cfq",
3893	.elevator_owner =	THIS_MODULE,
3894};
3895
3896#ifdef CONFIG_CFQ_GROUP_IOSCHED
3897static struct blkio_policy_type blkio_policy_cfq = {
3898	.ops = {
3899		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
3900		.blkio_clear_queue_fn = cfq_clear_queue,
3901		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
3902	},
3903	.plid = BLKIO_POLICY_PROP,
3904};
3905#endif
3906
3907static int __init cfq_init(void)
3908{
3909	int ret;
3910
3911	/*
3912	 * could be 0 on HZ < 1000 setups
3913	 */
3914	if (!cfq_slice_async)
3915		cfq_slice_async = 1;
3916	if (!cfq_slice_idle)
3917		cfq_slice_idle = 1;
3918
3919#ifdef CONFIG_CFQ_GROUP_IOSCHED
3920	if (!cfq_group_idle)
3921		cfq_group_idle = 1;
3922#else
3923		cfq_group_idle = 0;
3924#endif
3925	cfq_pool = KMEM_CACHE(cfq_queue, 0);
3926	if (!cfq_pool)
3927		return -ENOMEM;
3928
3929	ret = elv_register(&iosched_cfq);
3930	if (ret) {
3931		kmem_cache_destroy(cfq_pool);
3932		return ret;
3933	}
3934
3935#ifdef CONFIG_CFQ_GROUP_IOSCHED
3936	blkio_policy_register(&blkio_policy_cfq);
3937#endif
3938	return 0;
3939}
3940
3941static void __exit cfq_exit(void)
3942{
3943#ifdef CONFIG_CFQ_GROUP_IOSCHED
3944	blkio_policy_unregister(&blkio_policy_cfq);
3945#endif
3946	elv_unregister(&iosched_cfq);
3947	kmem_cache_destroy(cfq_pool);
3948}
3949
3950module_init(cfq_init);
3951module_exit(cfq_exit);
3952
3953MODULE_AUTHOR("Jens Axboe");
3954MODULE_LICENSE("GPL");
3955MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
3956