cfq-iosched.c revision c4e7893ebc3a5c507b53f59b9de448db20849944
1/*
2 *  CFQ, or complete fairness queueing, disk scheduler.
3 *
4 *  Based on ideas from a previously unfinished io
5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <linux/blkdev.h>
12#include <linux/elevator.h>
13#include <linux/jiffies.h>
14#include <linux/rbtree.h>
15#include <linux/ioprio.h>
16#include <linux/blktrace_api.h>
17#include "cfq.h"
18
19/*
20 * tunables
21 */
22/* max queue in one round of service */
23static const int cfq_quantum = 8;
24static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
25/* maximum backwards seek, in KiB */
26static const int cfq_back_max = 16 * 1024;
27/* penalty of a backwards seek */
28static const int cfq_back_penalty = 2;
29static const int cfq_slice_sync = HZ / 10;
30static int cfq_slice_async = HZ / 25;
31static const int cfq_slice_async_rq = 2;
32static int cfq_slice_idle = HZ / 125;
33static int cfq_group_idle = HZ / 125;
34static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
35static const int cfq_hist_divisor = 4;
36
37/*
38 * offset from end of service tree
39 */
40#define CFQ_IDLE_DELAY		(HZ / 5)
41
42/*
43 * below this threshold, we consider thinktime immediate
44 */
45#define CFQ_MIN_TT		(2)
46
47#define CFQ_SLICE_SCALE		(5)
48#define CFQ_HW_QUEUE_MIN	(5)
49#define CFQ_SERVICE_SHIFT       12
50
51#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
52#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
53#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
54#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
55
56#define RQ_CIC(rq)		\
57	((struct cfq_io_context *) (rq)->elevator_private)
58#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
59#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private3)
60
61static struct kmem_cache *cfq_pool;
62static struct kmem_cache *cfq_ioc_pool;
63
64static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
65static struct completion *ioc_gone;
66static DEFINE_SPINLOCK(ioc_gone_lock);
67
68static DEFINE_SPINLOCK(cic_index_lock);
69static DEFINE_IDA(cic_index_ida);
70
71#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
72#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
73#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
74
75#define sample_valid(samples)	((samples) > 80)
76#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
77
78/*
79 * Most of our rbtree usage is for sorting with min extraction, so
80 * if we cache the leftmost node we don't have to walk down the tree
81 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
82 * move this into the elevator for the rq sorting as well.
83 */
84struct cfq_rb_root {
85	struct rb_root rb;
86	struct rb_node *left;
87	unsigned count;
88	unsigned total_weight;
89	u64 min_vdisktime;
90	struct rb_node *active;
91};
92#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
93			.count = 0, .min_vdisktime = 0, }
94
95/*
96 * Per process-grouping structure
97 */
98struct cfq_queue {
99	/* reference count */
100	atomic_t ref;
101	/* various state flags, see below */
102	unsigned int flags;
103	/* parent cfq_data */
104	struct cfq_data *cfqd;
105	/* service_tree member */
106	struct rb_node rb_node;
107	/* service_tree key */
108	unsigned long rb_key;
109	/* prio tree member */
110	struct rb_node p_node;
111	/* prio tree root we belong to, if any */
112	struct rb_root *p_root;
113	/* sorted list of pending requests */
114	struct rb_root sort_list;
115	/* if fifo isn't expired, next request to serve */
116	struct request *next_rq;
117	/* requests queued in sort_list */
118	int queued[2];
119	/* currently allocated requests */
120	int allocated[2];
121	/* fifo list of requests in sort_list */
122	struct list_head fifo;
123
124	/* time when queue got scheduled in to dispatch first request. */
125	unsigned long dispatch_start;
126	unsigned int allocated_slice;
127	unsigned int slice_dispatch;
128	/* time when first request from queue completed and slice started. */
129	unsigned long slice_start;
130	unsigned long slice_end;
131	long slice_resid;
132
133	/* pending metadata requests */
134	int meta_pending;
135	/* number of requests that are on the dispatch list or inside driver */
136	int dispatched;
137
138	/* io prio of this group */
139	unsigned short ioprio, org_ioprio;
140	unsigned short ioprio_class, org_ioprio_class;
141
142	pid_t pid;
143
144	u32 seek_history;
145	sector_t last_request_pos;
146
147	struct cfq_rb_root *service_tree;
148	struct cfq_queue *new_cfqq;
149	struct cfq_group *cfqg;
150	struct cfq_group *orig_cfqg;
151	/* Number of sectors dispatched from queue in single dispatch round */
152	unsigned long nr_sectors;
153};
154
155/*
156 * First index in the service_trees.
157 * IDLE is handled separately, so it has negative index
158 */
159enum wl_prio_t {
160	BE_WORKLOAD = 0,
161	RT_WORKLOAD = 1,
162	IDLE_WORKLOAD = 2,
163};
164
165/*
166 * Second index in the service_trees.
167 */
168enum wl_type_t {
169	ASYNC_WORKLOAD = 0,
170	SYNC_NOIDLE_WORKLOAD = 1,
171	SYNC_WORKLOAD = 2
172};
173
174/* This is per cgroup per device grouping structure */
175struct cfq_group {
176	/* group service_tree member */
177	struct rb_node rb_node;
178
179	/* group service_tree key */
180	u64 vdisktime;
181	unsigned int weight;
182	bool on_st;
183
184	/* number of cfqq currently on this group */
185	int nr_cfqq;
186
187	/* Per group busy queus average. Useful for workload slice calc. */
188	unsigned int busy_queues_avg[2];
189	/*
190	 * rr lists of queues with requests, onle rr for each priority class.
191	 * Counts are embedded in the cfq_rb_root
192	 */
193	struct cfq_rb_root service_trees[2][3];
194	struct cfq_rb_root service_tree_idle;
195
196	unsigned long saved_workload_slice;
197	enum wl_type_t saved_workload;
198	enum wl_prio_t saved_serving_prio;
199	struct blkio_group blkg;
200#ifdef CONFIG_CFQ_GROUP_IOSCHED
201	struct hlist_node cfqd_node;
202	atomic_t ref;
203#endif
204	/* number of requests that are on the dispatch list or inside driver */
205	int dispatched;
206};
207
208/*
209 * Per block device queue structure
210 */
211struct cfq_data {
212	struct request_queue *queue;
213	/* Root service tree for cfq_groups */
214	struct cfq_rb_root grp_service_tree;
215	struct cfq_group root_group;
216
217	/*
218	 * The priority currently being served
219	 */
220	enum wl_prio_t serving_prio;
221	enum wl_type_t serving_type;
222	unsigned long workload_expires;
223	struct cfq_group *serving_group;
224	bool noidle_tree_requires_idle;
225
226	/*
227	 * Each priority tree is sorted by next_request position.  These
228	 * trees are used when determining if two or more queues are
229	 * interleaving requests (see cfq_close_cooperator).
230	 */
231	struct rb_root prio_trees[CFQ_PRIO_LISTS];
232
233	unsigned int busy_queues;
234
235	int rq_in_driver;
236	int rq_in_flight[2];
237
238	/*
239	 * queue-depth detection
240	 */
241	int rq_queued;
242	int hw_tag;
243	/*
244	 * hw_tag can be
245	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
246	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
247	 *  0 => no NCQ
248	 */
249	int hw_tag_est_depth;
250	unsigned int hw_tag_samples;
251
252	/*
253	 * idle window management
254	 */
255	struct timer_list idle_slice_timer;
256	struct work_struct unplug_work;
257
258	struct cfq_queue *active_queue;
259	struct cfq_io_context *active_cic;
260
261	/*
262	 * async queue for each priority case
263	 */
264	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
265	struct cfq_queue *async_idle_cfqq;
266
267	sector_t last_position;
268
269	/*
270	 * tunables, see top of file
271	 */
272	unsigned int cfq_quantum;
273	unsigned int cfq_fifo_expire[2];
274	unsigned int cfq_back_penalty;
275	unsigned int cfq_back_max;
276	unsigned int cfq_slice[2];
277	unsigned int cfq_slice_async_rq;
278	unsigned int cfq_slice_idle;
279	unsigned int cfq_group_idle;
280	unsigned int cfq_latency;
281	unsigned int cfq_group_isolation;
282
283	unsigned int cic_index;
284	struct list_head cic_list;
285
286	/*
287	 * Fallback dummy cfqq for extreme OOM conditions
288	 */
289	struct cfq_queue oom_cfqq;
290
291	unsigned long last_delayed_sync;
292
293	/* List of cfq groups being managed on this device*/
294	struct hlist_head cfqg_list;
295	struct rcu_head rcu;
296};
297
298static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
299
300static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
301					    enum wl_prio_t prio,
302					    enum wl_type_t type)
303{
304	if (!cfqg)
305		return NULL;
306
307	if (prio == IDLE_WORKLOAD)
308		return &cfqg->service_tree_idle;
309
310	return &cfqg->service_trees[prio][type];
311}
312
313enum cfqq_state_flags {
314	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
315	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
316	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
317	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
318	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
319	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
320	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
321	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
322	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
323	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
324	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
325	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
326	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
327};
328
329#define CFQ_CFQQ_FNS(name)						\
330static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
331{									\
332	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
333}									\
334static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
335{									\
336	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
337}									\
338static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
339{									\
340	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
341}
342
343CFQ_CFQQ_FNS(on_rr);
344CFQ_CFQQ_FNS(wait_request);
345CFQ_CFQQ_FNS(must_dispatch);
346CFQ_CFQQ_FNS(must_alloc_slice);
347CFQ_CFQQ_FNS(fifo_expire);
348CFQ_CFQQ_FNS(idle_window);
349CFQ_CFQQ_FNS(prio_changed);
350CFQ_CFQQ_FNS(slice_new);
351CFQ_CFQQ_FNS(sync);
352CFQ_CFQQ_FNS(coop);
353CFQ_CFQQ_FNS(split_coop);
354CFQ_CFQQ_FNS(deep);
355CFQ_CFQQ_FNS(wait_busy);
356#undef CFQ_CFQQ_FNS
357
358#ifdef CONFIG_CFQ_GROUP_IOSCHED
359#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
360	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
361			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
362			blkg_path(&(cfqq)->cfqg->blkg), ##args);
363
364#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
365	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
366				blkg_path(&(cfqg)->blkg), ##args);      \
367
368#else
369#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
370	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
371#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0);
372#endif
373#define cfq_log(cfqd, fmt, args...)	\
374	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
375
376/* Traverses through cfq group service trees */
377#define for_each_cfqg_st(cfqg, i, j, st) \
378	for (i = 0; i <= IDLE_WORKLOAD; i++) \
379		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
380			: &cfqg->service_tree_idle; \
381			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
382			(i == IDLE_WORKLOAD && j == 0); \
383			j++, st = i < IDLE_WORKLOAD ? \
384			&cfqg->service_trees[i][j]: NULL) \
385
386
387static inline bool iops_mode(struct cfq_data *cfqd)
388{
389	/*
390	 * If we are not idling on queues and it is a NCQ drive, parallel
391	 * execution of requests is on and measuring time is not possible
392	 * in most of the cases until and unless we drive shallower queue
393	 * depths and that becomes a performance bottleneck. In such cases
394	 * switch to start providing fairness in terms of number of IOs.
395	 */
396	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
397		return true;
398	else
399		return false;
400}
401
402static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
403{
404	if (cfq_class_idle(cfqq))
405		return IDLE_WORKLOAD;
406	if (cfq_class_rt(cfqq))
407		return RT_WORKLOAD;
408	return BE_WORKLOAD;
409}
410
411
412static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
413{
414	if (!cfq_cfqq_sync(cfqq))
415		return ASYNC_WORKLOAD;
416	if (!cfq_cfqq_idle_window(cfqq))
417		return SYNC_NOIDLE_WORKLOAD;
418	return SYNC_WORKLOAD;
419}
420
421static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
422					struct cfq_data *cfqd,
423					struct cfq_group *cfqg)
424{
425	if (wl == IDLE_WORKLOAD)
426		return cfqg->service_tree_idle.count;
427
428	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
429		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
430		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
431}
432
433static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
434					struct cfq_group *cfqg)
435{
436	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
437		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
438}
439
440static void cfq_dispatch_insert(struct request_queue *, struct request *);
441static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
442				       struct io_context *, gfp_t);
443static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
444						struct io_context *);
445
446static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
447					    bool is_sync)
448{
449	return cic->cfqq[is_sync];
450}
451
452static inline void cic_set_cfqq(struct cfq_io_context *cic,
453				struct cfq_queue *cfqq, bool is_sync)
454{
455	cic->cfqq[is_sync] = cfqq;
456}
457
458#define CIC_DEAD_KEY	1ul
459#define CIC_DEAD_INDEX_SHIFT	1
460
461static inline void *cfqd_dead_key(struct cfq_data *cfqd)
462{
463	return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
464}
465
466static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
467{
468	struct cfq_data *cfqd = cic->key;
469
470	if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
471		return NULL;
472
473	return cfqd;
474}
475
476/*
477 * We regard a request as SYNC, if it's either a read or has the SYNC bit
478 * set (in which case it could also be direct WRITE).
479 */
480static inline bool cfq_bio_sync(struct bio *bio)
481{
482	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
483}
484
485/*
486 * scheduler run of queue, if there are requests pending and no one in the
487 * driver that will restart queueing
488 */
489static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
490{
491	if (cfqd->busy_queues) {
492		cfq_log(cfqd, "schedule dispatch");
493		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
494	}
495}
496
497static int cfq_queue_empty(struct request_queue *q)
498{
499	struct cfq_data *cfqd = q->elevator->elevator_data;
500
501	return !cfqd->rq_queued;
502}
503
504/*
505 * Scale schedule slice based on io priority. Use the sync time slice only
506 * if a queue is marked sync and has sync io queued. A sync queue with async
507 * io only, should not get full sync slice length.
508 */
509static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
510				 unsigned short prio)
511{
512	const int base_slice = cfqd->cfq_slice[sync];
513
514	WARN_ON(prio >= IOPRIO_BE_NR);
515
516	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
517}
518
519static inline int
520cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
521{
522	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
523}
524
525static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
526{
527	u64 d = delta << CFQ_SERVICE_SHIFT;
528
529	d = d * BLKIO_WEIGHT_DEFAULT;
530	do_div(d, cfqg->weight);
531	return d;
532}
533
534static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
535{
536	s64 delta = (s64)(vdisktime - min_vdisktime);
537	if (delta > 0)
538		min_vdisktime = vdisktime;
539
540	return min_vdisktime;
541}
542
543static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
544{
545	s64 delta = (s64)(vdisktime - min_vdisktime);
546	if (delta < 0)
547		min_vdisktime = vdisktime;
548
549	return min_vdisktime;
550}
551
552static void update_min_vdisktime(struct cfq_rb_root *st)
553{
554	u64 vdisktime = st->min_vdisktime;
555	struct cfq_group *cfqg;
556
557	if (st->active) {
558		cfqg = rb_entry_cfqg(st->active);
559		vdisktime = cfqg->vdisktime;
560	}
561
562	if (st->left) {
563		cfqg = rb_entry_cfqg(st->left);
564		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
565	}
566
567	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
568}
569
570/*
571 * get averaged number of queues of RT/BE priority.
572 * average is updated, with a formula that gives more weight to higher numbers,
573 * to quickly follows sudden increases and decrease slowly
574 */
575
576static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
577					struct cfq_group *cfqg, bool rt)
578{
579	unsigned min_q, max_q;
580	unsigned mult  = cfq_hist_divisor - 1;
581	unsigned round = cfq_hist_divisor / 2;
582	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
583
584	min_q = min(cfqg->busy_queues_avg[rt], busy);
585	max_q = max(cfqg->busy_queues_avg[rt], busy);
586	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
587		cfq_hist_divisor;
588	return cfqg->busy_queues_avg[rt];
589}
590
591static inline unsigned
592cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
593{
594	struct cfq_rb_root *st = &cfqd->grp_service_tree;
595
596	return cfq_target_latency * cfqg->weight / st->total_weight;
597}
598
599static inline void
600cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
601{
602	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
603	if (cfqd->cfq_latency) {
604		/*
605		 * interested queues (we consider only the ones with the same
606		 * priority class in the cfq group)
607		 */
608		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
609						cfq_class_rt(cfqq));
610		unsigned sync_slice = cfqd->cfq_slice[1];
611		unsigned expect_latency = sync_slice * iq;
612		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
613
614		if (expect_latency > group_slice) {
615			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
616			/* scale low_slice according to IO priority
617			 * and sync vs async */
618			unsigned low_slice =
619				min(slice, base_low_slice * slice / sync_slice);
620			/* the adapted slice value is scaled to fit all iqs
621			 * into the target latency */
622			slice = max(slice * group_slice / expect_latency,
623				    low_slice);
624		}
625	}
626	cfqq->slice_start = jiffies;
627	cfqq->slice_end = jiffies + slice;
628	cfqq->allocated_slice = slice;
629	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
630}
631
632/*
633 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
634 * isn't valid until the first request from the dispatch is activated
635 * and the slice time set.
636 */
637static inline bool cfq_slice_used(struct cfq_queue *cfqq)
638{
639	if (cfq_cfqq_slice_new(cfqq))
640		return 0;
641	if (time_before(jiffies, cfqq->slice_end))
642		return 0;
643
644	return 1;
645}
646
647/*
648 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
649 * We choose the request that is closest to the head right now. Distance
650 * behind the head is penalized and only allowed to a certain extent.
651 */
652static struct request *
653cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
654{
655	sector_t s1, s2, d1 = 0, d2 = 0;
656	unsigned long back_max;
657#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
658#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
659	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
660
661	if (rq1 == NULL || rq1 == rq2)
662		return rq2;
663	if (rq2 == NULL)
664		return rq1;
665
666	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
667		return rq1;
668	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
669		return rq2;
670	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
671		return rq1;
672	else if ((rq2->cmd_flags & REQ_META) &&
673		 !(rq1->cmd_flags & REQ_META))
674		return rq2;
675
676	s1 = blk_rq_pos(rq1);
677	s2 = blk_rq_pos(rq2);
678
679	/*
680	 * by definition, 1KiB is 2 sectors
681	 */
682	back_max = cfqd->cfq_back_max * 2;
683
684	/*
685	 * Strict one way elevator _except_ in the case where we allow
686	 * short backward seeks which are biased as twice the cost of a
687	 * similar forward seek.
688	 */
689	if (s1 >= last)
690		d1 = s1 - last;
691	else if (s1 + back_max >= last)
692		d1 = (last - s1) * cfqd->cfq_back_penalty;
693	else
694		wrap |= CFQ_RQ1_WRAP;
695
696	if (s2 >= last)
697		d2 = s2 - last;
698	else if (s2 + back_max >= last)
699		d2 = (last - s2) * cfqd->cfq_back_penalty;
700	else
701		wrap |= CFQ_RQ2_WRAP;
702
703	/* Found required data */
704
705	/*
706	 * By doing switch() on the bit mask "wrap" we avoid having to
707	 * check two variables for all permutations: --> faster!
708	 */
709	switch (wrap) {
710	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
711		if (d1 < d2)
712			return rq1;
713		else if (d2 < d1)
714			return rq2;
715		else {
716			if (s1 >= s2)
717				return rq1;
718			else
719				return rq2;
720		}
721
722	case CFQ_RQ2_WRAP:
723		return rq1;
724	case CFQ_RQ1_WRAP:
725		return rq2;
726	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
727	default:
728		/*
729		 * Since both rqs are wrapped,
730		 * start with the one that's further behind head
731		 * (--> only *one* back seek required),
732		 * since back seek takes more time than forward.
733		 */
734		if (s1 <= s2)
735			return rq1;
736		else
737			return rq2;
738	}
739}
740
741/*
742 * The below is leftmost cache rbtree addon
743 */
744static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
745{
746	/* Service tree is empty */
747	if (!root->count)
748		return NULL;
749
750	if (!root->left)
751		root->left = rb_first(&root->rb);
752
753	if (root->left)
754		return rb_entry(root->left, struct cfq_queue, rb_node);
755
756	return NULL;
757}
758
759static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
760{
761	if (!root->left)
762		root->left = rb_first(&root->rb);
763
764	if (root->left)
765		return rb_entry_cfqg(root->left);
766
767	return NULL;
768}
769
770static void rb_erase_init(struct rb_node *n, struct rb_root *root)
771{
772	rb_erase(n, root);
773	RB_CLEAR_NODE(n);
774}
775
776static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
777{
778	if (root->left == n)
779		root->left = NULL;
780	rb_erase_init(n, &root->rb);
781	--root->count;
782}
783
784/*
785 * would be nice to take fifo expire time into account as well
786 */
787static struct request *
788cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
789		  struct request *last)
790{
791	struct rb_node *rbnext = rb_next(&last->rb_node);
792	struct rb_node *rbprev = rb_prev(&last->rb_node);
793	struct request *next = NULL, *prev = NULL;
794
795	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
796
797	if (rbprev)
798		prev = rb_entry_rq(rbprev);
799
800	if (rbnext)
801		next = rb_entry_rq(rbnext);
802	else {
803		rbnext = rb_first(&cfqq->sort_list);
804		if (rbnext && rbnext != &last->rb_node)
805			next = rb_entry_rq(rbnext);
806	}
807
808	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
809}
810
811static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
812				      struct cfq_queue *cfqq)
813{
814	/*
815	 * just an approximation, should be ok.
816	 */
817	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
818		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
819}
820
821static inline s64
822cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
823{
824	return cfqg->vdisktime - st->min_vdisktime;
825}
826
827static void
828__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
829{
830	struct rb_node **node = &st->rb.rb_node;
831	struct rb_node *parent = NULL;
832	struct cfq_group *__cfqg;
833	s64 key = cfqg_key(st, cfqg);
834	int left = 1;
835
836	while (*node != NULL) {
837		parent = *node;
838		__cfqg = rb_entry_cfqg(parent);
839
840		if (key < cfqg_key(st, __cfqg))
841			node = &parent->rb_left;
842		else {
843			node = &parent->rb_right;
844			left = 0;
845		}
846	}
847
848	if (left)
849		st->left = &cfqg->rb_node;
850
851	rb_link_node(&cfqg->rb_node, parent, node);
852	rb_insert_color(&cfqg->rb_node, &st->rb);
853}
854
855static void
856cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
857{
858	struct cfq_rb_root *st = &cfqd->grp_service_tree;
859	struct cfq_group *__cfqg;
860	struct rb_node *n;
861
862	cfqg->nr_cfqq++;
863	if (cfqg->on_st)
864		return;
865
866	/*
867	 * Currently put the group at the end. Later implement something
868	 * so that groups get lesser vtime based on their weights, so that
869	 * if group does not loose all if it was not continously backlogged.
870	 */
871	n = rb_last(&st->rb);
872	if (n) {
873		__cfqg = rb_entry_cfqg(n);
874		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
875	} else
876		cfqg->vdisktime = st->min_vdisktime;
877
878	__cfq_group_service_tree_add(st, cfqg);
879	cfqg->on_st = true;
880	st->total_weight += cfqg->weight;
881}
882
883static void
884cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
885{
886	struct cfq_rb_root *st = &cfqd->grp_service_tree;
887
888	if (st->active == &cfqg->rb_node)
889		st->active = NULL;
890
891	BUG_ON(cfqg->nr_cfqq < 1);
892	cfqg->nr_cfqq--;
893
894	/* If there are other cfq queues under this group, don't delete it */
895	if (cfqg->nr_cfqq)
896		return;
897
898	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
899	cfqg->on_st = false;
900	st->total_weight -= cfqg->weight;
901	if (!RB_EMPTY_NODE(&cfqg->rb_node))
902		cfq_rb_erase(&cfqg->rb_node, st);
903	cfqg->saved_workload_slice = 0;
904	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
905}
906
907static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
908{
909	unsigned int slice_used;
910
911	/*
912	 * Queue got expired before even a single request completed or
913	 * got expired immediately after first request completion.
914	 */
915	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
916		/*
917		 * Also charge the seek time incurred to the group, otherwise
918		 * if there are mutiple queues in the group, each can dispatch
919		 * a single request on seeky media and cause lots of seek time
920		 * and group will never know it.
921		 */
922		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
923					1);
924	} else {
925		slice_used = jiffies - cfqq->slice_start;
926		if (slice_used > cfqq->allocated_slice)
927			slice_used = cfqq->allocated_slice;
928	}
929
930	return slice_used;
931}
932
933static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
934				struct cfq_queue *cfqq)
935{
936	struct cfq_rb_root *st = &cfqd->grp_service_tree;
937	unsigned int used_sl, charge;
938	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
939			- cfqg->service_tree_idle.count;
940
941	BUG_ON(nr_sync < 0);
942	used_sl = charge = cfq_cfqq_slice_usage(cfqq);
943
944	if (iops_mode(cfqd))
945		charge = cfqq->slice_dispatch;
946	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
947		charge = cfqq->allocated_slice;
948
949	/* Can't update vdisktime while group is on service tree */
950	cfq_rb_erase(&cfqg->rb_node, st);
951	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
952	__cfq_group_service_tree_add(st, cfqg);
953
954	/* This group is being expired. Save the context */
955	if (time_after(cfqd->workload_expires, jiffies)) {
956		cfqg->saved_workload_slice = cfqd->workload_expires
957						- jiffies;
958		cfqg->saved_workload = cfqd->serving_type;
959		cfqg->saved_serving_prio = cfqd->serving_prio;
960	} else
961		cfqg->saved_workload_slice = 0;
962
963	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
964					st->min_vdisktime);
965	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
966			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
967			iops_mode(cfqd), cfqq->nr_sectors);
968	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
969	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
970}
971
972#ifdef CONFIG_CFQ_GROUP_IOSCHED
973static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
974{
975	if (blkg)
976		return container_of(blkg, struct cfq_group, blkg);
977	return NULL;
978}
979
980void
981cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
982{
983	cfqg_of_blkg(blkg)->weight = weight;
984}
985
986static struct cfq_group *
987cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
988{
989	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
990	struct cfq_group *cfqg = NULL;
991	void *key = cfqd;
992	int i, j;
993	struct cfq_rb_root *st;
994	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
995	unsigned int major, minor;
996
997	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
998	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
999		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1000		cfqg->blkg.dev = MKDEV(major, minor);
1001		goto done;
1002	}
1003	if (cfqg || !create)
1004		goto done;
1005
1006	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1007	if (!cfqg)
1008		goto done;
1009
1010	for_each_cfqg_st(cfqg, i, j, st)
1011		*st = CFQ_RB_ROOT;
1012	RB_CLEAR_NODE(&cfqg->rb_node);
1013
1014	/*
1015	 * Take the initial reference that will be released on destroy
1016	 * This can be thought of a joint reference by cgroup and
1017	 * elevator which will be dropped by either elevator exit
1018	 * or cgroup deletion path depending on who is exiting first.
1019	 */
1020	atomic_set(&cfqg->ref, 1);
1021
1022	/* Add group onto cgroup list */
1023	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1024	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1025					MKDEV(major, minor));
1026	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1027
1028	/* Add group on cfqd list */
1029	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1030
1031done:
1032	return cfqg;
1033}
1034
1035/*
1036 * Search for the cfq group current task belongs to. If create = 1, then also
1037 * create the cfq group if it does not exist. request_queue lock must be held.
1038 */
1039static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1040{
1041	struct cgroup *cgroup;
1042	struct cfq_group *cfqg = NULL;
1043
1044	rcu_read_lock();
1045	cgroup = task_cgroup(current, blkio_subsys_id);
1046	cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
1047	if (!cfqg && create)
1048		cfqg = &cfqd->root_group;
1049	rcu_read_unlock();
1050	return cfqg;
1051}
1052
1053static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1054{
1055	atomic_inc(&cfqg->ref);
1056	return cfqg;
1057}
1058
1059static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1060{
1061	/* Currently, all async queues are mapped to root group */
1062	if (!cfq_cfqq_sync(cfqq))
1063		cfqg = &cfqq->cfqd->root_group;
1064
1065	cfqq->cfqg = cfqg;
1066	/* cfqq reference on cfqg */
1067	atomic_inc(&cfqq->cfqg->ref);
1068}
1069
1070static void cfq_put_cfqg(struct cfq_group *cfqg)
1071{
1072	struct cfq_rb_root *st;
1073	int i, j;
1074
1075	BUG_ON(atomic_read(&cfqg->ref) <= 0);
1076	if (!atomic_dec_and_test(&cfqg->ref))
1077		return;
1078	for_each_cfqg_st(cfqg, i, j, st)
1079		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
1080	kfree(cfqg);
1081}
1082
1083static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1084{
1085	/* Something wrong if we are trying to remove same group twice */
1086	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1087
1088	hlist_del_init(&cfqg->cfqd_node);
1089
1090	/*
1091	 * Put the reference taken at the time of creation so that when all
1092	 * queues are gone, group can be destroyed.
1093	 */
1094	cfq_put_cfqg(cfqg);
1095}
1096
1097static void cfq_release_cfq_groups(struct cfq_data *cfqd)
1098{
1099	struct hlist_node *pos, *n;
1100	struct cfq_group *cfqg;
1101
1102	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1103		/*
1104		 * If cgroup removal path got to blk_group first and removed
1105		 * it from cgroup list, then it will take care of destroying
1106		 * cfqg also.
1107		 */
1108		if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
1109			cfq_destroy_cfqg(cfqd, cfqg);
1110	}
1111}
1112
1113/*
1114 * Blk cgroup controller notification saying that blkio_group object is being
1115 * delinked as associated cgroup object is going away. That also means that
1116 * no new IO will come in this group. So get rid of this group as soon as
1117 * any pending IO in the group is finished.
1118 *
1119 * This function is called under rcu_read_lock(). key is the rcu protected
1120 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1121 * read lock.
1122 *
1123 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1124 * it should not be NULL as even if elevator was exiting, cgroup deltion
1125 * path got to it first.
1126 */
1127void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1128{
1129	unsigned long  flags;
1130	struct cfq_data *cfqd = key;
1131
1132	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1133	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1134	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1135}
1136
1137#else /* GROUP_IOSCHED */
1138static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1139{
1140	return &cfqd->root_group;
1141}
1142
1143static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1144{
1145	return cfqg;
1146}
1147
1148static inline void
1149cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1150	cfqq->cfqg = cfqg;
1151}
1152
1153static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1154static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1155
1156#endif /* GROUP_IOSCHED */
1157
1158/*
1159 * The cfqd->service_trees holds all pending cfq_queue's that have
1160 * requests waiting to be processed. It is sorted in the order that
1161 * we will service the queues.
1162 */
1163static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1164				 bool add_front)
1165{
1166	struct rb_node **p, *parent;
1167	struct cfq_queue *__cfqq;
1168	unsigned long rb_key;
1169	struct cfq_rb_root *service_tree;
1170	int left;
1171	int new_cfqq = 1;
1172	int group_changed = 0;
1173
1174#ifdef CONFIG_CFQ_GROUP_IOSCHED
1175	if (!cfqd->cfq_group_isolation
1176	    && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1177	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1178		/* Move this cfq to root group */
1179		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1180		if (!RB_EMPTY_NODE(&cfqq->rb_node))
1181			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1182		cfqq->orig_cfqg = cfqq->cfqg;
1183		cfqq->cfqg = &cfqd->root_group;
1184		atomic_inc(&cfqd->root_group.ref);
1185		group_changed = 1;
1186	} else if (!cfqd->cfq_group_isolation
1187		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1188		/* cfqq is sequential now needs to go to its original group */
1189		BUG_ON(cfqq->cfqg != &cfqd->root_group);
1190		if (!RB_EMPTY_NODE(&cfqq->rb_node))
1191			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1192		cfq_put_cfqg(cfqq->cfqg);
1193		cfqq->cfqg = cfqq->orig_cfqg;
1194		cfqq->orig_cfqg = NULL;
1195		group_changed = 1;
1196		cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1197	}
1198#endif
1199
1200	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1201						cfqq_type(cfqq));
1202	if (cfq_class_idle(cfqq)) {
1203		rb_key = CFQ_IDLE_DELAY;
1204		parent = rb_last(&service_tree->rb);
1205		if (parent && parent != &cfqq->rb_node) {
1206			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1207			rb_key += __cfqq->rb_key;
1208		} else
1209			rb_key += jiffies;
1210	} else if (!add_front) {
1211		/*
1212		 * Get our rb key offset. Subtract any residual slice
1213		 * value carried from last service. A negative resid
1214		 * count indicates slice overrun, and this should position
1215		 * the next service time further away in the tree.
1216		 */
1217		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1218		rb_key -= cfqq->slice_resid;
1219		cfqq->slice_resid = 0;
1220	} else {
1221		rb_key = -HZ;
1222		__cfqq = cfq_rb_first(service_tree);
1223		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1224	}
1225
1226	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1227		new_cfqq = 0;
1228		/*
1229		 * same position, nothing more to do
1230		 */
1231		if (rb_key == cfqq->rb_key &&
1232		    cfqq->service_tree == service_tree)
1233			return;
1234
1235		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1236		cfqq->service_tree = NULL;
1237	}
1238
1239	left = 1;
1240	parent = NULL;
1241	cfqq->service_tree = service_tree;
1242	p = &service_tree->rb.rb_node;
1243	while (*p) {
1244		struct rb_node **n;
1245
1246		parent = *p;
1247		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1248
1249		/*
1250		 * sort by key, that represents service time.
1251		 */
1252		if (time_before(rb_key, __cfqq->rb_key))
1253			n = &(*p)->rb_left;
1254		else {
1255			n = &(*p)->rb_right;
1256			left = 0;
1257		}
1258
1259		p = n;
1260	}
1261
1262	if (left)
1263		service_tree->left = &cfqq->rb_node;
1264
1265	cfqq->rb_key = rb_key;
1266	rb_link_node(&cfqq->rb_node, parent, p);
1267	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1268	service_tree->count++;
1269	if ((add_front || !new_cfqq) && !group_changed)
1270		return;
1271	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
1272}
1273
1274static struct cfq_queue *
1275cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1276		     sector_t sector, struct rb_node **ret_parent,
1277		     struct rb_node ***rb_link)
1278{
1279	struct rb_node **p, *parent;
1280	struct cfq_queue *cfqq = NULL;
1281
1282	parent = NULL;
1283	p = &root->rb_node;
1284	while (*p) {
1285		struct rb_node **n;
1286
1287		parent = *p;
1288		cfqq = rb_entry(parent, struct cfq_queue, p_node);
1289
1290		/*
1291		 * Sort strictly based on sector.  Smallest to the left,
1292		 * largest to the right.
1293		 */
1294		if (sector > blk_rq_pos(cfqq->next_rq))
1295			n = &(*p)->rb_right;
1296		else if (sector < blk_rq_pos(cfqq->next_rq))
1297			n = &(*p)->rb_left;
1298		else
1299			break;
1300		p = n;
1301		cfqq = NULL;
1302	}
1303
1304	*ret_parent = parent;
1305	if (rb_link)
1306		*rb_link = p;
1307	return cfqq;
1308}
1309
1310static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1311{
1312	struct rb_node **p, *parent;
1313	struct cfq_queue *__cfqq;
1314
1315	if (cfqq->p_root) {
1316		rb_erase(&cfqq->p_node, cfqq->p_root);
1317		cfqq->p_root = NULL;
1318	}
1319
1320	if (cfq_class_idle(cfqq))
1321		return;
1322	if (!cfqq->next_rq)
1323		return;
1324
1325	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1326	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1327				      blk_rq_pos(cfqq->next_rq), &parent, &p);
1328	if (!__cfqq) {
1329		rb_link_node(&cfqq->p_node, parent, p);
1330		rb_insert_color(&cfqq->p_node, cfqq->p_root);
1331	} else
1332		cfqq->p_root = NULL;
1333}
1334
1335/*
1336 * Update cfqq's position in the service tree.
1337 */
1338static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1339{
1340	/*
1341	 * Resorting requires the cfqq to be on the RR list already.
1342	 */
1343	if (cfq_cfqq_on_rr(cfqq)) {
1344		cfq_service_tree_add(cfqd, cfqq, 0);
1345		cfq_prio_tree_add(cfqd, cfqq);
1346	}
1347}
1348
1349/*
1350 * add to busy list of queues for service, trying to be fair in ordering
1351 * the pending list according to last request service
1352 */
1353static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1354{
1355	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
1356	BUG_ON(cfq_cfqq_on_rr(cfqq));
1357	cfq_mark_cfqq_on_rr(cfqq);
1358	cfqd->busy_queues++;
1359
1360	cfq_resort_rr_list(cfqd, cfqq);
1361}
1362
1363/*
1364 * Called when the cfqq no longer has requests pending, remove it from
1365 * the service tree.
1366 */
1367static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1368{
1369	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
1370	BUG_ON(!cfq_cfqq_on_rr(cfqq));
1371	cfq_clear_cfqq_on_rr(cfqq);
1372
1373	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1374		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1375		cfqq->service_tree = NULL;
1376	}
1377	if (cfqq->p_root) {
1378		rb_erase(&cfqq->p_node, cfqq->p_root);
1379		cfqq->p_root = NULL;
1380	}
1381
1382	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1383	BUG_ON(!cfqd->busy_queues);
1384	cfqd->busy_queues--;
1385}
1386
1387/*
1388 * rb tree support functions
1389 */
1390static void cfq_del_rq_rb(struct request *rq)
1391{
1392	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1393	const int sync = rq_is_sync(rq);
1394
1395	BUG_ON(!cfqq->queued[sync]);
1396	cfqq->queued[sync]--;
1397
1398	elv_rb_del(&cfqq->sort_list, rq);
1399
1400	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1401		/*
1402		 * Queue will be deleted from service tree when we actually
1403		 * expire it later. Right now just remove it from prio tree
1404		 * as it is empty.
1405		 */
1406		if (cfqq->p_root) {
1407			rb_erase(&cfqq->p_node, cfqq->p_root);
1408			cfqq->p_root = NULL;
1409		}
1410	}
1411}
1412
1413static void cfq_add_rq_rb(struct request *rq)
1414{
1415	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1416	struct cfq_data *cfqd = cfqq->cfqd;
1417	struct request *__alias, *prev;
1418
1419	cfqq->queued[rq_is_sync(rq)]++;
1420
1421	/*
1422	 * looks a little odd, but the first insert might return an alias.
1423	 * if that happens, put the alias on the dispatch list
1424	 */
1425	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
1426		cfq_dispatch_insert(cfqd->queue, __alias);
1427
1428	if (!cfq_cfqq_on_rr(cfqq))
1429		cfq_add_cfqq_rr(cfqd, cfqq);
1430
1431	/*
1432	 * check if this request is a better next-serve candidate
1433	 */
1434	prev = cfqq->next_rq;
1435	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
1436
1437	/*
1438	 * adjust priority tree position, if ->next_rq changes
1439	 */
1440	if (prev != cfqq->next_rq)
1441		cfq_prio_tree_add(cfqd, cfqq);
1442
1443	BUG_ON(!cfqq->next_rq);
1444}
1445
1446static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1447{
1448	elv_rb_del(&cfqq->sort_list, rq);
1449	cfqq->queued[rq_is_sync(rq)]--;
1450	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1451					rq_data_dir(rq), rq_is_sync(rq));
1452	cfq_add_rq_rb(rq);
1453	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1454			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1455			rq_is_sync(rq));
1456}
1457
1458static struct request *
1459cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1460{
1461	struct task_struct *tsk = current;
1462	struct cfq_io_context *cic;
1463	struct cfq_queue *cfqq;
1464
1465	cic = cfq_cic_lookup(cfqd, tsk->io_context);
1466	if (!cic)
1467		return NULL;
1468
1469	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1470	if (cfqq) {
1471		sector_t sector = bio->bi_sector + bio_sectors(bio);
1472
1473		return elv_rb_find(&cfqq->sort_list, sector);
1474	}
1475
1476	return NULL;
1477}
1478
1479static void cfq_activate_request(struct request_queue *q, struct request *rq)
1480{
1481	struct cfq_data *cfqd = q->elevator->elevator_data;
1482
1483	cfqd->rq_in_driver++;
1484	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
1485						cfqd->rq_in_driver);
1486
1487	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1488}
1489
1490static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
1491{
1492	struct cfq_data *cfqd = q->elevator->elevator_data;
1493
1494	WARN_ON(!cfqd->rq_in_driver);
1495	cfqd->rq_in_driver--;
1496	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
1497						cfqd->rq_in_driver);
1498}
1499
1500static void cfq_remove_request(struct request *rq)
1501{
1502	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1503
1504	if (cfqq->next_rq == rq)
1505		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
1506
1507	list_del_init(&rq->queuelist);
1508	cfq_del_rq_rb(rq);
1509
1510	cfqq->cfqd->rq_queued--;
1511	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1512					rq_data_dir(rq), rq_is_sync(rq));
1513	if (rq->cmd_flags & REQ_META) {
1514		WARN_ON(!cfqq->meta_pending);
1515		cfqq->meta_pending--;
1516	}
1517}
1518
1519static int cfq_merge(struct request_queue *q, struct request **req,
1520		     struct bio *bio)
1521{
1522	struct cfq_data *cfqd = q->elevator->elevator_data;
1523	struct request *__rq;
1524
1525	__rq = cfq_find_rq_fmerge(cfqd, bio);
1526	if (__rq && elv_rq_merge_ok(__rq, bio)) {
1527		*req = __rq;
1528		return ELEVATOR_FRONT_MERGE;
1529	}
1530
1531	return ELEVATOR_NO_MERGE;
1532}
1533
1534static void cfq_merged_request(struct request_queue *q, struct request *req,
1535			       int type)
1536{
1537	if (type == ELEVATOR_FRONT_MERGE) {
1538		struct cfq_queue *cfqq = RQ_CFQQ(req);
1539
1540		cfq_reposition_rq_rb(cfqq, req);
1541	}
1542}
1543
1544static void cfq_bio_merged(struct request_queue *q, struct request *req,
1545				struct bio *bio)
1546{
1547	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
1548					bio_data_dir(bio), cfq_bio_sync(bio));
1549}
1550
1551static void
1552cfq_merged_requests(struct request_queue *q, struct request *rq,
1553		    struct request *next)
1554{
1555	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1556	/*
1557	 * reposition in fifo if next is older than rq
1558	 */
1559	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1560	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1561		list_move(&rq->queuelist, &next->queuelist);
1562		rq_set_fifo_time(rq, rq_fifo_time(next));
1563	}
1564
1565	if (cfqq->next_rq == next)
1566		cfqq->next_rq = rq;
1567	cfq_remove_request(next);
1568	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
1569					rq_data_dir(next), rq_is_sync(next));
1570}
1571
1572static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1573			   struct bio *bio)
1574{
1575	struct cfq_data *cfqd = q->elevator->elevator_data;
1576	struct cfq_io_context *cic;
1577	struct cfq_queue *cfqq;
1578
1579	/*
1580	 * Disallow merge of a sync bio into an async request.
1581	 */
1582	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
1583		return false;
1584
1585	/*
1586	 * Lookup the cfqq that this bio will be queued with. Allow
1587	 * merge only if rq is queued there.
1588	 */
1589	cic = cfq_cic_lookup(cfqd, current->io_context);
1590	if (!cic)
1591		return false;
1592
1593	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1594	return cfqq == RQ_CFQQ(rq);
1595}
1596
1597static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1598{
1599	del_timer(&cfqd->idle_slice_timer);
1600	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1601}
1602
1603static void __cfq_set_active_queue(struct cfq_data *cfqd,
1604				   struct cfq_queue *cfqq)
1605{
1606	if (cfqq) {
1607		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1608				cfqd->serving_prio, cfqd->serving_type);
1609		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1610		cfqq->slice_start = 0;
1611		cfqq->dispatch_start = jiffies;
1612		cfqq->allocated_slice = 0;
1613		cfqq->slice_end = 0;
1614		cfqq->slice_dispatch = 0;
1615		cfqq->nr_sectors = 0;
1616
1617		cfq_clear_cfqq_wait_request(cfqq);
1618		cfq_clear_cfqq_must_dispatch(cfqq);
1619		cfq_clear_cfqq_must_alloc_slice(cfqq);
1620		cfq_clear_cfqq_fifo_expire(cfqq);
1621		cfq_mark_cfqq_slice_new(cfqq);
1622
1623		cfq_del_timer(cfqd, cfqq);
1624	}
1625
1626	cfqd->active_queue = cfqq;
1627}
1628
1629/*
1630 * current cfqq expired its slice (or was too idle), select new one
1631 */
1632static void
1633__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1634		    bool timed_out)
1635{
1636	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1637
1638	if (cfq_cfqq_wait_request(cfqq))
1639		cfq_del_timer(cfqd, cfqq);
1640
1641	cfq_clear_cfqq_wait_request(cfqq);
1642	cfq_clear_cfqq_wait_busy(cfqq);
1643
1644	/*
1645	 * If this cfqq is shared between multiple processes, check to
1646	 * make sure that those processes are still issuing I/Os within
1647	 * the mean seek distance.  If not, it may be time to break the
1648	 * queues apart again.
1649	 */
1650	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
1651		cfq_mark_cfqq_split_coop(cfqq);
1652
1653	/*
1654	 * store what was left of this slice, if the queue idled/timed out
1655	 */
1656	if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
1657		cfqq->slice_resid = cfqq->slice_end - jiffies;
1658		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1659	}
1660
1661	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1662
1663	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1664		cfq_del_cfqq_rr(cfqd, cfqq);
1665
1666	cfq_resort_rr_list(cfqd, cfqq);
1667
1668	if (cfqq == cfqd->active_queue)
1669		cfqd->active_queue = NULL;
1670
1671	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1672		cfqd->grp_service_tree.active = NULL;
1673
1674	if (cfqd->active_cic) {
1675		put_io_context(cfqd->active_cic->ioc);
1676		cfqd->active_cic = NULL;
1677	}
1678}
1679
1680static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
1681{
1682	struct cfq_queue *cfqq = cfqd->active_queue;
1683
1684	if (cfqq)
1685		__cfq_slice_expired(cfqd, cfqq, timed_out);
1686}
1687
1688/*
1689 * Get next queue for service. Unless we have a queue preemption,
1690 * we'll simply select the first cfqq in the service tree.
1691 */
1692static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1693{
1694	struct cfq_rb_root *service_tree =
1695		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1696					cfqd->serving_type);
1697
1698	if (!cfqd->rq_queued)
1699		return NULL;
1700
1701	/* There is nothing to dispatch */
1702	if (!service_tree)
1703		return NULL;
1704	if (RB_EMPTY_ROOT(&service_tree->rb))
1705		return NULL;
1706	return cfq_rb_first(service_tree);
1707}
1708
1709static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1710{
1711	struct cfq_group *cfqg;
1712	struct cfq_queue *cfqq;
1713	int i, j;
1714	struct cfq_rb_root *st;
1715
1716	if (!cfqd->rq_queued)
1717		return NULL;
1718
1719	cfqg = cfq_get_next_cfqg(cfqd);
1720	if (!cfqg)
1721		return NULL;
1722
1723	for_each_cfqg_st(cfqg, i, j, st)
1724		if ((cfqq = cfq_rb_first(st)) != NULL)
1725			return cfqq;
1726	return NULL;
1727}
1728
1729/*
1730 * Get and set a new active queue for service.
1731 */
1732static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
1733					      struct cfq_queue *cfqq)
1734{
1735	if (!cfqq)
1736		cfqq = cfq_get_next_queue(cfqd);
1737
1738	__cfq_set_active_queue(cfqd, cfqq);
1739	return cfqq;
1740}
1741
1742static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
1743					  struct request *rq)
1744{
1745	if (blk_rq_pos(rq) >= cfqd->last_position)
1746		return blk_rq_pos(rq) - cfqd->last_position;
1747	else
1748		return cfqd->last_position - blk_rq_pos(rq);
1749}
1750
1751static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1752			       struct request *rq)
1753{
1754	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
1755}
1756
1757static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1758				    struct cfq_queue *cur_cfqq)
1759{
1760	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
1761	struct rb_node *parent, *node;
1762	struct cfq_queue *__cfqq;
1763	sector_t sector = cfqd->last_position;
1764
1765	if (RB_EMPTY_ROOT(root))
1766		return NULL;
1767
1768	/*
1769	 * First, if we find a request starting at the end of the last
1770	 * request, choose it.
1771	 */
1772	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
1773	if (__cfqq)
1774		return __cfqq;
1775
1776	/*
1777	 * If the exact sector wasn't found, the parent of the NULL leaf
1778	 * will contain the closest sector.
1779	 */
1780	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
1781	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1782		return __cfqq;
1783
1784	if (blk_rq_pos(__cfqq->next_rq) < sector)
1785		node = rb_next(&__cfqq->p_node);
1786	else
1787		node = rb_prev(&__cfqq->p_node);
1788	if (!node)
1789		return NULL;
1790
1791	__cfqq = rb_entry(node, struct cfq_queue, p_node);
1792	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1793		return __cfqq;
1794
1795	return NULL;
1796}
1797
1798/*
1799 * cfqd - obvious
1800 * cur_cfqq - passed in so that we don't decide that the current queue is
1801 * 	      closely cooperating with itself.
1802 *
1803 * So, basically we're assuming that that cur_cfqq has dispatched at least
1804 * one request, and that cfqd->last_position reflects a position on the disk
1805 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
1806 * assumption.
1807 */
1808static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1809					      struct cfq_queue *cur_cfqq)
1810{
1811	struct cfq_queue *cfqq;
1812
1813	if (cfq_class_idle(cur_cfqq))
1814		return NULL;
1815	if (!cfq_cfqq_sync(cur_cfqq))
1816		return NULL;
1817	if (CFQQ_SEEKY(cur_cfqq))
1818		return NULL;
1819
1820	/*
1821	 * Don't search priority tree if it's the only queue in the group.
1822	 */
1823	if (cur_cfqq->cfqg->nr_cfqq == 1)
1824		return NULL;
1825
1826	/*
1827	 * We should notice if some of the queues are cooperating, eg
1828	 * working closely on the same area of the disk. In that case,
1829	 * we can group them together and don't waste time idling.
1830	 */
1831	cfqq = cfqq_close(cfqd, cur_cfqq);
1832	if (!cfqq)
1833		return NULL;
1834
1835	/* If new queue belongs to different cfq_group, don't choose it */
1836	if (cur_cfqq->cfqg != cfqq->cfqg)
1837		return NULL;
1838
1839	/*
1840	 * It only makes sense to merge sync queues.
1841	 */
1842	if (!cfq_cfqq_sync(cfqq))
1843		return NULL;
1844	if (CFQQ_SEEKY(cfqq))
1845		return NULL;
1846
1847	/*
1848	 * Do not merge queues of different priority classes
1849	 */
1850	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1851		return NULL;
1852
1853	return cfqq;
1854}
1855
1856/*
1857 * Determine whether we should enforce idle window for this queue.
1858 */
1859
1860static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1861{
1862	enum wl_prio_t prio = cfqq_prio(cfqq);
1863	struct cfq_rb_root *service_tree = cfqq->service_tree;
1864
1865	BUG_ON(!service_tree);
1866	BUG_ON(!service_tree->count);
1867
1868	if (!cfqd->cfq_slice_idle)
1869		return false;
1870
1871	/* We never do for idle class queues. */
1872	if (prio == IDLE_WORKLOAD)
1873		return false;
1874
1875	/* We do for queues that were marked with idle window flag. */
1876	if (cfq_cfqq_idle_window(cfqq) &&
1877	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1878		return true;
1879
1880	/*
1881	 * Otherwise, we do only if they are the last ones
1882	 * in their service tree.
1883	 */
1884	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1885		return 1;
1886	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1887			service_tree->count);
1888	return 0;
1889}
1890
1891static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1892{
1893	struct cfq_queue *cfqq = cfqd->active_queue;
1894	struct cfq_io_context *cic;
1895	unsigned long sl, group_idle = 0;
1896
1897	/*
1898	 * SSD device without seek penalty, disable idling. But only do so
1899	 * for devices that support queuing, otherwise we still have a problem
1900	 * with sync vs async workloads.
1901	 */
1902	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
1903		return;
1904
1905	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
1906	WARN_ON(cfq_cfqq_slice_new(cfqq));
1907
1908	/*
1909	 * idle is disabled, either manually or by past process history
1910	 */
1911	if (!cfq_should_idle(cfqd, cfqq)) {
1912		/* no queue idling. Check for group idling */
1913		if (cfqd->cfq_group_idle)
1914			group_idle = cfqd->cfq_group_idle;
1915		else
1916			return;
1917	}
1918
1919	/*
1920	 * still active requests from this queue, don't idle
1921	 */
1922	if (cfqq->dispatched)
1923		return;
1924
1925	/*
1926	 * task has exited, don't wait
1927	 */
1928	cic = cfqd->active_cic;
1929	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
1930		return;
1931
1932	/*
1933	 * If our average think time is larger than the remaining time
1934	 * slice, then don't idle. This avoids overrunning the allotted
1935	 * time slice.
1936	 */
1937	if (sample_valid(cic->ttime_samples) &&
1938	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
1939		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
1940				cic->ttime_mean);
1941		return;
1942	}
1943
1944	/* There are other queues in the group, don't do group idle */
1945	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
1946		return;
1947
1948	cfq_mark_cfqq_wait_request(cfqq);
1949
1950	if (group_idle)
1951		sl = cfqd->cfq_group_idle;
1952	else
1953		sl = cfqd->cfq_slice_idle;
1954
1955	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1956	cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1957	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
1958			group_idle ? 1 : 0);
1959}
1960
1961/*
1962 * Move request from internal lists to the request queue dispatch list.
1963 */
1964static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1965{
1966	struct cfq_data *cfqd = q->elevator->elevator_data;
1967	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1968
1969	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
1970
1971	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
1972	cfq_remove_request(rq);
1973	cfqq->dispatched++;
1974	(RQ_CFQG(rq))->dispatched++;
1975	elv_dispatch_sort(q, rq);
1976
1977	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1978	cfqq->nr_sectors += blk_rq_sectors(rq);
1979	cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
1980					rq_data_dir(rq), rq_is_sync(rq));
1981}
1982
1983/*
1984 * return expired entry, or NULL to just start from scratch in rbtree
1985 */
1986static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
1987{
1988	struct request *rq = NULL;
1989
1990	if (cfq_cfqq_fifo_expire(cfqq))
1991		return NULL;
1992
1993	cfq_mark_cfqq_fifo_expire(cfqq);
1994
1995	if (list_empty(&cfqq->fifo))
1996		return NULL;
1997
1998	rq = rq_entry_fifo(cfqq->fifo.next);
1999	if (time_before(jiffies, rq_fifo_time(rq)))
2000		rq = NULL;
2001
2002	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
2003	return rq;
2004}
2005
2006static inline int
2007cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2008{
2009	const int base_rq = cfqd->cfq_slice_async_rq;
2010
2011	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2012
2013	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
2014}
2015
2016/*
2017 * Must be called with the queue_lock held.
2018 */
2019static int cfqq_process_refs(struct cfq_queue *cfqq)
2020{
2021	int process_refs, io_refs;
2022
2023	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2024	process_refs = atomic_read(&cfqq->ref) - io_refs;
2025	BUG_ON(process_refs < 0);
2026	return process_refs;
2027}
2028
2029static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2030{
2031	int process_refs, new_process_refs;
2032	struct cfq_queue *__cfqq;
2033
2034	/*
2035	 * If there are no process references on the new_cfqq, then it is
2036	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2037	 * chain may have dropped their last reference (not just their
2038	 * last process reference).
2039	 */
2040	if (!cfqq_process_refs(new_cfqq))
2041		return;
2042
2043	/* Avoid a circular list and skip interim queue merges */
2044	while ((__cfqq = new_cfqq->new_cfqq)) {
2045		if (__cfqq == cfqq)
2046			return;
2047		new_cfqq = __cfqq;
2048	}
2049
2050	process_refs = cfqq_process_refs(cfqq);
2051	new_process_refs = cfqq_process_refs(new_cfqq);
2052	/*
2053	 * If the process for the cfqq has gone away, there is no
2054	 * sense in merging the queues.
2055	 */
2056	if (process_refs == 0 || new_process_refs == 0)
2057		return;
2058
2059	/*
2060	 * Merge in the direction of the lesser amount of work.
2061	 */
2062	if (new_process_refs >= process_refs) {
2063		cfqq->new_cfqq = new_cfqq;
2064		atomic_add(process_refs, &new_cfqq->ref);
2065	} else {
2066		new_cfqq->new_cfqq = cfqq;
2067		atomic_add(new_process_refs, &cfqq->ref);
2068	}
2069}
2070
2071static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2072				struct cfq_group *cfqg, enum wl_prio_t prio)
2073{
2074	struct cfq_queue *queue;
2075	int i;
2076	bool key_valid = false;
2077	unsigned long lowest_key = 0;
2078	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2079
2080	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2081		/* select the one with lowest rb_key */
2082		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
2083		if (queue &&
2084		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
2085			lowest_key = queue->rb_key;
2086			cur_best = i;
2087			key_valid = true;
2088		}
2089	}
2090
2091	return cur_best;
2092}
2093
2094static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2095{
2096	unsigned slice;
2097	unsigned count;
2098	struct cfq_rb_root *st;
2099	unsigned group_slice;
2100
2101	if (!cfqg) {
2102		cfqd->serving_prio = IDLE_WORKLOAD;
2103		cfqd->workload_expires = jiffies + 1;
2104		return;
2105	}
2106
2107	/* Choose next priority. RT > BE > IDLE */
2108	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2109		cfqd->serving_prio = RT_WORKLOAD;
2110	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2111		cfqd->serving_prio = BE_WORKLOAD;
2112	else {
2113		cfqd->serving_prio = IDLE_WORKLOAD;
2114		cfqd->workload_expires = jiffies + 1;
2115		return;
2116	}
2117
2118	/*
2119	 * For RT and BE, we have to choose also the type
2120	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2121	 * expiration time
2122	 */
2123	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2124	count = st->count;
2125
2126	/*
2127	 * check workload expiration, and that we still have other queues ready
2128	 */
2129	if (count && !time_after(jiffies, cfqd->workload_expires))
2130		return;
2131
2132	/* otherwise select new workload type */
2133	cfqd->serving_type =
2134		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2135	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2136	count = st->count;
2137
2138	/*
2139	 * the workload slice is computed as a fraction of target latency
2140	 * proportional to the number of queues in that workload, over
2141	 * all the queues in the same priority class
2142	 */
2143	group_slice = cfq_group_slice(cfqd, cfqg);
2144
2145	slice = group_slice * count /
2146		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2147		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2148
2149	if (cfqd->serving_type == ASYNC_WORKLOAD) {
2150		unsigned int tmp;
2151
2152		/*
2153		 * Async queues are currently system wide. Just taking
2154		 * proportion of queues with-in same group will lead to higher
2155		 * async ratio system wide as generally root group is going
2156		 * to have higher weight. A more accurate thing would be to
2157		 * calculate system wide asnc/sync ratio.
2158		 */
2159		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2160		tmp = tmp/cfqd->busy_queues;
2161		slice = min_t(unsigned, slice, tmp);
2162
2163		/* async workload slice is scaled down according to
2164		 * the sync/async slice ratio. */
2165		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2166	} else
2167		/* sync workload slice is at least 2 * cfq_slice_idle */
2168		slice = max(slice, 2 * cfqd->cfq_slice_idle);
2169
2170	slice = max_t(unsigned, slice, CFQ_MIN_TT);
2171	cfq_log(cfqd, "workload slice:%d", slice);
2172	cfqd->workload_expires = jiffies + slice;
2173	cfqd->noidle_tree_requires_idle = false;
2174}
2175
2176static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2177{
2178	struct cfq_rb_root *st = &cfqd->grp_service_tree;
2179	struct cfq_group *cfqg;
2180
2181	if (RB_EMPTY_ROOT(&st->rb))
2182		return NULL;
2183	cfqg = cfq_rb_first_group(st);
2184	st->active = &cfqg->rb_node;
2185	update_min_vdisktime(st);
2186	return cfqg;
2187}
2188
2189static void cfq_choose_cfqg(struct cfq_data *cfqd)
2190{
2191	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2192
2193	cfqd->serving_group = cfqg;
2194
2195	/* Restore the workload type data */
2196	if (cfqg->saved_workload_slice) {
2197		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2198		cfqd->serving_type = cfqg->saved_workload;
2199		cfqd->serving_prio = cfqg->saved_serving_prio;
2200	} else
2201		cfqd->workload_expires = jiffies - 1;
2202
2203	choose_service_tree(cfqd, cfqg);
2204}
2205
2206/*
2207 * Select a queue for service. If we have a current active queue,
2208 * check whether to continue servicing it, or retrieve and set a new one.
2209 */
2210static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2211{
2212	struct cfq_queue *cfqq, *new_cfqq = NULL;
2213
2214	cfqq = cfqd->active_queue;
2215	if (!cfqq)
2216		goto new_queue;
2217
2218	if (!cfqd->rq_queued)
2219		return NULL;
2220
2221	/*
2222	 * We were waiting for group to get backlogged. Expire the queue
2223	 */
2224	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2225		goto expire;
2226
2227	/*
2228	 * The active queue has run out of time, expire it and select new.
2229	 */
2230	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2231		/*
2232		 * If slice had not expired at the completion of last request
2233		 * we might not have turned on wait_busy flag. Don't expire
2234		 * the queue yet. Allow the group to get backlogged.
2235		 *
2236		 * The very fact that we have used the slice, that means we
2237		 * have been idling all along on this queue and it should be
2238		 * ok to wait for this request to complete.
2239		 */
2240		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2241		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2242			cfqq = NULL;
2243			goto keep_queue;
2244		} else
2245			goto check_group_idle;
2246	}
2247
2248	/*
2249	 * The active queue has requests and isn't expired, allow it to
2250	 * dispatch.
2251	 */
2252	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2253		goto keep_queue;
2254
2255	/*
2256	 * If another queue has a request waiting within our mean seek
2257	 * distance, let it run.  The expire code will check for close
2258	 * cooperators and put the close queue at the front of the service
2259	 * tree.  If possible, merge the expiring queue with the new cfqq.
2260	 */
2261	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2262	if (new_cfqq) {
2263		if (!cfqq->new_cfqq)
2264			cfq_setup_merge(cfqq, new_cfqq);
2265		goto expire;
2266	}
2267
2268	/*
2269	 * No requests pending. If the active queue still has requests in
2270	 * flight or is idling for a new request, allow either of these
2271	 * conditions to happen (or time out) before selecting a new queue.
2272	 */
2273	if (timer_pending(&cfqd->idle_slice_timer)) {
2274		cfqq = NULL;
2275		goto keep_queue;
2276	}
2277
2278	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2279		cfqq = NULL;
2280		goto keep_queue;
2281	}
2282
2283	/*
2284	 * If group idle is enabled and there are requests dispatched from
2285	 * this group, wait for requests to complete.
2286	 */
2287check_group_idle:
2288	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
2289	    && cfqq->cfqg->dispatched) {
2290		cfqq = NULL;
2291		goto keep_queue;
2292	}
2293
2294expire:
2295	cfq_slice_expired(cfqd, 0);
2296new_queue:
2297	/*
2298	 * Current queue expired. Check if we have to switch to a new
2299	 * service tree
2300	 */
2301	if (!new_cfqq)
2302		cfq_choose_cfqg(cfqd);
2303
2304	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2305keep_queue:
2306	return cfqq;
2307}
2308
2309static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2310{
2311	int dispatched = 0;
2312
2313	while (cfqq->next_rq) {
2314		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2315		dispatched++;
2316	}
2317
2318	BUG_ON(!list_empty(&cfqq->fifo));
2319
2320	/* By default cfqq is not expired if it is empty. Do it explicitly */
2321	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2322	return dispatched;
2323}
2324
2325/*
2326 * Drain our current requests. Used for barriers and when switching
2327 * io schedulers on-the-fly.
2328 */
2329static int cfq_forced_dispatch(struct cfq_data *cfqd)
2330{
2331	struct cfq_queue *cfqq;
2332	int dispatched = 0;
2333
2334	/* Expire the timeslice of the current active queue first */
2335	cfq_slice_expired(cfqd, 0);
2336	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
2337		__cfq_set_active_queue(cfqd, cfqq);
2338		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2339	}
2340
2341	BUG_ON(cfqd->busy_queues);
2342
2343	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
2344	return dispatched;
2345}
2346
2347static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2348	struct cfq_queue *cfqq)
2349{
2350	/* the queue hasn't finished any request, can't estimate */
2351	if (cfq_cfqq_slice_new(cfqq))
2352		return 1;
2353	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2354		cfqq->slice_end))
2355		return 1;
2356
2357	return 0;
2358}
2359
2360static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2361{
2362	unsigned int max_dispatch;
2363
2364	/*
2365	 * Drain async requests before we start sync IO
2366	 */
2367	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
2368		return false;
2369
2370	/*
2371	 * If this is an async queue and we have sync IO in flight, let it wait
2372	 */
2373	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
2374		return false;
2375
2376	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2377	if (cfq_class_idle(cfqq))
2378		max_dispatch = 1;
2379
2380	/*
2381	 * Does this cfqq already have too much IO in flight?
2382	 */
2383	if (cfqq->dispatched >= max_dispatch) {
2384		/*
2385		 * idle queue must always only have a single IO in flight
2386		 */
2387		if (cfq_class_idle(cfqq))
2388			return false;
2389
2390		/*
2391		 * We have other queues, don't allow more IO from this one
2392		 */
2393		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
2394			return false;
2395
2396		/*
2397		 * Sole queue user, no limit
2398		 */
2399		if (cfqd->busy_queues == 1)
2400			max_dispatch = -1;
2401		else
2402			/*
2403			 * Normally we start throttling cfqq when cfq_quantum/2
2404			 * requests have been dispatched. But we can drive
2405			 * deeper queue depths at the beginning of slice
2406			 * subjected to upper limit of cfq_quantum.
2407			 * */
2408			max_dispatch = cfqd->cfq_quantum;
2409	}
2410
2411	/*
2412	 * Async queues must wait a bit before being allowed dispatch.
2413	 * We also ramp up the dispatch depth gradually for async IO,
2414	 * based on the last sync IO we serviced
2415	 */
2416	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
2417		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
2418		unsigned int depth;
2419
2420		depth = last_sync / cfqd->cfq_slice[1];
2421		if (!depth && !cfqq->dispatched)
2422			depth = 1;
2423		if (depth < max_dispatch)
2424			max_dispatch = depth;
2425	}
2426
2427	/*
2428	 * If we're below the current max, allow a dispatch
2429	 */
2430	return cfqq->dispatched < max_dispatch;
2431}
2432
2433/*
2434 * Dispatch a request from cfqq, moving them to the request queue
2435 * dispatch list.
2436 */
2437static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2438{
2439	struct request *rq;
2440
2441	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
2442
2443	if (!cfq_may_dispatch(cfqd, cfqq))
2444		return false;
2445
2446	/*
2447	 * follow expired path, else get first next available
2448	 */
2449	rq = cfq_check_fifo(cfqq);
2450	if (!rq)
2451		rq = cfqq->next_rq;
2452
2453	/*
2454	 * insert request into driver dispatch list
2455	 */
2456	cfq_dispatch_insert(cfqd->queue, rq);
2457
2458	if (!cfqd->active_cic) {
2459		struct cfq_io_context *cic = RQ_CIC(rq);
2460
2461		atomic_long_inc(&cic->ioc->refcount);
2462		cfqd->active_cic = cic;
2463	}
2464
2465	return true;
2466}
2467
2468/*
2469 * Find the cfqq that we need to service and move a request from that to the
2470 * dispatch list
2471 */
2472static int cfq_dispatch_requests(struct request_queue *q, int force)
2473{
2474	struct cfq_data *cfqd = q->elevator->elevator_data;
2475	struct cfq_queue *cfqq;
2476
2477	if (!cfqd->busy_queues)
2478		return 0;
2479
2480	if (unlikely(force))
2481		return cfq_forced_dispatch(cfqd);
2482
2483	cfqq = cfq_select_queue(cfqd);
2484	if (!cfqq)
2485		return 0;
2486
2487	/*
2488	 * Dispatch a request from this cfqq, if it is allowed
2489	 */
2490	if (!cfq_dispatch_request(cfqd, cfqq))
2491		return 0;
2492
2493	cfqq->slice_dispatch++;
2494	cfq_clear_cfqq_must_dispatch(cfqq);
2495
2496	/*
2497	 * expire an async queue immediately if it has used up its slice. idle
2498	 * queue always expire after 1 dispatch round.
2499	 */
2500	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
2501	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
2502	    cfq_class_idle(cfqq))) {
2503		cfqq->slice_end = jiffies + 1;
2504		cfq_slice_expired(cfqd, 0);
2505	}
2506
2507	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2508	return 1;
2509}
2510
2511/*
2512 * task holds one reference to the queue, dropped when task exits. each rq
2513 * in-flight on this queue also holds a reference, dropped when rq is freed.
2514 *
2515 * Each cfq queue took a reference on the parent group. Drop it now.
2516 * queue lock must be held here.
2517 */
2518static void cfq_put_queue(struct cfq_queue *cfqq)
2519{
2520	struct cfq_data *cfqd = cfqq->cfqd;
2521	struct cfq_group *cfqg, *orig_cfqg;
2522
2523	BUG_ON(atomic_read(&cfqq->ref) <= 0);
2524
2525	if (!atomic_dec_and_test(&cfqq->ref))
2526		return;
2527
2528	cfq_log_cfqq(cfqd, cfqq, "put_queue");
2529	BUG_ON(rb_first(&cfqq->sort_list));
2530	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2531	cfqg = cfqq->cfqg;
2532	orig_cfqg = cfqq->orig_cfqg;
2533
2534	if (unlikely(cfqd->active_queue == cfqq)) {
2535		__cfq_slice_expired(cfqd, cfqq, 0);
2536		cfq_schedule_dispatch(cfqd);
2537	}
2538
2539	BUG_ON(cfq_cfqq_on_rr(cfqq));
2540	kmem_cache_free(cfq_pool, cfqq);
2541	cfq_put_cfqg(cfqg);
2542	if (orig_cfqg)
2543		cfq_put_cfqg(orig_cfqg);
2544}
2545
2546/*
2547 * Must always be called with the rcu_read_lock() held
2548 */
2549static void
2550__call_for_each_cic(struct io_context *ioc,
2551		    void (*func)(struct io_context *, struct cfq_io_context *))
2552{
2553	struct cfq_io_context *cic;
2554	struct hlist_node *n;
2555
2556	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2557		func(ioc, cic);
2558}
2559
2560/*
2561 * Call func for each cic attached to this ioc.
2562 */
2563static void
2564call_for_each_cic(struct io_context *ioc,
2565		  void (*func)(struct io_context *, struct cfq_io_context *))
2566{
2567	rcu_read_lock();
2568	__call_for_each_cic(ioc, func);
2569	rcu_read_unlock();
2570}
2571
2572static void cfq_cic_free_rcu(struct rcu_head *head)
2573{
2574	struct cfq_io_context *cic;
2575
2576	cic = container_of(head, struct cfq_io_context, rcu_head);
2577
2578	kmem_cache_free(cfq_ioc_pool, cic);
2579	elv_ioc_count_dec(cfq_ioc_count);
2580
2581	if (ioc_gone) {
2582		/*
2583		 * CFQ scheduler is exiting, grab exit lock and check
2584		 * the pending io context count. If it hits zero,
2585		 * complete ioc_gone and set it back to NULL
2586		 */
2587		spin_lock(&ioc_gone_lock);
2588		if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
2589			complete(ioc_gone);
2590			ioc_gone = NULL;
2591		}
2592		spin_unlock(&ioc_gone_lock);
2593	}
2594}
2595
2596static void cfq_cic_free(struct cfq_io_context *cic)
2597{
2598	call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
2599}
2600
2601static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
2602{
2603	unsigned long flags;
2604	unsigned long dead_key = (unsigned long) cic->key;
2605
2606	BUG_ON(!(dead_key & CIC_DEAD_KEY));
2607
2608	spin_lock_irqsave(&ioc->lock, flags);
2609	radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
2610	hlist_del_rcu(&cic->cic_list);
2611	spin_unlock_irqrestore(&ioc->lock, flags);
2612
2613	cfq_cic_free(cic);
2614}
2615
2616/*
2617 * Must be called with rcu_read_lock() held or preemption otherwise disabled.
2618 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
2619 * and ->trim() which is called with the task lock held
2620 */
2621static void cfq_free_io_context(struct io_context *ioc)
2622{
2623	/*
2624	 * ioc->refcount is zero here, or we are called from elv_unregister(),
2625	 * so no more cic's are allowed to be linked into this ioc.  So it
2626	 * should be ok to iterate over the known list, we will see all cic's
2627	 * since no new ones are added.
2628	 */
2629	__call_for_each_cic(ioc, cic_free_func);
2630}
2631
2632static void cfq_put_cooperator(struct cfq_queue *cfqq)
2633{
2634	struct cfq_queue *__cfqq, *next;
2635
2636	/*
2637	 * If this queue was scheduled to merge with another queue, be
2638	 * sure to drop the reference taken on that queue (and others in
2639	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
2640	 */
2641	__cfqq = cfqq->new_cfqq;
2642	while (__cfqq) {
2643		if (__cfqq == cfqq) {
2644			WARN(1, "cfqq->new_cfqq loop detected\n");
2645			break;
2646		}
2647		next = __cfqq->new_cfqq;
2648		cfq_put_queue(__cfqq);
2649		__cfqq = next;
2650	}
2651}
2652
2653static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2654{
2655	if (unlikely(cfqq == cfqd->active_queue)) {
2656		__cfq_slice_expired(cfqd, cfqq, 0);
2657		cfq_schedule_dispatch(cfqd);
2658	}
2659
2660	cfq_put_cooperator(cfqq);
2661
2662	cfq_put_queue(cfqq);
2663}
2664
2665static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2666					 struct cfq_io_context *cic)
2667{
2668	struct io_context *ioc = cic->ioc;
2669
2670	list_del_init(&cic->queue_list);
2671
2672	/*
2673	 * Make sure dead mark is seen for dead queues
2674	 */
2675	smp_wmb();
2676	cic->key = cfqd_dead_key(cfqd);
2677
2678	if (ioc->ioc_data == cic)
2679		rcu_assign_pointer(ioc->ioc_data, NULL);
2680
2681	if (cic->cfqq[BLK_RW_ASYNC]) {
2682		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
2683		cic->cfqq[BLK_RW_ASYNC] = NULL;
2684	}
2685
2686	if (cic->cfqq[BLK_RW_SYNC]) {
2687		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
2688		cic->cfqq[BLK_RW_SYNC] = NULL;
2689	}
2690}
2691
2692static void cfq_exit_single_io_context(struct io_context *ioc,
2693				       struct cfq_io_context *cic)
2694{
2695	struct cfq_data *cfqd = cic_to_cfqd(cic);
2696
2697	if (cfqd) {
2698		struct request_queue *q = cfqd->queue;
2699		unsigned long flags;
2700
2701		spin_lock_irqsave(q->queue_lock, flags);
2702
2703		/*
2704		 * Ensure we get a fresh copy of the ->key to prevent
2705		 * race between exiting task and queue
2706		 */
2707		smp_read_barrier_depends();
2708		if (cic->key == cfqd)
2709			__cfq_exit_single_io_context(cfqd, cic);
2710
2711		spin_unlock_irqrestore(q->queue_lock, flags);
2712	}
2713}
2714
2715/*
2716 * The process that ioc belongs to has exited, we need to clean up
2717 * and put the internal structures we have that belongs to that process.
2718 */
2719static void cfq_exit_io_context(struct io_context *ioc)
2720{
2721	call_for_each_cic(ioc, cfq_exit_single_io_context);
2722}
2723
2724static struct cfq_io_context *
2725cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2726{
2727	struct cfq_io_context *cic;
2728
2729	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
2730							cfqd->queue->node);
2731	if (cic) {
2732		cic->last_end_request = jiffies;
2733		INIT_LIST_HEAD(&cic->queue_list);
2734		INIT_HLIST_NODE(&cic->cic_list);
2735		cic->dtor = cfq_free_io_context;
2736		cic->exit = cfq_exit_io_context;
2737		elv_ioc_count_inc(cfq_ioc_count);
2738	}
2739
2740	return cic;
2741}
2742
2743static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2744{
2745	struct task_struct *tsk = current;
2746	int ioprio_class;
2747
2748	if (!cfq_cfqq_prio_changed(cfqq))
2749		return;
2750
2751	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2752	switch (ioprio_class) {
2753	default:
2754		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
2755	case IOPRIO_CLASS_NONE:
2756		/*
2757		 * no prio set, inherit CPU scheduling settings
2758		 */
2759		cfqq->ioprio = task_nice_ioprio(tsk);
2760		cfqq->ioprio_class = task_nice_ioclass(tsk);
2761		break;
2762	case IOPRIO_CLASS_RT:
2763		cfqq->ioprio = task_ioprio(ioc);
2764		cfqq->ioprio_class = IOPRIO_CLASS_RT;
2765		break;
2766	case IOPRIO_CLASS_BE:
2767		cfqq->ioprio = task_ioprio(ioc);
2768		cfqq->ioprio_class = IOPRIO_CLASS_BE;
2769		break;
2770	case IOPRIO_CLASS_IDLE:
2771		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
2772		cfqq->ioprio = 7;
2773		cfq_clear_cfqq_idle_window(cfqq);
2774		break;
2775	}
2776
2777	/*
2778	 * keep track of original prio settings in case we have to temporarily
2779	 * elevate the priority of this queue
2780	 */
2781	cfqq->org_ioprio = cfqq->ioprio;
2782	cfqq->org_ioprio_class = cfqq->ioprio_class;
2783	cfq_clear_cfqq_prio_changed(cfqq);
2784}
2785
2786static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
2787{
2788	struct cfq_data *cfqd = cic_to_cfqd(cic);
2789	struct cfq_queue *cfqq;
2790	unsigned long flags;
2791
2792	if (unlikely(!cfqd))
2793		return;
2794
2795	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2796
2797	cfqq = cic->cfqq[BLK_RW_ASYNC];
2798	if (cfqq) {
2799		struct cfq_queue *new_cfqq;
2800		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
2801						GFP_ATOMIC);
2802		if (new_cfqq) {
2803			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2804			cfq_put_queue(cfqq);
2805		}
2806	}
2807
2808	cfqq = cic->cfqq[BLK_RW_SYNC];
2809	if (cfqq)
2810		cfq_mark_cfqq_prio_changed(cfqq);
2811
2812	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2813}
2814
2815static void cfq_ioc_set_ioprio(struct io_context *ioc)
2816{
2817	call_for_each_cic(ioc, changed_ioprio);
2818	ioc->ioprio_changed = 0;
2819}
2820
2821static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2822			  pid_t pid, bool is_sync)
2823{
2824	RB_CLEAR_NODE(&cfqq->rb_node);
2825	RB_CLEAR_NODE(&cfqq->p_node);
2826	INIT_LIST_HEAD(&cfqq->fifo);
2827
2828	atomic_set(&cfqq->ref, 0);
2829	cfqq->cfqd = cfqd;
2830
2831	cfq_mark_cfqq_prio_changed(cfqq);
2832
2833	if (is_sync) {
2834		if (!cfq_class_idle(cfqq))
2835			cfq_mark_cfqq_idle_window(cfqq);
2836		cfq_mark_cfqq_sync(cfqq);
2837	}
2838	cfqq->pid = pid;
2839}
2840
2841#ifdef CONFIG_CFQ_GROUP_IOSCHED
2842static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2843{
2844	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2845	struct cfq_data *cfqd = cic_to_cfqd(cic);
2846	unsigned long flags;
2847	struct request_queue *q;
2848
2849	if (unlikely(!cfqd))
2850		return;
2851
2852	q = cfqd->queue;
2853
2854	spin_lock_irqsave(q->queue_lock, flags);
2855
2856	if (sync_cfqq) {
2857		/*
2858		 * Drop reference to sync queue. A new sync queue will be
2859		 * assigned in new group upon arrival of a fresh request.
2860		 */
2861		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2862		cic_set_cfqq(cic, NULL, 1);
2863		cfq_put_queue(sync_cfqq);
2864	}
2865
2866	spin_unlock_irqrestore(q->queue_lock, flags);
2867}
2868
2869static void cfq_ioc_set_cgroup(struct io_context *ioc)
2870{
2871	call_for_each_cic(ioc, changed_cgroup);
2872	ioc->cgroup_changed = 0;
2873}
2874#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
2875
2876static struct cfq_queue *
2877cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2878		     struct io_context *ioc, gfp_t gfp_mask)
2879{
2880	struct cfq_queue *cfqq, *new_cfqq = NULL;
2881	struct cfq_io_context *cic;
2882	struct cfq_group *cfqg;
2883
2884retry:
2885	cfqg = cfq_get_cfqg(cfqd, 1);
2886	cic = cfq_cic_lookup(cfqd, ioc);
2887	/* cic always exists here */
2888	cfqq = cic_to_cfqq(cic, is_sync);
2889
2890	/*
2891	 * Always try a new alloc if we fell back to the OOM cfqq
2892	 * originally, since it should just be a temporary situation.
2893	 */
2894	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2895		cfqq = NULL;
2896		if (new_cfqq) {
2897			cfqq = new_cfqq;
2898			new_cfqq = NULL;
2899		} else if (gfp_mask & __GFP_WAIT) {
2900			spin_unlock_irq(cfqd->queue->queue_lock);
2901			new_cfqq = kmem_cache_alloc_node(cfq_pool,
2902					gfp_mask | __GFP_ZERO,
2903					cfqd->queue->node);
2904			spin_lock_irq(cfqd->queue->queue_lock);
2905			if (new_cfqq)
2906				goto retry;
2907		} else {
2908			cfqq = kmem_cache_alloc_node(cfq_pool,
2909					gfp_mask | __GFP_ZERO,
2910					cfqd->queue->node);
2911		}
2912
2913		if (cfqq) {
2914			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
2915			cfq_init_prio_data(cfqq, ioc);
2916			cfq_link_cfqq_cfqg(cfqq, cfqg);
2917			cfq_log_cfqq(cfqd, cfqq, "alloced");
2918		} else
2919			cfqq = &cfqd->oom_cfqq;
2920	}
2921
2922	if (new_cfqq)
2923		kmem_cache_free(cfq_pool, new_cfqq);
2924
2925	return cfqq;
2926}
2927
2928static struct cfq_queue **
2929cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2930{
2931	switch (ioprio_class) {
2932	case IOPRIO_CLASS_RT:
2933		return &cfqd->async_cfqq[0][ioprio];
2934	case IOPRIO_CLASS_BE:
2935		return &cfqd->async_cfqq[1][ioprio];
2936	case IOPRIO_CLASS_IDLE:
2937		return &cfqd->async_idle_cfqq;
2938	default:
2939		BUG();
2940	}
2941}
2942
2943static struct cfq_queue *
2944cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2945	      gfp_t gfp_mask)
2946{
2947	const int ioprio = task_ioprio(ioc);
2948	const int ioprio_class = task_ioprio_class(ioc);
2949	struct cfq_queue **async_cfqq = NULL;
2950	struct cfq_queue *cfqq = NULL;
2951
2952	if (!is_sync) {
2953		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
2954		cfqq = *async_cfqq;
2955	}
2956
2957	if (!cfqq)
2958		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
2959
2960	/*
2961	 * pin the queue now that it's allocated, scheduler exit will prune it
2962	 */
2963	if (!is_sync && !(*async_cfqq)) {
2964		atomic_inc(&cfqq->ref);
2965		*async_cfqq = cfqq;
2966	}
2967
2968	atomic_inc(&cfqq->ref);
2969	return cfqq;
2970}
2971
2972/*
2973 * We drop cfq io contexts lazily, so we may find a dead one.
2974 */
2975static void
2976cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
2977		  struct cfq_io_context *cic)
2978{
2979	unsigned long flags;
2980
2981	WARN_ON(!list_empty(&cic->queue_list));
2982	BUG_ON(cic->key != cfqd_dead_key(cfqd));
2983
2984	spin_lock_irqsave(&ioc->lock, flags);
2985
2986	BUG_ON(ioc->ioc_data == cic);
2987
2988	radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
2989	hlist_del_rcu(&cic->cic_list);
2990	spin_unlock_irqrestore(&ioc->lock, flags);
2991
2992	cfq_cic_free(cic);
2993}
2994
2995static struct cfq_io_context *
2996cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
2997{
2998	struct cfq_io_context *cic;
2999	unsigned long flags;
3000
3001	if (unlikely(!ioc))
3002		return NULL;
3003
3004	rcu_read_lock();
3005
3006	/*
3007	 * we maintain a last-hit cache, to avoid browsing over the tree
3008	 */
3009	cic = rcu_dereference(ioc->ioc_data);
3010	if (cic && cic->key == cfqd) {
3011		rcu_read_unlock();
3012		return cic;
3013	}
3014
3015	do {
3016		cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
3017		rcu_read_unlock();
3018		if (!cic)
3019			break;
3020		if (unlikely(cic->key != cfqd)) {
3021			cfq_drop_dead_cic(cfqd, ioc, cic);
3022			rcu_read_lock();
3023			continue;
3024		}
3025
3026		spin_lock_irqsave(&ioc->lock, flags);
3027		rcu_assign_pointer(ioc->ioc_data, cic);
3028		spin_unlock_irqrestore(&ioc->lock, flags);
3029		break;
3030	} while (1);
3031
3032	return cic;
3033}
3034
3035/*
3036 * Add cic into ioc, using cfqd as the search key. This enables us to lookup
3037 * the process specific cfq io context when entered from the block layer.
3038 * Also adds the cic to a per-cfqd list, used when this queue is removed.
3039 */
3040static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
3041			struct cfq_io_context *cic, gfp_t gfp_mask)
3042{
3043	unsigned long flags;
3044	int ret;
3045
3046	ret = radix_tree_preload(gfp_mask);
3047	if (!ret) {
3048		cic->ioc = ioc;
3049		cic->key = cfqd;
3050
3051		spin_lock_irqsave(&ioc->lock, flags);
3052		ret = radix_tree_insert(&ioc->radix_root,
3053						cfqd->cic_index, cic);
3054		if (!ret)
3055			hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
3056		spin_unlock_irqrestore(&ioc->lock, flags);
3057
3058		radix_tree_preload_end();
3059
3060		if (!ret) {
3061			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3062			list_add(&cic->queue_list, &cfqd->cic_list);
3063			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3064		}
3065	}
3066
3067	if (ret)
3068		printk(KERN_ERR "cfq: cic link failed!\n");
3069
3070	return ret;
3071}
3072
3073/*
3074 * Setup general io context and cfq io context. There can be several cfq
3075 * io contexts per general io context, if this process is doing io to more
3076 * than one device managed by cfq.
3077 */
3078static struct cfq_io_context *
3079cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
3080{
3081	struct io_context *ioc = NULL;
3082	struct cfq_io_context *cic;
3083
3084	might_sleep_if(gfp_mask & __GFP_WAIT);
3085
3086	ioc = get_io_context(gfp_mask, cfqd->queue->node);
3087	if (!ioc)
3088		return NULL;
3089
3090	cic = cfq_cic_lookup(cfqd, ioc);
3091	if (cic)
3092		goto out;
3093
3094	cic = cfq_alloc_io_context(cfqd, gfp_mask);
3095	if (cic == NULL)
3096		goto err;
3097
3098	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
3099		goto err_free;
3100
3101out:
3102	smp_read_barrier_depends();
3103	if (unlikely(ioc->ioprio_changed))
3104		cfq_ioc_set_ioprio(ioc);
3105
3106#ifdef CONFIG_CFQ_GROUP_IOSCHED
3107	if (unlikely(ioc->cgroup_changed))
3108		cfq_ioc_set_cgroup(ioc);
3109#endif
3110	return cic;
3111err_free:
3112	cfq_cic_free(cic);
3113err:
3114	put_io_context(ioc);
3115	return NULL;
3116}
3117
3118static void
3119cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
3120{
3121	unsigned long elapsed = jiffies - cic->last_end_request;
3122	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
3123
3124	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
3125	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
3126	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
3127}
3128
3129static void
3130cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3131		       struct request *rq)
3132{
3133	sector_t sdist = 0;
3134	sector_t n_sec = blk_rq_sectors(rq);
3135	if (cfqq->last_request_pos) {
3136		if (cfqq->last_request_pos < blk_rq_pos(rq))
3137			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3138		else
3139			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3140	}
3141
3142	cfqq->seek_history <<= 1;
3143	if (blk_queue_nonrot(cfqd->queue))
3144		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3145	else
3146		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3147}
3148
3149/*
3150 * Disable idle window if the process thinks too long or seeks so much that
3151 * it doesn't matter
3152 */
3153static void
3154cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3155		       struct cfq_io_context *cic)
3156{
3157	int old_idle, enable_idle;
3158
3159	/*
3160	 * Don't idle for async or idle io prio class
3161	 */
3162	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3163		return;
3164
3165	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3166
3167	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3168		cfq_mark_cfqq_deep(cfqq);
3169
3170	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3171	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3172		enable_idle = 0;
3173	else if (sample_valid(cic->ttime_samples)) {
3174		if (cic->ttime_mean > cfqd->cfq_slice_idle)
3175			enable_idle = 0;
3176		else
3177			enable_idle = 1;
3178	}
3179
3180	if (old_idle != enable_idle) {
3181		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3182		if (enable_idle)
3183			cfq_mark_cfqq_idle_window(cfqq);
3184		else
3185			cfq_clear_cfqq_idle_window(cfqq);
3186	}
3187}
3188
3189/*
3190 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3191 * no or if we aren't sure, a 1 will cause a preempt.
3192 */
3193static bool
3194cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3195		   struct request *rq)
3196{
3197	struct cfq_queue *cfqq;
3198
3199	cfqq = cfqd->active_queue;
3200	if (!cfqq)
3201		return false;
3202
3203	if (cfq_class_idle(new_cfqq))
3204		return false;
3205
3206	if (cfq_class_idle(cfqq))
3207		return true;
3208
3209	/*
3210	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3211	 */
3212	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3213		return false;
3214
3215	/*
3216	 * if the new request is sync, but the currently running queue is
3217	 * not, let the sync request have priority.
3218	 */
3219	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3220		return true;
3221
3222	if (new_cfqq->cfqg != cfqq->cfqg)
3223		return false;
3224
3225	if (cfq_slice_used(cfqq))
3226		return true;
3227
3228	/* Allow preemption only if we are idling on sync-noidle tree */
3229	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3230	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3231	    new_cfqq->service_tree->count == 2 &&
3232	    RB_EMPTY_ROOT(&cfqq->sort_list))
3233		return true;
3234
3235	/*
3236	 * So both queues are sync. Let the new request get disk time if
3237	 * it's a metadata request and the current queue is doing regular IO.
3238	 */
3239	if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
3240		return true;
3241
3242	/*
3243	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3244	 */
3245	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3246		return true;
3247
3248	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3249		return false;
3250
3251	/*
3252	 * if this request is as-good as one we would expect from the
3253	 * current cfqq, let it preempt
3254	 */
3255	if (cfq_rq_close(cfqd, cfqq, rq))
3256		return true;
3257
3258	return false;
3259}
3260
3261/*
3262 * cfqq preempts the active queue. if we allowed preempt with no slice left,
3263 * let it have half of its nominal slice.
3264 */
3265static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3266{
3267	cfq_log_cfqq(cfqd, cfqq, "preempt");
3268	cfq_slice_expired(cfqd, 1);
3269
3270	/*
3271	 * Put the new queue at the front of the of the current list,
3272	 * so we know that it will be selected next.
3273	 */
3274	BUG_ON(!cfq_cfqq_on_rr(cfqq));
3275
3276	cfq_service_tree_add(cfqd, cfqq, 1);
3277
3278	cfqq->slice_end = 0;
3279	cfq_mark_cfqq_slice_new(cfqq);
3280}
3281
3282/*
3283 * Called when a new fs request (rq) is added (to cfqq). Check if there's
3284 * something we should do about it
3285 */
3286static void
3287cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3288		struct request *rq)
3289{
3290	struct cfq_io_context *cic = RQ_CIC(rq);
3291
3292	cfqd->rq_queued++;
3293	if (rq->cmd_flags & REQ_META)
3294		cfqq->meta_pending++;
3295
3296	cfq_update_io_thinktime(cfqd, cic);
3297	cfq_update_io_seektime(cfqd, cfqq, rq);
3298	cfq_update_idle_window(cfqd, cfqq, cic);
3299
3300	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3301
3302	if (cfqq == cfqd->active_queue) {
3303		/*
3304		 * Remember that we saw a request from this process, but
3305		 * don't start queuing just yet. Otherwise we risk seeing lots
3306		 * of tiny requests, because we disrupt the normal plugging
3307		 * and merging. If the request is already larger than a single
3308		 * page, let it rip immediately. For that case we assume that
3309		 * merging is already done. Ditto for a busy system that
3310		 * has other work pending, don't risk delaying until the
3311		 * idle timer unplug to continue working.
3312		 */
3313		if (cfq_cfqq_wait_request(cfqq)) {
3314			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3315			    cfqd->busy_queues > 1) {
3316				cfq_del_timer(cfqd, cfqq);
3317				cfq_clear_cfqq_wait_request(cfqq);
3318				__blk_run_queue(cfqd->queue);
3319			} else {
3320				cfq_blkiocg_update_idle_time_stats(
3321						&cfqq->cfqg->blkg);
3322				cfq_mark_cfqq_must_dispatch(cfqq);
3323			}
3324		}
3325	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3326		/*
3327		 * not the active queue - expire current slice if it is
3328		 * idle and has expired it's mean thinktime or this new queue
3329		 * has some old slice time left and is of higher priority or
3330		 * this new queue is RT and the current one is BE
3331		 */
3332		cfq_preempt_queue(cfqd, cfqq);
3333		__blk_run_queue(cfqd->queue);
3334	}
3335}
3336
3337static void cfq_insert_request(struct request_queue *q, struct request *rq)
3338{
3339	struct cfq_data *cfqd = q->elevator->elevator_data;
3340	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3341
3342	cfq_log_cfqq(cfqd, cfqq, "insert_request");
3343	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
3344
3345	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3346	list_add_tail(&rq->queuelist, &cfqq->fifo);
3347	cfq_add_rq_rb(rq);
3348	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3349			&cfqd->serving_group->blkg, rq_data_dir(rq),
3350			rq_is_sync(rq));
3351	cfq_rq_enqueued(cfqd, cfqq, rq);
3352}
3353
3354/*
3355 * Update hw_tag based on peak queue depth over 50 samples under
3356 * sufficient load.
3357 */
3358static void cfq_update_hw_tag(struct cfq_data *cfqd)
3359{
3360	struct cfq_queue *cfqq = cfqd->active_queue;
3361
3362	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3363		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3364
3365	if (cfqd->hw_tag == 1)
3366		return;
3367
3368	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3369	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3370		return;
3371
3372	/*
3373	 * If active queue hasn't enough requests and can idle, cfq might not
3374	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3375	 * case
3376	 */
3377	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3378	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3379	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3380		return;
3381
3382	if (cfqd->hw_tag_samples++ < 50)
3383		return;
3384
3385	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3386		cfqd->hw_tag = 1;
3387	else
3388		cfqd->hw_tag = 0;
3389}
3390
3391static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3392{
3393	struct cfq_io_context *cic = cfqd->active_cic;
3394
3395	/* If there are other queues in the group, don't wait */
3396	if (cfqq->cfqg->nr_cfqq > 1)
3397		return false;
3398
3399	if (cfq_slice_used(cfqq))
3400		return true;
3401
3402	/* if slice left is less than think time, wait busy */
3403	if (cic && sample_valid(cic->ttime_samples)
3404	    && (cfqq->slice_end - jiffies < cic->ttime_mean))
3405		return true;
3406
3407	/*
3408	 * If think times is less than a jiffy than ttime_mean=0 and above
3409	 * will not be true. It might happen that slice has not expired yet
3410	 * but will expire soon (4-5 ns) during select_queue(). To cover the
3411	 * case where think time is less than a jiffy, mark the queue wait
3412	 * busy if only 1 jiffy is left in the slice.
3413	 */
3414	if (cfqq->slice_end - jiffies == 1)
3415		return true;
3416
3417	return false;
3418}
3419
3420static void cfq_completed_request(struct request_queue *q, struct request *rq)
3421{
3422	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3423	struct cfq_data *cfqd = cfqq->cfqd;
3424	const int sync = rq_is_sync(rq);
3425	unsigned long now;
3426
3427	now = jiffies;
3428	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
3429		     !!(rq->cmd_flags & REQ_NOIDLE));
3430
3431	cfq_update_hw_tag(cfqd);
3432
3433	WARN_ON(!cfqd->rq_in_driver);
3434	WARN_ON(!cfqq->dispatched);
3435	cfqd->rq_in_driver--;
3436	cfqq->dispatched--;
3437	(RQ_CFQG(rq))->dispatched--;
3438	cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
3439			rq_start_time_ns(rq), rq_io_start_time_ns(rq),
3440			rq_data_dir(rq), rq_is_sync(rq));
3441
3442	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3443
3444	if (sync) {
3445		RQ_CIC(rq)->last_end_request = now;
3446		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3447			cfqd->last_delayed_sync = now;
3448	}
3449
3450	/*
3451	 * If this is the active queue, check if it needs to be expired,
3452	 * or if we want to idle in case it has no pending requests.
3453	 */
3454	if (cfqd->active_queue == cfqq) {
3455		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3456
3457		if (cfq_cfqq_slice_new(cfqq)) {
3458			cfq_set_prio_slice(cfqd, cfqq);
3459			cfq_clear_cfqq_slice_new(cfqq);
3460		}
3461
3462		/*
3463		 * Should we wait for next request to come in before we expire
3464		 * the queue.
3465		 */
3466		if (cfq_should_wait_busy(cfqd, cfqq)) {
3467			unsigned long extend_sl = cfqd->cfq_slice_idle;
3468			if (!cfqd->cfq_slice_idle)
3469				extend_sl = cfqd->cfq_group_idle;
3470			cfqq->slice_end = jiffies + extend_sl;
3471			cfq_mark_cfqq_wait_busy(cfqq);
3472			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3473		}
3474
3475		/*
3476		 * Idling is not enabled on:
3477		 * - expired queues
3478		 * - idle-priority queues
3479		 * - async queues
3480		 * - queues with still some requests queued
3481		 * - when there is a close cooperator
3482		 */
3483		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3484			cfq_slice_expired(cfqd, 1);
3485		else if (sync && cfqq_empty &&
3486			 !cfq_close_cooperator(cfqd, cfqq)) {
3487			cfqd->noidle_tree_requires_idle |=
3488				!(rq->cmd_flags & REQ_NOIDLE);
3489			/*
3490			 * Idling is enabled for SYNC_WORKLOAD.
3491			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3492			 * only if we processed at least one !REQ_NOIDLE request
3493			 */
3494			if (cfqd->serving_type == SYNC_WORKLOAD
3495			    || cfqd->noidle_tree_requires_idle
3496			    || cfqq->cfqg->nr_cfqq == 1)
3497				cfq_arm_slice_timer(cfqd);
3498		}
3499	}
3500
3501	if (!cfqd->rq_in_driver)
3502		cfq_schedule_dispatch(cfqd);
3503}
3504
3505/*
3506 * we temporarily boost lower priority queues if they are holding fs exclusive
3507 * resources. they are boosted to normal prio (CLASS_BE/4)
3508 */
3509static void cfq_prio_boost(struct cfq_queue *cfqq)
3510{
3511	if (has_fs_excl()) {
3512		/*
3513		 * boost idle prio on transactions that would lock out other
3514		 * users of the filesystem
3515		 */
3516		if (cfq_class_idle(cfqq))
3517			cfqq->ioprio_class = IOPRIO_CLASS_BE;
3518		if (cfqq->ioprio > IOPRIO_NORM)
3519			cfqq->ioprio = IOPRIO_NORM;
3520	} else {
3521		/*
3522		 * unboost the queue (if needed)
3523		 */
3524		cfqq->ioprio_class = cfqq->org_ioprio_class;
3525		cfqq->ioprio = cfqq->org_ioprio;
3526	}
3527}
3528
3529static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3530{
3531	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3532		cfq_mark_cfqq_must_alloc_slice(cfqq);
3533		return ELV_MQUEUE_MUST;
3534	}
3535
3536	return ELV_MQUEUE_MAY;
3537}
3538
3539static int cfq_may_queue(struct request_queue *q, int rw)
3540{
3541	struct cfq_data *cfqd = q->elevator->elevator_data;
3542	struct task_struct *tsk = current;
3543	struct cfq_io_context *cic;
3544	struct cfq_queue *cfqq;
3545
3546	/*
3547	 * don't force setup of a queue from here, as a call to may_queue
3548	 * does not necessarily imply that a request actually will be queued.
3549	 * so just lookup a possibly existing queue, or return 'may queue'
3550	 * if that fails
3551	 */
3552	cic = cfq_cic_lookup(cfqd, tsk->io_context);
3553	if (!cic)
3554		return ELV_MQUEUE_MAY;
3555
3556	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3557	if (cfqq) {
3558		cfq_init_prio_data(cfqq, cic->ioc);
3559		cfq_prio_boost(cfqq);
3560
3561		return __cfq_may_queue(cfqq);
3562	}
3563
3564	return ELV_MQUEUE_MAY;
3565}
3566
3567/*
3568 * queue lock held here
3569 */
3570static void cfq_put_request(struct request *rq)
3571{
3572	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3573
3574	if (cfqq) {
3575		const int rw = rq_data_dir(rq);
3576
3577		BUG_ON(!cfqq->allocated[rw]);
3578		cfqq->allocated[rw]--;
3579
3580		put_io_context(RQ_CIC(rq)->ioc);
3581
3582		rq->elevator_private = NULL;
3583		rq->elevator_private2 = NULL;
3584
3585		/* Put down rq reference on cfqg */
3586		cfq_put_cfqg(RQ_CFQG(rq));
3587		rq->elevator_private3 = NULL;
3588
3589		cfq_put_queue(cfqq);
3590	}
3591}
3592
3593static struct cfq_queue *
3594cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3595		struct cfq_queue *cfqq)
3596{
3597	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3598	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3599	cfq_mark_cfqq_coop(cfqq->new_cfqq);
3600	cfq_put_queue(cfqq);
3601	return cic_to_cfqq(cic, 1);
3602}
3603
3604/*
3605 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3606 * was the last process referring to said cfqq.
3607 */
3608static struct cfq_queue *
3609split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3610{
3611	if (cfqq_process_refs(cfqq) == 1) {
3612		cfqq->pid = current->pid;
3613		cfq_clear_cfqq_coop(cfqq);
3614		cfq_clear_cfqq_split_coop(cfqq);
3615		return cfqq;
3616	}
3617
3618	cic_set_cfqq(cic, NULL, 1);
3619
3620	cfq_put_cooperator(cfqq);
3621
3622	cfq_put_queue(cfqq);
3623	return NULL;
3624}
3625/*
3626 * Allocate cfq data structures associated with this request.
3627 */
3628static int
3629cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3630{
3631	struct cfq_data *cfqd = q->elevator->elevator_data;
3632	struct cfq_io_context *cic;
3633	const int rw = rq_data_dir(rq);
3634	const bool is_sync = rq_is_sync(rq);
3635	struct cfq_queue *cfqq;
3636	unsigned long flags;
3637
3638	might_sleep_if(gfp_mask & __GFP_WAIT);
3639
3640	cic = cfq_get_io_context(cfqd, gfp_mask);
3641
3642	spin_lock_irqsave(q->queue_lock, flags);
3643
3644	if (!cic)
3645		goto queue_fail;
3646
3647new_queue:
3648	cfqq = cic_to_cfqq(cic, is_sync);
3649	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3650		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
3651		cic_set_cfqq(cic, cfqq, is_sync);
3652	} else {
3653		/*
3654		 * If the queue was seeky for too long, break it apart.
3655		 */
3656		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3657			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3658			cfqq = split_cfqq(cic, cfqq);
3659			if (!cfqq)
3660				goto new_queue;
3661		}
3662
3663		/*
3664		 * Check to see if this queue is scheduled to merge with
3665		 * another, closely cooperating queue.  The merging of
3666		 * queues happens here as it must be done in process context.
3667		 * The reference on new_cfqq was taken in merge_cfqqs.
3668		 */
3669		if (cfqq->new_cfqq)
3670			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
3671	}
3672
3673	cfqq->allocated[rw]++;
3674	atomic_inc(&cfqq->ref);
3675
3676	spin_unlock_irqrestore(q->queue_lock, flags);
3677
3678	rq->elevator_private = cic;
3679	rq->elevator_private2 = cfqq;
3680	rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3681	return 0;
3682
3683queue_fail:
3684	if (cic)
3685		put_io_context(cic->ioc);
3686
3687	cfq_schedule_dispatch(cfqd);
3688	spin_unlock_irqrestore(q->queue_lock, flags);
3689	cfq_log(cfqd, "set_request fail");
3690	return 1;
3691}
3692
3693static void cfq_kick_queue(struct work_struct *work)
3694{
3695	struct cfq_data *cfqd =
3696		container_of(work, struct cfq_data, unplug_work);
3697	struct request_queue *q = cfqd->queue;
3698
3699	spin_lock_irq(q->queue_lock);
3700	__blk_run_queue(cfqd->queue);
3701	spin_unlock_irq(q->queue_lock);
3702}
3703
3704/*
3705 * Timer running if the active_queue is currently idling inside its time slice
3706 */
3707static void cfq_idle_slice_timer(unsigned long data)
3708{
3709	struct cfq_data *cfqd = (struct cfq_data *) data;
3710	struct cfq_queue *cfqq;
3711	unsigned long flags;
3712	int timed_out = 1;
3713
3714	cfq_log(cfqd, "idle timer fired");
3715
3716	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3717
3718	cfqq = cfqd->active_queue;
3719	if (cfqq) {
3720		timed_out = 0;
3721
3722		/*
3723		 * We saw a request before the queue expired, let it through
3724		 */
3725		if (cfq_cfqq_must_dispatch(cfqq))
3726			goto out_kick;
3727
3728		/*
3729		 * expired
3730		 */
3731		if (cfq_slice_used(cfqq))
3732			goto expire;
3733
3734		/*
3735		 * only expire and reinvoke request handler, if there are
3736		 * other queues with pending requests
3737		 */
3738		if (!cfqd->busy_queues)
3739			goto out_cont;
3740
3741		/*
3742		 * not expired and it has a request pending, let it dispatch
3743		 */
3744		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3745			goto out_kick;
3746
3747		/*
3748		 * Queue depth flag is reset only when the idle didn't succeed
3749		 */
3750		cfq_clear_cfqq_deep(cfqq);
3751	}
3752expire:
3753	cfq_slice_expired(cfqd, timed_out);
3754out_kick:
3755	cfq_schedule_dispatch(cfqd);
3756out_cont:
3757	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3758}
3759
3760static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
3761{
3762	del_timer_sync(&cfqd->idle_slice_timer);
3763	cancel_work_sync(&cfqd->unplug_work);
3764}
3765
3766static void cfq_put_async_queues(struct cfq_data *cfqd)
3767{
3768	int i;
3769
3770	for (i = 0; i < IOPRIO_BE_NR; i++) {
3771		if (cfqd->async_cfqq[0][i])
3772			cfq_put_queue(cfqd->async_cfqq[0][i]);
3773		if (cfqd->async_cfqq[1][i])
3774			cfq_put_queue(cfqd->async_cfqq[1][i]);
3775	}
3776
3777	if (cfqd->async_idle_cfqq)
3778		cfq_put_queue(cfqd->async_idle_cfqq);
3779}
3780
3781static void cfq_cfqd_free(struct rcu_head *head)
3782{
3783	kfree(container_of(head, struct cfq_data, rcu));
3784}
3785
3786static void cfq_exit_queue(struct elevator_queue *e)
3787{
3788	struct cfq_data *cfqd = e->elevator_data;
3789	struct request_queue *q = cfqd->queue;
3790
3791	cfq_shutdown_timer_wq(cfqd);
3792
3793	spin_lock_irq(q->queue_lock);
3794
3795	if (cfqd->active_queue)
3796		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3797
3798	while (!list_empty(&cfqd->cic_list)) {
3799		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
3800							struct cfq_io_context,
3801							queue_list);
3802
3803		__cfq_exit_single_io_context(cfqd, cic);
3804	}
3805
3806	cfq_put_async_queues(cfqd);
3807	cfq_release_cfq_groups(cfqd);
3808	cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
3809
3810	spin_unlock_irq(q->queue_lock);
3811
3812	cfq_shutdown_timer_wq(cfqd);
3813
3814	spin_lock(&cic_index_lock);
3815	ida_remove(&cic_index_ida, cfqd->cic_index);
3816	spin_unlock(&cic_index_lock);
3817
3818	/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
3819	call_rcu(&cfqd->rcu, cfq_cfqd_free);
3820}
3821
3822static int cfq_alloc_cic_index(void)
3823{
3824	int index, error;
3825
3826	do {
3827		if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
3828			return -ENOMEM;
3829
3830		spin_lock(&cic_index_lock);
3831		error = ida_get_new(&cic_index_ida, &index);
3832		spin_unlock(&cic_index_lock);
3833		if (error && error != -EAGAIN)
3834			return error;
3835	} while (error);
3836
3837	return index;
3838}
3839
3840static void *cfq_init_queue(struct request_queue *q)
3841{
3842	struct cfq_data *cfqd;
3843	int i, j;
3844	struct cfq_group *cfqg;
3845	struct cfq_rb_root *st;
3846
3847	i = cfq_alloc_cic_index();
3848	if (i < 0)
3849		return NULL;
3850
3851	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3852	if (!cfqd)
3853		return NULL;
3854
3855	cfqd->cic_index = i;
3856
3857	/* Init root service tree */
3858	cfqd->grp_service_tree = CFQ_RB_ROOT;
3859
3860	/* Init root group */
3861	cfqg = &cfqd->root_group;
3862	for_each_cfqg_st(cfqg, i, j, st)
3863		*st = CFQ_RB_ROOT;
3864	RB_CLEAR_NODE(&cfqg->rb_node);
3865
3866	/* Give preference to root group over other groups */
3867	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3868
3869#ifdef CONFIG_CFQ_GROUP_IOSCHED
3870	/*
3871	 * Take a reference to root group which we never drop. This is just
3872	 * to make sure that cfq_put_cfqg() does not try to kfree root group
3873	 */
3874	atomic_set(&cfqg->ref, 1);
3875	rcu_read_lock();
3876	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
3877					(void *)cfqd, 0);
3878	rcu_read_unlock();
3879#endif
3880	/*
3881	 * Not strictly needed (since RB_ROOT just clears the node and we
3882	 * zeroed cfqd on alloc), but better be safe in case someone decides
3883	 * to add magic to the rb code
3884	 */
3885	for (i = 0; i < CFQ_PRIO_LISTS; i++)
3886		cfqd->prio_trees[i] = RB_ROOT;
3887
3888	/*
3889	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
3890	 * Grab a permanent reference to it, so that the normal code flow
3891	 * will not attempt to free it.
3892	 */
3893	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3894	atomic_inc(&cfqd->oom_cfqq.ref);
3895	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3896
3897	INIT_LIST_HEAD(&cfqd->cic_list);
3898
3899	cfqd->queue = q;
3900
3901	init_timer(&cfqd->idle_slice_timer);
3902	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
3903	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
3904
3905	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
3906
3907	cfqd->cfq_quantum = cfq_quantum;
3908	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
3909	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
3910	cfqd->cfq_back_max = cfq_back_max;
3911	cfqd->cfq_back_penalty = cfq_back_penalty;
3912	cfqd->cfq_slice[0] = cfq_slice_async;
3913	cfqd->cfq_slice[1] = cfq_slice_sync;
3914	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
3915	cfqd->cfq_slice_idle = cfq_slice_idle;
3916	cfqd->cfq_group_idle = cfq_group_idle;
3917	cfqd->cfq_latency = 1;
3918	cfqd->cfq_group_isolation = 0;
3919	cfqd->hw_tag = -1;
3920	/*
3921	 * we optimistically start assuming sync ops weren't delayed in last
3922	 * second, in order to have larger depth for async operations.
3923	 */
3924	cfqd->last_delayed_sync = jiffies - HZ;
3925	return cfqd;
3926}
3927
3928static void cfq_slab_kill(void)
3929{
3930	/*
3931	 * Caller already ensured that pending RCU callbacks are completed,
3932	 * so we should have no busy allocations at this point.
3933	 */
3934	if (cfq_pool)
3935		kmem_cache_destroy(cfq_pool);
3936	if (cfq_ioc_pool)
3937		kmem_cache_destroy(cfq_ioc_pool);
3938}
3939
3940static int __init cfq_slab_setup(void)
3941{
3942	cfq_pool = KMEM_CACHE(cfq_queue, 0);
3943	if (!cfq_pool)
3944		goto fail;
3945
3946	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
3947	if (!cfq_ioc_pool)
3948		goto fail;
3949
3950	return 0;
3951fail:
3952	cfq_slab_kill();
3953	return -ENOMEM;
3954}
3955
3956/*
3957 * sysfs parts below -->
3958 */
3959static ssize_t
3960cfq_var_show(unsigned int var, char *page)
3961{
3962	return sprintf(page, "%d\n", var);
3963}
3964
3965static ssize_t
3966cfq_var_store(unsigned int *var, const char *page, size_t count)
3967{
3968	char *p = (char *) page;
3969
3970	*var = simple_strtoul(p, &p, 10);
3971	return count;
3972}
3973
3974#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
3975static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
3976{									\
3977	struct cfq_data *cfqd = e->elevator_data;			\
3978	unsigned int __data = __VAR;					\
3979	if (__CONV)							\
3980		__data = jiffies_to_msecs(__data);			\
3981	return cfq_var_show(__data, (page));				\
3982}
3983SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
3984SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
3985SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3986SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
3987SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3988SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3989SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
3990SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
3991SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
3992SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
3993SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3994SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
3995#undef SHOW_FUNCTION
3996
3997#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
3998static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
3999{									\
4000	struct cfq_data *cfqd = e->elevator_data;			\
4001	unsigned int __data;						\
4002	int ret = cfq_var_store(&__data, (page), count);		\
4003	if (__data < (MIN))						\
4004		__data = (MIN);						\
4005	else if (__data > (MAX))					\
4006		__data = (MAX);						\
4007	if (__CONV)							\
4008		*(__PTR) = msecs_to_jiffies(__data);			\
4009	else								\
4010		*(__PTR) = __data;					\
4011	return ret;							\
4012}
4013STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4014STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4015		UINT_MAX, 1);
4016STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4017		UINT_MAX, 1);
4018STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4019STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4020		UINT_MAX, 0);
4021STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4022STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4023STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4024STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4025STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4026		UINT_MAX, 0);
4027STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4028STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
4029#undef STORE_FUNCTION
4030
4031#define CFQ_ATTR(name) \
4032	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
4033
4034static struct elv_fs_entry cfq_attrs[] = {
4035	CFQ_ATTR(quantum),
4036	CFQ_ATTR(fifo_expire_sync),
4037	CFQ_ATTR(fifo_expire_async),
4038	CFQ_ATTR(back_seek_max),
4039	CFQ_ATTR(back_seek_penalty),
4040	CFQ_ATTR(slice_sync),
4041	CFQ_ATTR(slice_async),
4042	CFQ_ATTR(slice_async_rq),
4043	CFQ_ATTR(slice_idle),
4044	CFQ_ATTR(group_idle),
4045	CFQ_ATTR(low_latency),
4046	CFQ_ATTR(group_isolation),
4047	__ATTR_NULL
4048};
4049
4050static struct elevator_type iosched_cfq = {
4051	.ops = {
4052		.elevator_merge_fn = 		cfq_merge,
4053		.elevator_merged_fn =		cfq_merged_request,
4054		.elevator_merge_req_fn =	cfq_merged_requests,
4055		.elevator_allow_merge_fn =	cfq_allow_merge,
4056		.elevator_bio_merged_fn =	cfq_bio_merged,
4057		.elevator_dispatch_fn =		cfq_dispatch_requests,
4058		.elevator_add_req_fn =		cfq_insert_request,
4059		.elevator_activate_req_fn =	cfq_activate_request,
4060		.elevator_deactivate_req_fn =	cfq_deactivate_request,
4061		.elevator_queue_empty_fn =	cfq_queue_empty,
4062		.elevator_completed_req_fn =	cfq_completed_request,
4063		.elevator_former_req_fn =	elv_rb_former_request,
4064		.elevator_latter_req_fn =	elv_rb_latter_request,
4065		.elevator_set_req_fn =		cfq_set_request,
4066		.elevator_put_req_fn =		cfq_put_request,
4067		.elevator_may_queue_fn =	cfq_may_queue,
4068		.elevator_init_fn =		cfq_init_queue,
4069		.elevator_exit_fn =		cfq_exit_queue,
4070		.trim =				cfq_free_io_context,
4071	},
4072	.elevator_attrs =	cfq_attrs,
4073	.elevator_name =	"cfq",
4074	.elevator_owner =	THIS_MODULE,
4075};
4076
4077#ifdef CONFIG_CFQ_GROUP_IOSCHED
4078static struct blkio_policy_type blkio_policy_cfq = {
4079	.ops = {
4080		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
4081		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
4082	},
4083};
4084#else
4085static struct blkio_policy_type blkio_policy_cfq;
4086#endif
4087
4088static int __init cfq_init(void)
4089{
4090	/*
4091	 * could be 0 on HZ < 1000 setups
4092	 */
4093	if (!cfq_slice_async)
4094		cfq_slice_async = 1;
4095	if (!cfq_slice_idle)
4096		cfq_slice_idle = 1;
4097
4098#ifdef CONFIG_CFQ_GROUP_IOSCHED
4099	if (!cfq_group_idle)
4100		cfq_group_idle = 1;
4101#else
4102		cfq_group_idle = 0;
4103#endif
4104	if (cfq_slab_setup())
4105		return -ENOMEM;
4106
4107	elv_register(&iosched_cfq);
4108	blkio_policy_register(&blkio_policy_cfq);
4109
4110	return 0;
4111}
4112
4113static void __exit cfq_exit(void)
4114{
4115	DECLARE_COMPLETION_ONSTACK(all_gone);
4116	blkio_policy_unregister(&blkio_policy_cfq);
4117	elv_unregister(&iosched_cfq);
4118	ioc_gone = &all_gone;
4119	/* ioc_gone's update must be visible before reading ioc_count */
4120	smp_wmb();
4121
4122	/*
4123	 * this also protects us from entering cfq_slab_kill() with
4124	 * pending RCU callbacks
4125	 */
4126	if (elv_ioc_count_read(cfq_ioc_count))
4127		wait_for_completion(&all_gone);
4128	ida_destroy(&cic_index_ida);
4129	cfq_slab_kill();
4130}
4131
4132module_init(cfq_init);
4133module_exit(cfq_exit);
4134
4135MODULE_AUTHOR("Jens Axboe");
4136MODULE_LICENSE("GPL");
4137MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
4138