cfq-iosched.c revision 41cad6ab2cb9ccb3b11546ad56b8b285e47c6279
1/*
2 *  CFQ, or complete fairness queueing, disk scheduler.
3 *
4 *  Based on ideas from a previously unfinished io
5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <linux/blkdev.h>
12#include <linux/elevator.h>
13#include <linux/jiffies.h>
14#include <linux/rbtree.h>
15#include <linux/ioprio.h>
16#include <linux/blktrace_api.h>
17#include "blk.h"
18#include "blk-cgroup.h"
19
20/*
21 * tunables
22 */
23/* max queue in one round of service */
24static const int cfq_quantum = 8;
25static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
26/* maximum backwards seek, in KiB */
27static const int cfq_back_max = 16 * 1024;
28/* penalty of a backwards seek */
29static const int cfq_back_penalty = 2;
30static const int cfq_slice_sync = HZ / 10;
31static int cfq_slice_async = HZ / 25;
32static const int cfq_slice_async_rq = 2;
33static int cfq_slice_idle = HZ / 125;
34static int cfq_group_idle = HZ / 125;
35static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
36static const int cfq_hist_divisor = 4;
37
38/*
39 * offset from end of service tree
40 */
41#define CFQ_IDLE_DELAY		(HZ / 5)
42
43/*
44 * below this threshold, we consider thinktime immediate
45 */
46#define CFQ_MIN_TT		(2)
47
48#define CFQ_SLICE_SCALE		(5)
49#define CFQ_HW_QUEUE_MIN	(5)
50#define CFQ_SERVICE_SHIFT       12
51
52#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
53#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
54#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
55#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
56
57#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
58#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
59#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
60
61static struct kmem_cache *cfq_pool;
62
63#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
64#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
65#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
66
67#define sample_valid(samples)	((samples) > 80)
68#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
69
70struct cfq_ttime {
71	unsigned long last_end_request;
72
73	unsigned long ttime_total;
74	unsigned long ttime_samples;
75	unsigned long ttime_mean;
76};
77
78/*
79 * Most of our rbtree usage is for sorting with min extraction, so
80 * if we cache the leftmost node we don't have to walk down the tree
81 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
82 * move this into the elevator for the rq sorting as well.
83 */
84struct cfq_rb_root {
85	struct rb_root rb;
86	struct rb_node *left;
87	unsigned count;
88	u64 min_vdisktime;
89	struct cfq_ttime ttime;
90};
91#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, \
92			.ttime = {.last_end_request = jiffies,},}
93
94/*
95 * Per process-grouping structure
96 */
97struct cfq_queue {
98	/* reference count */
99	int ref;
100	/* various state flags, see below */
101	unsigned int flags;
102	/* parent cfq_data */
103	struct cfq_data *cfqd;
104	/* service_tree member */
105	struct rb_node rb_node;
106	/* service_tree key */
107	unsigned long rb_key;
108	/* prio tree member */
109	struct rb_node p_node;
110	/* prio tree root we belong to, if any */
111	struct rb_root *p_root;
112	/* sorted list of pending requests */
113	struct rb_root sort_list;
114	/* if fifo isn't expired, next request to serve */
115	struct request *next_rq;
116	/* requests queued in sort_list */
117	int queued[2];
118	/* currently allocated requests */
119	int allocated[2];
120	/* fifo list of requests in sort_list */
121	struct list_head fifo;
122
123	/* time when queue got scheduled in to dispatch first request. */
124	unsigned long dispatch_start;
125	unsigned int allocated_slice;
126	unsigned int slice_dispatch;
127	/* time when first request from queue completed and slice started. */
128	unsigned long slice_start;
129	unsigned long slice_end;
130	long slice_resid;
131
132	/* pending priority requests */
133	int prio_pending;
134	/* number of requests that are on the dispatch list or inside driver */
135	int dispatched;
136
137	/* io prio of this group */
138	unsigned short ioprio, org_ioprio;
139	unsigned short ioprio_class;
140
141	pid_t pid;
142
143	u32 seek_history;
144	sector_t last_request_pos;
145
146	struct cfq_rb_root *service_tree;
147	struct cfq_queue *new_cfqq;
148	struct cfq_group *cfqg;
149	/* Number of sectors dispatched from queue in single dispatch round */
150	unsigned long nr_sectors;
151};
152
153/*
154 * First index in the service_trees.
155 * IDLE is handled separately, so it has negative index
156 */
157enum wl_class_t {
158	BE_WORKLOAD = 0,
159	RT_WORKLOAD = 1,
160	IDLE_WORKLOAD = 2,
161	CFQ_PRIO_NR,
162};
163
164/*
165 * Second index in the service_trees.
166 */
167enum wl_type_t {
168	ASYNC_WORKLOAD = 0,
169	SYNC_NOIDLE_WORKLOAD = 1,
170	SYNC_WORKLOAD = 2
171};
172
173struct cfqg_stats {
174#ifdef CONFIG_CFQ_GROUP_IOSCHED
175	/* total bytes transferred */
176	struct blkg_rwstat		service_bytes;
177	/* total IOs serviced, post merge */
178	struct blkg_rwstat		serviced;
179	/* number of ios merged */
180	struct blkg_rwstat		merged;
181	/* total time spent on device in ns, may not be accurate w/ queueing */
182	struct blkg_rwstat		service_time;
183	/* total time spent waiting in scheduler queue in ns */
184	struct blkg_rwstat		wait_time;
185	/* number of IOs queued up */
186	struct blkg_rwstat		queued;
187	/* total sectors transferred */
188	struct blkg_stat		sectors;
189	/* total disk time and nr sectors dispatched by this group */
190	struct blkg_stat		time;
191#ifdef CONFIG_DEBUG_BLK_CGROUP
192	/* time not charged to this cgroup */
193	struct blkg_stat		unaccounted_time;
194	/* sum of number of ios queued across all samples */
195	struct blkg_stat		avg_queue_size_sum;
196	/* count of samples taken for average */
197	struct blkg_stat		avg_queue_size_samples;
198	/* how many times this group has been removed from service tree */
199	struct blkg_stat		dequeue;
200	/* total time spent waiting for it to be assigned a timeslice. */
201	struct blkg_stat		group_wait_time;
202	/* time spent idling for this blkcg_gq */
203	struct blkg_stat		idle_time;
204	/* total time with empty current active q with other requests queued */
205	struct blkg_stat		empty_time;
206	/* fields after this shouldn't be cleared on stat reset */
207	uint64_t			start_group_wait_time;
208	uint64_t			start_idle_time;
209	uint64_t			start_empty_time;
210	uint16_t			flags;
211#endif	/* CONFIG_DEBUG_BLK_CGROUP */
212#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
213};
214
215/* This is per cgroup per device grouping structure */
216struct cfq_group {
217	/* must be the first member */
218	struct blkg_policy_data pd;
219
220	/* group service_tree member */
221	struct rb_node rb_node;
222
223	/* group service_tree key */
224	u64 vdisktime;
225
226	/*
227	 * The number of active cfqgs and sum of their weights under this
228	 * cfqg.  This covers this cfqg's leaf_weight and all children's
229	 * weights, but does not cover weights of further descendants.
230	 *
231	 * If a cfqg is on the service tree, it's active.  An active cfqg
232	 * also activates its parent and contributes to the children_weight
233	 * of the parent.
234	 */
235	int nr_active;
236	unsigned int children_weight;
237
238	/*
239	 * vfraction is the fraction of vdisktime that the tasks in this
240	 * cfqg are entitled to.  This is determined by compounding the
241	 * ratios walking up from this cfqg to the root.
242	 *
243	 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
244	 * vfractions on a service tree is approximately 1.  The sum may
245	 * deviate a bit due to rounding errors and fluctuations caused by
246	 * cfqgs entering and leaving the service tree.
247	 */
248	unsigned int vfraction;
249
250	/*
251	 * There are two weights - (internal) weight is the weight of this
252	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
253	 * this cfqg against the child cfqgs.  For the root cfqg, both
254	 * weights are kept in sync for backward compatibility.
255	 */
256	unsigned int weight;
257	unsigned int new_weight;
258	unsigned int dev_weight;
259
260	unsigned int leaf_weight;
261	unsigned int new_leaf_weight;
262	unsigned int dev_leaf_weight;
263
264	/* number of cfqq currently on this group */
265	int nr_cfqq;
266
267	/*
268	 * Per group busy queues average. Useful for workload slice calc. We
269	 * create the array for each prio class but at run time it is used
270	 * only for RT and BE class and slot for IDLE class remains unused.
271	 * This is primarily done to avoid confusion and a gcc warning.
272	 */
273	unsigned int busy_queues_avg[CFQ_PRIO_NR];
274	/*
275	 * rr lists of queues with requests. We maintain service trees for
276	 * RT and BE classes. These trees are subdivided in subclasses
277	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
278	 * class there is no subclassification and all the cfq queues go on
279	 * a single tree service_tree_idle.
280	 * Counts are embedded in the cfq_rb_root
281	 */
282	struct cfq_rb_root service_trees[2][3];
283	struct cfq_rb_root service_tree_idle;
284
285	unsigned long saved_wl_slice;
286	enum wl_type_t saved_wl_type;
287	enum wl_class_t saved_wl_class;
288
289	/* number of requests that are on the dispatch list or inside driver */
290	int dispatched;
291	struct cfq_ttime ttime;
292	struct cfqg_stats stats;
293};
294
295struct cfq_io_cq {
296	struct io_cq		icq;		/* must be the first member */
297	struct cfq_queue	*cfqq[2];
298	struct cfq_ttime	ttime;
299	int			ioprio;		/* the current ioprio */
300#ifdef CONFIG_CFQ_GROUP_IOSCHED
301	uint64_t		blkcg_id;	/* the current blkcg ID */
302#endif
303};
304
305/*
306 * Per block device queue structure
307 */
308struct cfq_data {
309	struct request_queue *queue;
310	/* Root service tree for cfq_groups */
311	struct cfq_rb_root grp_service_tree;
312	struct cfq_group *root_group;
313
314	/*
315	 * The priority currently being served
316	 */
317	enum wl_class_t serving_wl_class;
318	enum wl_type_t serving_wl_type;
319	unsigned long workload_expires;
320	struct cfq_group *serving_group;
321
322	/*
323	 * Each priority tree is sorted by next_request position.  These
324	 * trees are used when determining if two or more queues are
325	 * interleaving requests (see cfq_close_cooperator).
326	 */
327	struct rb_root prio_trees[CFQ_PRIO_LISTS];
328
329	unsigned int busy_queues;
330	unsigned int busy_sync_queues;
331
332	int rq_in_driver;
333	int rq_in_flight[2];
334
335	/*
336	 * queue-depth detection
337	 */
338	int rq_queued;
339	int hw_tag;
340	/*
341	 * hw_tag can be
342	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
343	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
344	 *  0 => no NCQ
345	 */
346	int hw_tag_est_depth;
347	unsigned int hw_tag_samples;
348
349	/*
350	 * idle window management
351	 */
352	struct timer_list idle_slice_timer;
353	struct work_struct unplug_work;
354
355	struct cfq_queue *active_queue;
356	struct cfq_io_cq *active_cic;
357
358	/*
359	 * async queue for each priority case
360	 */
361	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
362	struct cfq_queue *async_idle_cfqq;
363
364	sector_t last_position;
365
366	/*
367	 * tunables, see top of file
368	 */
369	unsigned int cfq_quantum;
370	unsigned int cfq_fifo_expire[2];
371	unsigned int cfq_back_penalty;
372	unsigned int cfq_back_max;
373	unsigned int cfq_slice[2];
374	unsigned int cfq_slice_async_rq;
375	unsigned int cfq_slice_idle;
376	unsigned int cfq_group_idle;
377	unsigned int cfq_latency;
378	unsigned int cfq_target_latency;
379
380	/*
381	 * Fallback dummy cfqq for extreme OOM conditions
382	 */
383	struct cfq_queue oom_cfqq;
384
385	unsigned long last_delayed_sync;
386};
387
388static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
389
390static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
391					    enum wl_class_t class,
392					    enum wl_type_t type)
393{
394	if (!cfqg)
395		return NULL;
396
397	if (class == IDLE_WORKLOAD)
398		return &cfqg->service_tree_idle;
399
400	return &cfqg->service_trees[class][type];
401}
402
403enum cfqq_state_flags {
404	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
405	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
406	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
407	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
408	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
409	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
410	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
411	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
412	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
413	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
414	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
415	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
416	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
417};
418
419#define CFQ_CFQQ_FNS(name)						\
420static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
421{									\
422	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
423}									\
424static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
425{									\
426	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
427}									\
428static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
429{									\
430	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
431}
432
433CFQ_CFQQ_FNS(on_rr);
434CFQ_CFQQ_FNS(wait_request);
435CFQ_CFQQ_FNS(must_dispatch);
436CFQ_CFQQ_FNS(must_alloc_slice);
437CFQ_CFQQ_FNS(fifo_expire);
438CFQ_CFQQ_FNS(idle_window);
439CFQ_CFQQ_FNS(prio_changed);
440CFQ_CFQQ_FNS(slice_new);
441CFQ_CFQQ_FNS(sync);
442CFQ_CFQQ_FNS(coop);
443CFQ_CFQQ_FNS(split_coop);
444CFQ_CFQQ_FNS(deep);
445CFQ_CFQQ_FNS(wait_busy);
446#undef CFQ_CFQQ_FNS
447
448static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
449{
450	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
451}
452
453static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
454{
455	return pd_to_blkg(&cfqg->pd);
456}
457
458#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
459
460/* cfqg stats flags */
461enum cfqg_stats_flags {
462	CFQG_stats_waiting = 0,
463	CFQG_stats_idling,
464	CFQG_stats_empty,
465};
466
467#define CFQG_FLAG_FNS(name)						\
468static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
469{									\
470	stats->flags |= (1 << CFQG_stats_##name);			\
471}									\
472static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
473{									\
474	stats->flags &= ~(1 << CFQG_stats_##name);			\
475}									\
476static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
477{									\
478	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
479}									\
480
481CFQG_FLAG_FNS(waiting)
482CFQG_FLAG_FNS(idling)
483CFQG_FLAG_FNS(empty)
484#undef CFQG_FLAG_FNS
485
486/* This should be called with the queue_lock held. */
487static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
488{
489	unsigned long long now;
490
491	if (!cfqg_stats_waiting(stats))
492		return;
493
494	now = sched_clock();
495	if (time_after64(now, stats->start_group_wait_time))
496		blkg_stat_add(&stats->group_wait_time,
497			      now - stats->start_group_wait_time);
498	cfqg_stats_clear_waiting(stats);
499}
500
501/* This should be called with the queue_lock held. */
502static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
503						 struct cfq_group *curr_cfqg)
504{
505	struct cfqg_stats *stats = &cfqg->stats;
506
507	if (cfqg_stats_waiting(stats))
508		return;
509	if (cfqg == curr_cfqg)
510		return;
511	stats->start_group_wait_time = sched_clock();
512	cfqg_stats_mark_waiting(stats);
513}
514
515/* This should be called with the queue_lock held. */
516static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
517{
518	unsigned long long now;
519
520	if (!cfqg_stats_empty(stats))
521		return;
522
523	now = sched_clock();
524	if (time_after64(now, stats->start_empty_time))
525		blkg_stat_add(&stats->empty_time,
526			      now - stats->start_empty_time);
527	cfqg_stats_clear_empty(stats);
528}
529
530static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
531{
532	blkg_stat_add(&cfqg->stats.dequeue, 1);
533}
534
535static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
536{
537	struct cfqg_stats *stats = &cfqg->stats;
538
539	if (blkg_rwstat_sum(&stats->queued))
540		return;
541
542	/*
543	 * group is already marked empty. This can happen if cfqq got new
544	 * request in parent group and moved to this group while being added
545	 * to service tree. Just ignore the event and move on.
546	 */
547	if (cfqg_stats_empty(stats))
548		return;
549
550	stats->start_empty_time = sched_clock();
551	cfqg_stats_mark_empty(stats);
552}
553
554static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
555{
556	struct cfqg_stats *stats = &cfqg->stats;
557
558	if (cfqg_stats_idling(stats)) {
559		unsigned long long now = sched_clock();
560
561		if (time_after64(now, stats->start_idle_time))
562			blkg_stat_add(&stats->idle_time,
563				      now - stats->start_idle_time);
564		cfqg_stats_clear_idling(stats);
565	}
566}
567
568static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
569{
570	struct cfqg_stats *stats = &cfqg->stats;
571
572	BUG_ON(cfqg_stats_idling(stats));
573
574	stats->start_idle_time = sched_clock();
575	cfqg_stats_mark_idling(stats);
576}
577
578static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
579{
580	struct cfqg_stats *stats = &cfqg->stats;
581
582	blkg_stat_add(&stats->avg_queue_size_sum,
583		      blkg_rwstat_sum(&stats->queued));
584	blkg_stat_add(&stats->avg_queue_size_samples, 1);
585	cfqg_stats_update_group_wait_time(stats);
586}
587
588#else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
589
590static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
591static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
592static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
593static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
594static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
595static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
596static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
597
598#endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
599
600#ifdef CONFIG_CFQ_GROUP_IOSCHED
601
602static struct blkcg_policy blkcg_policy_cfq;
603
604static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
605{
606	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
607}
608
609/*
610 * Determine the parent cfqg for weight calculation.  Currently, cfqg
611 * scheduling is flat and the root is the parent of everyone else.
612 */
613static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg)
614{
615	struct blkcg_gq *blkg = cfqg_to_blkg(cfqg);
616	struct cfq_group *root;
617
618	while (blkg->parent)
619		blkg = blkg->parent;
620	root = blkg_to_cfqg(blkg);
621
622	return root != cfqg ? root : NULL;
623}
624
625static inline void cfqg_get(struct cfq_group *cfqg)
626{
627	return blkg_get(cfqg_to_blkg(cfqg));
628}
629
630static inline void cfqg_put(struct cfq_group *cfqg)
631{
632	return blkg_put(cfqg_to_blkg(cfqg));
633}
634
635#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	do {			\
636	char __pbuf[128];						\
637									\
638	blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));	\
639	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
640			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
641			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
642			  __pbuf, ##args);				\
643} while (0)
644
645#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)	do {			\
646	char __pbuf[128];						\
647									\
648	blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));		\
649	blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);	\
650} while (0)
651
652static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
653					    struct cfq_group *curr_cfqg, int rw)
654{
655	blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
656	cfqg_stats_end_empty_time(&cfqg->stats);
657	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
658}
659
660static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
661			unsigned long time, unsigned long unaccounted_time)
662{
663	blkg_stat_add(&cfqg->stats.time, time);
664#ifdef CONFIG_DEBUG_BLK_CGROUP
665	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
666#endif
667}
668
669static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
670{
671	blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
672}
673
674static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
675{
676	blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
677}
678
679static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
680					      uint64_t bytes, int rw)
681{
682	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
683	blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
684	blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
685}
686
687static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
688			uint64_t start_time, uint64_t io_start_time, int rw)
689{
690	struct cfqg_stats *stats = &cfqg->stats;
691	unsigned long long now = sched_clock();
692
693	if (time_after64(now, io_start_time))
694		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
695	if (time_after64(io_start_time, start_time))
696		blkg_rwstat_add(&stats->wait_time, rw,
697				io_start_time - start_time);
698}
699
700static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
701{
702	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
703	struct cfqg_stats *stats = &cfqg->stats;
704
705	/* queued stats shouldn't be cleared */
706	blkg_rwstat_reset(&stats->service_bytes);
707	blkg_rwstat_reset(&stats->serviced);
708	blkg_rwstat_reset(&stats->merged);
709	blkg_rwstat_reset(&stats->service_time);
710	blkg_rwstat_reset(&stats->wait_time);
711	blkg_stat_reset(&stats->time);
712#ifdef CONFIG_DEBUG_BLK_CGROUP
713	blkg_stat_reset(&stats->unaccounted_time);
714	blkg_stat_reset(&stats->avg_queue_size_sum);
715	blkg_stat_reset(&stats->avg_queue_size_samples);
716	blkg_stat_reset(&stats->dequeue);
717	blkg_stat_reset(&stats->group_wait_time);
718	blkg_stat_reset(&stats->idle_time);
719	blkg_stat_reset(&stats->empty_time);
720#endif
721}
722
723#else	/* CONFIG_CFQ_GROUP_IOSCHED */
724
725static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg) { return NULL; }
726static inline void cfqg_get(struct cfq_group *cfqg) { }
727static inline void cfqg_put(struct cfq_group *cfqg) { }
728
729#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
730	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid,	\
731			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
732			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
733				##args)
734#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
735
736static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
737			struct cfq_group *curr_cfqg, int rw) { }
738static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
739			unsigned long time, unsigned long unaccounted_time) { }
740static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
741static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
742static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
743					      uint64_t bytes, int rw) { }
744static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
745			uint64_t start_time, uint64_t io_start_time, int rw) { }
746
747#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
748
749#define cfq_log(cfqd, fmt, args...)	\
750	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
751
752/* Traverses through cfq group service trees */
753#define for_each_cfqg_st(cfqg, i, j, st) \
754	for (i = 0; i <= IDLE_WORKLOAD; i++) \
755		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
756			: &cfqg->service_tree_idle; \
757			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
758			(i == IDLE_WORKLOAD && j == 0); \
759			j++, st = i < IDLE_WORKLOAD ? \
760			&cfqg->service_trees[i][j]: NULL) \
761
762static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
763	struct cfq_ttime *ttime, bool group_idle)
764{
765	unsigned long slice;
766	if (!sample_valid(ttime->ttime_samples))
767		return false;
768	if (group_idle)
769		slice = cfqd->cfq_group_idle;
770	else
771		slice = cfqd->cfq_slice_idle;
772	return ttime->ttime_mean > slice;
773}
774
775static inline bool iops_mode(struct cfq_data *cfqd)
776{
777	/*
778	 * If we are not idling on queues and it is a NCQ drive, parallel
779	 * execution of requests is on and measuring time is not possible
780	 * in most of the cases until and unless we drive shallower queue
781	 * depths and that becomes a performance bottleneck. In such cases
782	 * switch to start providing fairness in terms of number of IOs.
783	 */
784	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
785		return true;
786	else
787		return false;
788}
789
790static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
791{
792	if (cfq_class_idle(cfqq))
793		return IDLE_WORKLOAD;
794	if (cfq_class_rt(cfqq))
795		return RT_WORKLOAD;
796	return BE_WORKLOAD;
797}
798
799
800static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
801{
802	if (!cfq_cfqq_sync(cfqq))
803		return ASYNC_WORKLOAD;
804	if (!cfq_cfqq_idle_window(cfqq))
805		return SYNC_NOIDLE_WORKLOAD;
806	return SYNC_WORKLOAD;
807}
808
809static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
810					struct cfq_data *cfqd,
811					struct cfq_group *cfqg)
812{
813	if (wl_class == IDLE_WORKLOAD)
814		return cfqg->service_tree_idle.count;
815
816	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
817		cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
818		cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
819}
820
821static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
822					struct cfq_group *cfqg)
823{
824	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
825		cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
826}
827
828static void cfq_dispatch_insert(struct request_queue *, struct request *);
829static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
830				       struct cfq_io_cq *cic, struct bio *bio,
831				       gfp_t gfp_mask);
832
833static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
834{
835	/* cic->icq is the first member, %NULL will convert to %NULL */
836	return container_of(icq, struct cfq_io_cq, icq);
837}
838
839static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
840					       struct io_context *ioc)
841{
842	if (ioc)
843		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
844	return NULL;
845}
846
847static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
848{
849	return cic->cfqq[is_sync];
850}
851
852static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
853				bool is_sync)
854{
855	cic->cfqq[is_sync] = cfqq;
856}
857
858static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
859{
860	return cic->icq.q->elevator->elevator_data;
861}
862
863/*
864 * We regard a request as SYNC, if it's either a read or has the SYNC bit
865 * set (in which case it could also be direct WRITE).
866 */
867static inline bool cfq_bio_sync(struct bio *bio)
868{
869	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
870}
871
872/*
873 * scheduler run of queue, if there are requests pending and no one in the
874 * driver that will restart queueing
875 */
876static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
877{
878	if (cfqd->busy_queues) {
879		cfq_log(cfqd, "schedule dispatch");
880		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
881	}
882}
883
884/*
885 * Scale schedule slice based on io priority. Use the sync time slice only
886 * if a queue is marked sync and has sync io queued. A sync queue with async
887 * io only, should not get full sync slice length.
888 */
889static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
890				 unsigned short prio)
891{
892	const int base_slice = cfqd->cfq_slice[sync];
893
894	WARN_ON(prio >= IOPRIO_BE_NR);
895
896	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
897}
898
899static inline int
900cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
901{
902	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
903}
904
905/**
906 * cfqg_scale_charge - scale disk time charge according to cfqg weight
907 * @charge: disk time being charged
908 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
909 *
910 * Scale @charge according to @vfraction, which is in range (0, 1].  The
911 * scaling is inversely proportional.
912 *
913 * scaled = charge / vfraction
914 *
915 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
916 */
917static inline u64 cfqg_scale_charge(unsigned long charge,
918				    unsigned int vfraction)
919{
920	u64 c = charge << CFQ_SERVICE_SHIFT;	/* make it fixed point */
921
922	/* charge / vfraction */
923	c <<= CFQ_SERVICE_SHIFT;
924	do_div(c, vfraction);
925	return c;
926}
927
928static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
929{
930	s64 delta = (s64)(vdisktime - min_vdisktime);
931	if (delta > 0)
932		min_vdisktime = vdisktime;
933
934	return min_vdisktime;
935}
936
937static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
938{
939	s64 delta = (s64)(vdisktime - min_vdisktime);
940	if (delta < 0)
941		min_vdisktime = vdisktime;
942
943	return min_vdisktime;
944}
945
946static void update_min_vdisktime(struct cfq_rb_root *st)
947{
948	struct cfq_group *cfqg;
949
950	if (st->left) {
951		cfqg = rb_entry_cfqg(st->left);
952		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
953						  cfqg->vdisktime);
954	}
955}
956
957/*
958 * get averaged number of queues of RT/BE priority.
959 * average is updated, with a formula that gives more weight to higher numbers,
960 * to quickly follows sudden increases and decrease slowly
961 */
962
963static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
964					struct cfq_group *cfqg, bool rt)
965{
966	unsigned min_q, max_q;
967	unsigned mult  = cfq_hist_divisor - 1;
968	unsigned round = cfq_hist_divisor / 2;
969	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
970
971	min_q = min(cfqg->busy_queues_avg[rt], busy);
972	max_q = max(cfqg->busy_queues_avg[rt], busy);
973	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
974		cfq_hist_divisor;
975	return cfqg->busy_queues_avg[rt];
976}
977
978static inline unsigned
979cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
980{
981	return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
982}
983
984static inline unsigned
985cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
986{
987	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
988	if (cfqd->cfq_latency) {
989		/*
990		 * interested queues (we consider only the ones with the same
991		 * priority class in the cfq group)
992		 */
993		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
994						cfq_class_rt(cfqq));
995		unsigned sync_slice = cfqd->cfq_slice[1];
996		unsigned expect_latency = sync_slice * iq;
997		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
998
999		if (expect_latency > group_slice) {
1000			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
1001			/* scale low_slice according to IO priority
1002			 * and sync vs async */
1003			unsigned low_slice =
1004				min(slice, base_low_slice * slice / sync_slice);
1005			/* the adapted slice value is scaled to fit all iqs
1006			 * into the target latency */
1007			slice = max(slice * group_slice / expect_latency,
1008				    low_slice);
1009		}
1010	}
1011	return slice;
1012}
1013
1014static inline void
1015cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1016{
1017	unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1018
1019	cfqq->slice_start = jiffies;
1020	cfqq->slice_end = jiffies + slice;
1021	cfqq->allocated_slice = slice;
1022	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
1023}
1024
1025/*
1026 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1027 * isn't valid until the first request from the dispatch is activated
1028 * and the slice time set.
1029 */
1030static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1031{
1032	if (cfq_cfqq_slice_new(cfqq))
1033		return false;
1034	if (time_before(jiffies, cfqq->slice_end))
1035		return false;
1036
1037	return true;
1038}
1039
1040/*
1041 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1042 * We choose the request that is closest to the head right now. Distance
1043 * behind the head is penalized and only allowed to a certain extent.
1044 */
1045static struct request *
1046cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1047{
1048	sector_t s1, s2, d1 = 0, d2 = 0;
1049	unsigned long back_max;
1050#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
1051#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
1052	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1053
1054	if (rq1 == NULL || rq1 == rq2)
1055		return rq2;
1056	if (rq2 == NULL)
1057		return rq1;
1058
1059	if (rq_is_sync(rq1) != rq_is_sync(rq2))
1060		return rq_is_sync(rq1) ? rq1 : rq2;
1061
1062	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1063		return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1064
1065	s1 = blk_rq_pos(rq1);
1066	s2 = blk_rq_pos(rq2);
1067
1068	/*
1069	 * by definition, 1KiB is 2 sectors
1070	 */
1071	back_max = cfqd->cfq_back_max * 2;
1072
1073	/*
1074	 * Strict one way elevator _except_ in the case where we allow
1075	 * short backward seeks which are biased as twice the cost of a
1076	 * similar forward seek.
1077	 */
1078	if (s1 >= last)
1079		d1 = s1 - last;
1080	else if (s1 + back_max >= last)
1081		d1 = (last - s1) * cfqd->cfq_back_penalty;
1082	else
1083		wrap |= CFQ_RQ1_WRAP;
1084
1085	if (s2 >= last)
1086		d2 = s2 - last;
1087	else if (s2 + back_max >= last)
1088		d2 = (last - s2) * cfqd->cfq_back_penalty;
1089	else
1090		wrap |= CFQ_RQ2_WRAP;
1091
1092	/* Found required data */
1093
1094	/*
1095	 * By doing switch() on the bit mask "wrap" we avoid having to
1096	 * check two variables for all permutations: --> faster!
1097	 */
1098	switch (wrap) {
1099	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1100		if (d1 < d2)
1101			return rq1;
1102		else if (d2 < d1)
1103			return rq2;
1104		else {
1105			if (s1 >= s2)
1106				return rq1;
1107			else
1108				return rq2;
1109		}
1110
1111	case CFQ_RQ2_WRAP:
1112		return rq1;
1113	case CFQ_RQ1_WRAP:
1114		return rq2;
1115	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1116	default:
1117		/*
1118		 * Since both rqs are wrapped,
1119		 * start with the one that's further behind head
1120		 * (--> only *one* back seek required),
1121		 * since back seek takes more time than forward.
1122		 */
1123		if (s1 <= s2)
1124			return rq1;
1125		else
1126			return rq2;
1127	}
1128}
1129
1130/*
1131 * The below is leftmost cache rbtree addon
1132 */
1133static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1134{
1135	/* Service tree is empty */
1136	if (!root->count)
1137		return NULL;
1138
1139	if (!root->left)
1140		root->left = rb_first(&root->rb);
1141
1142	if (root->left)
1143		return rb_entry(root->left, struct cfq_queue, rb_node);
1144
1145	return NULL;
1146}
1147
1148static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1149{
1150	if (!root->left)
1151		root->left = rb_first(&root->rb);
1152
1153	if (root->left)
1154		return rb_entry_cfqg(root->left);
1155
1156	return NULL;
1157}
1158
1159static void rb_erase_init(struct rb_node *n, struct rb_root *root)
1160{
1161	rb_erase(n, root);
1162	RB_CLEAR_NODE(n);
1163}
1164
1165static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1166{
1167	if (root->left == n)
1168		root->left = NULL;
1169	rb_erase_init(n, &root->rb);
1170	--root->count;
1171}
1172
1173/*
1174 * would be nice to take fifo expire time into account as well
1175 */
1176static struct request *
1177cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1178		  struct request *last)
1179{
1180	struct rb_node *rbnext = rb_next(&last->rb_node);
1181	struct rb_node *rbprev = rb_prev(&last->rb_node);
1182	struct request *next = NULL, *prev = NULL;
1183
1184	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1185
1186	if (rbprev)
1187		prev = rb_entry_rq(rbprev);
1188
1189	if (rbnext)
1190		next = rb_entry_rq(rbnext);
1191	else {
1192		rbnext = rb_first(&cfqq->sort_list);
1193		if (rbnext && rbnext != &last->rb_node)
1194			next = rb_entry_rq(rbnext);
1195	}
1196
1197	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1198}
1199
1200static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
1201				      struct cfq_queue *cfqq)
1202{
1203	/*
1204	 * just an approximation, should be ok.
1205	 */
1206	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1207		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1208}
1209
1210static inline s64
1211cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1212{
1213	return cfqg->vdisktime - st->min_vdisktime;
1214}
1215
1216static void
1217__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1218{
1219	struct rb_node **node = &st->rb.rb_node;
1220	struct rb_node *parent = NULL;
1221	struct cfq_group *__cfqg;
1222	s64 key = cfqg_key(st, cfqg);
1223	int left = 1;
1224
1225	while (*node != NULL) {
1226		parent = *node;
1227		__cfqg = rb_entry_cfqg(parent);
1228
1229		if (key < cfqg_key(st, __cfqg))
1230			node = &parent->rb_left;
1231		else {
1232			node = &parent->rb_right;
1233			left = 0;
1234		}
1235	}
1236
1237	if (left)
1238		st->left = &cfqg->rb_node;
1239
1240	rb_link_node(&cfqg->rb_node, parent, node);
1241	rb_insert_color(&cfqg->rb_node, &st->rb);
1242}
1243
1244static void
1245cfq_update_group_weight(struct cfq_group *cfqg)
1246{
1247	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1248
1249	if (cfqg->new_weight) {
1250		cfqg->weight = cfqg->new_weight;
1251		cfqg->new_weight = 0;
1252	}
1253
1254	if (cfqg->new_leaf_weight) {
1255		cfqg->leaf_weight = cfqg->new_leaf_weight;
1256		cfqg->new_leaf_weight = 0;
1257	}
1258}
1259
1260static void
1261cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1262{
1263	unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;	/* start with 1 */
1264	struct cfq_group *pos = cfqg;
1265	struct cfq_group *parent;
1266	bool propagate;
1267
1268	/* add to the service tree */
1269	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1270
1271	cfq_update_group_weight(cfqg);
1272	__cfq_group_service_tree_add(st, cfqg);
1273
1274	/*
1275	 * Activate @cfqg and calculate the portion of vfraction @cfqg is
1276	 * entitled to.  vfraction is calculated by walking the tree
1277	 * towards the root calculating the fraction it has at each level.
1278	 * The compounded ratio is how much vfraction @cfqg owns.
1279	 *
1280	 * Start with the proportion tasks in this cfqg has against active
1281	 * children cfqgs - its leaf_weight against children_weight.
1282	 */
1283	propagate = !pos->nr_active++;
1284	pos->children_weight += pos->leaf_weight;
1285	vfr = vfr * pos->leaf_weight / pos->children_weight;
1286
1287	/*
1288	 * Compound ->weight walking up the tree.  Both activation and
1289	 * vfraction calculation are done in the same loop.  Propagation
1290	 * stops once an already activated node is met.  vfraction
1291	 * calculation should always continue to the root.
1292	 */
1293	while ((parent = cfqg_flat_parent(pos))) {
1294		if (propagate) {
1295			propagate = !parent->nr_active++;
1296			parent->children_weight += pos->weight;
1297		}
1298		vfr = vfr * pos->weight / parent->children_weight;
1299		pos = parent;
1300	}
1301
1302	cfqg->vfraction = max_t(unsigned, vfr, 1);
1303}
1304
1305static void
1306cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1307{
1308	struct cfq_rb_root *st = &cfqd->grp_service_tree;
1309	struct cfq_group *__cfqg;
1310	struct rb_node *n;
1311
1312	cfqg->nr_cfqq++;
1313	if (!RB_EMPTY_NODE(&cfqg->rb_node))
1314		return;
1315
1316	/*
1317	 * Currently put the group at the end. Later implement something
1318	 * so that groups get lesser vtime based on their weights, so that
1319	 * if group does not loose all if it was not continuously backlogged.
1320	 */
1321	n = rb_last(&st->rb);
1322	if (n) {
1323		__cfqg = rb_entry_cfqg(n);
1324		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
1325	} else
1326		cfqg->vdisktime = st->min_vdisktime;
1327	cfq_group_service_tree_add(st, cfqg);
1328}
1329
1330static void
1331cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1332{
1333	struct cfq_group *pos = cfqg;
1334	bool propagate;
1335
1336	/*
1337	 * Undo activation from cfq_group_service_tree_add().  Deactivate
1338	 * @cfqg and propagate deactivation upwards.
1339	 */
1340	propagate = !--pos->nr_active;
1341	pos->children_weight -= pos->leaf_weight;
1342
1343	while (propagate) {
1344		struct cfq_group *parent = cfqg_flat_parent(pos);
1345
1346		/* @pos has 0 nr_active at this point */
1347		WARN_ON_ONCE(pos->children_weight);
1348		pos->vfraction = 0;
1349
1350		if (!parent)
1351			break;
1352
1353		propagate = !--parent->nr_active;
1354		parent->children_weight -= pos->weight;
1355		pos = parent;
1356	}
1357
1358	/* remove from the service tree */
1359	if (!RB_EMPTY_NODE(&cfqg->rb_node))
1360		cfq_rb_erase(&cfqg->rb_node, st);
1361}
1362
1363static void
1364cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1365{
1366	struct cfq_rb_root *st = &cfqd->grp_service_tree;
1367
1368	BUG_ON(cfqg->nr_cfqq < 1);
1369	cfqg->nr_cfqq--;
1370
1371	/* If there are other cfq queues under this group, don't delete it */
1372	if (cfqg->nr_cfqq)
1373		return;
1374
1375	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1376	cfq_group_service_tree_del(st, cfqg);
1377	cfqg->saved_wl_slice = 0;
1378	cfqg_stats_update_dequeue(cfqg);
1379}
1380
1381static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1382						unsigned int *unaccounted_time)
1383{
1384	unsigned int slice_used;
1385
1386	/*
1387	 * Queue got expired before even a single request completed or
1388	 * got expired immediately after first request completion.
1389	 */
1390	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
1391		/*
1392		 * Also charge the seek time incurred to the group, otherwise
1393		 * if there are mutiple queues in the group, each can dispatch
1394		 * a single request on seeky media and cause lots of seek time
1395		 * and group will never know it.
1396		 */
1397		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
1398					1);
1399	} else {
1400		slice_used = jiffies - cfqq->slice_start;
1401		if (slice_used > cfqq->allocated_slice) {
1402			*unaccounted_time = slice_used - cfqq->allocated_slice;
1403			slice_used = cfqq->allocated_slice;
1404		}
1405		if (time_after(cfqq->slice_start, cfqq->dispatch_start))
1406			*unaccounted_time += cfqq->slice_start -
1407					cfqq->dispatch_start;
1408	}
1409
1410	return slice_used;
1411}
1412
1413static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1414				struct cfq_queue *cfqq)
1415{
1416	struct cfq_rb_root *st = &cfqd->grp_service_tree;
1417	unsigned int used_sl, charge, unaccounted_sl = 0;
1418	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1419			- cfqg->service_tree_idle.count;
1420	unsigned int vfr;
1421
1422	BUG_ON(nr_sync < 0);
1423	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1424
1425	if (iops_mode(cfqd))
1426		charge = cfqq->slice_dispatch;
1427	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1428		charge = cfqq->allocated_slice;
1429
1430	/*
1431	 * Can't update vdisktime while on service tree and cfqg->vfraction
1432	 * is valid only while on it.  Cache vfr, leave the service tree,
1433	 * update vdisktime and go back on.  The re-addition to the tree
1434	 * will also update the weights as necessary.
1435	 */
1436	vfr = cfqg->vfraction;
1437	cfq_group_service_tree_del(st, cfqg);
1438	cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1439	cfq_group_service_tree_add(st, cfqg);
1440
1441	/* This group is being expired. Save the context */
1442	if (time_after(cfqd->workload_expires, jiffies)) {
1443		cfqg->saved_wl_slice = cfqd->workload_expires
1444						- jiffies;
1445		cfqg->saved_wl_type = cfqd->serving_wl_type;
1446		cfqg->saved_wl_class = cfqd->serving_wl_class;
1447	} else
1448		cfqg->saved_wl_slice = 0;
1449
1450	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1451					st->min_vdisktime);
1452	cfq_log_cfqq(cfqq->cfqd, cfqq,
1453		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
1454		     used_sl, cfqq->slice_dispatch, charge,
1455		     iops_mode(cfqd), cfqq->nr_sectors);
1456	cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1457	cfqg_stats_set_start_empty_time(cfqg);
1458}
1459
1460/**
1461 * cfq_init_cfqg_base - initialize base part of a cfq_group
1462 * @cfqg: cfq_group to initialize
1463 *
1464 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1465 * is enabled or not.
1466 */
1467static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1468{
1469	struct cfq_rb_root *st;
1470	int i, j;
1471
1472	for_each_cfqg_st(cfqg, i, j, st)
1473		*st = CFQ_RB_ROOT;
1474	RB_CLEAR_NODE(&cfqg->rb_node);
1475
1476	cfqg->ttime.last_end_request = jiffies;
1477}
1478
1479#ifdef CONFIG_CFQ_GROUP_IOSCHED
1480static void cfq_pd_init(struct blkcg_gq *blkg)
1481{
1482	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1483
1484	cfq_init_cfqg_base(cfqg);
1485	cfqg->weight = blkg->blkcg->cfq_weight;
1486	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
1487}
1488
1489/*
1490 * Search for the cfq group current task belongs to. request_queue lock must
1491 * be held.
1492 */
1493static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1494						struct blkcg *blkcg)
1495{
1496	struct request_queue *q = cfqd->queue;
1497	struct cfq_group *cfqg = NULL;
1498
1499	/* avoid lookup for the common case where there's no blkcg */
1500	if (blkcg == &blkcg_root) {
1501		cfqg = cfqd->root_group;
1502	} else {
1503		struct blkcg_gq *blkg;
1504
1505		blkg = blkg_lookup_create(blkcg, q);
1506		if (!IS_ERR(blkg))
1507			cfqg = blkg_to_cfqg(blkg);
1508	}
1509
1510	return cfqg;
1511}
1512
1513static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1514{
1515	/* Currently, all async queues are mapped to root group */
1516	if (!cfq_cfqq_sync(cfqq))
1517		cfqg = cfqq->cfqd->root_group;
1518
1519	cfqq->cfqg = cfqg;
1520	/* cfqq reference on cfqg */
1521	cfqg_get(cfqg);
1522}
1523
1524static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1525				     struct blkg_policy_data *pd, int off)
1526{
1527	struct cfq_group *cfqg = pd_to_cfqg(pd);
1528
1529	if (!cfqg->dev_weight)
1530		return 0;
1531	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1532}
1533
1534static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
1535				    struct seq_file *sf)
1536{
1537	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1538			  cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
1539			  false);
1540	return 0;
1541}
1542
1543static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1544					  struct blkg_policy_data *pd, int off)
1545{
1546	struct cfq_group *cfqg = pd_to_cfqg(pd);
1547
1548	if (!cfqg->dev_leaf_weight)
1549		return 0;
1550	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1551}
1552
1553static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
1554					 struct cftype *cft,
1555					 struct seq_file *sf)
1556{
1557	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1558			  cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
1559			  false);
1560	return 0;
1561}
1562
1563static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
1564			    struct seq_file *sf)
1565{
1566	seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
1567	return 0;
1568}
1569
1570static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
1571				 struct seq_file *sf)
1572{
1573	seq_printf(sf, "%u\n",
1574		   cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
1575	return 0;
1576}
1577
1578static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1579				    const char *buf, bool is_leaf_weight)
1580{
1581	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1582	struct blkg_conf_ctx ctx;
1583	struct cfq_group *cfqg;
1584	int ret;
1585
1586	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1587	if (ret)
1588		return ret;
1589
1590	ret = -EINVAL;
1591	cfqg = blkg_to_cfqg(ctx.blkg);
1592	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1593		if (!is_leaf_weight) {
1594			cfqg->dev_weight = ctx.v;
1595			cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
1596		} else {
1597			cfqg->dev_leaf_weight = ctx.v;
1598			cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
1599		}
1600		ret = 0;
1601	}
1602
1603	blkg_conf_finish(&ctx);
1604	return ret;
1605}
1606
1607static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1608				  const char *buf)
1609{
1610	return __cfqg_set_weight_device(cgrp, cft, buf, false);
1611}
1612
1613static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
1614				       const char *buf)
1615{
1616	return __cfqg_set_weight_device(cgrp, cft, buf, true);
1617}
1618
1619static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
1620			    bool is_leaf_weight)
1621{
1622	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1623	struct blkcg_gq *blkg;
1624	struct hlist_node *n;
1625
1626	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
1627		return -EINVAL;
1628
1629	spin_lock_irq(&blkcg->lock);
1630
1631	if (!is_leaf_weight)
1632		blkcg->cfq_weight = val;
1633	else
1634		blkcg->cfq_leaf_weight = val;
1635
1636	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1637		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1638
1639		if (!cfqg)
1640			continue;
1641
1642		if (!is_leaf_weight) {
1643			if (!cfqg->dev_weight)
1644				cfqg->new_weight = blkcg->cfq_weight;
1645		} else {
1646			if (!cfqg->dev_leaf_weight)
1647				cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
1648		}
1649	}
1650
1651	spin_unlock_irq(&blkcg->lock);
1652	return 0;
1653}
1654
1655static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1656{
1657	return __cfq_set_weight(cgrp, cft, val, false);
1658}
1659
1660static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1661{
1662	return __cfq_set_weight(cgrp, cft, val, true);
1663}
1664
1665static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
1666			   struct seq_file *sf)
1667{
1668	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1669
1670	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
1671			  cft->private, false);
1672	return 0;
1673}
1674
1675static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
1676			     struct seq_file *sf)
1677{
1678	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1679
1680	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
1681			  cft->private, true);
1682	return 0;
1683}
1684
1685#ifdef CONFIG_DEBUG_BLK_CGROUP
1686static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1687				      struct blkg_policy_data *pd, int off)
1688{
1689	struct cfq_group *cfqg = pd_to_cfqg(pd);
1690	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1691	u64 v = 0;
1692
1693	if (samples) {
1694		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1695		do_div(v, samples);
1696	}
1697	__blkg_prfill_u64(sf, pd, v);
1698	return 0;
1699}
1700
1701/* print avg_queue_size */
1702static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
1703				     struct seq_file *sf)
1704{
1705	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1706
1707	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
1708			  &blkcg_policy_cfq, 0, false);
1709	return 0;
1710}
1711#endif	/* CONFIG_DEBUG_BLK_CGROUP */
1712
1713static struct cftype cfq_blkcg_files[] = {
1714	/* on root, weight is mapped to leaf_weight */
1715	{
1716		.name = "weight_device",
1717		.flags = CFTYPE_ONLY_ON_ROOT,
1718		.read_seq_string = cfqg_print_leaf_weight_device,
1719		.write_string = cfqg_set_leaf_weight_device,
1720		.max_write_len = 256,
1721	},
1722	{
1723		.name = "weight",
1724		.flags = CFTYPE_ONLY_ON_ROOT,
1725		.read_seq_string = cfq_print_leaf_weight,
1726		.write_u64 = cfq_set_leaf_weight,
1727	},
1728
1729	/* no such mapping necessary for !roots */
1730	{
1731		.name = "weight_device",
1732		.flags = CFTYPE_NOT_ON_ROOT,
1733		.read_seq_string = cfqg_print_weight_device,
1734		.write_string = cfqg_set_weight_device,
1735		.max_write_len = 256,
1736	},
1737	{
1738		.name = "weight",
1739		.flags = CFTYPE_NOT_ON_ROOT,
1740		.read_seq_string = cfq_print_weight,
1741		.write_u64 = cfq_set_weight,
1742	},
1743
1744	{
1745		.name = "leaf_weight_device",
1746		.read_seq_string = cfqg_print_leaf_weight_device,
1747		.write_string = cfqg_set_leaf_weight_device,
1748		.max_write_len = 256,
1749	},
1750	{
1751		.name = "leaf_weight",
1752		.read_seq_string = cfq_print_leaf_weight,
1753		.write_u64 = cfq_set_leaf_weight,
1754	},
1755
1756	{
1757		.name = "time",
1758		.private = offsetof(struct cfq_group, stats.time),
1759		.read_seq_string = cfqg_print_stat,
1760	},
1761	{
1762		.name = "sectors",
1763		.private = offsetof(struct cfq_group, stats.sectors),
1764		.read_seq_string = cfqg_print_stat,
1765	},
1766	{
1767		.name = "io_service_bytes",
1768		.private = offsetof(struct cfq_group, stats.service_bytes),
1769		.read_seq_string = cfqg_print_rwstat,
1770	},
1771	{
1772		.name = "io_serviced",
1773		.private = offsetof(struct cfq_group, stats.serviced),
1774		.read_seq_string = cfqg_print_rwstat,
1775	},
1776	{
1777		.name = "io_service_time",
1778		.private = offsetof(struct cfq_group, stats.service_time),
1779		.read_seq_string = cfqg_print_rwstat,
1780	},
1781	{
1782		.name = "io_wait_time",
1783		.private = offsetof(struct cfq_group, stats.wait_time),
1784		.read_seq_string = cfqg_print_rwstat,
1785	},
1786	{
1787		.name = "io_merged",
1788		.private = offsetof(struct cfq_group, stats.merged),
1789		.read_seq_string = cfqg_print_rwstat,
1790	},
1791	{
1792		.name = "io_queued",
1793		.private = offsetof(struct cfq_group, stats.queued),
1794		.read_seq_string = cfqg_print_rwstat,
1795	},
1796#ifdef CONFIG_DEBUG_BLK_CGROUP
1797	{
1798		.name = "avg_queue_size",
1799		.read_seq_string = cfqg_print_avg_queue_size,
1800	},
1801	{
1802		.name = "group_wait_time",
1803		.private = offsetof(struct cfq_group, stats.group_wait_time),
1804		.read_seq_string = cfqg_print_stat,
1805	},
1806	{
1807		.name = "idle_time",
1808		.private = offsetof(struct cfq_group, stats.idle_time),
1809		.read_seq_string = cfqg_print_stat,
1810	},
1811	{
1812		.name = "empty_time",
1813		.private = offsetof(struct cfq_group, stats.empty_time),
1814		.read_seq_string = cfqg_print_stat,
1815	},
1816	{
1817		.name = "dequeue",
1818		.private = offsetof(struct cfq_group, stats.dequeue),
1819		.read_seq_string = cfqg_print_stat,
1820	},
1821	{
1822		.name = "unaccounted_time",
1823		.private = offsetof(struct cfq_group, stats.unaccounted_time),
1824		.read_seq_string = cfqg_print_stat,
1825	},
1826#endif	/* CONFIG_DEBUG_BLK_CGROUP */
1827	{ }	/* terminate */
1828};
1829#else /* GROUP_IOSCHED */
1830static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1831						struct blkcg *blkcg)
1832{
1833	return cfqd->root_group;
1834}
1835
1836static inline void
1837cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1838	cfqq->cfqg = cfqg;
1839}
1840
1841#endif /* GROUP_IOSCHED */
1842
1843/*
1844 * The cfqd->service_trees holds all pending cfq_queue's that have
1845 * requests waiting to be processed. It is sorted in the order that
1846 * we will service the queues.
1847 */
1848static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1849				 bool add_front)
1850{
1851	struct rb_node **p, *parent;
1852	struct cfq_queue *__cfqq;
1853	unsigned long rb_key;
1854	struct cfq_rb_root *st;
1855	int left;
1856	int new_cfqq = 1;
1857
1858	st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
1859	if (cfq_class_idle(cfqq)) {
1860		rb_key = CFQ_IDLE_DELAY;
1861		parent = rb_last(&st->rb);
1862		if (parent && parent != &cfqq->rb_node) {
1863			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1864			rb_key += __cfqq->rb_key;
1865		} else
1866			rb_key += jiffies;
1867	} else if (!add_front) {
1868		/*
1869		 * Get our rb key offset. Subtract any residual slice
1870		 * value carried from last service. A negative resid
1871		 * count indicates slice overrun, and this should position
1872		 * the next service time further away in the tree.
1873		 */
1874		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1875		rb_key -= cfqq->slice_resid;
1876		cfqq->slice_resid = 0;
1877	} else {
1878		rb_key = -HZ;
1879		__cfqq = cfq_rb_first(st);
1880		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1881	}
1882
1883	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1884		new_cfqq = 0;
1885		/*
1886		 * same position, nothing more to do
1887		 */
1888		if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
1889			return;
1890
1891		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1892		cfqq->service_tree = NULL;
1893	}
1894
1895	left = 1;
1896	parent = NULL;
1897	cfqq->service_tree = st;
1898	p = &st->rb.rb_node;
1899	while (*p) {
1900		parent = *p;
1901		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1902
1903		/*
1904		 * sort by key, that represents service time.
1905		 */
1906		if (time_before(rb_key, __cfqq->rb_key))
1907			p = &parent->rb_left;
1908		else {
1909			p = &parent->rb_right;
1910			left = 0;
1911		}
1912	}
1913
1914	if (left)
1915		st->left = &cfqq->rb_node;
1916
1917	cfqq->rb_key = rb_key;
1918	rb_link_node(&cfqq->rb_node, parent, p);
1919	rb_insert_color(&cfqq->rb_node, &st->rb);
1920	st->count++;
1921	if (add_front || !new_cfqq)
1922		return;
1923	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1924}
1925
1926static struct cfq_queue *
1927cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1928		     sector_t sector, struct rb_node **ret_parent,
1929		     struct rb_node ***rb_link)
1930{
1931	struct rb_node **p, *parent;
1932	struct cfq_queue *cfqq = NULL;
1933
1934	parent = NULL;
1935	p = &root->rb_node;
1936	while (*p) {
1937		struct rb_node **n;
1938
1939		parent = *p;
1940		cfqq = rb_entry(parent, struct cfq_queue, p_node);
1941
1942		/*
1943		 * Sort strictly based on sector.  Smallest to the left,
1944		 * largest to the right.
1945		 */
1946		if (sector > blk_rq_pos(cfqq->next_rq))
1947			n = &(*p)->rb_right;
1948		else if (sector < blk_rq_pos(cfqq->next_rq))
1949			n = &(*p)->rb_left;
1950		else
1951			break;
1952		p = n;
1953		cfqq = NULL;
1954	}
1955
1956	*ret_parent = parent;
1957	if (rb_link)
1958		*rb_link = p;
1959	return cfqq;
1960}
1961
1962static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1963{
1964	struct rb_node **p, *parent;
1965	struct cfq_queue *__cfqq;
1966
1967	if (cfqq->p_root) {
1968		rb_erase(&cfqq->p_node, cfqq->p_root);
1969		cfqq->p_root = NULL;
1970	}
1971
1972	if (cfq_class_idle(cfqq))
1973		return;
1974	if (!cfqq->next_rq)
1975		return;
1976
1977	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1978	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1979				      blk_rq_pos(cfqq->next_rq), &parent, &p);
1980	if (!__cfqq) {
1981		rb_link_node(&cfqq->p_node, parent, p);
1982		rb_insert_color(&cfqq->p_node, cfqq->p_root);
1983	} else
1984		cfqq->p_root = NULL;
1985}
1986
1987/*
1988 * Update cfqq's position in the service tree.
1989 */
1990static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1991{
1992	/*
1993	 * Resorting requires the cfqq to be on the RR list already.
1994	 */
1995	if (cfq_cfqq_on_rr(cfqq)) {
1996		cfq_service_tree_add(cfqd, cfqq, 0);
1997		cfq_prio_tree_add(cfqd, cfqq);
1998	}
1999}
2000
2001/*
2002 * add to busy list of queues for service, trying to be fair in ordering
2003 * the pending list according to last request service
2004 */
2005static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2006{
2007	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2008	BUG_ON(cfq_cfqq_on_rr(cfqq));
2009	cfq_mark_cfqq_on_rr(cfqq);
2010	cfqd->busy_queues++;
2011	if (cfq_cfqq_sync(cfqq))
2012		cfqd->busy_sync_queues++;
2013
2014	cfq_resort_rr_list(cfqd, cfqq);
2015}
2016
2017/*
2018 * Called when the cfqq no longer has requests pending, remove it from
2019 * the service tree.
2020 */
2021static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2022{
2023	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2024	BUG_ON(!cfq_cfqq_on_rr(cfqq));
2025	cfq_clear_cfqq_on_rr(cfqq);
2026
2027	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2028		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2029		cfqq->service_tree = NULL;
2030	}
2031	if (cfqq->p_root) {
2032		rb_erase(&cfqq->p_node, cfqq->p_root);
2033		cfqq->p_root = NULL;
2034	}
2035
2036	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2037	BUG_ON(!cfqd->busy_queues);
2038	cfqd->busy_queues--;
2039	if (cfq_cfqq_sync(cfqq))
2040		cfqd->busy_sync_queues--;
2041}
2042
2043/*
2044 * rb tree support functions
2045 */
2046static void cfq_del_rq_rb(struct request *rq)
2047{
2048	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2049	const int sync = rq_is_sync(rq);
2050
2051	BUG_ON(!cfqq->queued[sync]);
2052	cfqq->queued[sync]--;
2053
2054	elv_rb_del(&cfqq->sort_list, rq);
2055
2056	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2057		/*
2058		 * Queue will be deleted from service tree when we actually
2059		 * expire it later. Right now just remove it from prio tree
2060		 * as it is empty.
2061		 */
2062		if (cfqq->p_root) {
2063			rb_erase(&cfqq->p_node, cfqq->p_root);
2064			cfqq->p_root = NULL;
2065		}
2066	}
2067}
2068
2069static void cfq_add_rq_rb(struct request *rq)
2070{
2071	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2072	struct cfq_data *cfqd = cfqq->cfqd;
2073	struct request *prev;
2074
2075	cfqq->queued[rq_is_sync(rq)]++;
2076
2077	elv_rb_add(&cfqq->sort_list, rq);
2078
2079	if (!cfq_cfqq_on_rr(cfqq))
2080		cfq_add_cfqq_rr(cfqd, cfqq);
2081
2082	/*
2083	 * check if this request is a better next-serve candidate
2084	 */
2085	prev = cfqq->next_rq;
2086	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2087
2088	/*
2089	 * adjust priority tree position, if ->next_rq changes
2090	 */
2091	if (prev != cfqq->next_rq)
2092		cfq_prio_tree_add(cfqd, cfqq);
2093
2094	BUG_ON(!cfqq->next_rq);
2095}
2096
2097static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2098{
2099	elv_rb_del(&cfqq->sort_list, rq);
2100	cfqq->queued[rq_is_sync(rq)]--;
2101	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2102	cfq_add_rq_rb(rq);
2103	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2104				 rq->cmd_flags);
2105}
2106
2107static struct request *
2108cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2109{
2110	struct task_struct *tsk = current;
2111	struct cfq_io_cq *cic;
2112	struct cfq_queue *cfqq;
2113
2114	cic = cfq_cic_lookup(cfqd, tsk->io_context);
2115	if (!cic)
2116		return NULL;
2117
2118	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
2119	if (cfqq) {
2120		sector_t sector = bio->bi_sector + bio_sectors(bio);
2121
2122		return elv_rb_find(&cfqq->sort_list, sector);
2123	}
2124
2125	return NULL;
2126}
2127
2128static void cfq_activate_request(struct request_queue *q, struct request *rq)
2129{
2130	struct cfq_data *cfqd = q->elevator->elevator_data;
2131
2132	cfqd->rq_in_driver++;
2133	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2134						cfqd->rq_in_driver);
2135
2136	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2137}
2138
2139static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2140{
2141	struct cfq_data *cfqd = q->elevator->elevator_data;
2142
2143	WARN_ON(!cfqd->rq_in_driver);
2144	cfqd->rq_in_driver--;
2145	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2146						cfqd->rq_in_driver);
2147}
2148
2149static void cfq_remove_request(struct request *rq)
2150{
2151	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2152
2153	if (cfqq->next_rq == rq)
2154		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2155
2156	list_del_init(&rq->queuelist);
2157	cfq_del_rq_rb(rq);
2158
2159	cfqq->cfqd->rq_queued--;
2160	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2161	if (rq->cmd_flags & REQ_PRIO) {
2162		WARN_ON(!cfqq->prio_pending);
2163		cfqq->prio_pending--;
2164	}
2165}
2166
2167static int cfq_merge(struct request_queue *q, struct request **req,
2168		     struct bio *bio)
2169{
2170	struct cfq_data *cfqd = q->elevator->elevator_data;
2171	struct request *__rq;
2172
2173	__rq = cfq_find_rq_fmerge(cfqd, bio);
2174	if (__rq && elv_rq_merge_ok(__rq, bio)) {
2175		*req = __rq;
2176		return ELEVATOR_FRONT_MERGE;
2177	}
2178
2179	return ELEVATOR_NO_MERGE;
2180}
2181
2182static void cfq_merged_request(struct request_queue *q, struct request *req,
2183			       int type)
2184{
2185	if (type == ELEVATOR_FRONT_MERGE) {
2186		struct cfq_queue *cfqq = RQ_CFQQ(req);
2187
2188		cfq_reposition_rq_rb(cfqq, req);
2189	}
2190}
2191
2192static void cfq_bio_merged(struct request_queue *q, struct request *req,
2193				struct bio *bio)
2194{
2195	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
2196}
2197
2198static void
2199cfq_merged_requests(struct request_queue *q, struct request *rq,
2200		    struct request *next)
2201{
2202	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2203	struct cfq_data *cfqd = q->elevator->elevator_data;
2204
2205	/*
2206	 * reposition in fifo if next is older than rq
2207	 */
2208	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2209	    time_before(rq_fifo_time(next), rq_fifo_time(rq)) &&
2210	    cfqq == RQ_CFQQ(next)) {
2211		list_move(&rq->queuelist, &next->queuelist);
2212		rq_set_fifo_time(rq, rq_fifo_time(next));
2213	}
2214
2215	if (cfqq->next_rq == next)
2216		cfqq->next_rq = rq;
2217	cfq_remove_request(next);
2218	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2219
2220	cfqq = RQ_CFQQ(next);
2221	/*
2222	 * all requests of this queue are merged to other queues, delete it
2223	 * from the service tree. If it's the active_queue,
2224	 * cfq_dispatch_requests() will choose to expire it or do idle
2225	 */
2226	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2227	    cfqq != cfqd->active_queue)
2228		cfq_del_cfqq_rr(cfqd, cfqq);
2229}
2230
2231static int cfq_allow_merge(struct request_queue *q, struct request *rq,
2232			   struct bio *bio)
2233{
2234	struct cfq_data *cfqd = q->elevator->elevator_data;
2235	struct cfq_io_cq *cic;
2236	struct cfq_queue *cfqq;
2237
2238	/*
2239	 * Disallow merge of a sync bio into an async request.
2240	 */
2241	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
2242		return false;
2243
2244	/*
2245	 * Lookup the cfqq that this bio will be queued with and allow
2246	 * merge only if rq is queued there.
2247	 */
2248	cic = cfq_cic_lookup(cfqd, current->io_context);
2249	if (!cic)
2250		return false;
2251
2252	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
2253	return cfqq == RQ_CFQQ(rq);
2254}
2255
2256static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2257{
2258	del_timer(&cfqd->idle_slice_timer);
2259	cfqg_stats_update_idle_time(cfqq->cfqg);
2260}
2261
2262static void __cfq_set_active_queue(struct cfq_data *cfqd,
2263				   struct cfq_queue *cfqq)
2264{
2265	if (cfqq) {
2266		cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2267				cfqd->serving_wl_class, cfqd->serving_wl_type);
2268		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2269		cfqq->slice_start = 0;
2270		cfqq->dispatch_start = jiffies;
2271		cfqq->allocated_slice = 0;
2272		cfqq->slice_end = 0;
2273		cfqq->slice_dispatch = 0;
2274		cfqq->nr_sectors = 0;
2275
2276		cfq_clear_cfqq_wait_request(cfqq);
2277		cfq_clear_cfqq_must_dispatch(cfqq);
2278		cfq_clear_cfqq_must_alloc_slice(cfqq);
2279		cfq_clear_cfqq_fifo_expire(cfqq);
2280		cfq_mark_cfqq_slice_new(cfqq);
2281
2282		cfq_del_timer(cfqd, cfqq);
2283	}
2284
2285	cfqd->active_queue = cfqq;
2286}
2287
2288/*
2289 * current cfqq expired its slice (or was too idle), select new one
2290 */
2291static void
2292__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2293		    bool timed_out)
2294{
2295	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2296
2297	if (cfq_cfqq_wait_request(cfqq))
2298		cfq_del_timer(cfqd, cfqq);
2299
2300	cfq_clear_cfqq_wait_request(cfqq);
2301	cfq_clear_cfqq_wait_busy(cfqq);
2302
2303	/*
2304	 * If this cfqq is shared between multiple processes, check to
2305	 * make sure that those processes are still issuing I/Os within
2306	 * the mean seek distance.  If not, it may be time to break the
2307	 * queues apart again.
2308	 */
2309	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2310		cfq_mark_cfqq_split_coop(cfqq);
2311
2312	/*
2313	 * store what was left of this slice, if the queue idled/timed out
2314	 */
2315	if (timed_out) {
2316		if (cfq_cfqq_slice_new(cfqq))
2317			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2318		else
2319			cfqq->slice_resid = cfqq->slice_end - jiffies;
2320		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
2321	}
2322
2323	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2324
2325	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2326		cfq_del_cfqq_rr(cfqd, cfqq);
2327
2328	cfq_resort_rr_list(cfqd, cfqq);
2329
2330	if (cfqq == cfqd->active_queue)
2331		cfqd->active_queue = NULL;
2332
2333	if (cfqd->active_cic) {
2334		put_io_context(cfqd->active_cic->icq.ioc);
2335		cfqd->active_cic = NULL;
2336	}
2337}
2338
2339static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2340{
2341	struct cfq_queue *cfqq = cfqd->active_queue;
2342
2343	if (cfqq)
2344		__cfq_slice_expired(cfqd, cfqq, timed_out);
2345}
2346
2347/*
2348 * Get next queue for service. Unless we have a queue preemption,
2349 * we'll simply select the first cfqq in the service tree.
2350 */
2351static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2352{
2353	struct cfq_rb_root *st = st_for(cfqd->serving_group,
2354			cfqd->serving_wl_class, cfqd->serving_wl_type);
2355
2356	if (!cfqd->rq_queued)
2357		return NULL;
2358
2359	/* There is nothing to dispatch */
2360	if (!st)
2361		return NULL;
2362	if (RB_EMPTY_ROOT(&st->rb))
2363		return NULL;
2364	return cfq_rb_first(st);
2365}
2366
2367static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2368{
2369	struct cfq_group *cfqg;
2370	struct cfq_queue *cfqq;
2371	int i, j;
2372	struct cfq_rb_root *st;
2373
2374	if (!cfqd->rq_queued)
2375		return NULL;
2376
2377	cfqg = cfq_get_next_cfqg(cfqd);
2378	if (!cfqg)
2379		return NULL;
2380
2381	for_each_cfqg_st(cfqg, i, j, st)
2382		if ((cfqq = cfq_rb_first(st)) != NULL)
2383			return cfqq;
2384	return NULL;
2385}
2386
2387/*
2388 * Get and set a new active queue for service.
2389 */
2390static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2391					      struct cfq_queue *cfqq)
2392{
2393	if (!cfqq)
2394		cfqq = cfq_get_next_queue(cfqd);
2395
2396	__cfq_set_active_queue(cfqd, cfqq);
2397	return cfqq;
2398}
2399
2400static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2401					  struct request *rq)
2402{
2403	if (blk_rq_pos(rq) >= cfqd->last_position)
2404		return blk_rq_pos(rq) - cfqd->last_position;
2405	else
2406		return cfqd->last_position - blk_rq_pos(rq);
2407}
2408
2409static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2410			       struct request *rq)
2411{
2412	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2413}
2414
2415static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2416				    struct cfq_queue *cur_cfqq)
2417{
2418	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2419	struct rb_node *parent, *node;
2420	struct cfq_queue *__cfqq;
2421	sector_t sector = cfqd->last_position;
2422
2423	if (RB_EMPTY_ROOT(root))
2424		return NULL;
2425
2426	/*
2427	 * First, if we find a request starting at the end of the last
2428	 * request, choose it.
2429	 */
2430	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2431	if (__cfqq)
2432		return __cfqq;
2433
2434	/*
2435	 * If the exact sector wasn't found, the parent of the NULL leaf
2436	 * will contain the closest sector.
2437	 */
2438	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
2439	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2440		return __cfqq;
2441
2442	if (blk_rq_pos(__cfqq->next_rq) < sector)
2443		node = rb_next(&__cfqq->p_node);
2444	else
2445		node = rb_prev(&__cfqq->p_node);
2446	if (!node)
2447		return NULL;
2448
2449	__cfqq = rb_entry(node, struct cfq_queue, p_node);
2450	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2451		return __cfqq;
2452
2453	return NULL;
2454}
2455
2456/*
2457 * cfqd - obvious
2458 * cur_cfqq - passed in so that we don't decide that the current queue is
2459 * 	      closely cooperating with itself.
2460 *
2461 * So, basically we're assuming that that cur_cfqq has dispatched at least
2462 * one request, and that cfqd->last_position reflects a position on the disk
2463 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
2464 * assumption.
2465 */
2466static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2467					      struct cfq_queue *cur_cfqq)
2468{
2469	struct cfq_queue *cfqq;
2470
2471	if (cfq_class_idle(cur_cfqq))
2472		return NULL;
2473	if (!cfq_cfqq_sync(cur_cfqq))
2474		return NULL;
2475	if (CFQQ_SEEKY(cur_cfqq))
2476		return NULL;
2477
2478	/*
2479	 * Don't search priority tree if it's the only queue in the group.
2480	 */
2481	if (cur_cfqq->cfqg->nr_cfqq == 1)
2482		return NULL;
2483
2484	/*
2485	 * We should notice if some of the queues are cooperating, eg
2486	 * working closely on the same area of the disk. In that case,
2487	 * we can group them together and don't waste time idling.
2488	 */
2489	cfqq = cfqq_close(cfqd, cur_cfqq);
2490	if (!cfqq)
2491		return NULL;
2492
2493	/* If new queue belongs to different cfq_group, don't choose it */
2494	if (cur_cfqq->cfqg != cfqq->cfqg)
2495		return NULL;
2496
2497	/*
2498	 * It only makes sense to merge sync queues.
2499	 */
2500	if (!cfq_cfqq_sync(cfqq))
2501		return NULL;
2502	if (CFQQ_SEEKY(cfqq))
2503		return NULL;
2504
2505	/*
2506	 * Do not merge queues of different priority classes
2507	 */
2508	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2509		return NULL;
2510
2511	return cfqq;
2512}
2513
2514/*
2515 * Determine whether we should enforce idle window for this queue.
2516 */
2517
2518static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2519{
2520	enum wl_class_t wl_class = cfqq_class(cfqq);
2521	struct cfq_rb_root *st = cfqq->service_tree;
2522
2523	BUG_ON(!st);
2524	BUG_ON(!st->count);
2525
2526	if (!cfqd->cfq_slice_idle)
2527		return false;
2528
2529	/* We never do for idle class queues. */
2530	if (wl_class == IDLE_WORKLOAD)
2531		return false;
2532
2533	/* We do for queues that were marked with idle window flag. */
2534	if (cfq_cfqq_idle_window(cfqq) &&
2535	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2536		return true;
2537
2538	/*
2539	 * Otherwise, we do only if they are the last ones
2540	 * in their service tree.
2541	 */
2542	if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2543	   !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2544		return true;
2545	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2546	return false;
2547}
2548
2549static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2550{
2551	struct cfq_queue *cfqq = cfqd->active_queue;
2552	struct cfq_io_cq *cic;
2553	unsigned long sl, group_idle = 0;
2554
2555	/*
2556	 * SSD device without seek penalty, disable idling. But only do so
2557	 * for devices that support queuing, otherwise we still have a problem
2558	 * with sync vs async workloads.
2559	 */
2560	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
2561		return;
2562
2563	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2564	WARN_ON(cfq_cfqq_slice_new(cfqq));
2565
2566	/*
2567	 * idle is disabled, either manually or by past process history
2568	 */
2569	if (!cfq_should_idle(cfqd, cfqq)) {
2570		/* no queue idling. Check for group idling */
2571		if (cfqd->cfq_group_idle)
2572			group_idle = cfqd->cfq_group_idle;
2573		else
2574			return;
2575	}
2576
2577	/*
2578	 * still active requests from this queue, don't idle
2579	 */
2580	if (cfqq->dispatched)
2581		return;
2582
2583	/*
2584	 * task has exited, don't wait
2585	 */
2586	cic = cfqd->active_cic;
2587	if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2588		return;
2589
2590	/*
2591	 * If our average think time is larger than the remaining time
2592	 * slice, then don't idle. This avoids overrunning the allotted
2593	 * time slice.
2594	 */
2595	if (sample_valid(cic->ttime.ttime_samples) &&
2596	    (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
2597		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
2598			     cic->ttime.ttime_mean);
2599		return;
2600	}
2601
2602	/* There are other queues in the group, don't do group idle */
2603	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
2604		return;
2605
2606	cfq_mark_cfqq_wait_request(cfqq);
2607
2608	if (group_idle)
2609		sl = cfqd->cfq_group_idle;
2610	else
2611		sl = cfqd->cfq_slice_idle;
2612
2613	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2614	cfqg_stats_set_start_idle_time(cfqq->cfqg);
2615	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2616			group_idle ? 1 : 0);
2617}
2618
2619/*
2620 * Move request from internal lists to the request queue dispatch list.
2621 */
2622static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2623{
2624	struct cfq_data *cfqd = q->elevator->elevator_data;
2625	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2626
2627	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2628
2629	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2630	cfq_remove_request(rq);
2631	cfqq->dispatched++;
2632	(RQ_CFQG(rq))->dispatched++;
2633	elv_dispatch_sort(q, rq);
2634
2635	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2636	cfqq->nr_sectors += blk_rq_sectors(rq);
2637	cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
2638}
2639
2640/*
2641 * return expired entry, or NULL to just start from scratch in rbtree
2642 */
2643static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
2644{
2645	struct request *rq = NULL;
2646
2647	if (cfq_cfqq_fifo_expire(cfqq))
2648		return NULL;
2649
2650	cfq_mark_cfqq_fifo_expire(cfqq);
2651
2652	if (list_empty(&cfqq->fifo))
2653		return NULL;
2654
2655	rq = rq_entry_fifo(cfqq->fifo.next);
2656	if (time_before(jiffies, rq_fifo_time(rq)))
2657		rq = NULL;
2658
2659	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
2660	return rq;
2661}
2662
2663static inline int
2664cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2665{
2666	const int base_rq = cfqd->cfq_slice_async_rq;
2667
2668	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2669
2670	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2671}
2672
2673/*
2674 * Must be called with the queue_lock held.
2675 */
2676static int cfqq_process_refs(struct cfq_queue *cfqq)
2677{
2678	int process_refs, io_refs;
2679
2680	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2681	process_refs = cfqq->ref - io_refs;
2682	BUG_ON(process_refs < 0);
2683	return process_refs;
2684}
2685
2686static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2687{
2688	int process_refs, new_process_refs;
2689	struct cfq_queue *__cfqq;
2690
2691	/*
2692	 * If there are no process references on the new_cfqq, then it is
2693	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2694	 * chain may have dropped their last reference (not just their
2695	 * last process reference).
2696	 */
2697	if (!cfqq_process_refs(new_cfqq))
2698		return;
2699
2700	/* Avoid a circular list and skip interim queue merges */
2701	while ((__cfqq = new_cfqq->new_cfqq)) {
2702		if (__cfqq == cfqq)
2703			return;
2704		new_cfqq = __cfqq;
2705	}
2706
2707	process_refs = cfqq_process_refs(cfqq);
2708	new_process_refs = cfqq_process_refs(new_cfqq);
2709	/*
2710	 * If the process for the cfqq has gone away, there is no
2711	 * sense in merging the queues.
2712	 */
2713	if (process_refs == 0 || new_process_refs == 0)
2714		return;
2715
2716	/*
2717	 * Merge in the direction of the lesser amount of work.
2718	 */
2719	if (new_process_refs >= process_refs) {
2720		cfqq->new_cfqq = new_cfqq;
2721		new_cfqq->ref += process_refs;
2722	} else {
2723		new_cfqq->new_cfqq = cfqq;
2724		cfqq->ref += new_process_refs;
2725	}
2726}
2727
2728static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
2729			struct cfq_group *cfqg, enum wl_class_t wl_class)
2730{
2731	struct cfq_queue *queue;
2732	int i;
2733	bool key_valid = false;
2734	unsigned long lowest_key = 0;
2735	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2736
2737	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2738		/* select the one with lowest rb_key */
2739		queue = cfq_rb_first(st_for(cfqg, wl_class, i));
2740		if (queue &&
2741		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
2742			lowest_key = queue->rb_key;
2743			cur_best = i;
2744			key_valid = true;
2745		}
2746	}
2747
2748	return cur_best;
2749}
2750
2751static void
2752choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
2753{
2754	unsigned slice;
2755	unsigned count;
2756	struct cfq_rb_root *st;
2757	unsigned group_slice;
2758	enum wl_class_t original_class = cfqd->serving_wl_class;
2759
2760	/* Choose next priority. RT > BE > IDLE */
2761	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2762		cfqd->serving_wl_class = RT_WORKLOAD;
2763	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2764		cfqd->serving_wl_class = BE_WORKLOAD;
2765	else {
2766		cfqd->serving_wl_class = IDLE_WORKLOAD;
2767		cfqd->workload_expires = jiffies + 1;
2768		return;
2769	}
2770
2771	if (original_class != cfqd->serving_wl_class)
2772		goto new_workload;
2773
2774	/*
2775	 * For RT and BE, we have to choose also the type
2776	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2777	 * expiration time
2778	 */
2779	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2780	count = st->count;
2781
2782	/*
2783	 * check workload expiration, and that we still have other queues ready
2784	 */
2785	if (count && !time_after(jiffies, cfqd->workload_expires))
2786		return;
2787
2788new_workload:
2789	/* otherwise select new workload type */
2790	cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
2791					cfqd->serving_wl_class);
2792	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2793	count = st->count;
2794
2795	/*
2796	 * the workload slice is computed as a fraction of target latency
2797	 * proportional to the number of queues in that workload, over
2798	 * all the queues in the same priority class
2799	 */
2800	group_slice = cfq_group_slice(cfqd, cfqg);
2801
2802	slice = group_slice * count /
2803		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
2804		      cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
2805					cfqg));
2806
2807	if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
2808		unsigned int tmp;
2809
2810		/*
2811		 * Async queues are currently system wide. Just taking
2812		 * proportion of queues with-in same group will lead to higher
2813		 * async ratio system wide as generally root group is going
2814		 * to have higher weight. A more accurate thing would be to
2815		 * calculate system wide asnc/sync ratio.
2816		 */
2817		tmp = cfqd->cfq_target_latency *
2818			cfqg_busy_async_queues(cfqd, cfqg);
2819		tmp = tmp/cfqd->busy_queues;
2820		slice = min_t(unsigned, slice, tmp);
2821
2822		/* async workload slice is scaled down according to
2823		 * the sync/async slice ratio. */
2824		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2825	} else
2826		/* sync workload slice is at least 2 * cfq_slice_idle */
2827		slice = max(slice, 2 * cfqd->cfq_slice_idle);
2828
2829	slice = max_t(unsigned, slice, CFQ_MIN_TT);
2830	cfq_log(cfqd, "workload slice:%d", slice);
2831	cfqd->workload_expires = jiffies + slice;
2832}
2833
2834static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2835{
2836	struct cfq_rb_root *st = &cfqd->grp_service_tree;
2837	struct cfq_group *cfqg;
2838
2839	if (RB_EMPTY_ROOT(&st->rb))
2840		return NULL;
2841	cfqg = cfq_rb_first_group(st);
2842	update_min_vdisktime(st);
2843	return cfqg;
2844}
2845
2846static void cfq_choose_cfqg(struct cfq_data *cfqd)
2847{
2848	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2849
2850	cfqd->serving_group = cfqg;
2851
2852	/* Restore the workload type data */
2853	if (cfqg->saved_wl_slice) {
2854		cfqd->workload_expires = jiffies + cfqg->saved_wl_slice;
2855		cfqd->serving_wl_type = cfqg->saved_wl_type;
2856		cfqd->serving_wl_class = cfqg->saved_wl_class;
2857	} else
2858		cfqd->workload_expires = jiffies - 1;
2859
2860	choose_wl_class_and_type(cfqd, cfqg);
2861}
2862
2863/*
2864 * Select a queue for service. If we have a current active queue,
2865 * check whether to continue servicing it, or retrieve and set a new one.
2866 */
2867static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2868{
2869	struct cfq_queue *cfqq, *new_cfqq = NULL;
2870
2871	cfqq = cfqd->active_queue;
2872	if (!cfqq)
2873		goto new_queue;
2874
2875	if (!cfqd->rq_queued)
2876		return NULL;
2877
2878	/*
2879	 * We were waiting for group to get backlogged. Expire the queue
2880	 */
2881	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2882		goto expire;
2883
2884	/*
2885	 * The active queue has run out of time, expire it and select new.
2886	 */
2887	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2888		/*
2889		 * If slice had not expired at the completion of last request
2890		 * we might not have turned on wait_busy flag. Don't expire
2891		 * the queue yet. Allow the group to get backlogged.
2892		 *
2893		 * The very fact that we have used the slice, that means we
2894		 * have been idling all along on this queue and it should be
2895		 * ok to wait for this request to complete.
2896		 */
2897		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2898		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2899			cfqq = NULL;
2900			goto keep_queue;
2901		} else
2902			goto check_group_idle;
2903	}
2904
2905	/*
2906	 * The active queue has requests and isn't expired, allow it to
2907	 * dispatch.
2908	 */
2909	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2910		goto keep_queue;
2911
2912	/*
2913	 * If another queue has a request waiting within our mean seek
2914	 * distance, let it run.  The expire code will check for close
2915	 * cooperators and put the close queue at the front of the service
2916	 * tree.  If possible, merge the expiring queue with the new cfqq.
2917	 */
2918	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2919	if (new_cfqq) {
2920		if (!cfqq->new_cfqq)
2921			cfq_setup_merge(cfqq, new_cfqq);
2922		goto expire;
2923	}
2924
2925	/*
2926	 * No requests pending. If the active queue still has requests in
2927	 * flight or is idling for a new request, allow either of these
2928	 * conditions to happen (or time out) before selecting a new queue.
2929	 */
2930	if (timer_pending(&cfqd->idle_slice_timer)) {
2931		cfqq = NULL;
2932		goto keep_queue;
2933	}
2934
2935	/*
2936	 * This is a deep seek queue, but the device is much faster than
2937	 * the queue can deliver, don't idle
2938	 **/
2939	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2940	    (cfq_cfqq_slice_new(cfqq) ||
2941	    (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2942		cfq_clear_cfqq_deep(cfqq);
2943		cfq_clear_cfqq_idle_window(cfqq);
2944	}
2945
2946	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2947		cfqq = NULL;
2948		goto keep_queue;
2949	}
2950
2951	/*
2952	 * If group idle is enabled and there are requests dispatched from
2953	 * this group, wait for requests to complete.
2954	 */
2955check_group_idle:
2956	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
2957	    cfqq->cfqg->dispatched &&
2958	    !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
2959		cfqq = NULL;
2960		goto keep_queue;
2961	}
2962
2963expire:
2964	cfq_slice_expired(cfqd, 0);
2965new_queue:
2966	/*
2967	 * Current queue expired. Check if we have to switch to a new
2968	 * service tree
2969	 */
2970	if (!new_cfqq)
2971		cfq_choose_cfqg(cfqd);
2972
2973	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2974keep_queue:
2975	return cfqq;
2976}
2977
2978static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2979{
2980	int dispatched = 0;
2981
2982	while (cfqq->next_rq) {
2983		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2984		dispatched++;
2985	}
2986
2987	BUG_ON(!list_empty(&cfqq->fifo));
2988
2989	/* By default cfqq is not expired if it is empty. Do it explicitly */
2990	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2991	return dispatched;
2992}
2993
2994/*
2995 * Drain our current requests. Used for barriers and when switching
2996 * io schedulers on-the-fly.
2997 */
2998static int cfq_forced_dispatch(struct cfq_data *cfqd)
2999{
3000	struct cfq_queue *cfqq;
3001	int dispatched = 0;
3002
3003	/* Expire the timeslice of the current active queue first */
3004	cfq_slice_expired(cfqd, 0);
3005	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3006		__cfq_set_active_queue(cfqd, cfqq);
3007		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3008	}
3009
3010	BUG_ON(cfqd->busy_queues);
3011
3012	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3013	return dispatched;
3014}
3015
3016static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3017	struct cfq_queue *cfqq)
3018{
3019	/* the queue hasn't finished any request, can't estimate */
3020	if (cfq_cfqq_slice_new(cfqq))
3021		return true;
3022	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
3023		cfqq->slice_end))
3024		return true;
3025
3026	return false;
3027}
3028
3029static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3030{
3031	unsigned int max_dispatch;
3032
3033	/*
3034	 * Drain async requests before we start sync IO
3035	 */
3036	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3037		return false;
3038
3039	/*
3040	 * If this is an async queue and we have sync IO in flight, let it wait
3041	 */
3042	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3043		return false;
3044
3045	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3046	if (cfq_class_idle(cfqq))
3047		max_dispatch = 1;
3048
3049	/*
3050	 * Does this cfqq already have too much IO in flight?
3051	 */
3052	if (cfqq->dispatched >= max_dispatch) {
3053		bool promote_sync = false;
3054		/*
3055		 * idle queue must always only have a single IO in flight
3056		 */
3057		if (cfq_class_idle(cfqq))
3058			return false;
3059
3060		/*
3061		 * If there is only one sync queue
3062		 * we can ignore async queue here and give the sync
3063		 * queue no dispatch limit. The reason is a sync queue can
3064		 * preempt async queue, limiting the sync queue doesn't make
3065		 * sense. This is useful for aiostress test.
3066		 */
3067		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3068			promote_sync = true;
3069
3070		/*
3071		 * We have other queues, don't allow more IO from this one
3072		 */
3073		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3074				!promote_sync)
3075			return false;
3076
3077		/*
3078		 * Sole queue user, no limit
3079		 */
3080		if (cfqd->busy_queues == 1 || promote_sync)
3081			max_dispatch = -1;
3082		else
3083			/*
3084			 * Normally we start throttling cfqq when cfq_quantum/2
3085			 * requests have been dispatched. But we can drive
3086			 * deeper queue depths at the beginning of slice
3087			 * subjected to upper limit of cfq_quantum.
3088			 * */
3089			max_dispatch = cfqd->cfq_quantum;
3090	}
3091
3092	/*
3093	 * Async queues must wait a bit before being allowed dispatch.
3094	 * We also ramp up the dispatch depth gradually for async IO,
3095	 * based on the last sync IO we serviced
3096	 */
3097	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3098		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
3099		unsigned int depth;
3100
3101		depth = last_sync / cfqd->cfq_slice[1];
3102		if (!depth && !cfqq->dispatched)
3103			depth = 1;
3104		if (depth < max_dispatch)
3105			max_dispatch = depth;
3106	}
3107
3108	/*
3109	 * If we're below the current max, allow a dispatch
3110	 */
3111	return cfqq->dispatched < max_dispatch;
3112}
3113
3114/*
3115 * Dispatch a request from cfqq, moving them to the request queue
3116 * dispatch list.
3117 */
3118static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3119{
3120	struct request *rq;
3121
3122	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3123
3124	if (!cfq_may_dispatch(cfqd, cfqq))
3125		return false;
3126
3127	/*
3128	 * follow expired path, else get first next available
3129	 */
3130	rq = cfq_check_fifo(cfqq);
3131	if (!rq)
3132		rq = cfqq->next_rq;
3133
3134	/*
3135	 * insert request into driver dispatch list
3136	 */
3137	cfq_dispatch_insert(cfqd->queue, rq);
3138
3139	if (!cfqd->active_cic) {
3140		struct cfq_io_cq *cic = RQ_CIC(rq);
3141
3142		atomic_long_inc(&cic->icq.ioc->refcount);
3143		cfqd->active_cic = cic;
3144	}
3145
3146	return true;
3147}
3148
3149/*
3150 * Find the cfqq that we need to service and move a request from that to the
3151 * dispatch list
3152 */
3153static int cfq_dispatch_requests(struct request_queue *q, int force)
3154{
3155	struct cfq_data *cfqd = q->elevator->elevator_data;
3156	struct cfq_queue *cfqq;
3157
3158	if (!cfqd->busy_queues)
3159		return 0;
3160
3161	if (unlikely(force))
3162		return cfq_forced_dispatch(cfqd);
3163
3164	cfqq = cfq_select_queue(cfqd);
3165	if (!cfqq)
3166		return 0;
3167
3168	/*
3169	 * Dispatch a request from this cfqq, if it is allowed
3170	 */
3171	if (!cfq_dispatch_request(cfqd, cfqq))
3172		return 0;
3173
3174	cfqq->slice_dispatch++;
3175	cfq_clear_cfqq_must_dispatch(cfqq);
3176
3177	/*
3178	 * expire an async queue immediately if it has used up its slice. idle
3179	 * queue always expire after 1 dispatch round.
3180	 */
3181	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3182	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3183	    cfq_class_idle(cfqq))) {
3184		cfqq->slice_end = jiffies + 1;
3185		cfq_slice_expired(cfqd, 0);
3186	}
3187
3188	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3189	return 1;
3190}
3191
3192/*
3193 * task holds one reference to the queue, dropped when task exits. each rq
3194 * in-flight on this queue also holds a reference, dropped when rq is freed.
3195 *
3196 * Each cfq queue took a reference on the parent group. Drop it now.
3197 * queue lock must be held here.
3198 */
3199static void cfq_put_queue(struct cfq_queue *cfqq)
3200{
3201	struct cfq_data *cfqd = cfqq->cfqd;
3202	struct cfq_group *cfqg;
3203
3204	BUG_ON(cfqq->ref <= 0);
3205
3206	cfqq->ref--;
3207	if (cfqq->ref)
3208		return;
3209
3210	cfq_log_cfqq(cfqd, cfqq, "put_queue");
3211	BUG_ON(rb_first(&cfqq->sort_list));
3212	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3213	cfqg = cfqq->cfqg;
3214
3215	if (unlikely(cfqd->active_queue == cfqq)) {
3216		__cfq_slice_expired(cfqd, cfqq, 0);
3217		cfq_schedule_dispatch(cfqd);
3218	}
3219
3220	BUG_ON(cfq_cfqq_on_rr(cfqq));
3221	kmem_cache_free(cfq_pool, cfqq);
3222	cfqg_put(cfqg);
3223}
3224
3225static void cfq_put_cooperator(struct cfq_queue *cfqq)
3226{
3227	struct cfq_queue *__cfqq, *next;
3228
3229	/*
3230	 * If this queue was scheduled to merge with another queue, be
3231	 * sure to drop the reference taken on that queue (and others in
3232	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
3233	 */
3234	__cfqq = cfqq->new_cfqq;
3235	while (__cfqq) {
3236		if (__cfqq == cfqq) {
3237			WARN(1, "cfqq->new_cfqq loop detected\n");
3238			break;
3239		}
3240		next = __cfqq->new_cfqq;
3241		cfq_put_queue(__cfqq);
3242		__cfqq = next;
3243	}
3244}
3245
3246static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3247{
3248	if (unlikely(cfqq == cfqd->active_queue)) {
3249		__cfq_slice_expired(cfqd, cfqq, 0);
3250		cfq_schedule_dispatch(cfqd);
3251	}
3252
3253	cfq_put_cooperator(cfqq);
3254
3255	cfq_put_queue(cfqq);
3256}
3257
3258static void cfq_init_icq(struct io_cq *icq)
3259{
3260	struct cfq_io_cq *cic = icq_to_cic(icq);
3261
3262	cic->ttime.last_end_request = jiffies;
3263}
3264
3265static void cfq_exit_icq(struct io_cq *icq)
3266{
3267	struct cfq_io_cq *cic = icq_to_cic(icq);
3268	struct cfq_data *cfqd = cic_to_cfqd(cic);
3269
3270	if (cic->cfqq[BLK_RW_ASYNC]) {
3271		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
3272		cic->cfqq[BLK_RW_ASYNC] = NULL;
3273	}
3274
3275	if (cic->cfqq[BLK_RW_SYNC]) {
3276		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
3277		cic->cfqq[BLK_RW_SYNC] = NULL;
3278	}
3279}
3280
3281static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3282{
3283	struct task_struct *tsk = current;
3284	int ioprio_class;
3285
3286	if (!cfq_cfqq_prio_changed(cfqq))
3287		return;
3288
3289	ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3290	switch (ioprio_class) {
3291	default:
3292		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3293	case IOPRIO_CLASS_NONE:
3294		/*
3295		 * no prio set, inherit CPU scheduling settings
3296		 */
3297		cfqq->ioprio = task_nice_ioprio(tsk);
3298		cfqq->ioprio_class = task_nice_ioclass(tsk);
3299		break;
3300	case IOPRIO_CLASS_RT:
3301		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3302		cfqq->ioprio_class = IOPRIO_CLASS_RT;
3303		break;
3304	case IOPRIO_CLASS_BE:
3305		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3306		cfqq->ioprio_class = IOPRIO_CLASS_BE;
3307		break;
3308	case IOPRIO_CLASS_IDLE:
3309		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3310		cfqq->ioprio = 7;
3311		cfq_clear_cfqq_idle_window(cfqq);
3312		break;
3313	}
3314
3315	/*
3316	 * keep track of original prio settings in case we have to temporarily
3317	 * elevate the priority of this queue
3318	 */
3319	cfqq->org_ioprio = cfqq->ioprio;
3320	cfq_clear_cfqq_prio_changed(cfqq);
3321}
3322
3323static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3324{
3325	int ioprio = cic->icq.ioc->ioprio;
3326	struct cfq_data *cfqd = cic_to_cfqd(cic);
3327	struct cfq_queue *cfqq;
3328
3329	/*
3330	 * Check whether ioprio has changed.  The condition may trigger
3331	 * spuriously on a newly created cic but there's no harm.
3332	 */
3333	if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3334		return;
3335
3336	cfqq = cic->cfqq[BLK_RW_ASYNC];
3337	if (cfqq) {
3338		struct cfq_queue *new_cfqq;
3339		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
3340					 GFP_ATOMIC);
3341		if (new_cfqq) {
3342			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
3343			cfq_put_queue(cfqq);
3344		}
3345	}
3346
3347	cfqq = cic->cfqq[BLK_RW_SYNC];
3348	if (cfqq)
3349		cfq_mark_cfqq_prio_changed(cfqq);
3350
3351	cic->ioprio = ioprio;
3352}
3353
3354static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3355			  pid_t pid, bool is_sync)
3356{
3357	RB_CLEAR_NODE(&cfqq->rb_node);
3358	RB_CLEAR_NODE(&cfqq->p_node);
3359	INIT_LIST_HEAD(&cfqq->fifo);
3360
3361	cfqq->ref = 0;
3362	cfqq->cfqd = cfqd;
3363
3364	cfq_mark_cfqq_prio_changed(cfqq);
3365
3366	if (is_sync) {
3367		if (!cfq_class_idle(cfqq))
3368			cfq_mark_cfqq_idle_window(cfqq);
3369		cfq_mark_cfqq_sync(cfqq);
3370	}
3371	cfqq->pid = pid;
3372}
3373
3374#ifdef CONFIG_CFQ_GROUP_IOSCHED
3375static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3376{
3377	struct cfq_data *cfqd = cic_to_cfqd(cic);
3378	struct cfq_queue *sync_cfqq;
3379	uint64_t id;
3380
3381	rcu_read_lock();
3382	id = bio_blkcg(bio)->id;
3383	rcu_read_unlock();
3384
3385	/*
3386	 * Check whether blkcg has changed.  The condition may trigger
3387	 * spuriously on a newly created cic but there's no harm.
3388	 */
3389	if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
3390		return;
3391
3392	sync_cfqq = cic_to_cfqq(cic, 1);
3393	if (sync_cfqq) {
3394		/*
3395		 * Drop reference to sync queue. A new sync queue will be
3396		 * assigned in new group upon arrival of a fresh request.
3397		 */
3398		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
3399		cic_set_cfqq(cic, NULL, 1);
3400		cfq_put_queue(sync_cfqq);
3401	}
3402
3403	cic->blkcg_id = id;
3404}
3405#else
3406static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
3407#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
3408
3409static struct cfq_queue *
3410cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3411		     struct bio *bio, gfp_t gfp_mask)
3412{
3413	struct blkcg *blkcg;
3414	struct cfq_queue *cfqq, *new_cfqq = NULL;
3415	struct cfq_group *cfqg;
3416
3417retry:
3418	rcu_read_lock();
3419
3420	blkcg = bio_blkcg(bio);
3421	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
3422	cfqq = cic_to_cfqq(cic, is_sync);
3423
3424	/*
3425	 * Always try a new alloc if we fell back to the OOM cfqq
3426	 * originally, since it should just be a temporary situation.
3427	 */
3428	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3429		cfqq = NULL;
3430		if (new_cfqq) {
3431			cfqq = new_cfqq;
3432			new_cfqq = NULL;
3433		} else if (gfp_mask & __GFP_WAIT) {
3434			rcu_read_unlock();
3435			spin_unlock_irq(cfqd->queue->queue_lock);
3436			new_cfqq = kmem_cache_alloc_node(cfq_pool,
3437					gfp_mask | __GFP_ZERO,
3438					cfqd->queue->node);
3439			spin_lock_irq(cfqd->queue->queue_lock);
3440			if (new_cfqq)
3441				goto retry;
3442		} else {
3443			cfqq = kmem_cache_alloc_node(cfq_pool,
3444					gfp_mask | __GFP_ZERO,
3445					cfqd->queue->node);
3446		}
3447
3448		if (cfqq) {
3449			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3450			cfq_init_prio_data(cfqq, cic);
3451			cfq_link_cfqq_cfqg(cfqq, cfqg);
3452			cfq_log_cfqq(cfqd, cfqq, "alloced");
3453		} else
3454			cfqq = &cfqd->oom_cfqq;
3455	}
3456
3457	if (new_cfqq)
3458		kmem_cache_free(cfq_pool, new_cfqq);
3459
3460	rcu_read_unlock();
3461	return cfqq;
3462}
3463
3464static struct cfq_queue **
3465cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
3466{
3467	switch (ioprio_class) {
3468	case IOPRIO_CLASS_RT:
3469		return &cfqd->async_cfqq[0][ioprio];
3470	case IOPRIO_CLASS_NONE:
3471		ioprio = IOPRIO_NORM;
3472		/* fall through */
3473	case IOPRIO_CLASS_BE:
3474		return &cfqd->async_cfqq[1][ioprio];
3475	case IOPRIO_CLASS_IDLE:
3476		return &cfqd->async_idle_cfqq;
3477	default:
3478		BUG();
3479	}
3480}
3481
3482static struct cfq_queue *
3483cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3484	      struct bio *bio, gfp_t gfp_mask)
3485{
3486	const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3487	const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3488	struct cfq_queue **async_cfqq = NULL;
3489	struct cfq_queue *cfqq = NULL;
3490
3491	if (!is_sync) {
3492		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
3493		cfqq = *async_cfqq;
3494	}
3495
3496	if (!cfqq)
3497		cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
3498
3499	/*
3500	 * pin the queue now that it's allocated, scheduler exit will prune it
3501	 */
3502	if (!is_sync && !(*async_cfqq)) {
3503		cfqq->ref++;
3504		*async_cfqq = cfqq;
3505	}
3506
3507	cfqq->ref++;
3508	return cfqq;
3509}
3510
3511static void
3512__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3513{
3514	unsigned long elapsed = jiffies - ttime->last_end_request;
3515	elapsed = min(elapsed, 2UL * slice_idle);
3516
3517	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3518	ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
3519	ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
3520}
3521
3522static void
3523cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3524			struct cfq_io_cq *cic)
3525{
3526	if (cfq_cfqq_sync(cfqq)) {
3527		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3528		__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3529			cfqd->cfq_slice_idle);
3530	}
3531#ifdef CONFIG_CFQ_GROUP_IOSCHED
3532	__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3533#endif
3534}
3535
3536static void
3537cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3538		       struct request *rq)
3539{
3540	sector_t sdist = 0;
3541	sector_t n_sec = blk_rq_sectors(rq);
3542	if (cfqq->last_request_pos) {
3543		if (cfqq->last_request_pos < blk_rq_pos(rq))
3544			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3545		else
3546			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3547	}
3548
3549	cfqq->seek_history <<= 1;
3550	if (blk_queue_nonrot(cfqd->queue))
3551		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3552	else
3553		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3554}
3555
3556/*
3557 * Disable idle window if the process thinks too long or seeks so much that
3558 * it doesn't matter
3559 */
3560static void
3561cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3562		       struct cfq_io_cq *cic)
3563{
3564	int old_idle, enable_idle;
3565
3566	/*
3567	 * Don't idle for async or idle io prio class
3568	 */
3569	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3570		return;
3571
3572	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3573
3574	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3575		cfq_mark_cfqq_deep(cfqq);
3576
3577	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3578		enable_idle = 0;
3579	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3580		 !cfqd->cfq_slice_idle ||
3581		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3582		enable_idle = 0;
3583	else if (sample_valid(cic->ttime.ttime_samples)) {
3584		if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3585			enable_idle = 0;
3586		else
3587			enable_idle = 1;
3588	}
3589
3590	if (old_idle != enable_idle) {
3591		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3592		if (enable_idle)
3593			cfq_mark_cfqq_idle_window(cfqq);
3594		else
3595			cfq_clear_cfqq_idle_window(cfqq);
3596	}
3597}
3598
3599/*
3600 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3601 * no or if we aren't sure, a 1 will cause a preempt.
3602 */
3603static bool
3604cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3605		   struct request *rq)
3606{
3607	struct cfq_queue *cfqq;
3608
3609	cfqq = cfqd->active_queue;
3610	if (!cfqq)
3611		return false;
3612
3613	if (cfq_class_idle(new_cfqq))
3614		return false;
3615
3616	if (cfq_class_idle(cfqq))
3617		return true;
3618
3619	/*
3620	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3621	 */
3622	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3623		return false;
3624
3625	/*
3626	 * if the new request is sync, but the currently running queue is
3627	 * not, let the sync request have priority.
3628	 */
3629	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3630		return true;
3631
3632	if (new_cfqq->cfqg != cfqq->cfqg)
3633		return false;
3634
3635	if (cfq_slice_used(cfqq))
3636		return true;
3637
3638	/* Allow preemption only if we are idling on sync-noidle tree */
3639	if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
3640	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3641	    new_cfqq->service_tree->count == 2 &&
3642	    RB_EMPTY_ROOT(&cfqq->sort_list))
3643		return true;
3644
3645	/*
3646	 * So both queues are sync. Let the new request get disk time if
3647	 * it's a metadata request and the current queue is doing regular IO.
3648	 */
3649	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
3650		return true;
3651
3652	/*
3653	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3654	 */
3655	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3656		return true;
3657
3658	/* An idle queue should not be idle now for some reason */
3659	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3660		return true;
3661
3662	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3663		return false;
3664
3665	/*
3666	 * if this request is as-good as one we would expect from the
3667	 * current cfqq, let it preempt
3668	 */
3669	if (cfq_rq_close(cfqd, cfqq, rq))
3670		return true;
3671
3672	return false;
3673}
3674
3675/*
3676 * cfqq preempts the active queue. if we allowed preempt with no slice left,
3677 * let it have half of its nominal slice.
3678 */
3679static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3680{
3681	enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
3682
3683	cfq_log_cfqq(cfqd, cfqq, "preempt");
3684	cfq_slice_expired(cfqd, 1);
3685
3686	/*
3687	 * workload type is changed, don't save slice, otherwise preempt
3688	 * doesn't happen
3689	 */
3690	if (old_type != cfqq_type(cfqq))
3691		cfqq->cfqg->saved_wl_slice = 0;
3692
3693	/*
3694	 * Put the new queue at the front of the of the current list,
3695	 * so we know that it will be selected next.
3696	 */
3697	BUG_ON(!cfq_cfqq_on_rr(cfqq));
3698
3699	cfq_service_tree_add(cfqd, cfqq, 1);
3700
3701	cfqq->slice_end = 0;
3702	cfq_mark_cfqq_slice_new(cfqq);
3703}
3704
3705/*
3706 * Called when a new fs request (rq) is added (to cfqq). Check if there's
3707 * something we should do about it
3708 */
3709static void
3710cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3711		struct request *rq)
3712{
3713	struct cfq_io_cq *cic = RQ_CIC(rq);
3714
3715	cfqd->rq_queued++;
3716	if (rq->cmd_flags & REQ_PRIO)
3717		cfqq->prio_pending++;
3718
3719	cfq_update_io_thinktime(cfqd, cfqq, cic);
3720	cfq_update_io_seektime(cfqd, cfqq, rq);
3721	cfq_update_idle_window(cfqd, cfqq, cic);
3722
3723	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3724
3725	if (cfqq == cfqd->active_queue) {
3726		/*
3727		 * Remember that we saw a request from this process, but
3728		 * don't start queuing just yet. Otherwise we risk seeing lots
3729		 * of tiny requests, because we disrupt the normal plugging
3730		 * and merging. If the request is already larger than a single
3731		 * page, let it rip immediately. For that case we assume that
3732		 * merging is already done. Ditto for a busy system that
3733		 * has other work pending, don't risk delaying until the
3734		 * idle timer unplug to continue working.
3735		 */
3736		if (cfq_cfqq_wait_request(cfqq)) {
3737			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3738			    cfqd->busy_queues > 1) {
3739				cfq_del_timer(cfqd, cfqq);
3740				cfq_clear_cfqq_wait_request(cfqq);
3741				__blk_run_queue(cfqd->queue);
3742			} else {
3743				cfqg_stats_update_idle_time(cfqq->cfqg);
3744				cfq_mark_cfqq_must_dispatch(cfqq);
3745			}
3746		}
3747	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3748		/*
3749		 * not the active queue - expire current slice if it is
3750		 * idle and has expired it's mean thinktime or this new queue
3751		 * has some old slice time left and is of higher priority or
3752		 * this new queue is RT and the current one is BE
3753		 */
3754		cfq_preempt_queue(cfqd, cfqq);
3755		__blk_run_queue(cfqd->queue);
3756	}
3757}
3758
3759static void cfq_insert_request(struct request_queue *q, struct request *rq)
3760{
3761	struct cfq_data *cfqd = q->elevator->elevator_data;
3762	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3763
3764	cfq_log_cfqq(cfqd, cfqq, "insert_request");
3765	cfq_init_prio_data(cfqq, RQ_CIC(rq));
3766
3767	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3768	list_add_tail(&rq->queuelist, &cfqq->fifo);
3769	cfq_add_rq_rb(rq);
3770	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
3771				 rq->cmd_flags);
3772	cfq_rq_enqueued(cfqd, cfqq, rq);
3773}
3774
3775/*
3776 * Update hw_tag based on peak queue depth over 50 samples under
3777 * sufficient load.
3778 */
3779static void cfq_update_hw_tag(struct cfq_data *cfqd)
3780{
3781	struct cfq_queue *cfqq = cfqd->active_queue;
3782
3783	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3784		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3785
3786	if (cfqd->hw_tag == 1)
3787		return;
3788
3789	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3790	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3791		return;
3792
3793	/*
3794	 * If active queue hasn't enough requests and can idle, cfq might not
3795	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3796	 * case
3797	 */
3798	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3799	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3800	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3801		return;
3802
3803	if (cfqd->hw_tag_samples++ < 50)
3804		return;
3805
3806	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3807		cfqd->hw_tag = 1;
3808	else
3809		cfqd->hw_tag = 0;
3810}
3811
3812static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3813{
3814	struct cfq_io_cq *cic = cfqd->active_cic;
3815
3816	/* If the queue already has requests, don't wait */
3817	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3818		return false;
3819
3820	/* If there are other queues in the group, don't wait */
3821	if (cfqq->cfqg->nr_cfqq > 1)
3822		return false;
3823
3824	/* the only queue in the group, but think time is big */
3825	if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
3826		return false;
3827
3828	if (cfq_slice_used(cfqq))
3829		return true;
3830
3831	/* if slice left is less than think time, wait busy */
3832	if (cic && sample_valid(cic->ttime.ttime_samples)
3833	    && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
3834		return true;
3835
3836	/*
3837	 * If think times is less than a jiffy than ttime_mean=0 and above
3838	 * will not be true. It might happen that slice has not expired yet
3839	 * but will expire soon (4-5 ns) during select_queue(). To cover the
3840	 * case where think time is less than a jiffy, mark the queue wait
3841	 * busy if only 1 jiffy is left in the slice.
3842	 */
3843	if (cfqq->slice_end - jiffies == 1)
3844		return true;
3845
3846	return false;
3847}
3848
3849static void cfq_completed_request(struct request_queue *q, struct request *rq)
3850{
3851	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3852	struct cfq_data *cfqd = cfqq->cfqd;
3853	const int sync = rq_is_sync(rq);
3854	unsigned long now;
3855
3856	now = jiffies;
3857	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
3858		     !!(rq->cmd_flags & REQ_NOIDLE));
3859
3860	cfq_update_hw_tag(cfqd);
3861
3862	WARN_ON(!cfqd->rq_in_driver);
3863	WARN_ON(!cfqq->dispatched);
3864	cfqd->rq_in_driver--;
3865	cfqq->dispatched--;
3866	(RQ_CFQG(rq))->dispatched--;
3867	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
3868				     rq_io_start_time_ns(rq), rq->cmd_flags);
3869
3870	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3871
3872	if (sync) {
3873		struct cfq_rb_root *st;
3874
3875		RQ_CIC(rq)->ttime.last_end_request = now;
3876
3877		if (cfq_cfqq_on_rr(cfqq))
3878			st = cfqq->service_tree;
3879		else
3880			st = st_for(cfqq->cfqg, cfqq_class(cfqq),
3881					cfqq_type(cfqq));
3882
3883		st->ttime.last_end_request = now;
3884		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3885			cfqd->last_delayed_sync = now;
3886	}
3887
3888#ifdef CONFIG_CFQ_GROUP_IOSCHED
3889	cfqq->cfqg->ttime.last_end_request = now;
3890#endif
3891
3892	/*
3893	 * If this is the active queue, check if it needs to be expired,
3894	 * or if we want to idle in case it has no pending requests.
3895	 */
3896	if (cfqd->active_queue == cfqq) {
3897		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3898
3899		if (cfq_cfqq_slice_new(cfqq)) {
3900			cfq_set_prio_slice(cfqd, cfqq);
3901			cfq_clear_cfqq_slice_new(cfqq);
3902		}
3903
3904		/*
3905		 * Should we wait for next request to come in before we expire
3906		 * the queue.
3907		 */
3908		if (cfq_should_wait_busy(cfqd, cfqq)) {
3909			unsigned long extend_sl = cfqd->cfq_slice_idle;
3910			if (!cfqd->cfq_slice_idle)
3911				extend_sl = cfqd->cfq_group_idle;
3912			cfqq->slice_end = jiffies + extend_sl;
3913			cfq_mark_cfqq_wait_busy(cfqq);
3914			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3915		}
3916
3917		/*
3918		 * Idling is not enabled on:
3919		 * - expired queues
3920		 * - idle-priority queues
3921		 * - async queues
3922		 * - queues with still some requests queued
3923		 * - when there is a close cooperator
3924		 */
3925		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3926			cfq_slice_expired(cfqd, 1);
3927		else if (sync && cfqq_empty &&
3928			 !cfq_close_cooperator(cfqd, cfqq)) {
3929			cfq_arm_slice_timer(cfqd);
3930		}
3931	}
3932
3933	if (!cfqd->rq_in_driver)
3934		cfq_schedule_dispatch(cfqd);
3935}
3936
3937static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3938{
3939	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3940		cfq_mark_cfqq_must_alloc_slice(cfqq);
3941		return ELV_MQUEUE_MUST;
3942	}
3943
3944	return ELV_MQUEUE_MAY;
3945}
3946
3947static int cfq_may_queue(struct request_queue *q, int rw)
3948{
3949	struct cfq_data *cfqd = q->elevator->elevator_data;
3950	struct task_struct *tsk = current;
3951	struct cfq_io_cq *cic;
3952	struct cfq_queue *cfqq;
3953
3954	/*
3955	 * don't force setup of a queue from here, as a call to may_queue
3956	 * does not necessarily imply that a request actually will be queued.
3957	 * so just lookup a possibly existing queue, or return 'may queue'
3958	 * if that fails
3959	 */
3960	cic = cfq_cic_lookup(cfqd, tsk->io_context);
3961	if (!cic)
3962		return ELV_MQUEUE_MAY;
3963
3964	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3965	if (cfqq) {
3966		cfq_init_prio_data(cfqq, cic);
3967
3968		return __cfq_may_queue(cfqq);
3969	}
3970
3971	return ELV_MQUEUE_MAY;
3972}
3973
3974/*
3975 * queue lock held here
3976 */
3977static void cfq_put_request(struct request *rq)
3978{
3979	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3980
3981	if (cfqq) {
3982		const int rw = rq_data_dir(rq);
3983
3984		BUG_ON(!cfqq->allocated[rw]);
3985		cfqq->allocated[rw]--;
3986
3987		/* Put down rq reference on cfqg */
3988		cfqg_put(RQ_CFQG(rq));
3989		rq->elv.priv[0] = NULL;
3990		rq->elv.priv[1] = NULL;
3991
3992		cfq_put_queue(cfqq);
3993	}
3994}
3995
3996static struct cfq_queue *
3997cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
3998		struct cfq_queue *cfqq)
3999{
4000	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4001	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4002	cfq_mark_cfqq_coop(cfqq->new_cfqq);
4003	cfq_put_queue(cfqq);
4004	return cic_to_cfqq(cic, 1);
4005}
4006
4007/*
4008 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4009 * was the last process referring to said cfqq.
4010 */
4011static struct cfq_queue *
4012split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4013{
4014	if (cfqq_process_refs(cfqq) == 1) {
4015		cfqq->pid = current->pid;
4016		cfq_clear_cfqq_coop(cfqq);
4017		cfq_clear_cfqq_split_coop(cfqq);
4018		return cfqq;
4019	}
4020
4021	cic_set_cfqq(cic, NULL, 1);
4022
4023	cfq_put_cooperator(cfqq);
4024
4025	cfq_put_queue(cfqq);
4026	return NULL;
4027}
4028/*
4029 * Allocate cfq data structures associated with this request.
4030 */
4031static int
4032cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4033		gfp_t gfp_mask)
4034{
4035	struct cfq_data *cfqd = q->elevator->elevator_data;
4036	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
4037	const int rw = rq_data_dir(rq);
4038	const bool is_sync = rq_is_sync(rq);
4039	struct cfq_queue *cfqq;
4040
4041	might_sleep_if(gfp_mask & __GFP_WAIT);
4042
4043	spin_lock_irq(q->queue_lock);
4044
4045	check_ioprio_changed(cic, bio);
4046	check_blkcg_changed(cic, bio);
4047new_queue:
4048	cfqq = cic_to_cfqq(cic, is_sync);
4049	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4050		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
4051		cic_set_cfqq(cic, cfqq, is_sync);
4052	} else {
4053		/*
4054		 * If the queue was seeky for too long, break it apart.
4055		 */
4056		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4057			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4058			cfqq = split_cfqq(cic, cfqq);
4059			if (!cfqq)
4060				goto new_queue;
4061		}
4062
4063		/*
4064		 * Check to see if this queue is scheduled to merge with
4065		 * another, closely cooperating queue.  The merging of
4066		 * queues happens here as it must be done in process context.
4067		 * The reference on new_cfqq was taken in merge_cfqqs.
4068		 */
4069		if (cfqq->new_cfqq)
4070			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4071	}
4072
4073	cfqq->allocated[rw]++;
4074
4075	cfqq->ref++;
4076	cfqg_get(cfqq->cfqg);
4077	rq->elv.priv[0] = cfqq;
4078	rq->elv.priv[1] = cfqq->cfqg;
4079	spin_unlock_irq(q->queue_lock);
4080	return 0;
4081}
4082
4083static void cfq_kick_queue(struct work_struct *work)
4084{
4085	struct cfq_data *cfqd =
4086		container_of(work, struct cfq_data, unplug_work);
4087	struct request_queue *q = cfqd->queue;
4088
4089	spin_lock_irq(q->queue_lock);
4090	__blk_run_queue(cfqd->queue);
4091	spin_unlock_irq(q->queue_lock);
4092}
4093
4094/*
4095 * Timer running if the active_queue is currently idling inside its time slice
4096 */
4097static void cfq_idle_slice_timer(unsigned long data)
4098{
4099	struct cfq_data *cfqd = (struct cfq_data *) data;
4100	struct cfq_queue *cfqq;
4101	unsigned long flags;
4102	int timed_out = 1;
4103
4104	cfq_log(cfqd, "idle timer fired");
4105
4106	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4107
4108	cfqq = cfqd->active_queue;
4109	if (cfqq) {
4110		timed_out = 0;
4111
4112		/*
4113		 * We saw a request before the queue expired, let it through
4114		 */
4115		if (cfq_cfqq_must_dispatch(cfqq))
4116			goto out_kick;
4117
4118		/*
4119		 * expired
4120		 */
4121		if (cfq_slice_used(cfqq))
4122			goto expire;
4123
4124		/*
4125		 * only expire and reinvoke request handler, if there are
4126		 * other queues with pending requests
4127		 */
4128		if (!cfqd->busy_queues)
4129			goto out_cont;
4130
4131		/*
4132		 * not expired and it has a request pending, let it dispatch
4133		 */
4134		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4135			goto out_kick;
4136
4137		/*
4138		 * Queue depth flag is reset only when the idle didn't succeed
4139		 */
4140		cfq_clear_cfqq_deep(cfqq);
4141	}
4142expire:
4143	cfq_slice_expired(cfqd, timed_out);
4144out_kick:
4145	cfq_schedule_dispatch(cfqd);
4146out_cont:
4147	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4148}
4149
4150static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4151{
4152	del_timer_sync(&cfqd->idle_slice_timer);
4153	cancel_work_sync(&cfqd->unplug_work);
4154}
4155
4156static void cfq_put_async_queues(struct cfq_data *cfqd)
4157{
4158	int i;
4159
4160	for (i = 0; i < IOPRIO_BE_NR; i++) {
4161		if (cfqd->async_cfqq[0][i])
4162			cfq_put_queue(cfqd->async_cfqq[0][i]);
4163		if (cfqd->async_cfqq[1][i])
4164			cfq_put_queue(cfqd->async_cfqq[1][i]);
4165	}
4166
4167	if (cfqd->async_idle_cfqq)
4168		cfq_put_queue(cfqd->async_idle_cfqq);
4169}
4170
4171static void cfq_exit_queue(struct elevator_queue *e)
4172{
4173	struct cfq_data *cfqd = e->elevator_data;
4174	struct request_queue *q = cfqd->queue;
4175
4176	cfq_shutdown_timer_wq(cfqd);
4177
4178	spin_lock_irq(q->queue_lock);
4179
4180	if (cfqd->active_queue)
4181		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4182
4183	cfq_put_async_queues(cfqd);
4184
4185	spin_unlock_irq(q->queue_lock);
4186
4187	cfq_shutdown_timer_wq(cfqd);
4188
4189#ifdef CONFIG_CFQ_GROUP_IOSCHED
4190	blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4191#else
4192	kfree(cfqd->root_group);
4193#endif
4194	kfree(cfqd);
4195}
4196
4197static int cfq_init_queue(struct request_queue *q)
4198{
4199	struct cfq_data *cfqd;
4200	struct blkcg_gq *blkg __maybe_unused;
4201	int i, ret;
4202
4203	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
4204	if (!cfqd)
4205		return -ENOMEM;
4206
4207	cfqd->queue = q;
4208	q->elevator->elevator_data = cfqd;
4209
4210	/* Init root service tree */
4211	cfqd->grp_service_tree = CFQ_RB_ROOT;
4212
4213	/* Init root group and prefer root group over other groups by default */
4214#ifdef CONFIG_CFQ_GROUP_IOSCHED
4215	ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4216	if (ret)
4217		goto out_free;
4218
4219	cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4220#else
4221	ret = -ENOMEM;
4222	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4223					GFP_KERNEL, cfqd->queue->node);
4224	if (!cfqd->root_group)
4225		goto out_free;
4226
4227	cfq_init_cfqg_base(cfqd->root_group);
4228#endif
4229	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
4230	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
4231
4232	/*
4233	 * Not strictly needed (since RB_ROOT just clears the node and we
4234	 * zeroed cfqd on alloc), but better be safe in case someone decides
4235	 * to add magic to the rb code
4236	 */
4237	for (i = 0; i < CFQ_PRIO_LISTS; i++)
4238		cfqd->prio_trees[i] = RB_ROOT;
4239
4240	/*
4241	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
4242	 * Grab a permanent reference to it, so that the normal code flow
4243	 * will not attempt to free it.  oom_cfqq is linked to root_group
4244	 * but shouldn't hold a reference as it'll never be unlinked.  Lose
4245	 * the reference from linking right away.
4246	 */
4247	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4248	cfqd->oom_cfqq.ref++;
4249
4250	spin_lock_irq(q->queue_lock);
4251	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4252	cfqg_put(cfqd->root_group);
4253	spin_unlock_irq(q->queue_lock);
4254
4255	init_timer(&cfqd->idle_slice_timer);
4256	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4257	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
4258
4259	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4260
4261	cfqd->cfq_quantum = cfq_quantum;
4262	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4263	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4264	cfqd->cfq_back_max = cfq_back_max;
4265	cfqd->cfq_back_penalty = cfq_back_penalty;
4266	cfqd->cfq_slice[0] = cfq_slice_async;
4267	cfqd->cfq_slice[1] = cfq_slice_sync;
4268	cfqd->cfq_target_latency = cfq_target_latency;
4269	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4270	cfqd->cfq_slice_idle = cfq_slice_idle;
4271	cfqd->cfq_group_idle = cfq_group_idle;
4272	cfqd->cfq_latency = 1;
4273	cfqd->hw_tag = -1;
4274	/*
4275	 * we optimistically start assuming sync ops weren't delayed in last
4276	 * second, in order to have larger depth for async operations.
4277	 */
4278	cfqd->last_delayed_sync = jiffies - HZ;
4279	return 0;
4280
4281out_free:
4282	kfree(cfqd);
4283	return ret;
4284}
4285
4286/*
4287 * sysfs parts below -->
4288 */
4289static ssize_t
4290cfq_var_show(unsigned int var, char *page)
4291{
4292	return sprintf(page, "%d\n", var);
4293}
4294
4295static ssize_t
4296cfq_var_store(unsigned int *var, const char *page, size_t count)
4297{
4298	char *p = (char *) page;
4299
4300	*var = simple_strtoul(p, &p, 10);
4301	return count;
4302}
4303
4304#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
4305static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
4306{									\
4307	struct cfq_data *cfqd = e->elevator_data;			\
4308	unsigned int __data = __VAR;					\
4309	if (__CONV)							\
4310		__data = jiffies_to_msecs(__data);			\
4311	return cfq_var_show(__data, (page));				\
4312}
4313SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4314SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4315SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4316SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4317SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4318SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4319SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4320SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4321SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4322SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4323SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4324SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4325#undef SHOW_FUNCTION
4326
4327#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
4328static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
4329{									\
4330	struct cfq_data *cfqd = e->elevator_data;			\
4331	unsigned int __data;						\
4332	int ret = cfq_var_store(&__data, (page), count);		\
4333	if (__data < (MIN))						\
4334		__data = (MIN);						\
4335	else if (__data > (MAX))					\
4336		__data = (MAX);						\
4337	if (__CONV)							\
4338		*(__PTR) = msecs_to_jiffies(__data);			\
4339	else								\
4340		*(__PTR) = __data;					\
4341	return ret;							\
4342}
4343STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4344STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4345		UINT_MAX, 1);
4346STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4347		UINT_MAX, 1);
4348STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4349STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4350		UINT_MAX, 0);
4351STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4352STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4353STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4354STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4355STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4356		UINT_MAX, 0);
4357STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4358STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4359#undef STORE_FUNCTION
4360
4361#define CFQ_ATTR(name) \
4362	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
4363
4364static struct elv_fs_entry cfq_attrs[] = {
4365	CFQ_ATTR(quantum),
4366	CFQ_ATTR(fifo_expire_sync),
4367	CFQ_ATTR(fifo_expire_async),
4368	CFQ_ATTR(back_seek_max),
4369	CFQ_ATTR(back_seek_penalty),
4370	CFQ_ATTR(slice_sync),
4371	CFQ_ATTR(slice_async),
4372	CFQ_ATTR(slice_async_rq),
4373	CFQ_ATTR(slice_idle),
4374	CFQ_ATTR(group_idle),
4375	CFQ_ATTR(low_latency),
4376	CFQ_ATTR(target_latency),
4377	__ATTR_NULL
4378};
4379
4380static struct elevator_type iosched_cfq = {
4381	.ops = {
4382		.elevator_merge_fn = 		cfq_merge,
4383		.elevator_merged_fn =		cfq_merged_request,
4384		.elevator_merge_req_fn =	cfq_merged_requests,
4385		.elevator_allow_merge_fn =	cfq_allow_merge,
4386		.elevator_bio_merged_fn =	cfq_bio_merged,
4387		.elevator_dispatch_fn =		cfq_dispatch_requests,
4388		.elevator_add_req_fn =		cfq_insert_request,
4389		.elevator_activate_req_fn =	cfq_activate_request,
4390		.elevator_deactivate_req_fn =	cfq_deactivate_request,
4391		.elevator_completed_req_fn =	cfq_completed_request,
4392		.elevator_former_req_fn =	elv_rb_former_request,
4393		.elevator_latter_req_fn =	elv_rb_latter_request,
4394		.elevator_init_icq_fn =		cfq_init_icq,
4395		.elevator_exit_icq_fn =		cfq_exit_icq,
4396		.elevator_set_req_fn =		cfq_set_request,
4397		.elevator_put_req_fn =		cfq_put_request,
4398		.elevator_may_queue_fn =	cfq_may_queue,
4399		.elevator_init_fn =		cfq_init_queue,
4400		.elevator_exit_fn =		cfq_exit_queue,
4401	},
4402	.icq_size	=	sizeof(struct cfq_io_cq),
4403	.icq_align	=	__alignof__(struct cfq_io_cq),
4404	.elevator_attrs =	cfq_attrs,
4405	.elevator_name	=	"cfq",
4406	.elevator_owner =	THIS_MODULE,
4407};
4408
4409#ifdef CONFIG_CFQ_GROUP_IOSCHED
4410static struct blkcg_policy blkcg_policy_cfq = {
4411	.pd_size		= sizeof(struct cfq_group),
4412	.cftypes		= cfq_blkcg_files,
4413
4414	.pd_init_fn		= cfq_pd_init,
4415	.pd_reset_stats_fn	= cfq_pd_reset_stats,
4416};
4417#endif
4418
4419static int __init cfq_init(void)
4420{
4421	int ret;
4422
4423	/*
4424	 * could be 0 on HZ < 1000 setups
4425	 */
4426	if (!cfq_slice_async)
4427		cfq_slice_async = 1;
4428	if (!cfq_slice_idle)
4429		cfq_slice_idle = 1;
4430
4431#ifdef CONFIG_CFQ_GROUP_IOSCHED
4432	if (!cfq_group_idle)
4433		cfq_group_idle = 1;
4434
4435	ret = blkcg_policy_register(&blkcg_policy_cfq);
4436	if (ret)
4437		return ret;
4438#else
4439	cfq_group_idle = 0;
4440#endif
4441
4442	ret = -ENOMEM;
4443	cfq_pool = KMEM_CACHE(cfq_queue, 0);
4444	if (!cfq_pool)
4445		goto err_pol_unreg;
4446
4447	ret = elv_register(&iosched_cfq);
4448	if (ret)
4449		goto err_free_pool;
4450
4451	return 0;
4452
4453err_free_pool:
4454	kmem_cache_destroy(cfq_pool);
4455err_pol_unreg:
4456#ifdef CONFIG_CFQ_GROUP_IOSCHED
4457	blkcg_policy_unregister(&blkcg_policy_cfq);
4458#endif
4459	return ret;
4460}
4461
4462static void __exit cfq_exit(void)
4463{
4464#ifdef CONFIG_CFQ_GROUP_IOSCHED
4465	blkcg_policy_unregister(&blkcg_policy_cfq);
4466#endif
4467	elv_unregister(&iosched_cfq);
4468	kmem_cache_destroy(cfq_pool);
4469}
4470
4471module_init(cfq_init);
4472module_exit(cfq_exit);
4473
4474MODULE_AUTHOR("Jens Axboe");
4475MODULE_LICENSE("GPL");
4476MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
4477