cfq-iosched.c revision 1d3650f713e7f6392b02fde450c5bae40291e65b
1/*
2 *  CFQ, or complete fairness queueing, disk scheduler.
3 *
4 *  Based on ideas from a previously unfinished io
5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <linux/blkdev.h>
12#include <linux/elevator.h>
13#include <linux/jiffies.h>
14#include <linux/rbtree.h>
15#include <linux/ioprio.h>
16#include <linux/blktrace_api.h>
17#include "blk.h"
18#include "blk-cgroup.h"
19
20/*
21 * tunables
22 */
23/* max queue in one round of service */
24static const int cfq_quantum = 8;
25static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
26/* maximum backwards seek, in KiB */
27static const int cfq_back_max = 16 * 1024;
28/* penalty of a backwards seek */
29static const int cfq_back_penalty = 2;
30static const int cfq_slice_sync = HZ / 10;
31static int cfq_slice_async = HZ / 25;
32static const int cfq_slice_async_rq = 2;
33static int cfq_slice_idle = HZ / 125;
34static int cfq_group_idle = HZ / 125;
35static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
36static const int cfq_hist_divisor = 4;
37
38/*
39 * offset from end of service tree
40 */
41#define CFQ_IDLE_DELAY		(HZ / 5)
42
43/*
44 * below this threshold, we consider thinktime immediate
45 */
46#define CFQ_MIN_TT		(2)
47
48#define CFQ_SLICE_SCALE		(5)
49#define CFQ_HW_QUEUE_MIN	(5)
50#define CFQ_SERVICE_SHIFT       12
51
52#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
53#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
54#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
55#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
56
57#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
58#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
59#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
60
61static struct kmem_cache *cfq_pool;
62
63#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
64#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
65#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
66
67#define sample_valid(samples)	((samples) > 80)
68#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
69
70struct cfq_ttime {
71	unsigned long last_end_request;
72
73	unsigned long ttime_total;
74	unsigned long ttime_samples;
75	unsigned long ttime_mean;
76};
77
78/*
79 * Most of our rbtree usage is for sorting with min extraction, so
80 * if we cache the leftmost node we don't have to walk down the tree
81 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
82 * move this into the elevator for the rq sorting as well.
83 */
84struct cfq_rb_root {
85	struct rb_root rb;
86	struct rb_node *left;
87	unsigned count;
88	unsigned total_weight;
89	u64 min_vdisktime;
90	struct cfq_ttime ttime;
91};
92#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, \
93			.ttime = {.last_end_request = jiffies,},}
94
95/*
96 * Per process-grouping structure
97 */
98struct cfq_queue {
99	/* reference count */
100	int ref;
101	/* various state flags, see below */
102	unsigned int flags;
103	/* parent cfq_data */
104	struct cfq_data *cfqd;
105	/* service_tree member */
106	struct rb_node rb_node;
107	/* service_tree key */
108	unsigned long rb_key;
109	/* prio tree member */
110	struct rb_node p_node;
111	/* prio tree root we belong to, if any */
112	struct rb_root *p_root;
113	/* sorted list of pending requests */
114	struct rb_root sort_list;
115	/* if fifo isn't expired, next request to serve */
116	struct request *next_rq;
117	/* requests queued in sort_list */
118	int queued[2];
119	/* currently allocated requests */
120	int allocated[2];
121	/* fifo list of requests in sort_list */
122	struct list_head fifo;
123
124	/* time when queue got scheduled in to dispatch first request. */
125	unsigned long dispatch_start;
126	unsigned int allocated_slice;
127	unsigned int slice_dispatch;
128	/* time when first request from queue completed and slice started. */
129	unsigned long slice_start;
130	unsigned long slice_end;
131	long slice_resid;
132
133	/* pending priority requests */
134	int prio_pending;
135	/* number of requests that are on the dispatch list or inside driver */
136	int dispatched;
137
138	/* io prio of this group */
139	unsigned short ioprio, org_ioprio;
140	unsigned short ioprio_class;
141
142	pid_t pid;
143
144	u32 seek_history;
145	sector_t last_request_pos;
146
147	struct cfq_rb_root *service_tree;
148	struct cfq_queue *new_cfqq;
149	struct cfq_group *cfqg;
150	/* Number of sectors dispatched from queue in single dispatch round */
151	unsigned long nr_sectors;
152};
153
154/*
155 * First index in the service_trees.
156 * IDLE is handled separately, so it has negative index
157 */
158enum wl_class_t {
159	BE_WORKLOAD = 0,
160	RT_WORKLOAD = 1,
161	IDLE_WORKLOAD = 2,
162	CFQ_PRIO_NR,
163};
164
165/*
166 * Second index in the service_trees.
167 */
168enum wl_type_t {
169	ASYNC_WORKLOAD = 0,
170	SYNC_NOIDLE_WORKLOAD = 1,
171	SYNC_WORKLOAD = 2
172};
173
174struct cfqg_stats {
175#ifdef CONFIG_CFQ_GROUP_IOSCHED
176	/* total bytes transferred */
177	struct blkg_rwstat		service_bytes;
178	/* total IOs serviced, post merge */
179	struct blkg_rwstat		serviced;
180	/* number of ios merged */
181	struct blkg_rwstat		merged;
182	/* total time spent on device in ns, may not be accurate w/ queueing */
183	struct blkg_rwstat		service_time;
184	/* total time spent waiting in scheduler queue in ns */
185	struct blkg_rwstat		wait_time;
186	/* number of IOs queued up */
187	struct blkg_rwstat		queued;
188	/* total sectors transferred */
189	struct blkg_stat		sectors;
190	/* total disk time and nr sectors dispatched by this group */
191	struct blkg_stat		time;
192#ifdef CONFIG_DEBUG_BLK_CGROUP
193	/* time not charged to this cgroup */
194	struct blkg_stat		unaccounted_time;
195	/* sum of number of ios queued across all samples */
196	struct blkg_stat		avg_queue_size_sum;
197	/* count of samples taken for average */
198	struct blkg_stat		avg_queue_size_samples;
199	/* how many times this group has been removed from service tree */
200	struct blkg_stat		dequeue;
201	/* total time spent waiting for it to be assigned a timeslice. */
202	struct blkg_stat		group_wait_time;
203	/* time spent idling for this blkcg_gq */
204	struct blkg_stat		idle_time;
205	/* total time with empty current active q with other requests queued */
206	struct blkg_stat		empty_time;
207	/* fields after this shouldn't be cleared on stat reset */
208	uint64_t			start_group_wait_time;
209	uint64_t			start_idle_time;
210	uint64_t			start_empty_time;
211	uint16_t			flags;
212#endif	/* CONFIG_DEBUG_BLK_CGROUP */
213#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
214};
215
216/* This is per cgroup per device grouping structure */
217struct cfq_group {
218	/* must be the first member */
219	struct blkg_policy_data pd;
220
221	/* group service_tree member */
222	struct rb_node rb_node;
223
224	/* group service_tree key */
225	u64 vdisktime;
226
227	/*
228	 * The number of active cfqgs and sum of their weights under this
229	 * cfqg.  This covers this cfqg's leaf_weight and all children's
230	 * weights, but does not cover weights of further descendants.
231	 *
232	 * If a cfqg is on the service tree, it's active.  An active cfqg
233	 * also activates its parent and contributes to the children_weight
234	 * of the parent.
235	 */
236	int nr_active;
237	unsigned int children_weight;
238
239	/*
240	 * vfraction is the fraction of vdisktime that the tasks in this
241	 * cfqg are entitled to.  This is determined by compounding the
242	 * ratios walking up from this cfqg to the root.
243	 *
244	 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
245	 * vfractions on a service tree is approximately 1.  The sum may
246	 * deviate a bit due to rounding errors and fluctuations caused by
247	 * cfqgs entering and leaving the service tree.
248	 */
249	unsigned int vfraction;
250
251	/*
252	 * There are two weights - (internal) weight is the weight of this
253	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
254	 * this cfqg against the child cfqgs.  For the root cfqg, both
255	 * weights are kept in sync for backward compatibility.
256	 */
257	unsigned int weight;
258	unsigned int new_weight;
259	unsigned int dev_weight;
260
261	unsigned int leaf_weight;
262	unsigned int new_leaf_weight;
263	unsigned int dev_leaf_weight;
264
265	/* number of cfqq currently on this group */
266	int nr_cfqq;
267
268	/*
269	 * Per group busy queues average. Useful for workload slice calc. We
270	 * create the array for each prio class but at run time it is used
271	 * only for RT and BE class and slot for IDLE class remains unused.
272	 * This is primarily done to avoid confusion and a gcc warning.
273	 */
274	unsigned int busy_queues_avg[CFQ_PRIO_NR];
275	/*
276	 * rr lists of queues with requests. We maintain service trees for
277	 * RT and BE classes. These trees are subdivided in subclasses
278	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
279	 * class there is no subclassification and all the cfq queues go on
280	 * a single tree service_tree_idle.
281	 * Counts are embedded in the cfq_rb_root
282	 */
283	struct cfq_rb_root service_trees[2][3];
284	struct cfq_rb_root service_tree_idle;
285
286	unsigned long saved_wl_slice;
287	enum wl_type_t saved_wl_type;
288	enum wl_class_t saved_wl_class;
289
290	/* number of requests that are on the dispatch list or inside driver */
291	int dispatched;
292	struct cfq_ttime ttime;
293	struct cfqg_stats stats;
294};
295
296struct cfq_io_cq {
297	struct io_cq		icq;		/* must be the first member */
298	struct cfq_queue	*cfqq[2];
299	struct cfq_ttime	ttime;
300	int			ioprio;		/* the current ioprio */
301#ifdef CONFIG_CFQ_GROUP_IOSCHED
302	uint64_t		blkcg_id;	/* the current blkcg ID */
303#endif
304};
305
306/*
307 * Per block device queue structure
308 */
309struct cfq_data {
310	struct request_queue *queue;
311	/* Root service tree for cfq_groups */
312	struct cfq_rb_root grp_service_tree;
313	struct cfq_group *root_group;
314
315	/*
316	 * The priority currently being served
317	 */
318	enum wl_class_t serving_wl_class;
319	enum wl_type_t serving_wl_type;
320	unsigned long workload_expires;
321	struct cfq_group *serving_group;
322
323	/*
324	 * Each priority tree is sorted by next_request position.  These
325	 * trees are used when determining if two or more queues are
326	 * interleaving requests (see cfq_close_cooperator).
327	 */
328	struct rb_root prio_trees[CFQ_PRIO_LISTS];
329
330	unsigned int busy_queues;
331	unsigned int busy_sync_queues;
332
333	int rq_in_driver;
334	int rq_in_flight[2];
335
336	/*
337	 * queue-depth detection
338	 */
339	int rq_queued;
340	int hw_tag;
341	/*
342	 * hw_tag can be
343	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
344	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
345	 *  0 => no NCQ
346	 */
347	int hw_tag_est_depth;
348	unsigned int hw_tag_samples;
349
350	/*
351	 * idle window management
352	 */
353	struct timer_list idle_slice_timer;
354	struct work_struct unplug_work;
355
356	struct cfq_queue *active_queue;
357	struct cfq_io_cq *active_cic;
358
359	/*
360	 * async queue for each priority case
361	 */
362	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
363	struct cfq_queue *async_idle_cfqq;
364
365	sector_t last_position;
366
367	/*
368	 * tunables, see top of file
369	 */
370	unsigned int cfq_quantum;
371	unsigned int cfq_fifo_expire[2];
372	unsigned int cfq_back_penalty;
373	unsigned int cfq_back_max;
374	unsigned int cfq_slice[2];
375	unsigned int cfq_slice_async_rq;
376	unsigned int cfq_slice_idle;
377	unsigned int cfq_group_idle;
378	unsigned int cfq_latency;
379	unsigned int cfq_target_latency;
380
381	/*
382	 * Fallback dummy cfqq for extreme OOM conditions
383	 */
384	struct cfq_queue oom_cfqq;
385
386	unsigned long last_delayed_sync;
387};
388
389static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
390
391static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
392					    enum wl_class_t class,
393					    enum wl_type_t type)
394{
395	if (!cfqg)
396		return NULL;
397
398	if (class == IDLE_WORKLOAD)
399		return &cfqg->service_tree_idle;
400
401	return &cfqg->service_trees[class][type];
402}
403
404enum cfqq_state_flags {
405	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
406	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
407	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
408	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
409	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
410	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
411	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
412	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
413	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
414	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
415	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
416	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
417	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
418};
419
420#define CFQ_CFQQ_FNS(name)						\
421static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
422{									\
423	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
424}									\
425static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
426{									\
427	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
428}									\
429static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
430{									\
431	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
432}
433
434CFQ_CFQQ_FNS(on_rr);
435CFQ_CFQQ_FNS(wait_request);
436CFQ_CFQQ_FNS(must_dispatch);
437CFQ_CFQQ_FNS(must_alloc_slice);
438CFQ_CFQQ_FNS(fifo_expire);
439CFQ_CFQQ_FNS(idle_window);
440CFQ_CFQQ_FNS(prio_changed);
441CFQ_CFQQ_FNS(slice_new);
442CFQ_CFQQ_FNS(sync);
443CFQ_CFQQ_FNS(coop);
444CFQ_CFQQ_FNS(split_coop);
445CFQ_CFQQ_FNS(deep);
446CFQ_CFQQ_FNS(wait_busy);
447#undef CFQ_CFQQ_FNS
448
449static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
450{
451	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
452}
453
454static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
455{
456	return pd_to_blkg(&cfqg->pd);
457}
458
459#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
460
461/* cfqg stats flags */
462enum cfqg_stats_flags {
463	CFQG_stats_waiting = 0,
464	CFQG_stats_idling,
465	CFQG_stats_empty,
466};
467
468#define CFQG_FLAG_FNS(name)						\
469static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
470{									\
471	stats->flags |= (1 << CFQG_stats_##name);			\
472}									\
473static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
474{									\
475	stats->flags &= ~(1 << CFQG_stats_##name);			\
476}									\
477static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
478{									\
479	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
480}									\
481
482CFQG_FLAG_FNS(waiting)
483CFQG_FLAG_FNS(idling)
484CFQG_FLAG_FNS(empty)
485#undef CFQG_FLAG_FNS
486
487/* This should be called with the queue_lock held. */
488static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
489{
490	unsigned long long now;
491
492	if (!cfqg_stats_waiting(stats))
493		return;
494
495	now = sched_clock();
496	if (time_after64(now, stats->start_group_wait_time))
497		blkg_stat_add(&stats->group_wait_time,
498			      now - stats->start_group_wait_time);
499	cfqg_stats_clear_waiting(stats);
500}
501
502/* This should be called with the queue_lock held. */
503static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
504						 struct cfq_group *curr_cfqg)
505{
506	struct cfqg_stats *stats = &cfqg->stats;
507
508	if (cfqg_stats_waiting(stats))
509		return;
510	if (cfqg == curr_cfqg)
511		return;
512	stats->start_group_wait_time = sched_clock();
513	cfqg_stats_mark_waiting(stats);
514}
515
516/* This should be called with the queue_lock held. */
517static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
518{
519	unsigned long long now;
520
521	if (!cfqg_stats_empty(stats))
522		return;
523
524	now = sched_clock();
525	if (time_after64(now, stats->start_empty_time))
526		blkg_stat_add(&stats->empty_time,
527			      now - stats->start_empty_time);
528	cfqg_stats_clear_empty(stats);
529}
530
531static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
532{
533	blkg_stat_add(&cfqg->stats.dequeue, 1);
534}
535
536static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
537{
538	struct cfqg_stats *stats = &cfqg->stats;
539
540	if (blkg_rwstat_sum(&stats->queued))
541		return;
542
543	/*
544	 * group is already marked empty. This can happen if cfqq got new
545	 * request in parent group and moved to this group while being added
546	 * to service tree. Just ignore the event and move on.
547	 */
548	if (cfqg_stats_empty(stats))
549		return;
550
551	stats->start_empty_time = sched_clock();
552	cfqg_stats_mark_empty(stats);
553}
554
555static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
556{
557	struct cfqg_stats *stats = &cfqg->stats;
558
559	if (cfqg_stats_idling(stats)) {
560		unsigned long long now = sched_clock();
561
562		if (time_after64(now, stats->start_idle_time))
563			blkg_stat_add(&stats->idle_time,
564				      now - stats->start_idle_time);
565		cfqg_stats_clear_idling(stats);
566	}
567}
568
569static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
570{
571	struct cfqg_stats *stats = &cfqg->stats;
572
573	BUG_ON(cfqg_stats_idling(stats));
574
575	stats->start_idle_time = sched_clock();
576	cfqg_stats_mark_idling(stats);
577}
578
579static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
580{
581	struct cfqg_stats *stats = &cfqg->stats;
582
583	blkg_stat_add(&stats->avg_queue_size_sum,
584		      blkg_rwstat_sum(&stats->queued));
585	blkg_stat_add(&stats->avg_queue_size_samples, 1);
586	cfqg_stats_update_group_wait_time(stats);
587}
588
589#else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
590
591static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
592static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
593static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
594static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
595static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
596static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
597static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
598
599#endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
600
601#ifdef CONFIG_CFQ_GROUP_IOSCHED
602
603static struct blkcg_policy blkcg_policy_cfq;
604
605static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
606{
607	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
608}
609
610/*
611 * Determine the parent cfqg for weight calculation.  Currently, cfqg
612 * scheduling is flat and the root is the parent of everyone else.
613 */
614static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg)
615{
616	struct blkcg_gq *blkg = cfqg_to_blkg(cfqg);
617	struct cfq_group *root;
618
619	while (blkg->parent)
620		blkg = blkg->parent;
621	root = blkg_to_cfqg(blkg);
622
623	return root != cfqg ? root : NULL;
624}
625
626static inline void cfqg_get(struct cfq_group *cfqg)
627{
628	return blkg_get(cfqg_to_blkg(cfqg));
629}
630
631static inline void cfqg_put(struct cfq_group *cfqg)
632{
633	return blkg_put(cfqg_to_blkg(cfqg));
634}
635
636#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	do {			\
637	char __pbuf[128];						\
638									\
639	blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));	\
640	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
641			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
642			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
643			  __pbuf, ##args);				\
644} while (0)
645
646#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)	do {			\
647	char __pbuf[128];						\
648									\
649	blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));		\
650	blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);	\
651} while (0)
652
653static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
654					    struct cfq_group *curr_cfqg, int rw)
655{
656	blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
657	cfqg_stats_end_empty_time(&cfqg->stats);
658	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
659}
660
661static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
662			unsigned long time, unsigned long unaccounted_time)
663{
664	blkg_stat_add(&cfqg->stats.time, time);
665#ifdef CONFIG_DEBUG_BLK_CGROUP
666	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
667#endif
668}
669
670static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
671{
672	blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
673}
674
675static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
676{
677	blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
678}
679
680static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
681					      uint64_t bytes, int rw)
682{
683	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
684	blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
685	blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
686}
687
688static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
689			uint64_t start_time, uint64_t io_start_time, int rw)
690{
691	struct cfqg_stats *stats = &cfqg->stats;
692	unsigned long long now = sched_clock();
693
694	if (time_after64(now, io_start_time))
695		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
696	if (time_after64(io_start_time, start_time))
697		blkg_rwstat_add(&stats->wait_time, rw,
698				io_start_time - start_time);
699}
700
701static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
702{
703	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
704	struct cfqg_stats *stats = &cfqg->stats;
705
706	/* queued stats shouldn't be cleared */
707	blkg_rwstat_reset(&stats->service_bytes);
708	blkg_rwstat_reset(&stats->serviced);
709	blkg_rwstat_reset(&stats->merged);
710	blkg_rwstat_reset(&stats->service_time);
711	blkg_rwstat_reset(&stats->wait_time);
712	blkg_stat_reset(&stats->time);
713#ifdef CONFIG_DEBUG_BLK_CGROUP
714	blkg_stat_reset(&stats->unaccounted_time);
715	blkg_stat_reset(&stats->avg_queue_size_sum);
716	blkg_stat_reset(&stats->avg_queue_size_samples);
717	blkg_stat_reset(&stats->dequeue);
718	blkg_stat_reset(&stats->group_wait_time);
719	blkg_stat_reset(&stats->idle_time);
720	blkg_stat_reset(&stats->empty_time);
721#endif
722}
723
724#else	/* CONFIG_CFQ_GROUP_IOSCHED */
725
726static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg) { return NULL; }
727static inline void cfqg_get(struct cfq_group *cfqg) { }
728static inline void cfqg_put(struct cfq_group *cfqg) { }
729
730#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
731	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid,	\
732			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
733			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
734				##args)
735#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
736
737static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
738			struct cfq_group *curr_cfqg, int rw) { }
739static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
740			unsigned long time, unsigned long unaccounted_time) { }
741static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
742static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
743static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
744					      uint64_t bytes, int rw) { }
745static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
746			uint64_t start_time, uint64_t io_start_time, int rw) { }
747
748#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
749
750#define cfq_log(cfqd, fmt, args...)	\
751	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
752
753/* Traverses through cfq group service trees */
754#define for_each_cfqg_st(cfqg, i, j, st) \
755	for (i = 0; i <= IDLE_WORKLOAD; i++) \
756		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
757			: &cfqg->service_tree_idle; \
758			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
759			(i == IDLE_WORKLOAD && j == 0); \
760			j++, st = i < IDLE_WORKLOAD ? \
761			&cfqg->service_trees[i][j]: NULL) \
762
763static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
764	struct cfq_ttime *ttime, bool group_idle)
765{
766	unsigned long slice;
767	if (!sample_valid(ttime->ttime_samples))
768		return false;
769	if (group_idle)
770		slice = cfqd->cfq_group_idle;
771	else
772		slice = cfqd->cfq_slice_idle;
773	return ttime->ttime_mean > slice;
774}
775
776static inline bool iops_mode(struct cfq_data *cfqd)
777{
778	/*
779	 * If we are not idling on queues and it is a NCQ drive, parallel
780	 * execution of requests is on and measuring time is not possible
781	 * in most of the cases until and unless we drive shallower queue
782	 * depths and that becomes a performance bottleneck. In such cases
783	 * switch to start providing fairness in terms of number of IOs.
784	 */
785	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
786		return true;
787	else
788		return false;
789}
790
791static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
792{
793	if (cfq_class_idle(cfqq))
794		return IDLE_WORKLOAD;
795	if (cfq_class_rt(cfqq))
796		return RT_WORKLOAD;
797	return BE_WORKLOAD;
798}
799
800
801static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
802{
803	if (!cfq_cfqq_sync(cfqq))
804		return ASYNC_WORKLOAD;
805	if (!cfq_cfqq_idle_window(cfqq))
806		return SYNC_NOIDLE_WORKLOAD;
807	return SYNC_WORKLOAD;
808}
809
810static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
811					struct cfq_data *cfqd,
812					struct cfq_group *cfqg)
813{
814	if (wl_class == IDLE_WORKLOAD)
815		return cfqg->service_tree_idle.count;
816
817	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
818		cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
819		cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
820}
821
822static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
823					struct cfq_group *cfqg)
824{
825	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
826		cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
827}
828
829static void cfq_dispatch_insert(struct request_queue *, struct request *);
830static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
831				       struct cfq_io_cq *cic, struct bio *bio,
832				       gfp_t gfp_mask);
833
834static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
835{
836	/* cic->icq is the first member, %NULL will convert to %NULL */
837	return container_of(icq, struct cfq_io_cq, icq);
838}
839
840static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
841					       struct io_context *ioc)
842{
843	if (ioc)
844		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
845	return NULL;
846}
847
848static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
849{
850	return cic->cfqq[is_sync];
851}
852
853static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
854				bool is_sync)
855{
856	cic->cfqq[is_sync] = cfqq;
857}
858
859static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
860{
861	return cic->icq.q->elevator->elevator_data;
862}
863
864/*
865 * We regard a request as SYNC, if it's either a read or has the SYNC bit
866 * set (in which case it could also be direct WRITE).
867 */
868static inline bool cfq_bio_sync(struct bio *bio)
869{
870	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
871}
872
873/*
874 * scheduler run of queue, if there are requests pending and no one in the
875 * driver that will restart queueing
876 */
877static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
878{
879	if (cfqd->busy_queues) {
880		cfq_log(cfqd, "schedule dispatch");
881		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
882	}
883}
884
885/*
886 * Scale schedule slice based on io priority. Use the sync time slice only
887 * if a queue is marked sync and has sync io queued. A sync queue with async
888 * io only, should not get full sync slice length.
889 */
890static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
891				 unsigned short prio)
892{
893	const int base_slice = cfqd->cfq_slice[sync];
894
895	WARN_ON(prio >= IOPRIO_BE_NR);
896
897	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
898}
899
900static inline int
901cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
902{
903	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
904}
905
906/**
907 * cfqg_scale_charge - scale disk time charge according to cfqg weight
908 * @charge: disk time being charged
909 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
910 *
911 * Scale @charge according to @vfraction, which is in range (0, 1].  The
912 * scaling is inversely proportional.
913 *
914 * scaled = charge / vfraction
915 *
916 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
917 */
918static inline u64 cfqg_scale_charge(unsigned long charge,
919				    unsigned int vfraction)
920{
921	u64 c = charge << CFQ_SERVICE_SHIFT;	/* make it fixed point */
922
923	/* charge / vfraction */
924	c <<= CFQ_SERVICE_SHIFT;
925	do_div(c, vfraction);
926	return c;
927}
928
929static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
930{
931	s64 delta = (s64)(vdisktime - min_vdisktime);
932	if (delta > 0)
933		min_vdisktime = vdisktime;
934
935	return min_vdisktime;
936}
937
938static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
939{
940	s64 delta = (s64)(vdisktime - min_vdisktime);
941	if (delta < 0)
942		min_vdisktime = vdisktime;
943
944	return min_vdisktime;
945}
946
947static void update_min_vdisktime(struct cfq_rb_root *st)
948{
949	struct cfq_group *cfqg;
950
951	if (st->left) {
952		cfqg = rb_entry_cfqg(st->left);
953		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
954						  cfqg->vdisktime);
955	}
956}
957
958/*
959 * get averaged number of queues of RT/BE priority.
960 * average is updated, with a formula that gives more weight to higher numbers,
961 * to quickly follows sudden increases and decrease slowly
962 */
963
964static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
965					struct cfq_group *cfqg, bool rt)
966{
967	unsigned min_q, max_q;
968	unsigned mult  = cfq_hist_divisor - 1;
969	unsigned round = cfq_hist_divisor / 2;
970	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
971
972	min_q = min(cfqg->busy_queues_avg[rt], busy);
973	max_q = max(cfqg->busy_queues_avg[rt], busy);
974	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
975		cfq_hist_divisor;
976	return cfqg->busy_queues_avg[rt];
977}
978
979static inline unsigned
980cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
981{
982	struct cfq_rb_root *st = &cfqd->grp_service_tree;
983
984	return cfqd->cfq_target_latency * cfqg->weight / st->total_weight;
985}
986
987static inline unsigned
988cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
989{
990	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
991	if (cfqd->cfq_latency) {
992		/*
993		 * interested queues (we consider only the ones with the same
994		 * priority class in the cfq group)
995		 */
996		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
997						cfq_class_rt(cfqq));
998		unsigned sync_slice = cfqd->cfq_slice[1];
999		unsigned expect_latency = sync_slice * iq;
1000		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1001
1002		if (expect_latency > group_slice) {
1003			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
1004			/* scale low_slice according to IO priority
1005			 * and sync vs async */
1006			unsigned low_slice =
1007				min(slice, base_low_slice * slice / sync_slice);
1008			/* the adapted slice value is scaled to fit all iqs
1009			 * into the target latency */
1010			slice = max(slice * group_slice / expect_latency,
1011				    low_slice);
1012		}
1013	}
1014	return slice;
1015}
1016
1017static inline void
1018cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1019{
1020	unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1021
1022	cfqq->slice_start = jiffies;
1023	cfqq->slice_end = jiffies + slice;
1024	cfqq->allocated_slice = slice;
1025	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
1026}
1027
1028/*
1029 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1030 * isn't valid until the first request from the dispatch is activated
1031 * and the slice time set.
1032 */
1033static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1034{
1035	if (cfq_cfqq_slice_new(cfqq))
1036		return false;
1037	if (time_before(jiffies, cfqq->slice_end))
1038		return false;
1039
1040	return true;
1041}
1042
1043/*
1044 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1045 * We choose the request that is closest to the head right now. Distance
1046 * behind the head is penalized and only allowed to a certain extent.
1047 */
1048static struct request *
1049cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1050{
1051	sector_t s1, s2, d1 = 0, d2 = 0;
1052	unsigned long back_max;
1053#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
1054#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
1055	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1056
1057	if (rq1 == NULL || rq1 == rq2)
1058		return rq2;
1059	if (rq2 == NULL)
1060		return rq1;
1061
1062	if (rq_is_sync(rq1) != rq_is_sync(rq2))
1063		return rq_is_sync(rq1) ? rq1 : rq2;
1064
1065	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1066		return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1067
1068	s1 = blk_rq_pos(rq1);
1069	s2 = blk_rq_pos(rq2);
1070
1071	/*
1072	 * by definition, 1KiB is 2 sectors
1073	 */
1074	back_max = cfqd->cfq_back_max * 2;
1075
1076	/*
1077	 * Strict one way elevator _except_ in the case where we allow
1078	 * short backward seeks which are biased as twice the cost of a
1079	 * similar forward seek.
1080	 */
1081	if (s1 >= last)
1082		d1 = s1 - last;
1083	else if (s1 + back_max >= last)
1084		d1 = (last - s1) * cfqd->cfq_back_penalty;
1085	else
1086		wrap |= CFQ_RQ1_WRAP;
1087
1088	if (s2 >= last)
1089		d2 = s2 - last;
1090	else if (s2 + back_max >= last)
1091		d2 = (last - s2) * cfqd->cfq_back_penalty;
1092	else
1093		wrap |= CFQ_RQ2_WRAP;
1094
1095	/* Found required data */
1096
1097	/*
1098	 * By doing switch() on the bit mask "wrap" we avoid having to
1099	 * check two variables for all permutations: --> faster!
1100	 */
1101	switch (wrap) {
1102	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1103		if (d1 < d2)
1104			return rq1;
1105		else if (d2 < d1)
1106			return rq2;
1107		else {
1108			if (s1 >= s2)
1109				return rq1;
1110			else
1111				return rq2;
1112		}
1113
1114	case CFQ_RQ2_WRAP:
1115		return rq1;
1116	case CFQ_RQ1_WRAP:
1117		return rq2;
1118	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1119	default:
1120		/*
1121		 * Since both rqs are wrapped,
1122		 * start with the one that's further behind head
1123		 * (--> only *one* back seek required),
1124		 * since back seek takes more time than forward.
1125		 */
1126		if (s1 <= s2)
1127			return rq1;
1128		else
1129			return rq2;
1130	}
1131}
1132
1133/*
1134 * The below is leftmost cache rbtree addon
1135 */
1136static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1137{
1138	/* Service tree is empty */
1139	if (!root->count)
1140		return NULL;
1141
1142	if (!root->left)
1143		root->left = rb_first(&root->rb);
1144
1145	if (root->left)
1146		return rb_entry(root->left, struct cfq_queue, rb_node);
1147
1148	return NULL;
1149}
1150
1151static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1152{
1153	if (!root->left)
1154		root->left = rb_first(&root->rb);
1155
1156	if (root->left)
1157		return rb_entry_cfqg(root->left);
1158
1159	return NULL;
1160}
1161
1162static void rb_erase_init(struct rb_node *n, struct rb_root *root)
1163{
1164	rb_erase(n, root);
1165	RB_CLEAR_NODE(n);
1166}
1167
1168static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1169{
1170	if (root->left == n)
1171		root->left = NULL;
1172	rb_erase_init(n, &root->rb);
1173	--root->count;
1174}
1175
1176/*
1177 * would be nice to take fifo expire time into account as well
1178 */
1179static struct request *
1180cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1181		  struct request *last)
1182{
1183	struct rb_node *rbnext = rb_next(&last->rb_node);
1184	struct rb_node *rbprev = rb_prev(&last->rb_node);
1185	struct request *next = NULL, *prev = NULL;
1186
1187	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1188
1189	if (rbprev)
1190		prev = rb_entry_rq(rbprev);
1191
1192	if (rbnext)
1193		next = rb_entry_rq(rbnext);
1194	else {
1195		rbnext = rb_first(&cfqq->sort_list);
1196		if (rbnext && rbnext != &last->rb_node)
1197			next = rb_entry_rq(rbnext);
1198	}
1199
1200	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1201}
1202
1203static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
1204				      struct cfq_queue *cfqq)
1205{
1206	/*
1207	 * just an approximation, should be ok.
1208	 */
1209	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1210		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1211}
1212
1213static inline s64
1214cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1215{
1216	return cfqg->vdisktime - st->min_vdisktime;
1217}
1218
1219static void
1220__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1221{
1222	struct rb_node **node = &st->rb.rb_node;
1223	struct rb_node *parent = NULL;
1224	struct cfq_group *__cfqg;
1225	s64 key = cfqg_key(st, cfqg);
1226	int left = 1;
1227
1228	while (*node != NULL) {
1229		parent = *node;
1230		__cfqg = rb_entry_cfqg(parent);
1231
1232		if (key < cfqg_key(st, __cfqg))
1233			node = &parent->rb_left;
1234		else {
1235			node = &parent->rb_right;
1236			left = 0;
1237		}
1238	}
1239
1240	if (left)
1241		st->left = &cfqg->rb_node;
1242
1243	rb_link_node(&cfqg->rb_node, parent, node);
1244	rb_insert_color(&cfqg->rb_node, &st->rb);
1245}
1246
1247static void
1248cfq_update_group_weight(struct cfq_group *cfqg)
1249{
1250	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1251
1252	if (cfqg->new_weight) {
1253		cfqg->weight = cfqg->new_weight;
1254		cfqg->new_weight = 0;
1255	}
1256
1257	if (cfqg->new_leaf_weight) {
1258		cfqg->leaf_weight = cfqg->new_leaf_weight;
1259		cfqg->new_leaf_weight = 0;
1260	}
1261}
1262
1263static void
1264cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1265{
1266	unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;	/* start with 1 */
1267	struct cfq_group *pos = cfqg;
1268	struct cfq_group *parent;
1269	bool propagate;
1270
1271	/* add to the service tree */
1272	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1273
1274	cfq_update_group_weight(cfqg);
1275	__cfq_group_service_tree_add(st, cfqg);
1276	st->total_weight += cfqg->weight;
1277
1278	/*
1279	 * Activate @cfqg and calculate the portion of vfraction @cfqg is
1280	 * entitled to.  vfraction is calculated by walking the tree
1281	 * towards the root calculating the fraction it has at each level.
1282	 * The compounded ratio is how much vfraction @cfqg owns.
1283	 *
1284	 * Start with the proportion tasks in this cfqg has against active
1285	 * children cfqgs - its leaf_weight against children_weight.
1286	 */
1287	propagate = !pos->nr_active++;
1288	pos->children_weight += pos->leaf_weight;
1289	vfr = vfr * pos->leaf_weight / pos->children_weight;
1290
1291	/*
1292	 * Compound ->weight walking up the tree.  Both activation and
1293	 * vfraction calculation are done in the same loop.  Propagation
1294	 * stops once an already activated node is met.  vfraction
1295	 * calculation should always continue to the root.
1296	 */
1297	while ((parent = cfqg_flat_parent(pos))) {
1298		if (propagate) {
1299			propagate = !parent->nr_active++;
1300			parent->children_weight += pos->weight;
1301		}
1302		vfr = vfr * pos->weight / parent->children_weight;
1303		pos = parent;
1304	}
1305
1306	cfqg->vfraction = max_t(unsigned, vfr, 1);
1307}
1308
1309static void
1310cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1311{
1312	struct cfq_rb_root *st = &cfqd->grp_service_tree;
1313	struct cfq_group *__cfqg;
1314	struct rb_node *n;
1315
1316	cfqg->nr_cfqq++;
1317	if (!RB_EMPTY_NODE(&cfqg->rb_node))
1318		return;
1319
1320	/*
1321	 * Currently put the group at the end. Later implement something
1322	 * so that groups get lesser vtime based on their weights, so that
1323	 * if group does not loose all if it was not continuously backlogged.
1324	 */
1325	n = rb_last(&st->rb);
1326	if (n) {
1327		__cfqg = rb_entry_cfqg(n);
1328		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
1329	} else
1330		cfqg->vdisktime = st->min_vdisktime;
1331	cfq_group_service_tree_add(st, cfqg);
1332}
1333
1334static void
1335cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1336{
1337	struct cfq_group *pos = cfqg;
1338	bool propagate;
1339
1340	/*
1341	 * Undo activation from cfq_group_service_tree_add().  Deactivate
1342	 * @cfqg and propagate deactivation upwards.
1343	 */
1344	propagate = !--pos->nr_active;
1345	pos->children_weight -= pos->leaf_weight;
1346
1347	while (propagate) {
1348		struct cfq_group *parent = cfqg_flat_parent(pos);
1349
1350		/* @pos has 0 nr_active at this point */
1351		WARN_ON_ONCE(pos->children_weight);
1352		pos->vfraction = 0;
1353
1354		if (!parent)
1355			break;
1356
1357		propagate = !--parent->nr_active;
1358		parent->children_weight -= pos->weight;
1359		pos = parent;
1360	}
1361
1362	/* remove from the service tree */
1363	st->total_weight -= cfqg->weight;
1364	if (!RB_EMPTY_NODE(&cfqg->rb_node))
1365		cfq_rb_erase(&cfqg->rb_node, st);
1366}
1367
1368static void
1369cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1370{
1371	struct cfq_rb_root *st = &cfqd->grp_service_tree;
1372
1373	BUG_ON(cfqg->nr_cfqq < 1);
1374	cfqg->nr_cfqq--;
1375
1376	/* If there are other cfq queues under this group, don't delete it */
1377	if (cfqg->nr_cfqq)
1378		return;
1379
1380	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1381	cfq_group_service_tree_del(st, cfqg);
1382	cfqg->saved_wl_slice = 0;
1383	cfqg_stats_update_dequeue(cfqg);
1384}
1385
1386static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1387						unsigned int *unaccounted_time)
1388{
1389	unsigned int slice_used;
1390
1391	/*
1392	 * Queue got expired before even a single request completed or
1393	 * got expired immediately after first request completion.
1394	 */
1395	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
1396		/*
1397		 * Also charge the seek time incurred to the group, otherwise
1398		 * if there are mutiple queues in the group, each can dispatch
1399		 * a single request on seeky media and cause lots of seek time
1400		 * and group will never know it.
1401		 */
1402		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
1403					1);
1404	} else {
1405		slice_used = jiffies - cfqq->slice_start;
1406		if (slice_used > cfqq->allocated_slice) {
1407			*unaccounted_time = slice_used - cfqq->allocated_slice;
1408			slice_used = cfqq->allocated_slice;
1409		}
1410		if (time_after(cfqq->slice_start, cfqq->dispatch_start))
1411			*unaccounted_time += cfqq->slice_start -
1412					cfqq->dispatch_start;
1413	}
1414
1415	return slice_used;
1416}
1417
1418static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1419				struct cfq_queue *cfqq)
1420{
1421	struct cfq_rb_root *st = &cfqd->grp_service_tree;
1422	unsigned int used_sl, charge, unaccounted_sl = 0;
1423	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1424			- cfqg->service_tree_idle.count;
1425	unsigned int vfr;
1426
1427	BUG_ON(nr_sync < 0);
1428	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1429
1430	if (iops_mode(cfqd))
1431		charge = cfqq->slice_dispatch;
1432	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1433		charge = cfqq->allocated_slice;
1434
1435	/*
1436	 * Can't update vdisktime while on service tree and cfqg->vfraction
1437	 * is valid only while on it.  Cache vfr, leave the service tree,
1438	 * update vdisktime and go back on.  The re-addition to the tree
1439	 * will also update the weights as necessary.
1440	 */
1441	vfr = cfqg->vfraction;
1442	cfq_group_service_tree_del(st, cfqg);
1443	cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1444	cfq_group_service_tree_add(st, cfqg);
1445
1446	/* This group is being expired. Save the context */
1447	if (time_after(cfqd->workload_expires, jiffies)) {
1448		cfqg->saved_wl_slice = cfqd->workload_expires
1449						- jiffies;
1450		cfqg->saved_wl_type = cfqd->serving_wl_type;
1451		cfqg->saved_wl_class = cfqd->serving_wl_class;
1452	} else
1453		cfqg->saved_wl_slice = 0;
1454
1455	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1456					st->min_vdisktime);
1457	cfq_log_cfqq(cfqq->cfqd, cfqq,
1458		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
1459		     used_sl, cfqq->slice_dispatch, charge,
1460		     iops_mode(cfqd), cfqq->nr_sectors);
1461	cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1462	cfqg_stats_set_start_empty_time(cfqg);
1463}
1464
1465/**
1466 * cfq_init_cfqg_base - initialize base part of a cfq_group
1467 * @cfqg: cfq_group to initialize
1468 *
1469 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1470 * is enabled or not.
1471 */
1472static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1473{
1474	struct cfq_rb_root *st;
1475	int i, j;
1476
1477	for_each_cfqg_st(cfqg, i, j, st)
1478		*st = CFQ_RB_ROOT;
1479	RB_CLEAR_NODE(&cfqg->rb_node);
1480
1481	cfqg->ttime.last_end_request = jiffies;
1482}
1483
1484#ifdef CONFIG_CFQ_GROUP_IOSCHED
1485static void cfq_pd_init(struct blkcg_gq *blkg)
1486{
1487	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1488
1489	cfq_init_cfqg_base(cfqg);
1490	cfqg->weight = blkg->blkcg->cfq_weight;
1491	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
1492}
1493
1494/*
1495 * Search for the cfq group current task belongs to. request_queue lock must
1496 * be held.
1497 */
1498static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1499						struct blkcg *blkcg)
1500{
1501	struct request_queue *q = cfqd->queue;
1502	struct cfq_group *cfqg = NULL;
1503
1504	/* avoid lookup for the common case where there's no blkcg */
1505	if (blkcg == &blkcg_root) {
1506		cfqg = cfqd->root_group;
1507	} else {
1508		struct blkcg_gq *blkg;
1509
1510		blkg = blkg_lookup_create(blkcg, q);
1511		if (!IS_ERR(blkg))
1512			cfqg = blkg_to_cfqg(blkg);
1513	}
1514
1515	return cfqg;
1516}
1517
1518static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1519{
1520	/* Currently, all async queues are mapped to root group */
1521	if (!cfq_cfqq_sync(cfqq))
1522		cfqg = cfqq->cfqd->root_group;
1523
1524	cfqq->cfqg = cfqg;
1525	/* cfqq reference on cfqg */
1526	cfqg_get(cfqg);
1527}
1528
1529static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1530				     struct blkg_policy_data *pd, int off)
1531{
1532	struct cfq_group *cfqg = pd_to_cfqg(pd);
1533
1534	if (!cfqg->dev_weight)
1535		return 0;
1536	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1537}
1538
1539static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
1540				    struct seq_file *sf)
1541{
1542	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1543			  cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
1544			  false);
1545	return 0;
1546}
1547
1548static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1549					  struct blkg_policy_data *pd, int off)
1550{
1551	struct cfq_group *cfqg = pd_to_cfqg(pd);
1552
1553	if (!cfqg->dev_leaf_weight)
1554		return 0;
1555	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1556}
1557
1558static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
1559					 struct cftype *cft,
1560					 struct seq_file *sf)
1561{
1562	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1563			  cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
1564			  false);
1565	return 0;
1566}
1567
1568static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
1569			    struct seq_file *sf)
1570{
1571	seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
1572	return 0;
1573}
1574
1575static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
1576				 struct seq_file *sf)
1577{
1578	seq_printf(sf, "%u\n",
1579		   cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
1580	return 0;
1581}
1582
1583static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1584				    const char *buf, bool is_leaf_weight)
1585{
1586	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1587	struct blkg_conf_ctx ctx;
1588	struct cfq_group *cfqg;
1589	int ret;
1590
1591	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1592	if (ret)
1593		return ret;
1594
1595	ret = -EINVAL;
1596	cfqg = blkg_to_cfqg(ctx.blkg);
1597	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1598		if (!is_leaf_weight) {
1599			cfqg->dev_weight = ctx.v;
1600			cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
1601		} else {
1602			cfqg->dev_leaf_weight = ctx.v;
1603			cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
1604		}
1605		ret = 0;
1606	}
1607
1608	blkg_conf_finish(&ctx);
1609	return ret;
1610}
1611
1612static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1613				  const char *buf)
1614{
1615	return __cfqg_set_weight_device(cgrp, cft, buf, false);
1616}
1617
1618static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
1619				       const char *buf)
1620{
1621	return __cfqg_set_weight_device(cgrp, cft, buf, true);
1622}
1623
1624static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
1625			    bool is_leaf_weight)
1626{
1627	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1628	struct blkcg_gq *blkg;
1629	struct hlist_node *n;
1630
1631	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
1632		return -EINVAL;
1633
1634	spin_lock_irq(&blkcg->lock);
1635
1636	if (!is_leaf_weight)
1637		blkcg->cfq_weight = val;
1638	else
1639		blkcg->cfq_leaf_weight = val;
1640
1641	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1642		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1643
1644		if (!cfqg)
1645			continue;
1646
1647		if (!is_leaf_weight) {
1648			if (!cfqg->dev_weight)
1649				cfqg->new_weight = blkcg->cfq_weight;
1650		} else {
1651			if (!cfqg->dev_leaf_weight)
1652				cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
1653		}
1654	}
1655
1656	spin_unlock_irq(&blkcg->lock);
1657	return 0;
1658}
1659
1660static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1661{
1662	return __cfq_set_weight(cgrp, cft, val, false);
1663}
1664
1665static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1666{
1667	return __cfq_set_weight(cgrp, cft, val, true);
1668}
1669
1670static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
1671			   struct seq_file *sf)
1672{
1673	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1674
1675	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
1676			  cft->private, false);
1677	return 0;
1678}
1679
1680static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
1681			     struct seq_file *sf)
1682{
1683	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1684
1685	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
1686			  cft->private, true);
1687	return 0;
1688}
1689
1690#ifdef CONFIG_DEBUG_BLK_CGROUP
1691static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1692				      struct blkg_policy_data *pd, int off)
1693{
1694	struct cfq_group *cfqg = pd_to_cfqg(pd);
1695	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1696	u64 v = 0;
1697
1698	if (samples) {
1699		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1700		do_div(v, samples);
1701	}
1702	__blkg_prfill_u64(sf, pd, v);
1703	return 0;
1704}
1705
1706/* print avg_queue_size */
1707static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
1708				     struct seq_file *sf)
1709{
1710	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1711
1712	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
1713			  &blkcg_policy_cfq, 0, false);
1714	return 0;
1715}
1716#endif	/* CONFIG_DEBUG_BLK_CGROUP */
1717
1718static struct cftype cfq_blkcg_files[] = {
1719	/* on root, weight is mapped to leaf_weight */
1720	{
1721		.name = "weight_device",
1722		.flags = CFTYPE_ONLY_ON_ROOT,
1723		.read_seq_string = cfqg_print_leaf_weight_device,
1724		.write_string = cfqg_set_leaf_weight_device,
1725		.max_write_len = 256,
1726	},
1727	{
1728		.name = "weight",
1729		.flags = CFTYPE_ONLY_ON_ROOT,
1730		.read_seq_string = cfq_print_leaf_weight,
1731		.write_u64 = cfq_set_leaf_weight,
1732	},
1733
1734	/* no such mapping necessary for !roots */
1735	{
1736		.name = "weight_device",
1737		.flags = CFTYPE_NOT_ON_ROOT,
1738		.read_seq_string = cfqg_print_weight_device,
1739		.write_string = cfqg_set_weight_device,
1740		.max_write_len = 256,
1741	},
1742	{
1743		.name = "weight",
1744		.flags = CFTYPE_NOT_ON_ROOT,
1745		.read_seq_string = cfq_print_weight,
1746		.write_u64 = cfq_set_weight,
1747	},
1748
1749	{
1750		.name = "leaf_weight_device",
1751		.read_seq_string = cfqg_print_leaf_weight_device,
1752		.write_string = cfqg_set_leaf_weight_device,
1753		.max_write_len = 256,
1754	},
1755	{
1756		.name = "leaf_weight",
1757		.read_seq_string = cfq_print_leaf_weight,
1758		.write_u64 = cfq_set_leaf_weight,
1759	},
1760
1761	{
1762		.name = "time",
1763		.private = offsetof(struct cfq_group, stats.time),
1764		.read_seq_string = cfqg_print_stat,
1765	},
1766	{
1767		.name = "sectors",
1768		.private = offsetof(struct cfq_group, stats.sectors),
1769		.read_seq_string = cfqg_print_stat,
1770	},
1771	{
1772		.name = "io_service_bytes",
1773		.private = offsetof(struct cfq_group, stats.service_bytes),
1774		.read_seq_string = cfqg_print_rwstat,
1775	},
1776	{
1777		.name = "io_serviced",
1778		.private = offsetof(struct cfq_group, stats.serviced),
1779		.read_seq_string = cfqg_print_rwstat,
1780	},
1781	{
1782		.name = "io_service_time",
1783		.private = offsetof(struct cfq_group, stats.service_time),
1784		.read_seq_string = cfqg_print_rwstat,
1785	},
1786	{
1787		.name = "io_wait_time",
1788		.private = offsetof(struct cfq_group, stats.wait_time),
1789		.read_seq_string = cfqg_print_rwstat,
1790	},
1791	{
1792		.name = "io_merged",
1793		.private = offsetof(struct cfq_group, stats.merged),
1794		.read_seq_string = cfqg_print_rwstat,
1795	},
1796	{
1797		.name = "io_queued",
1798		.private = offsetof(struct cfq_group, stats.queued),
1799		.read_seq_string = cfqg_print_rwstat,
1800	},
1801#ifdef CONFIG_DEBUG_BLK_CGROUP
1802	{
1803		.name = "avg_queue_size",
1804		.read_seq_string = cfqg_print_avg_queue_size,
1805	},
1806	{
1807		.name = "group_wait_time",
1808		.private = offsetof(struct cfq_group, stats.group_wait_time),
1809		.read_seq_string = cfqg_print_stat,
1810	},
1811	{
1812		.name = "idle_time",
1813		.private = offsetof(struct cfq_group, stats.idle_time),
1814		.read_seq_string = cfqg_print_stat,
1815	},
1816	{
1817		.name = "empty_time",
1818		.private = offsetof(struct cfq_group, stats.empty_time),
1819		.read_seq_string = cfqg_print_stat,
1820	},
1821	{
1822		.name = "dequeue",
1823		.private = offsetof(struct cfq_group, stats.dequeue),
1824		.read_seq_string = cfqg_print_stat,
1825	},
1826	{
1827		.name = "unaccounted_time",
1828		.private = offsetof(struct cfq_group, stats.unaccounted_time),
1829		.read_seq_string = cfqg_print_stat,
1830	},
1831#endif	/* CONFIG_DEBUG_BLK_CGROUP */
1832	{ }	/* terminate */
1833};
1834#else /* GROUP_IOSCHED */
1835static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1836						struct blkcg *blkcg)
1837{
1838	return cfqd->root_group;
1839}
1840
1841static inline void
1842cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1843	cfqq->cfqg = cfqg;
1844}
1845
1846#endif /* GROUP_IOSCHED */
1847
1848/*
1849 * The cfqd->service_trees holds all pending cfq_queue's that have
1850 * requests waiting to be processed. It is sorted in the order that
1851 * we will service the queues.
1852 */
1853static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1854				 bool add_front)
1855{
1856	struct rb_node **p, *parent;
1857	struct cfq_queue *__cfqq;
1858	unsigned long rb_key;
1859	struct cfq_rb_root *st;
1860	int left;
1861	int new_cfqq = 1;
1862
1863	st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
1864	if (cfq_class_idle(cfqq)) {
1865		rb_key = CFQ_IDLE_DELAY;
1866		parent = rb_last(&st->rb);
1867		if (parent && parent != &cfqq->rb_node) {
1868			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1869			rb_key += __cfqq->rb_key;
1870		} else
1871			rb_key += jiffies;
1872	} else if (!add_front) {
1873		/*
1874		 * Get our rb key offset. Subtract any residual slice
1875		 * value carried from last service. A negative resid
1876		 * count indicates slice overrun, and this should position
1877		 * the next service time further away in the tree.
1878		 */
1879		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1880		rb_key -= cfqq->slice_resid;
1881		cfqq->slice_resid = 0;
1882	} else {
1883		rb_key = -HZ;
1884		__cfqq = cfq_rb_first(st);
1885		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1886	}
1887
1888	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1889		new_cfqq = 0;
1890		/*
1891		 * same position, nothing more to do
1892		 */
1893		if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
1894			return;
1895
1896		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1897		cfqq->service_tree = NULL;
1898	}
1899
1900	left = 1;
1901	parent = NULL;
1902	cfqq->service_tree = st;
1903	p = &st->rb.rb_node;
1904	while (*p) {
1905		parent = *p;
1906		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1907
1908		/*
1909		 * sort by key, that represents service time.
1910		 */
1911		if (time_before(rb_key, __cfqq->rb_key))
1912			p = &parent->rb_left;
1913		else {
1914			p = &parent->rb_right;
1915			left = 0;
1916		}
1917	}
1918
1919	if (left)
1920		st->left = &cfqq->rb_node;
1921
1922	cfqq->rb_key = rb_key;
1923	rb_link_node(&cfqq->rb_node, parent, p);
1924	rb_insert_color(&cfqq->rb_node, &st->rb);
1925	st->count++;
1926	if (add_front || !new_cfqq)
1927		return;
1928	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1929}
1930
1931static struct cfq_queue *
1932cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1933		     sector_t sector, struct rb_node **ret_parent,
1934		     struct rb_node ***rb_link)
1935{
1936	struct rb_node **p, *parent;
1937	struct cfq_queue *cfqq = NULL;
1938
1939	parent = NULL;
1940	p = &root->rb_node;
1941	while (*p) {
1942		struct rb_node **n;
1943
1944		parent = *p;
1945		cfqq = rb_entry(parent, struct cfq_queue, p_node);
1946
1947		/*
1948		 * Sort strictly based on sector.  Smallest to the left,
1949		 * largest to the right.
1950		 */
1951		if (sector > blk_rq_pos(cfqq->next_rq))
1952			n = &(*p)->rb_right;
1953		else if (sector < blk_rq_pos(cfqq->next_rq))
1954			n = &(*p)->rb_left;
1955		else
1956			break;
1957		p = n;
1958		cfqq = NULL;
1959	}
1960
1961	*ret_parent = parent;
1962	if (rb_link)
1963		*rb_link = p;
1964	return cfqq;
1965}
1966
1967static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1968{
1969	struct rb_node **p, *parent;
1970	struct cfq_queue *__cfqq;
1971
1972	if (cfqq->p_root) {
1973		rb_erase(&cfqq->p_node, cfqq->p_root);
1974		cfqq->p_root = NULL;
1975	}
1976
1977	if (cfq_class_idle(cfqq))
1978		return;
1979	if (!cfqq->next_rq)
1980		return;
1981
1982	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1983	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1984				      blk_rq_pos(cfqq->next_rq), &parent, &p);
1985	if (!__cfqq) {
1986		rb_link_node(&cfqq->p_node, parent, p);
1987		rb_insert_color(&cfqq->p_node, cfqq->p_root);
1988	} else
1989		cfqq->p_root = NULL;
1990}
1991
1992/*
1993 * Update cfqq's position in the service tree.
1994 */
1995static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1996{
1997	/*
1998	 * Resorting requires the cfqq to be on the RR list already.
1999	 */
2000	if (cfq_cfqq_on_rr(cfqq)) {
2001		cfq_service_tree_add(cfqd, cfqq, 0);
2002		cfq_prio_tree_add(cfqd, cfqq);
2003	}
2004}
2005
2006/*
2007 * add to busy list of queues for service, trying to be fair in ordering
2008 * the pending list according to last request service
2009 */
2010static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2011{
2012	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2013	BUG_ON(cfq_cfqq_on_rr(cfqq));
2014	cfq_mark_cfqq_on_rr(cfqq);
2015	cfqd->busy_queues++;
2016	if (cfq_cfqq_sync(cfqq))
2017		cfqd->busy_sync_queues++;
2018
2019	cfq_resort_rr_list(cfqd, cfqq);
2020}
2021
2022/*
2023 * Called when the cfqq no longer has requests pending, remove it from
2024 * the service tree.
2025 */
2026static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2027{
2028	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2029	BUG_ON(!cfq_cfqq_on_rr(cfqq));
2030	cfq_clear_cfqq_on_rr(cfqq);
2031
2032	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2033		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2034		cfqq->service_tree = NULL;
2035	}
2036	if (cfqq->p_root) {
2037		rb_erase(&cfqq->p_node, cfqq->p_root);
2038		cfqq->p_root = NULL;
2039	}
2040
2041	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2042	BUG_ON(!cfqd->busy_queues);
2043	cfqd->busy_queues--;
2044	if (cfq_cfqq_sync(cfqq))
2045		cfqd->busy_sync_queues--;
2046}
2047
2048/*
2049 * rb tree support functions
2050 */
2051static void cfq_del_rq_rb(struct request *rq)
2052{
2053	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2054	const int sync = rq_is_sync(rq);
2055
2056	BUG_ON(!cfqq->queued[sync]);
2057	cfqq->queued[sync]--;
2058
2059	elv_rb_del(&cfqq->sort_list, rq);
2060
2061	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2062		/*
2063		 * Queue will be deleted from service tree when we actually
2064		 * expire it later. Right now just remove it from prio tree
2065		 * as it is empty.
2066		 */
2067		if (cfqq->p_root) {
2068			rb_erase(&cfqq->p_node, cfqq->p_root);
2069			cfqq->p_root = NULL;
2070		}
2071	}
2072}
2073
2074static void cfq_add_rq_rb(struct request *rq)
2075{
2076	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2077	struct cfq_data *cfqd = cfqq->cfqd;
2078	struct request *prev;
2079
2080	cfqq->queued[rq_is_sync(rq)]++;
2081
2082	elv_rb_add(&cfqq->sort_list, rq);
2083
2084	if (!cfq_cfqq_on_rr(cfqq))
2085		cfq_add_cfqq_rr(cfqd, cfqq);
2086
2087	/*
2088	 * check if this request is a better next-serve candidate
2089	 */
2090	prev = cfqq->next_rq;
2091	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2092
2093	/*
2094	 * adjust priority tree position, if ->next_rq changes
2095	 */
2096	if (prev != cfqq->next_rq)
2097		cfq_prio_tree_add(cfqd, cfqq);
2098
2099	BUG_ON(!cfqq->next_rq);
2100}
2101
2102static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2103{
2104	elv_rb_del(&cfqq->sort_list, rq);
2105	cfqq->queued[rq_is_sync(rq)]--;
2106	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2107	cfq_add_rq_rb(rq);
2108	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2109				 rq->cmd_flags);
2110}
2111
2112static struct request *
2113cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2114{
2115	struct task_struct *tsk = current;
2116	struct cfq_io_cq *cic;
2117	struct cfq_queue *cfqq;
2118
2119	cic = cfq_cic_lookup(cfqd, tsk->io_context);
2120	if (!cic)
2121		return NULL;
2122
2123	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
2124	if (cfqq) {
2125		sector_t sector = bio->bi_sector + bio_sectors(bio);
2126
2127		return elv_rb_find(&cfqq->sort_list, sector);
2128	}
2129
2130	return NULL;
2131}
2132
2133static void cfq_activate_request(struct request_queue *q, struct request *rq)
2134{
2135	struct cfq_data *cfqd = q->elevator->elevator_data;
2136
2137	cfqd->rq_in_driver++;
2138	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2139						cfqd->rq_in_driver);
2140
2141	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2142}
2143
2144static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2145{
2146	struct cfq_data *cfqd = q->elevator->elevator_data;
2147
2148	WARN_ON(!cfqd->rq_in_driver);
2149	cfqd->rq_in_driver--;
2150	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2151						cfqd->rq_in_driver);
2152}
2153
2154static void cfq_remove_request(struct request *rq)
2155{
2156	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2157
2158	if (cfqq->next_rq == rq)
2159		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2160
2161	list_del_init(&rq->queuelist);
2162	cfq_del_rq_rb(rq);
2163
2164	cfqq->cfqd->rq_queued--;
2165	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2166	if (rq->cmd_flags & REQ_PRIO) {
2167		WARN_ON(!cfqq->prio_pending);
2168		cfqq->prio_pending--;
2169	}
2170}
2171
2172static int cfq_merge(struct request_queue *q, struct request **req,
2173		     struct bio *bio)
2174{
2175	struct cfq_data *cfqd = q->elevator->elevator_data;
2176	struct request *__rq;
2177
2178	__rq = cfq_find_rq_fmerge(cfqd, bio);
2179	if (__rq && elv_rq_merge_ok(__rq, bio)) {
2180		*req = __rq;
2181		return ELEVATOR_FRONT_MERGE;
2182	}
2183
2184	return ELEVATOR_NO_MERGE;
2185}
2186
2187static void cfq_merged_request(struct request_queue *q, struct request *req,
2188			       int type)
2189{
2190	if (type == ELEVATOR_FRONT_MERGE) {
2191		struct cfq_queue *cfqq = RQ_CFQQ(req);
2192
2193		cfq_reposition_rq_rb(cfqq, req);
2194	}
2195}
2196
2197static void cfq_bio_merged(struct request_queue *q, struct request *req,
2198				struct bio *bio)
2199{
2200	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
2201}
2202
2203static void
2204cfq_merged_requests(struct request_queue *q, struct request *rq,
2205		    struct request *next)
2206{
2207	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2208	struct cfq_data *cfqd = q->elevator->elevator_data;
2209
2210	/*
2211	 * reposition in fifo if next is older than rq
2212	 */
2213	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2214	    time_before(rq_fifo_time(next), rq_fifo_time(rq)) &&
2215	    cfqq == RQ_CFQQ(next)) {
2216		list_move(&rq->queuelist, &next->queuelist);
2217		rq_set_fifo_time(rq, rq_fifo_time(next));
2218	}
2219
2220	if (cfqq->next_rq == next)
2221		cfqq->next_rq = rq;
2222	cfq_remove_request(next);
2223	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2224
2225	cfqq = RQ_CFQQ(next);
2226	/*
2227	 * all requests of this queue are merged to other queues, delete it
2228	 * from the service tree. If it's the active_queue,
2229	 * cfq_dispatch_requests() will choose to expire it or do idle
2230	 */
2231	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2232	    cfqq != cfqd->active_queue)
2233		cfq_del_cfqq_rr(cfqd, cfqq);
2234}
2235
2236static int cfq_allow_merge(struct request_queue *q, struct request *rq,
2237			   struct bio *bio)
2238{
2239	struct cfq_data *cfqd = q->elevator->elevator_data;
2240	struct cfq_io_cq *cic;
2241	struct cfq_queue *cfqq;
2242
2243	/*
2244	 * Disallow merge of a sync bio into an async request.
2245	 */
2246	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
2247		return false;
2248
2249	/*
2250	 * Lookup the cfqq that this bio will be queued with and allow
2251	 * merge only if rq is queued there.
2252	 */
2253	cic = cfq_cic_lookup(cfqd, current->io_context);
2254	if (!cic)
2255		return false;
2256
2257	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
2258	return cfqq == RQ_CFQQ(rq);
2259}
2260
2261static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2262{
2263	del_timer(&cfqd->idle_slice_timer);
2264	cfqg_stats_update_idle_time(cfqq->cfqg);
2265}
2266
2267static void __cfq_set_active_queue(struct cfq_data *cfqd,
2268				   struct cfq_queue *cfqq)
2269{
2270	if (cfqq) {
2271		cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2272				cfqd->serving_wl_class, cfqd->serving_wl_type);
2273		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2274		cfqq->slice_start = 0;
2275		cfqq->dispatch_start = jiffies;
2276		cfqq->allocated_slice = 0;
2277		cfqq->slice_end = 0;
2278		cfqq->slice_dispatch = 0;
2279		cfqq->nr_sectors = 0;
2280
2281		cfq_clear_cfqq_wait_request(cfqq);
2282		cfq_clear_cfqq_must_dispatch(cfqq);
2283		cfq_clear_cfqq_must_alloc_slice(cfqq);
2284		cfq_clear_cfqq_fifo_expire(cfqq);
2285		cfq_mark_cfqq_slice_new(cfqq);
2286
2287		cfq_del_timer(cfqd, cfqq);
2288	}
2289
2290	cfqd->active_queue = cfqq;
2291}
2292
2293/*
2294 * current cfqq expired its slice (or was too idle), select new one
2295 */
2296static void
2297__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2298		    bool timed_out)
2299{
2300	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2301
2302	if (cfq_cfqq_wait_request(cfqq))
2303		cfq_del_timer(cfqd, cfqq);
2304
2305	cfq_clear_cfqq_wait_request(cfqq);
2306	cfq_clear_cfqq_wait_busy(cfqq);
2307
2308	/*
2309	 * If this cfqq is shared between multiple processes, check to
2310	 * make sure that those processes are still issuing I/Os within
2311	 * the mean seek distance.  If not, it may be time to break the
2312	 * queues apart again.
2313	 */
2314	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2315		cfq_mark_cfqq_split_coop(cfqq);
2316
2317	/*
2318	 * store what was left of this slice, if the queue idled/timed out
2319	 */
2320	if (timed_out) {
2321		if (cfq_cfqq_slice_new(cfqq))
2322			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2323		else
2324			cfqq->slice_resid = cfqq->slice_end - jiffies;
2325		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
2326	}
2327
2328	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2329
2330	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2331		cfq_del_cfqq_rr(cfqd, cfqq);
2332
2333	cfq_resort_rr_list(cfqd, cfqq);
2334
2335	if (cfqq == cfqd->active_queue)
2336		cfqd->active_queue = NULL;
2337
2338	if (cfqd->active_cic) {
2339		put_io_context(cfqd->active_cic->icq.ioc);
2340		cfqd->active_cic = NULL;
2341	}
2342}
2343
2344static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2345{
2346	struct cfq_queue *cfqq = cfqd->active_queue;
2347
2348	if (cfqq)
2349		__cfq_slice_expired(cfqd, cfqq, timed_out);
2350}
2351
2352/*
2353 * Get next queue for service. Unless we have a queue preemption,
2354 * we'll simply select the first cfqq in the service tree.
2355 */
2356static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2357{
2358	struct cfq_rb_root *st = st_for(cfqd->serving_group,
2359			cfqd->serving_wl_class, cfqd->serving_wl_type);
2360
2361	if (!cfqd->rq_queued)
2362		return NULL;
2363
2364	/* There is nothing to dispatch */
2365	if (!st)
2366		return NULL;
2367	if (RB_EMPTY_ROOT(&st->rb))
2368		return NULL;
2369	return cfq_rb_first(st);
2370}
2371
2372static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2373{
2374	struct cfq_group *cfqg;
2375	struct cfq_queue *cfqq;
2376	int i, j;
2377	struct cfq_rb_root *st;
2378
2379	if (!cfqd->rq_queued)
2380		return NULL;
2381
2382	cfqg = cfq_get_next_cfqg(cfqd);
2383	if (!cfqg)
2384		return NULL;
2385
2386	for_each_cfqg_st(cfqg, i, j, st)
2387		if ((cfqq = cfq_rb_first(st)) != NULL)
2388			return cfqq;
2389	return NULL;
2390}
2391
2392/*
2393 * Get and set a new active queue for service.
2394 */
2395static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2396					      struct cfq_queue *cfqq)
2397{
2398	if (!cfqq)
2399		cfqq = cfq_get_next_queue(cfqd);
2400
2401	__cfq_set_active_queue(cfqd, cfqq);
2402	return cfqq;
2403}
2404
2405static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2406					  struct request *rq)
2407{
2408	if (blk_rq_pos(rq) >= cfqd->last_position)
2409		return blk_rq_pos(rq) - cfqd->last_position;
2410	else
2411		return cfqd->last_position - blk_rq_pos(rq);
2412}
2413
2414static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2415			       struct request *rq)
2416{
2417	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2418}
2419
2420static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2421				    struct cfq_queue *cur_cfqq)
2422{
2423	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2424	struct rb_node *parent, *node;
2425	struct cfq_queue *__cfqq;
2426	sector_t sector = cfqd->last_position;
2427
2428	if (RB_EMPTY_ROOT(root))
2429		return NULL;
2430
2431	/*
2432	 * First, if we find a request starting at the end of the last
2433	 * request, choose it.
2434	 */
2435	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2436	if (__cfqq)
2437		return __cfqq;
2438
2439	/*
2440	 * If the exact sector wasn't found, the parent of the NULL leaf
2441	 * will contain the closest sector.
2442	 */
2443	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
2444	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2445		return __cfqq;
2446
2447	if (blk_rq_pos(__cfqq->next_rq) < sector)
2448		node = rb_next(&__cfqq->p_node);
2449	else
2450		node = rb_prev(&__cfqq->p_node);
2451	if (!node)
2452		return NULL;
2453
2454	__cfqq = rb_entry(node, struct cfq_queue, p_node);
2455	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2456		return __cfqq;
2457
2458	return NULL;
2459}
2460
2461/*
2462 * cfqd - obvious
2463 * cur_cfqq - passed in so that we don't decide that the current queue is
2464 * 	      closely cooperating with itself.
2465 *
2466 * So, basically we're assuming that that cur_cfqq has dispatched at least
2467 * one request, and that cfqd->last_position reflects a position on the disk
2468 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
2469 * assumption.
2470 */
2471static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2472					      struct cfq_queue *cur_cfqq)
2473{
2474	struct cfq_queue *cfqq;
2475
2476	if (cfq_class_idle(cur_cfqq))
2477		return NULL;
2478	if (!cfq_cfqq_sync(cur_cfqq))
2479		return NULL;
2480	if (CFQQ_SEEKY(cur_cfqq))
2481		return NULL;
2482
2483	/*
2484	 * Don't search priority tree if it's the only queue in the group.
2485	 */
2486	if (cur_cfqq->cfqg->nr_cfqq == 1)
2487		return NULL;
2488
2489	/*
2490	 * We should notice if some of the queues are cooperating, eg
2491	 * working closely on the same area of the disk. In that case,
2492	 * we can group them together and don't waste time idling.
2493	 */
2494	cfqq = cfqq_close(cfqd, cur_cfqq);
2495	if (!cfqq)
2496		return NULL;
2497
2498	/* If new queue belongs to different cfq_group, don't choose it */
2499	if (cur_cfqq->cfqg != cfqq->cfqg)
2500		return NULL;
2501
2502	/*
2503	 * It only makes sense to merge sync queues.
2504	 */
2505	if (!cfq_cfqq_sync(cfqq))
2506		return NULL;
2507	if (CFQQ_SEEKY(cfqq))
2508		return NULL;
2509
2510	/*
2511	 * Do not merge queues of different priority classes
2512	 */
2513	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2514		return NULL;
2515
2516	return cfqq;
2517}
2518
2519/*
2520 * Determine whether we should enforce idle window for this queue.
2521 */
2522
2523static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2524{
2525	enum wl_class_t wl_class = cfqq_class(cfqq);
2526	struct cfq_rb_root *st = cfqq->service_tree;
2527
2528	BUG_ON(!st);
2529	BUG_ON(!st->count);
2530
2531	if (!cfqd->cfq_slice_idle)
2532		return false;
2533
2534	/* We never do for idle class queues. */
2535	if (wl_class == IDLE_WORKLOAD)
2536		return false;
2537
2538	/* We do for queues that were marked with idle window flag. */
2539	if (cfq_cfqq_idle_window(cfqq) &&
2540	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2541		return true;
2542
2543	/*
2544	 * Otherwise, we do only if they are the last ones
2545	 * in their service tree.
2546	 */
2547	if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2548	   !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2549		return true;
2550	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2551	return false;
2552}
2553
2554static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2555{
2556	struct cfq_queue *cfqq = cfqd->active_queue;
2557	struct cfq_io_cq *cic;
2558	unsigned long sl, group_idle = 0;
2559
2560	/*
2561	 * SSD device without seek penalty, disable idling. But only do so
2562	 * for devices that support queuing, otherwise we still have a problem
2563	 * with sync vs async workloads.
2564	 */
2565	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
2566		return;
2567
2568	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2569	WARN_ON(cfq_cfqq_slice_new(cfqq));
2570
2571	/*
2572	 * idle is disabled, either manually or by past process history
2573	 */
2574	if (!cfq_should_idle(cfqd, cfqq)) {
2575		/* no queue idling. Check for group idling */
2576		if (cfqd->cfq_group_idle)
2577			group_idle = cfqd->cfq_group_idle;
2578		else
2579			return;
2580	}
2581
2582	/*
2583	 * still active requests from this queue, don't idle
2584	 */
2585	if (cfqq->dispatched)
2586		return;
2587
2588	/*
2589	 * task has exited, don't wait
2590	 */
2591	cic = cfqd->active_cic;
2592	if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2593		return;
2594
2595	/*
2596	 * If our average think time is larger than the remaining time
2597	 * slice, then don't idle. This avoids overrunning the allotted
2598	 * time slice.
2599	 */
2600	if (sample_valid(cic->ttime.ttime_samples) &&
2601	    (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
2602		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
2603			     cic->ttime.ttime_mean);
2604		return;
2605	}
2606
2607	/* There are other queues in the group, don't do group idle */
2608	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
2609		return;
2610
2611	cfq_mark_cfqq_wait_request(cfqq);
2612
2613	if (group_idle)
2614		sl = cfqd->cfq_group_idle;
2615	else
2616		sl = cfqd->cfq_slice_idle;
2617
2618	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2619	cfqg_stats_set_start_idle_time(cfqq->cfqg);
2620	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2621			group_idle ? 1 : 0);
2622}
2623
2624/*
2625 * Move request from internal lists to the request queue dispatch list.
2626 */
2627static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2628{
2629	struct cfq_data *cfqd = q->elevator->elevator_data;
2630	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2631
2632	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2633
2634	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2635	cfq_remove_request(rq);
2636	cfqq->dispatched++;
2637	(RQ_CFQG(rq))->dispatched++;
2638	elv_dispatch_sort(q, rq);
2639
2640	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2641	cfqq->nr_sectors += blk_rq_sectors(rq);
2642	cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
2643}
2644
2645/*
2646 * return expired entry, or NULL to just start from scratch in rbtree
2647 */
2648static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
2649{
2650	struct request *rq = NULL;
2651
2652	if (cfq_cfqq_fifo_expire(cfqq))
2653		return NULL;
2654
2655	cfq_mark_cfqq_fifo_expire(cfqq);
2656
2657	if (list_empty(&cfqq->fifo))
2658		return NULL;
2659
2660	rq = rq_entry_fifo(cfqq->fifo.next);
2661	if (time_before(jiffies, rq_fifo_time(rq)))
2662		rq = NULL;
2663
2664	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
2665	return rq;
2666}
2667
2668static inline int
2669cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2670{
2671	const int base_rq = cfqd->cfq_slice_async_rq;
2672
2673	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2674
2675	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2676}
2677
2678/*
2679 * Must be called with the queue_lock held.
2680 */
2681static int cfqq_process_refs(struct cfq_queue *cfqq)
2682{
2683	int process_refs, io_refs;
2684
2685	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2686	process_refs = cfqq->ref - io_refs;
2687	BUG_ON(process_refs < 0);
2688	return process_refs;
2689}
2690
2691static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2692{
2693	int process_refs, new_process_refs;
2694	struct cfq_queue *__cfqq;
2695
2696	/*
2697	 * If there are no process references on the new_cfqq, then it is
2698	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2699	 * chain may have dropped their last reference (not just their
2700	 * last process reference).
2701	 */
2702	if (!cfqq_process_refs(new_cfqq))
2703		return;
2704
2705	/* Avoid a circular list and skip interim queue merges */
2706	while ((__cfqq = new_cfqq->new_cfqq)) {
2707		if (__cfqq == cfqq)
2708			return;
2709		new_cfqq = __cfqq;
2710	}
2711
2712	process_refs = cfqq_process_refs(cfqq);
2713	new_process_refs = cfqq_process_refs(new_cfqq);
2714	/*
2715	 * If the process for the cfqq has gone away, there is no
2716	 * sense in merging the queues.
2717	 */
2718	if (process_refs == 0 || new_process_refs == 0)
2719		return;
2720
2721	/*
2722	 * Merge in the direction of the lesser amount of work.
2723	 */
2724	if (new_process_refs >= process_refs) {
2725		cfqq->new_cfqq = new_cfqq;
2726		new_cfqq->ref += process_refs;
2727	} else {
2728		new_cfqq->new_cfqq = cfqq;
2729		cfqq->ref += new_process_refs;
2730	}
2731}
2732
2733static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
2734			struct cfq_group *cfqg, enum wl_class_t wl_class)
2735{
2736	struct cfq_queue *queue;
2737	int i;
2738	bool key_valid = false;
2739	unsigned long lowest_key = 0;
2740	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2741
2742	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2743		/* select the one with lowest rb_key */
2744		queue = cfq_rb_first(st_for(cfqg, wl_class, i));
2745		if (queue &&
2746		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
2747			lowest_key = queue->rb_key;
2748			cur_best = i;
2749			key_valid = true;
2750		}
2751	}
2752
2753	return cur_best;
2754}
2755
2756static void
2757choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
2758{
2759	unsigned slice;
2760	unsigned count;
2761	struct cfq_rb_root *st;
2762	unsigned group_slice;
2763	enum wl_class_t original_class = cfqd->serving_wl_class;
2764
2765	/* Choose next priority. RT > BE > IDLE */
2766	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2767		cfqd->serving_wl_class = RT_WORKLOAD;
2768	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2769		cfqd->serving_wl_class = BE_WORKLOAD;
2770	else {
2771		cfqd->serving_wl_class = IDLE_WORKLOAD;
2772		cfqd->workload_expires = jiffies + 1;
2773		return;
2774	}
2775
2776	if (original_class != cfqd->serving_wl_class)
2777		goto new_workload;
2778
2779	/*
2780	 * For RT and BE, we have to choose also the type
2781	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2782	 * expiration time
2783	 */
2784	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2785	count = st->count;
2786
2787	/*
2788	 * check workload expiration, and that we still have other queues ready
2789	 */
2790	if (count && !time_after(jiffies, cfqd->workload_expires))
2791		return;
2792
2793new_workload:
2794	/* otherwise select new workload type */
2795	cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
2796					cfqd->serving_wl_class);
2797	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2798	count = st->count;
2799
2800	/*
2801	 * the workload slice is computed as a fraction of target latency
2802	 * proportional to the number of queues in that workload, over
2803	 * all the queues in the same priority class
2804	 */
2805	group_slice = cfq_group_slice(cfqd, cfqg);
2806
2807	slice = group_slice * count /
2808		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
2809		      cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
2810					cfqg));
2811
2812	if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
2813		unsigned int tmp;
2814
2815		/*
2816		 * Async queues are currently system wide. Just taking
2817		 * proportion of queues with-in same group will lead to higher
2818		 * async ratio system wide as generally root group is going
2819		 * to have higher weight. A more accurate thing would be to
2820		 * calculate system wide asnc/sync ratio.
2821		 */
2822		tmp = cfqd->cfq_target_latency *
2823			cfqg_busy_async_queues(cfqd, cfqg);
2824		tmp = tmp/cfqd->busy_queues;
2825		slice = min_t(unsigned, slice, tmp);
2826
2827		/* async workload slice is scaled down according to
2828		 * the sync/async slice ratio. */
2829		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2830	} else
2831		/* sync workload slice is at least 2 * cfq_slice_idle */
2832		slice = max(slice, 2 * cfqd->cfq_slice_idle);
2833
2834	slice = max_t(unsigned, slice, CFQ_MIN_TT);
2835	cfq_log(cfqd, "workload slice:%d", slice);
2836	cfqd->workload_expires = jiffies + slice;
2837}
2838
2839static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2840{
2841	struct cfq_rb_root *st = &cfqd->grp_service_tree;
2842	struct cfq_group *cfqg;
2843
2844	if (RB_EMPTY_ROOT(&st->rb))
2845		return NULL;
2846	cfqg = cfq_rb_first_group(st);
2847	update_min_vdisktime(st);
2848	return cfqg;
2849}
2850
2851static void cfq_choose_cfqg(struct cfq_data *cfqd)
2852{
2853	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2854
2855	cfqd->serving_group = cfqg;
2856
2857	/* Restore the workload type data */
2858	if (cfqg->saved_wl_slice) {
2859		cfqd->workload_expires = jiffies + cfqg->saved_wl_slice;
2860		cfqd->serving_wl_type = cfqg->saved_wl_type;
2861		cfqd->serving_wl_class = cfqg->saved_wl_class;
2862	} else
2863		cfqd->workload_expires = jiffies - 1;
2864
2865	choose_wl_class_and_type(cfqd, cfqg);
2866}
2867
2868/*
2869 * Select a queue for service. If we have a current active queue,
2870 * check whether to continue servicing it, or retrieve and set a new one.
2871 */
2872static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2873{
2874	struct cfq_queue *cfqq, *new_cfqq = NULL;
2875
2876	cfqq = cfqd->active_queue;
2877	if (!cfqq)
2878		goto new_queue;
2879
2880	if (!cfqd->rq_queued)
2881		return NULL;
2882
2883	/*
2884	 * We were waiting for group to get backlogged. Expire the queue
2885	 */
2886	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2887		goto expire;
2888
2889	/*
2890	 * The active queue has run out of time, expire it and select new.
2891	 */
2892	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2893		/*
2894		 * If slice had not expired at the completion of last request
2895		 * we might not have turned on wait_busy flag. Don't expire
2896		 * the queue yet. Allow the group to get backlogged.
2897		 *
2898		 * The very fact that we have used the slice, that means we
2899		 * have been idling all along on this queue and it should be
2900		 * ok to wait for this request to complete.
2901		 */
2902		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2903		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2904			cfqq = NULL;
2905			goto keep_queue;
2906		} else
2907			goto check_group_idle;
2908	}
2909
2910	/*
2911	 * The active queue has requests and isn't expired, allow it to
2912	 * dispatch.
2913	 */
2914	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2915		goto keep_queue;
2916
2917	/*
2918	 * If another queue has a request waiting within our mean seek
2919	 * distance, let it run.  The expire code will check for close
2920	 * cooperators and put the close queue at the front of the service
2921	 * tree.  If possible, merge the expiring queue with the new cfqq.
2922	 */
2923	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2924	if (new_cfqq) {
2925		if (!cfqq->new_cfqq)
2926			cfq_setup_merge(cfqq, new_cfqq);
2927		goto expire;
2928	}
2929
2930	/*
2931	 * No requests pending. If the active queue still has requests in
2932	 * flight or is idling for a new request, allow either of these
2933	 * conditions to happen (or time out) before selecting a new queue.
2934	 */
2935	if (timer_pending(&cfqd->idle_slice_timer)) {
2936		cfqq = NULL;
2937		goto keep_queue;
2938	}
2939
2940	/*
2941	 * This is a deep seek queue, but the device is much faster than
2942	 * the queue can deliver, don't idle
2943	 **/
2944	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2945	    (cfq_cfqq_slice_new(cfqq) ||
2946	    (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2947		cfq_clear_cfqq_deep(cfqq);
2948		cfq_clear_cfqq_idle_window(cfqq);
2949	}
2950
2951	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2952		cfqq = NULL;
2953		goto keep_queue;
2954	}
2955
2956	/*
2957	 * If group idle is enabled and there are requests dispatched from
2958	 * this group, wait for requests to complete.
2959	 */
2960check_group_idle:
2961	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
2962	    cfqq->cfqg->dispatched &&
2963	    !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
2964		cfqq = NULL;
2965		goto keep_queue;
2966	}
2967
2968expire:
2969	cfq_slice_expired(cfqd, 0);
2970new_queue:
2971	/*
2972	 * Current queue expired. Check if we have to switch to a new
2973	 * service tree
2974	 */
2975	if (!new_cfqq)
2976		cfq_choose_cfqg(cfqd);
2977
2978	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2979keep_queue:
2980	return cfqq;
2981}
2982
2983static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2984{
2985	int dispatched = 0;
2986
2987	while (cfqq->next_rq) {
2988		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2989		dispatched++;
2990	}
2991
2992	BUG_ON(!list_empty(&cfqq->fifo));
2993
2994	/* By default cfqq is not expired if it is empty. Do it explicitly */
2995	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2996	return dispatched;
2997}
2998
2999/*
3000 * Drain our current requests. Used for barriers and when switching
3001 * io schedulers on-the-fly.
3002 */
3003static int cfq_forced_dispatch(struct cfq_data *cfqd)
3004{
3005	struct cfq_queue *cfqq;
3006	int dispatched = 0;
3007
3008	/* Expire the timeslice of the current active queue first */
3009	cfq_slice_expired(cfqd, 0);
3010	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3011		__cfq_set_active_queue(cfqd, cfqq);
3012		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3013	}
3014
3015	BUG_ON(cfqd->busy_queues);
3016
3017	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3018	return dispatched;
3019}
3020
3021static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3022	struct cfq_queue *cfqq)
3023{
3024	/* the queue hasn't finished any request, can't estimate */
3025	if (cfq_cfqq_slice_new(cfqq))
3026		return true;
3027	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
3028		cfqq->slice_end))
3029		return true;
3030
3031	return false;
3032}
3033
3034static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3035{
3036	unsigned int max_dispatch;
3037
3038	/*
3039	 * Drain async requests before we start sync IO
3040	 */
3041	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3042		return false;
3043
3044	/*
3045	 * If this is an async queue and we have sync IO in flight, let it wait
3046	 */
3047	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3048		return false;
3049
3050	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3051	if (cfq_class_idle(cfqq))
3052		max_dispatch = 1;
3053
3054	/*
3055	 * Does this cfqq already have too much IO in flight?
3056	 */
3057	if (cfqq->dispatched >= max_dispatch) {
3058		bool promote_sync = false;
3059		/*
3060		 * idle queue must always only have a single IO in flight
3061		 */
3062		if (cfq_class_idle(cfqq))
3063			return false;
3064
3065		/*
3066		 * If there is only one sync queue
3067		 * we can ignore async queue here and give the sync
3068		 * queue no dispatch limit. The reason is a sync queue can
3069		 * preempt async queue, limiting the sync queue doesn't make
3070		 * sense. This is useful for aiostress test.
3071		 */
3072		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3073			promote_sync = true;
3074
3075		/*
3076		 * We have other queues, don't allow more IO from this one
3077		 */
3078		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3079				!promote_sync)
3080			return false;
3081
3082		/*
3083		 * Sole queue user, no limit
3084		 */
3085		if (cfqd->busy_queues == 1 || promote_sync)
3086			max_dispatch = -1;
3087		else
3088			/*
3089			 * Normally we start throttling cfqq when cfq_quantum/2
3090			 * requests have been dispatched. But we can drive
3091			 * deeper queue depths at the beginning of slice
3092			 * subjected to upper limit of cfq_quantum.
3093			 * */
3094			max_dispatch = cfqd->cfq_quantum;
3095	}
3096
3097	/*
3098	 * Async queues must wait a bit before being allowed dispatch.
3099	 * We also ramp up the dispatch depth gradually for async IO,
3100	 * based on the last sync IO we serviced
3101	 */
3102	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3103		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
3104		unsigned int depth;
3105
3106		depth = last_sync / cfqd->cfq_slice[1];
3107		if (!depth && !cfqq->dispatched)
3108			depth = 1;
3109		if (depth < max_dispatch)
3110			max_dispatch = depth;
3111	}
3112
3113	/*
3114	 * If we're below the current max, allow a dispatch
3115	 */
3116	return cfqq->dispatched < max_dispatch;
3117}
3118
3119/*
3120 * Dispatch a request from cfqq, moving them to the request queue
3121 * dispatch list.
3122 */
3123static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3124{
3125	struct request *rq;
3126
3127	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3128
3129	if (!cfq_may_dispatch(cfqd, cfqq))
3130		return false;
3131
3132	/*
3133	 * follow expired path, else get first next available
3134	 */
3135	rq = cfq_check_fifo(cfqq);
3136	if (!rq)
3137		rq = cfqq->next_rq;
3138
3139	/*
3140	 * insert request into driver dispatch list
3141	 */
3142	cfq_dispatch_insert(cfqd->queue, rq);
3143
3144	if (!cfqd->active_cic) {
3145		struct cfq_io_cq *cic = RQ_CIC(rq);
3146
3147		atomic_long_inc(&cic->icq.ioc->refcount);
3148		cfqd->active_cic = cic;
3149	}
3150
3151	return true;
3152}
3153
3154/*
3155 * Find the cfqq that we need to service and move a request from that to the
3156 * dispatch list
3157 */
3158static int cfq_dispatch_requests(struct request_queue *q, int force)
3159{
3160	struct cfq_data *cfqd = q->elevator->elevator_data;
3161	struct cfq_queue *cfqq;
3162
3163	if (!cfqd->busy_queues)
3164		return 0;
3165
3166	if (unlikely(force))
3167		return cfq_forced_dispatch(cfqd);
3168
3169	cfqq = cfq_select_queue(cfqd);
3170	if (!cfqq)
3171		return 0;
3172
3173	/*
3174	 * Dispatch a request from this cfqq, if it is allowed
3175	 */
3176	if (!cfq_dispatch_request(cfqd, cfqq))
3177		return 0;
3178
3179	cfqq->slice_dispatch++;
3180	cfq_clear_cfqq_must_dispatch(cfqq);
3181
3182	/*
3183	 * expire an async queue immediately if it has used up its slice. idle
3184	 * queue always expire after 1 dispatch round.
3185	 */
3186	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3187	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3188	    cfq_class_idle(cfqq))) {
3189		cfqq->slice_end = jiffies + 1;
3190		cfq_slice_expired(cfqd, 0);
3191	}
3192
3193	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3194	return 1;
3195}
3196
3197/*
3198 * task holds one reference to the queue, dropped when task exits. each rq
3199 * in-flight on this queue also holds a reference, dropped when rq is freed.
3200 *
3201 * Each cfq queue took a reference on the parent group. Drop it now.
3202 * queue lock must be held here.
3203 */
3204static void cfq_put_queue(struct cfq_queue *cfqq)
3205{
3206	struct cfq_data *cfqd = cfqq->cfqd;
3207	struct cfq_group *cfqg;
3208
3209	BUG_ON(cfqq->ref <= 0);
3210
3211	cfqq->ref--;
3212	if (cfqq->ref)
3213		return;
3214
3215	cfq_log_cfqq(cfqd, cfqq, "put_queue");
3216	BUG_ON(rb_first(&cfqq->sort_list));
3217	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3218	cfqg = cfqq->cfqg;
3219
3220	if (unlikely(cfqd->active_queue == cfqq)) {
3221		__cfq_slice_expired(cfqd, cfqq, 0);
3222		cfq_schedule_dispatch(cfqd);
3223	}
3224
3225	BUG_ON(cfq_cfqq_on_rr(cfqq));
3226	kmem_cache_free(cfq_pool, cfqq);
3227	cfqg_put(cfqg);
3228}
3229
3230static void cfq_put_cooperator(struct cfq_queue *cfqq)
3231{
3232	struct cfq_queue *__cfqq, *next;
3233
3234	/*
3235	 * If this queue was scheduled to merge with another queue, be
3236	 * sure to drop the reference taken on that queue (and others in
3237	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
3238	 */
3239	__cfqq = cfqq->new_cfqq;
3240	while (__cfqq) {
3241		if (__cfqq == cfqq) {
3242			WARN(1, "cfqq->new_cfqq loop detected\n");
3243			break;
3244		}
3245		next = __cfqq->new_cfqq;
3246		cfq_put_queue(__cfqq);
3247		__cfqq = next;
3248	}
3249}
3250
3251static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3252{
3253	if (unlikely(cfqq == cfqd->active_queue)) {
3254		__cfq_slice_expired(cfqd, cfqq, 0);
3255		cfq_schedule_dispatch(cfqd);
3256	}
3257
3258	cfq_put_cooperator(cfqq);
3259
3260	cfq_put_queue(cfqq);
3261}
3262
3263static void cfq_init_icq(struct io_cq *icq)
3264{
3265	struct cfq_io_cq *cic = icq_to_cic(icq);
3266
3267	cic->ttime.last_end_request = jiffies;
3268}
3269
3270static void cfq_exit_icq(struct io_cq *icq)
3271{
3272	struct cfq_io_cq *cic = icq_to_cic(icq);
3273	struct cfq_data *cfqd = cic_to_cfqd(cic);
3274
3275	if (cic->cfqq[BLK_RW_ASYNC]) {
3276		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
3277		cic->cfqq[BLK_RW_ASYNC] = NULL;
3278	}
3279
3280	if (cic->cfqq[BLK_RW_SYNC]) {
3281		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
3282		cic->cfqq[BLK_RW_SYNC] = NULL;
3283	}
3284}
3285
3286static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3287{
3288	struct task_struct *tsk = current;
3289	int ioprio_class;
3290
3291	if (!cfq_cfqq_prio_changed(cfqq))
3292		return;
3293
3294	ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3295	switch (ioprio_class) {
3296	default:
3297		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3298	case IOPRIO_CLASS_NONE:
3299		/*
3300		 * no prio set, inherit CPU scheduling settings
3301		 */
3302		cfqq->ioprio = task_nice_ioprio(tsk);
3303		cfqq->ioprio_class = task_nice_ioclass(tsk);
3304		break;
3305	case IOPRIO_CLASS_RT:
3306		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3307		cfqq->ioprio_class = IOPRIO_CLASS_RT;
3308		break;
3309	case IOPRIO_CLASS_BE:
3310		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3311		cfqq->ioprio_class = IOPRIO_CLASS_BE;
3312		break;
3313	case IOPRIO_CLASS_IDLE:
3314		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3315		cfqq->ioprio = 7;
3316		cfq_clear_cfqq_idle_window(cfqq);
3317		break;
3318	}
3319
3320	/*
3321	 * keep track of original prio settings in case we have to temporarily
3322	 * elevate the priority of this queue
3323	 */
3324	cfqq->org_ioprio = cfqq->ioprio;
3325	cfq_clear_cfqq_prio_changed(cfqq);
3326}
3327
3328static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3329{
3330	int ioprio = cic->icq.ioc->ioprio;
3331	struct cfq_data *cfqd = cic_to_cfqd(cic);
3332	struct cfq_queue *cfqq;
3333
3334	/*
3335	 * Check whether ioprio has changed.  The condition may trigger
3336	 * spuriously on a newly created cic but there's no harm.
3337	 */
3338	if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3339		return;
3340
3341	cfqq = cic->cfqq[BLK_RW_ASYNC];
3342	if (cfqq) {
3343		struct cfq_queue *new_cfqq;
3344		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
3345					 GFP_ATOMIC);
3346		if (new_cfqq) {
3347			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
3348			cfq_put_queue(cfqq);
3349		}
3350	}
3351
3352	cfqq = cic->cfqq[BLK_RW_SYNC];
3353	if (cfqq)
3354		cfq_mark_cfqq_prio_changed(cfqq);
3355
3356	cic->ioprio = ioprio;
3357}
3358
3359static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3360			  pid_t pid, bool is_sync)
3361{
3362	RB_CLEAR_NODE(&cfqq->rb_node);
3363	RB_CLEAR_NODE(&cfqq->p_node);
3364	INIT_LIST_HEAD(&cfqq->fifo);
3365
3366	cfqq->ref = 0;
3367	cfqq->cfqd = cfqd;
3368
3369	cfq_mark_cfqq_prio_changed(cfqq);
3370
3371	if (is_sync) {
3372		if (!cfq_class_idle(cfqq))
3373			cfq_mark_cfqq_idle_window(cfqq);
3374		cfq_mark_cfqq_sync(cfqq);
3375	}
3376	cfqq->pid = pid;
3377}
3378
3379#ifdef CONFIG_CFQ_GROUP_IOSCHED
3380static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3381{
3382	struct cfq_data *cfqd = cic_to_cfqd(cic);
3383	struct cfq_queue *sync_cfqq;
3384	uint64_t id;
3385
3386	rcu_read_lock();
3387	id = bio_blkcg(bio)->id;
3388	rcu_read_unlock();
3389
3390	/*
3391	 * Check whether blkcg has changed.  The condition may trigger
3392	 * spuriously on a newly created cic but there's no harm.
3393	 */
3394	if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
3395		return;
3396
3397	sync_cfqq = cic_to_cfqq(cic, 1);
3398	if (sync_cfqq) {
3399		/*
3400		 * Drop reference to sync queue. A new sync queue will be
3401		 * assigned in new group upon arrival of a fresh request.
3402		 */
3403		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
3404		cic_set_cfqq(cic, NULL, 1);
3405		cfq_put_queue(sync_cfqq);
3406	}
3407
3408	cic->blkcg_id = id;
3409}
3410#else
3411static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
3412#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
3413
3414static struct cfq_queue *
3415cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3416		     struct bio *bio, gfp_t gfp_mask)
3417{
3418	struct blkcg *blkcg;
3419	struct cfq_queue *cfqq, *new_cfqq = NULL;
3420	struct cfq_group *cfqg;
3421
3422retry:
3423	rcu_read_lock();
3424
3425	blkcg = bio_blkcg(bio);
3426	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
3427	cfqq = cic_to_cfqq(cic, is_sync);
3428
3429	/*
3430	 * Always try a new alloc if we fell back to the OOM cfqq
3431	 * originally, since it should just be a temporary situation.
3432	 */
3433	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3434		cfqq = NULL;
3435		if (new_cfqq) {
3436			cfqq = new_cfqq;
3437			new_cfqq = NULL;
3438		} else if (gfp_mask & __GFP_WAIT) {
3439			rcu_read_unlock();
3440			spin_unlock_irq(cfqd->queue->queue_lock);
3441			new_cfqq = kmem_cache_alloc_node(cfq_pool,
3442					gfp_mask | __GFP_ZERO,
3443					cfqd->queue->node);
3444			spin_lock_irq(cfqd->queue->queue_lock);
3445			if (new_cfqq)
3446				goto retry;
3447		} else {
3448			cfqq = kmem_cache_alloc_node(cfq_pool,
3449					gfp_mask | __GFP_ZERO,
3450					cfqd->queue->node);
3451		}
3452
3453		if (cfqq) {
3454			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3455			cfq_init_prio_data(cfqq, cic);
3456			cfq_link_cfqq_cfqg(cfqq, cfqg);
3457			cfq_log_cfqq(cfqd, cfqq, "alloced");
3458		} else
3459			cfqq = &cfqd->oom_cfqq;
3460	}
3461
3462	if (new_cfqq)
3463		kmem_cache_free(cfq_pool, new_cfqq);
3464
3465	rcu_read_unlock();
3466	return cfqq;
3467}
3468
3469static struct cfq_queue **
3470cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
3471{
3472	switch (ioprio_class) {
3473	case IOPRIO_CLASS_RT:
3474		return &cfqd->async_cfqq[0][ioprio];
3475	case IOPRIO_CLASS_NONE:
3476		ioprio = IOPRIO_NORM;
3477		/* fall through */
3478	case IOPRIO_CLASS_BE:
3479		return &cfqd->async_cfqq[1][ioprio];
3480	case IOPRIO_CLASS_IDLE:
3481		return &cfqd->async_idle_cfqq;
3482	default:
3483		BUG();
3484	}
3485}
3486
3487static struct cfq_queue *
3488cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3489	      struct bio *bio, gfp_t gfp_mask)
3490{
3491	const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3492	const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3493	struct cfq_queue **async_cfqq = NULL;
3494	struct cfq_queue *cfqq = NULL;
3495
3496	if (!is_sync) {
3497		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
3498		cfqq = *async_cfqq;
3499	}
3500
3501	if (!cfqq)
3502		cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
3503
3504	/*
3505	 * pin the queue now that it's allocated, scheduler exit will prune it
3506	 */
3507	if (!is_sync && !(*async_cfqq)) {
3508		cfqq->ref++;
3509		*async_cfqq = cfqq;
3510	}
3511
3512	cfqq->ref++;
3513	return cfqq;
3514}
3515
3516static void
3517__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3518{
3519	unsigned long elapsed = jiffies - ttime->last_end_request;
3520	elapsed = min(elapsed, 2UL * slice_idle);
3521
3522	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3523	ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
3524	ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
3525}
3526
3527static void
3528cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3529			struct cfq_io_cq *cic)
3530{
3531	if (cfq_cfqq_sync(cfqq)) {
3532		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3533		__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3534			cfqd->cfq_slice_idle);
3535	}
3536#ifdef CONFIG_CFQ_GROUP_IOSCHED
3537	__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3538#endif
3539}
3540
3541static void
3542cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3543		       struct request *rq)
3544{
3545	sector_t sdist = 0;
3546	sector_t n_sec = blk_rq_sectors(rq);
3547	if (cfqq->last_request_pos) {
3548		if (cfqq->last_request_pos < blk_rq_pos(rq))
3549			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3550		else
3551			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3552	}
3553
3554	cfqq->seek_history <<= 1;
3555	if (blk_queue_nonrot(cfqd->queue))
3556		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3557	else
3558		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3559}
3560
3561/*
3562 * Disable idle window if the process thinks too long or seeks so much that
3563 * it doesn't matter
3564 */
3565static void
3566cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3567		       struct cfq_io_cq *cic)
3568{
3569	int old_idle, enable_idle;
3570
3571	/*
3572	 * Don't idle for async or idle io prio class
3573	 */
3574	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3575		return;
3576
3577	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3578
3579	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3580		cfq_mark_cfqq_deep(cfqq);
3581
3582	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3583		enable_idle = 0;
3584	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3585		 !cfqd->cfq_slice_idle ||
3586		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3587		enable_idle = 0;
3588	else if (sample_valid(cic->ttime.ttime_samples)) {
3589		if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3590			enable_idle = 0;
3591		else
3592			enable_idle = 1;
3593	}
3594
3595	if (old_idle != enable_idle) {
3596		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3597		if (enable_idle)
3598			cfq_mark_cfqq_idle_window(cfqq);
3599		else
3600			cfq_clear_cfqq_idle_window(cfqq);
3601	}
3602}
3603
3604/*
3605 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3606 * no or if we aren't sure, a 1 will cause a preempt.
3607 */
3608static bool
3609cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3610		   struct request *rq)
3611{
3612	struct cfq_queue *cfqq;
3613
3614	cfqq = cfqd->active_queue;
3615	if (!cfqq)
3616		return false;
3617
3618	if (cfq_class_idle(new_cfqq))
3619		return false;
3620
3621	if (cfq_class_idle(cfqq))
3622		return true;
3623
3624	/*
3625	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3626	 */
3627	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3628		return false;
3629
3630	/*
3631	 * if the new request is sync, but the currently running queue is
3632	 * not, let the sync request have priority.
3633	 */
3634	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3635		return true;
3636
3637	if (new_cfqq->cfqg != cfqq->cfqg)
3638		return false;
3639
3640	if (cfq_slice_used(cfqq))
3641		return true;
3642
3643	/* Allow preemption only if we are idling on sync-noidle tree */
3644	if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
3645	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3646	    new_cfqq->service_tree->count == 2 &&
3647	    RB_EMPTY_ROOT(&cfqq->sort_list))
3648		return true;
3649
3650	/*
3651	 * So both queues are sync. Let the new request get disk time if
3652	 * it's a metadata request and the current queue is doing regular IO.
3653	 */
3654	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
3655		return true;
3656
3657	/*
3658	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3659	 */
3660	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3661		return true;
3662
3663	/* An idle queue should not be idle now for some reason */
3664	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3665		return true;
3666
3667	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3668		return false;
3669
3670	/*
3671	 * if this request is as-good as one we would expect from the
3672	 * current cfqq, let it preempt
3673	 */
3674	if (cfq_rq_close(cfqd, cfqq, rq))
3675		return true;
3676
3677	return false;
3678}
3679
3680/*
3681 * cfqq preempts the active queue. if we allowed preempt with no slice left,
3682 * let it have half of its nominal slice.
3683 */
3684static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3685{
3686	enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
3687
3688	cfq_log_cfqq(cfqd, cfqq, "preempt");
3689	cfq_slice_expired(cfqd, 1);
3690
3691	/*
3692	 * workload type is changed, don't save slice, otherwise preempt
3693	 * doesn't happen
3694	 */
3695	if (old_type != cfqq_type(cfqq))
3696		cfqq->cfqg->saved_wl_slice = 0;
3697
3698	/*
3699	 * Put the new queue at the front of the of the current list,
3700	 * so we know that it will be selected next.
3701	 */
3702	BUG_ON(!cfq_cfqq_on_rr(cfqq));
3703
3704	cfq_service_tree_add(cfqd, cfqq, 1);
3705
3706	cfqq->slice_end = 0;
3707	cfq_mark_cfqq_slice_new(cfqq);
3708}
3709
3710/*
3711 * Called when a new fs request (rq) is added (to cfqq). Check if there's
3712 * something we should do about it
3713 */
3714static void
3715cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3716		struct request *rq)
3717{
3718	struct cfq_io_cq *cic = RQ_CIC(rq);
3719
3720	cfqd->rq_queued++;
3721	if (rq->cmd_flags & REQ_PRIO)
3722		cfqq->prio_pending++;
3723
3724	cfq_update_io_thinktime(cfqd, cfqq, cic);
3725	cfq_update_io_seektime(cfqd, cfqq, rq);
3726	cfq_update_idle_window(cfqd, cfqq, cic);
3727
3728	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3729
3730	if (cfqq == cfqd->active_queue) {
3731		/*
3732		 * Remember that we saw a request from this process, but
3733		 * don't start queuing just yet. Otherwise we risk seeing lots
3734		 * of tiny requests, because we disrupt the normal plugging
3735		 * and merging. If the request is already larger than a single
3736		 * page, let it rip immediately. For that case we assume that
3737		 * merging is already done. Ditto for a busy system that
3738		 * has other work pending, don't risk delaying until the
3739		 * idle timer unplug to continue working.
3740		 */
3741		if (cfq_cfqq_wait_request(cfqq)) {
3742			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3743			    cfqd->busy_queues > 1) {
3744				cfq_del_timer(cfqd, cfqq);
3745				cfq_clear_cfqq_wait_request(cfqq);
3746				__blk_run_queue(cfqd->queue);
3747			} else {
3748				cfqg_stats_update_idle_time(cfqq->cfqg);
3749				cfq_mark_cfqq_must_dispatch(cfqq);
3750			}
3751		}
3752	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3753		/*
3754		 * not the active queue - expire current slice if it is
3755		 * idle and has expired it's mean thinktime or this new queue
3756		 * has some old slice time left and is of higher priority or
3757		 * this new queue is RT and the current one is BE
3758		 */
3759		cfq_preempt_queue(cfqd, cfqq);
3760		__blk_run_queue(cfqd->queue);
3761	}
3762}
3763
3764static void cfq_insert_request(struct request_queue *q, struct request *rq)
3765{
3766	struct cfq_data *cfqd = q->elevator->elevator_data;
3767	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3768
3769	cfq_log_cfqq(cfqd, cfqq, "insert_request");
3770	cfq_init_prio_data(cfqq, RQ_CIC(rq));
3771
3772	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3773	list_add_tail(&rq->queuelist, &cfqq->fifo);
3774	cfq_add_rq_rb(rq);
3775	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
3776				 rq->cmd_flags);
3777	cfq_rq_enqueued(cfqd, cfqq, rq);
3778}
3779
3780/*
3781 * Update hw_tag based on peak queue depth over 50 samples under
3782 * sufficient load.
3783 */
3784static void cfq_update_hw_tag(struct cfq_data *cfqd)
3785{
3786	struct cfq_queue *cfqq = cfqd->active_queue;
3787
3788	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3789		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3790
3791	if (cfqd->hw_tag == 1)
3792		return;
3793
3794	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3795	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3796		return;
3797
3798	/*
3799	 * If active queue hasn't enough requests and can idle, cfq might not
3800	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3801	 * case
3802	 */
3803	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3804	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3805	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3806		return;
3807
3808	if (cfqd->hw_tag_samples++ < 50)
3809		return;
3810
3811	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3812		cfqd->hw_tag = 1;
3813	else
3814		cfqd->hw_tag = 0;
3815}
3816
3817static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3818{
3819	struct cfq_io_cq *cic = cfqd->active_cic;
3820
3821	/* If the queue already has requests, don't wait */
3822	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3823		return false;
3824
3825	/* If there are other queues in the group, don't wait */
3826	if (cfqq->cfqg->nr_cfqq > 1)
3827		return false;
3828
3829	/* the only queue in the group, but think time is big */
3830	if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
3831		return false;
3832
3833	if (cfq_slice_used(cfqq))
3834		return true;
3835
3836	/* if slice left is less than think time, wait busy */
3837	if (cic && sample_valid(cic->ttime.ttime_samples)
3838	    && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
3839		return true;
3840
3841	/*
3842	 * If think times is less than a jiffy than ttime_mean=0 and above
3843	 * will not be true. It might happen that slice has not expired yet
3844	 * but will expire soon (4-5 ns) during select_queue(). To cover the
3845	 * case where think time is less than a jiffy, mark the queue wait
3846	 * busy if only 1 jiffy is left in the slice.
3847	 */
3848	if (cfqq->slice_end - jiffies == 1)
3849		return true;
3850
3851	return false;
3852}
3853
3854static void cfq_completed_request(struct request_queue *q, struct request *rq)
3855{
3856	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3857	struct cfq_data *cfqd = cfqq->cfqd;
3858	const int sync = rq_is_sync(rq);
3859	unsigned long now;
3860
3861	now = jiffies;
3862	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
3863		     !!(rq->cmd_flags & REQ_NOIDLE));
3864
3865	cfq_update_hw_tag(cfqd);
3866
3867	WARN_ON(!cfqd->rq_in_driver);
3868	WARN_ON(!cfqq->dispatched);
3869	cfqd->rq_in_driver--;
3870	cfqq->dispatched--;
3871	(RQ_CFQG(rq))->dispatched--;
3872	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
3873				     rq_io_start_time_ns(rq), rq->cmd_flags);
3874
3875	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3876
3877	if (sync) {
3878		struct cfq_rb_root *st;
3879
3880		RQ_CIC(rq)->ttime.last_end_request = now;
3881
3882		if (cfq_cfqq_on_rr(cfqq))
3883			st = cfqq->service_tree;
3884		else
3885			st = st_for(cfqq->cfqg, cfqq_class(cfqq),
3886					cfqq_type(cfqq));
3887
3888		st->ttime.last_end_request = now;
3889		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3890			cfqd->last_delayed_sync = now;
3891	}
3892
3893#ifdef CONFIG_CFQ_GROUP_IOSCHED
3894	cfqq->cfqg->ttime.last_end_request = now;
3895#endif
3896
3897	/*
3898	 * If this is the active queue, check if it needs to be expired,
3899	 * or if we want to idle in case it has no pending requests.
3900	 */
3901	if (cfqd->active_queue == cfqq) {
3902		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3903
3904		if (cfq_cfqq_slice_new(cfqq)) {
3905			cfq_set_prio_slice(cfqd, cfqq);
3906			cfq_clear_cfqq_slice_new(cfqq);
3907		}
3908
3909		/*
3910		 * Should we wait for next request to come in before we expire
3911		 * the queue.
3912		 */
3913		if (cfq_should_wait_busy(cfqd, cfqq)) {
3914			unsigned long extend_sl = cfqd->cfq_slice_idle;
3915			if (!cfqd->cfq_slice_idle)
3916				extend_sl = cfqd->cfq_group_idle;
3917			cfqq->slice_end = jiffies + extend_sl;
3918			cfq_mark_cfqq_wait_busy(cfqq);
3919			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3920		}
3921
3922		/*
3923		 * Idling is not enabled on:
3924		 * - expired queues
3925		 * - idle-priority queues
3926		 * - async queues
3927		 * - queues with still some requests queued
3928		 * - when there is a close cooperator
3929		 */
3930		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3931			cfq_slice_expired(cfqd, 1);
3932		else if (sync && cfqq_empty &&
3933			 !cfq_close_cooperator(cfqd, cfqq)) {
3934			cfq_arm_slice_timer(cfqd);
3935		}
3936	}
3937
3938	if (!cfqd->rq_in_driver)
3939		cfq_schedule_dispatch(cfqd);
3940}
3941
3942static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3943{
3944	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3945		cfq_mark_cfqq_must_alloc_slice(cfqq);
3946		return ELV_MQUEUE_MUST;
3947	}
3948
3949	return ELV_MQUEUE_MAY;
3950}
3951
3952static int cfq_may_queue(struct request_queue *q, int rw)
3953{
3954	struct cfq_data *cfqd = q->elevator->elevator_data;
3955	struct task_struct *tsk = current;
3956	struct cfq_io_cq *cic;
3957	struct cfq_queue *cfqq;
3958
3959	/*
3960	 * don't force setup of a queue from here, as a call to may_queue
3961	 * does not necessarily imply that a request actually will be queued.
3962	 * so just lookup a possibly existing queue, or return 'may queue'
3963	 * if that fails
3964	 */
3965	cic = cfq_cic_lookup(cfqd, tsk->io_context);
3966	if (!cic)
3967		return ELV_MQUEUE_MAY;
3968
3969	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3970	if (cfqq) {
3971		cfq_init_prio_data(cfqq, cic);
3972
3973		return __cfq_may_queue(cfqq);
3974	}
3975
3976	return ELV_MQUEUE_MAY;
3977}
3978
3979/*
3980 * queue lock held here
3981 */
3982static void cfq_put_request(struct request *rq)
3983{
3984	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3985
3986	if (cfqq) {
3987		const int rw = rq_data_dir(rq);
3988
3989		BUG_ON(!cfqq->allocated[rw]);
3990		cfqq->allocated[rw]--;
3991
3992		/* Put down rq reference on cfqg */
3993		cfqg_put(RQ_CFQG(rq));
3994		rq->elv.priv[0] = NULL;
3995		rq->elv.priv[1] = NULL;
3996
3997		cfq_put_queue(cfqq);
3998	}
3999}
4000
4001static struct cfq_queue *
4002cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
4003		struct cfq_queue *cfqq)
4004{
4005	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4006	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4007	cfq_mark_cfqq_coop(cfqq->new_cfqq);
4008	cfq_put_queue(cfqq);
4009	return cic_to_cfqq(cic, 1);
4010}
4011
4012/*
4013 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4014 * was the last process referring to said cfqq.
4015 */
4016static struct cfq_queue *
4017split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4018{
4019	if (cfqq_process_refs(cfqq) == 1) {
4020		cfqq->pid = current->pid;
4021		cfq_clear_cfqq_coop(cfqq);
4022		cfq_clear_cfqq_split_coop(cfqq);
4023		return cfqq;
4024	}
4025
4026	cic_set_cfqq(cic, NULL, 1);
4027
4028	cfq_put_cooperator(cfqq);
4029
4030	cfq_put_queue(cfqq);
4031	return NULL;
4032}
4033/*
4034 * Allocate cfq data structures associated with this request.
4035 */
4036static int
4037cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4038		gfp_t gfp_mask)
4039{
4040	struct cfq_data *cfqd = q->elevator->elevator_data;
4041	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
4042	const int rw = rq_data_dir(rq);
4043	const bool is_sync = rq_is_sync(rq);
4044	struct cfq_queue *cfqq;
4045
4046	might_sleep_if(gfp_mask & __GFP_WAIT);
4047
4048	spin_lock_irq(q->queue_lock);
4049
4050	check_ioprio_changed(cic, bio);
4051	check_blkcg_changed(cic, bio);
4052new_queue:
4053	cfqq = cic_to_cfqq(cic, is_sync);
4054	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4055		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
4056		cic_set_cfqq(cic, cfqq, is_sync);
4057	} else {
4058		/*
4059		 * If the queue was seeky for too long, break it apart.
4060		 */
4061		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4062			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4063			cfqq = split_cfqq(cic, cfqq);
4064			if (!cfqq)
4065				goto new_queue;
4066		}
4067
4068		/*
4069		 * Check to see if this queue is scheduled to merge with
4070		 * another, closely cooperating queue.  The merging of
4071		 * queues happens here as it must be done in process context.
4072		 * The reference on new_cfqq was taken in merge_cfqqs.
4073		 */
4074		if (cfqq->new_cfqq)
4075			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4076	}
4077
4078	cfqq->allocated[rw]++;
4079
4080	cfqq->ref++;
4081	cfqg_get(cfqq->cfqg);
4082	rq->elv.priv[0] = cfqq;
4083	rq->elv.priv[1] = cfqq->cfqg;
4084	spin_unlock_irq(q->queue_lock);
4085	return 0;
4086}
4087
4088static void cfq_kick_queue(struct work_struct *work)
4089{
4090	struct cfq_data *cfqd =
4091		container_of(work, struct cfq_data, unplug_work);
4092	struct request_queue *q = cfqd->queue;
4093
4094	spin_lock_irq(q->queue_lock);
4095	__blk_run_queue(cfqd->queue);
4096	spin_unlock_irq(q->queue_lock);
4097}
4098
4099/*
4100 * Timer running if the active_queue is currently idling inside its time slice
4101 */
4102static void cfq_idle_slice_timer(unsigned long data)
4103{
4104	struct cfq_data *cfqd = (struct cfq_data *) data;
4105	struct cfq_queue *cfqq;
4106	unsigned long flags;
4107	int timed_out = 1;
4108
4109	cfq_log(cfqd, "idle timer fired");
4110
4111	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4112
4113	cfqq = cfqd->active_queue;
4114	if (cfqq) {
4115		timed_out = 0;
4116
4117		/*
4118		 * We saw a request before the queue expired, let it through
4119		 */
4120		if (cfq_cfqq_must_dispatch(cfqq))
4121			goto out_kick;
4122
4123		/*
4124		 * expired
4125		 */
4126		if (cfq_slice_used(cfqq))
4127			goto expire;
4128
4129		/*
4130		 * only expire and reinvoke request handler, if there are
4131		 * other queues with pending requests
4132		 */
4133		if (!cfqd->busy_queues)
4134			goto out_cont;
4135
4136		/*
4137		 * not expired and it has a request pending, let it dispatch
4138		 */
4139		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4140			goto out_kick;
4141
4142		/*
4143		 * Queue depth flag is reset only when the idle didn't succeed
4144		 */
4145		cfq_clear_cfqq_deep(cfqq);
4146	}
4147expire:
4148	cfq_slice_expired(cfqd, timed_out);
4149out_kick:
4150	cfq_schedule_dispatch(cfqd);
4151out_cont:
4152	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4153}
4154
4155static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4156{
4157	del_timer_sync(&cfqd->idle_slice_timer);
4158	cancel_work_sync(&cfqd->unplug_work);
4159}
4160
4161static void cfq_put_async_queues(struct cfq_data *cfqd)
4162{
4163	int i;
4164
4165	for (i = 0; i < IOPRIO_BE_NR; i++) {
4166		if (cfqd->async_cfqq[0][i])
4167			cfq_put_queue(cfqd->async_cfqq[0][i]);
4168		if (cfqd->async_cfqq[1][i])
4169			cfq_put_queue(cfqd->async_cfqq[1][i]);
4170	}
4171
4172	if (cfqd->async_idle_cfqq)
4173		cfq_put_queue(cfqd->async_idle_cfqq);
4174}
4175
4176static void cfq_exit_queue(struct elevator_queue *e)
4177{
4178	struct cfq_data *cfqd = e->elevator_data;
4179	struct request_queue *q = cfqd->queue;
4180
4181	cfq_shutdown_timer_wq(cfqd);
4182
4183	spin_lock_irq(q->queue_lock);
4184
4185	if (cfqd->active_queue)
4186		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4187
4188	cfq_put_async_queues(cfqd);
4189
4190	spin_unlock_irq(q->queue_lock);
4191
4192	cfq_shutdown_timer_wq(cfqd);
4193
4194#ifdef CONFIG_CFQ_GROUP_IOSCHED
4195	blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4196#else
4197	kfree(cfqd->root_group);
4198#endif
4199	kfree(cfqd);
4200}
4201
4202static int cfq_init_queue(struct request_queue *q)
4203{
4204	struct cfq_data *cfqd;
4205	struct blkcg_gq *blkg __maybe_unused;
4206	int i, ret;
4207
4208	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
4209	if (!cfqd)
4210		return -ENOMEM;
4211
4212	cfqd->queue = q;
4213	q->elevator->elevator_data = cfqd;
4214
4215	/* Init root service tree */
4216	cfqd->grp_service_tree = CFQ_RB_ROOT;
4217
4218	/* Init root group and prefer root group over other groups by default */
4219#ifdef CONFIG_CFQ_GROUP_IOSCHED
4220	ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4221	if (ret)
4222		goto out_free;
4223
4224	cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4225#else
4226	ret = -ENOMEM;
4227	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4228					GFP_KERNEL, cfqd->queue->node);
4229	if (!cfqd->root_group)
4230		goto out_free;
4231
4232	cfq_init_cfqg_base(cfqd->root_group);
4233#endif
4234	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
4235	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
4236
4237	/*
4238	 * Not strictly needed (since RB_ROOT just clears the node and we
4239	 * zeroed cfqd on alloc), but better be safe in case someone decides
4240	 * to add magic to the rb code
4241	 */
4242	for (i = 0; i < CFQ_PRIO_LISTS; i++)
4243		cfqd->prio_trees[i] = RB_ROOT;
4244
4245	/*
4246	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
4247	 * Grab a permanent reference to it, so that the normal code flow
4248	 * will not attempt to free it.  oom_cfqq is linked to root_group
4249	 * but shouldn't hold a reference as it'll never be unlinked.  Lose
4250	 * the reference from linking right away.
4251	 */
4252	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4253	cfqd->oom_cfqq.ref++;
4254
4255	spin_lock_irq(q->queue_lock);
4256	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4257	cfqg_put(cfqd->root_group);
4258	spin_unlock_irq(q->queue_lock);
4259
4260	init_timer(&cfqd->idle_slice_timer);
4261	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4262	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
4263
4264	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4265
4266	cfqd->cfq_quantum = cfq_quantum;
4267	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4268	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4269	cfqd->cfq_back_max = cfq_back_max;
4270	cfqd->cfq_back_penalty = cfq_back_penalty;
4271	cfqd->cfq_slice[0] = cfq_slice_async;
4272	cfqd->cfq_slice[1] = cfq_slice_sync;
4273	cfqd->cfq_target_latency = cfq_target_latency;
4274	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4275	cfqd->cfq_slice_idle = cfq_slice_idle;
4276	cfqd->cfq_group_idle = cfq_group_idle;
4277	cfqd->cfq_latency = 1;
4278	cfqd->hw_tag = -1;
4279	/*
4280	 * we optimistically start assuming sync ops weren't delayed in last
4281	 * second, in order to have larger depth for async operations.
4282	 */
4283	cfqd->last_delayed_sync = jiffies - HZ;
4284	return 0;
4285
4286out_free:
4287	kfree(cfqd);
4288	return ret;
4289}
4290
4291/*
4292 * sysfs parts below -->
4293 */
4294static ssize_t
4295cfq_var_show(unsigned int var, char *page)
4296{
4297	return sprintf(page, "%d\n", var);
4298}
4299
4300static ssize_t
4301cfq_var_store(unsigned int *var, const char *page, size_t count)
4302{
4303	char *p = (char *) page;
4304
4305	*var = simple_strtoul(p, &p, 10);
4306	return count;
4307}
4308
4309#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
4310static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
4311{									\
4312	struct cfq_data *cfqd = e->elevator_data;			\
4313	unsigned int __data = __VAR;					\
4314	if (__CONV)							\
4315		__data = jiffies_to_msecs(__data);			\
4316	return cfq_var_show(__data, (page));				\
4317}
4318SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4319SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4320SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4321SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4322SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4323SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4324SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4325SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4326SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4327SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4328SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4329SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4330#undef SHOW_FUNCTION
4331
4332#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
4333static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
4334{									\
4335	struct cfq_data *cfqd = e->elevator_data;			\
4336	unsigned int __data;						\
4337	int ret = cfq_var_store(&__data, (page), count);		\
4338	if (__data < (MIN))						\
4339		__data = (MIN);						\
4340	else if (__data > (MAX))					\
4341		__data = (MAX);						\
4342	if (__CONV)							\
4343		*(__PTR) = msecs_to_jiffies(__data);			\
4344	else								\
4345		*(__PTR) = __data;					\
4346	return ret;							\
4347}
4348STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4349STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4350		UINT_MAX, 1);
4351STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4352		UINT_MAX, 1);
4353STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4354STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4355		UINT_MAX, 0);
4356STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4357STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4358STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4359STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4360STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4361		UINT_MAX, 0);
4362STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4363STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4364#undef STORE_FUNCTION
4365
4366#define CFQ_ATTR(name) \
4367	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
4368
4369static struct elv_fs_entry cfq_attrs[] = {
4370	CFQ_ATTR(quantum),
4371	CFQ_ATTR(fifo_expire_sync),
4372	CFQ_ATTR(fifo_expire_async),
4373	CFQ_ATTR(back_seek_max),
4374	CFQ_ATTR(back_seek_penalty),
4375	CFQ_ATTR(slice_sync),
4376	CFQ_ATTR(slice_async),
4377	CFQ_ATTR(slice_async_rq),
4378	CFQ_ATTR(slice_idle),
4379	CFQ_ATTR(group_idle),
4380	CFQ_ATTR(low_latency),
4381	CFQ_ATTR(target_latency),
4382	__ATTR_NULL
4383};
4384
4385static struct elevator_type iosched_cfq = {
4386	.ops = {
4387		.elevator_merge_fn = 		cfq_merge,
4388		.elevator_merged_fn =		cfq_merged_request,
4389		.elevator_merge_req_fn =	cfq_merged_requests,
4390		.elevator_allow_merge_fn =	cfq_allow_merge,
4391		.elevator_bio_merged_fn =	cfq_bio_merged,
4392		.elevator_dispatch_fn =		cfq_dispatch_requests,
4393		.elevator_add_req_fn =		cfq_insert_request,
4394		.elevator_activate_req_fn =	cfq_activate_request,
4395		.elevator_deactivate_req_fn =	cfq_deactivate_request,
4396		.elevator_completed_req_fn =	cfq_completed_request,
4397		.elevator_former_req_fn =	elv_rb_former_request,
4398		.elevator_latter_req_fn =	elv_rb_latter_request,
4399		.elevator_init_icq_fn =		cfq_init_icq,
4400		.elevator_exit_icq_fn =		cfq_exit_icq,
4401		.elevator_set_req_fn =		cfq_set_request,
4402		.elevator_put_req_fn =		cfq_put_request,
4403		.elevator_may_queue_fn =	cfq_may_queue,
4404		.elevator_init_fn =		cfq_init_queue,
4405		.elevator_exit_fn =		cfq_exit_queue,
4406	},
4407	.icq_size	=	sizeof(struct cfq_io_cq),
4408	.icq_align	=	__alignof__(struct cfq_io_cq),
4409	.elevator_attrs =	cfq_attrs,
4410	.elevator_name	=	"cfq",
4411	.elevator_owner =	THIS_MODULE,
4412};
4413
4414#ifdef CONFIG_CFQ_GROUP_IOSCHED
4415static struct blkcg_policy blkcg_policy_cfq = {
4416	.pd_size		= sizeof(struct cfq_group),
4417	.cftypes		= cfq_blkcg_files,
4418
4419	.pd_init_fn		= cfq_pd_init,
4420	.pd_reset_stats_fn	= cfq_pd_reset_stats,
4421};
4422#endif
4423
4424static int __init cfq_init(void)
4425{
4426	int ret;
4427
4428	/*
4429	 * could be 0 on HZ < 1000 setups
4430	 */
4431	if (!cfq_slice_async)
4432		cfq_slice_async = 1;
4433	if (!cfq_slice_idle)
4434		cfq_slice_idle = 1;
4435
4436#ifdef CONFIG_CFQ_GROUP_IOSCHED
4437	if (!cfq_group_idle)
4438		cfq_group_idle = 1;
4439
4440	ret = blkcg_policy_register(&blkcg_policy_cfq);
4441	if (ret)
4442		return ret;
4443#else
4444	cfq_group_idle = 0;
4445#endif
4446
4447	ret = -ENOMEM;
4448	cfq_pool = KMEM_CACHE(cfq_queue, 0);
4449	if (!cfq_pool)
4450		goto err_pol_unreg;
4451
4452	ret = elv_register(&iosched_cfq);
4453	if (ret)
4454		goto err_free_pool;
4455
4456	return 0;
4457
4458err_free_pool:
4459	kmem_cache_destroy(cfq_pool);
4460err_pol_unreg:
4461#ifdef CONFIG_CFQ_GROUP_IOSCHED
4462	blkcg_policy_unregister(&blkcg_policy_cfq);
4463#endif
4464	return ret;
4465}
4466
4467static void __exit cfq_exit(void)
4468{
4469#ifdef CONFIG_CFQ_GROUP_IOSCHED
4470	blkcg_policy_unregister(&blkcg_policy_cfq);
4471#endif
4472	elv_unregister(&iosched_cfq);
4473	kmem_cache_destroy(cfq_pool);
4474}
4475
4476module_init(cfq_init);
4477module_exit(cfq_exit);
4478
4479MODULE_AUTHOR("Jens Axboe");
4480MODULE_LICENSE("GPL");
4481MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
4482