cfq-iosched.c revision 3c96cb32d318f323c1bf972a4c66821f8499e34d
1/*
2 *  CFQ, or complete fairness queueing, disk scheduler.
3 *
4 *  Based on ideas from a previously unfinished io
5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <linux/blkdev.h>
12#include <linux/elevator.h>
13#include <linux/jiffies.h>
14#include <linux/rbtree.h>
15#include <linux/ioprio.h>
16#include <linux/blktrace_api.h>
17#include "blk.h"
18#include "blk-cgroup.h"
19
20static struct blkio_policy_type blkio_policy_cfq __maybe_unused;
21
22/*
23 * tunables
24 */
25/* max queue in one round of service */
26static const int cfq_quantum = 8;
27static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
28/* maximum backwards seek, in KiB */
29static const int cfq_back_max = 16 * 1024;
30/* penalty of a backwards seek */
31static const int cfq_back_penalty = 2;
32static const int cfq_slice_sync = HZ / 10;
33static int cfq_slice_async = HZ / 25;
34static const int cfq_slice_async_rq = 2;
35static int cfq_slice_idle = HZ / 125;
36static int cfq_group_idle = HZ / 125;
37static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
38static const int cfq_hist_divisor = 4;
39
40/*
41 * offset from end of service tree
42 */
43#define CFQ_IDLE_DELAY		(HZ / 5)
44
45/*
46 * below this threshold, we consider thinktime immediate
47 */
48#define CFQ_MIN_TT		(2)
49
50#define CFQ_SLICE_SCALE		(5)
51#define CFQ_HW_QUEUE_MIN	(5)
52#define CFQ_SERVICE_SHIFT       12
53
54#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
55#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
56#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
57#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
58
59#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
60#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
61#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
62
63static struct kmem_cache *cfq_pool;
64
65#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
66#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
67#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
68
69#define sample_valid(samples)	((samples) > 80)
70#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
71
72struct cfq_ttime {
73	unsigned long last_end_request;
74
75	unsigned long ttime_total;
76	unsigned long ttime_samples;
77	unsigned long ttime_mean;
78};
79
80/*
81 * Most of our rbtree usage is for sorting with min extraction, so
82 * if we cache the leftmost node we don't have to walk down the tree
83 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
84 * move this into the elevator for the rq sorting as well.
85 */
86struct cfq_rb_root {
87	struct rb_root rb;
88	struct rb_node *left;
89	unsigned count;
90	unsigned total_weight;
91	u64 min_vdisktime;
92	struct cfq_ttime ttime;
93};
94#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, \
95			.ttime = {.last_end_request = jiffies,},}
96
97/*
98 * Per process-grouping structure
99 */
100struct cfq_queue {
101	/* reference count */
102	int ref;
103	/* various state flags, see below */
104	unsigned int flags;
105	/* parent cfq_data */
106	struct cfq_data *cfqd;
107	/* service_tree member */
108	struct rb_node rb_node;
109	/* service_tree key */
110	unsigned long rb_key;
111	/* prio tree member */
112	struct rb_node p_node;
113	/* prio tree root we belong to, if any */
114	struct rb_root *p_root;
115	/* sorted list of pending requests */
116	struct rb_root sort_list;
117	/* if fifo isn't expired, next request to serve */
118	struct request *next_rq;
119	/* requests queued in sort_list */
120	int queued[2];
121	/* currently allocated requests */
122	int allocated[2];
123	/* fifo list of requests in sort_list */
124	struct list_head fifo;
125
126	/* time when queue got scheduled in to dispatch first request. */
127	unsigned long dispatch_start;
128	unsigned int allocated_slice;
129	unsigned int slice_dispatch;
130	/* time when first request from queue completed and slice started. */
131	unsigned long slice_start;
132	unsigned long slice_end;
133	long slice_resid;
134
135	/* pending priority requests */
136	int prio_pending;
137	/* number of requests that are on the dispatch list or inside driver */
138	int dispatched;
139
140	/* io prio of this group */
141	unsigned short ioprio, org_ioprio;
142	unsigned short ioprio_class;
143
144	pid_t pid;
145
146	u32 seek_history;
147	sector_t last_request_pos;
148
149	struct cfq_rb_root *service_tree;
150	struct cfq_queue *new_cfqq;
151	struct cfq_group *cfqg;
152	/* Number of sectors dispatched from queue in single dispatch round */
153	unsigned long nr_sectors;
154};
155
156/*
157 * First index in the service_trees.
158 * IDLE is handled separately, so it has negative index
159 */
160enum wl_prio_t {
161	BE_WORKLOAD = 0,
162	RT_WORKLOAD = 1,
163	IDLE_WORKLOAD = 2,
164	CFQ_PRIO_NR,
165};
166
167/*
168 * Second index in the service_trees.
169 */
170enum wl_type_t {
171	ASYNC_WORKLOAD = 0,
172	SYNC_NOIDLE_WORKLOAD = 1,
173	SYNC_WORKLOAD = 2
174};
175
176struct cfqg_stats {
177#ifdef CONFIG_CFQ_GROUP_IOSCHED
178	/* total bytes transferred */
179	struct blkg_rwstat		service_bytes;
180	/* total IOs serviced, post merge */
181	struct blkg_rwstat		serviced;
182	/* number of ios merged */
183	struct blkg_rwstat		merged;
184	/* total time spent on device in ns, may not be accurate w/ queueing */
185	struct blkg_rwstat		service_time;
186	/* total time spent waiting in scheduler queue in ns */
187	struct blkg_rwstat		wait_time;
188	/* number of IOs queued up */
189	struct blkg_rwstat		queued;
190	/* total sectors transferred */
191	struct blkg_stat		sectors;
192	/* total disk time and nr sectors dispatched by this group */
193	struct blkg_stat		time;
194#ifdef CONFIG_DEBUG_BLK_CGROUP
195	/* time not charged to this cgroup */
196	struct blkg_stat		unaccounted_time;
197	/* sum of number of ios queued across all samples */
198	struct blkg_stat		avg_queue_size_sum;
199	/* count of samples taken for average */
200	struct blkg_stat		avg_queue_size_samples;
201	/* how many times this group has been removed from service tree */
202	struct blkg_stat		dequeue;
203	/* total time spent waiting for it to be assigned a timeslice. */
204	struct blkg_stat		group_wait_time;
205	/* time spent idling for this blkio_group */
206	struct blkg_stat		idle_time;
207	/* total time with empty current active q with other requests queued */
208	struct blkg_stat		empty_time;
209	/* fields after this shouldn't be cleared on stat reset */
210	uint64_t			start_group_wait_time;
211	uint64_t			start_idle_time;
212	uint64_t			start_empty_time;
213	uint16_t			flags;
214#endif	/* CONFIG_DEBUG_BLK_CGROUP */
215#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
216};
217
218/* This is per cgroup per device grouping structure */
219struct cfq_group {
220	/* group service_tree member */
221	struct rb_node rb_node;
222
223	/* group service_tree key */
224	u64 vdisktime;
225	unsigned int weight;
226	unsigned int new_weight;
227	unsigned int dev_weight;
228
229	/* number of cfqq currently on this group */
230	int nr_cfqq;
231
232	/*
233	 * Per group busy queues average. Useful for workload slice calc. We
234	 * create the array for each prio class but at run time it is used
235	 * only for RT and BE class and slot for IDLE class remains unused.
236	 * This is primarily done to avoid confusion and a gcc warning.
237	 */
238	unsigned int busy_queues_avg[CFQ_PRIO_NR];
239	/*
240	 * rr lists of queues with requests. We maintain service trees for
241	 * RT and BE classes. These trees are subdivided in subclasses
242	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
243	 * class there is no subclassification and all the cfq queues go on
244	 * a single tree service_tree_idle.
245	 * Counts are embedded in the cfq_rb_root
246	 */
247	struct cfq_rb_root service_trees[2][3];
248	struct cfq_rb_root service_tree_idle;
249
250	unsigned long saved_workload_slice;
251	enum wl_type_t saved_workload;
252	enum wl_prio_t saved_serving_prio;
253
254	/* number of requests that are on the dispatch list or inside driver */
255	int dispatched;
256	struct cfq_ttime ttime;
257	struct cfqg_stats stats;
258};
259
260struct cfq_io_cq {
261	struct io_cq		icq;		/* must be the first member */
262	struct cfq_queue	*cfqq[2];
263	struct cfq_ttime	ttime;
264	int			ioprio;		/* the current ioprio */
265#ifdef CONFIG_CFQ_GROUP_IOSCHED
266	uint64_t		blkcg_id;	/* the current blkcg ID */
267#endif
268};
269
270/*
271 * Per block device queue structure
272 */
273struct cfq_data {
274	struct request_queue *queue;
275	/* Root service tree for cfq_groups */
276	struct cfq_rb_root grp_service_tree;
277	struct cfq_group *root_group;
278
279	/*
280	 * The priority currently being served
281	 */
282	enum wl_prio_t serving_prio;
283	enum wl_type_t serving_type;
284	unsigned long workload_expires;
285	struct cfq_group *serving_group;
286
287	/*
288	 * Each priority tree is sorted by next_request position.  These
289	 * trees are used when determining if two or more queues are
290	 * interleaving requests (see cfq_close_cooperator).
291	 */
292	struct rb_root prio_trees[CFQ_PRIO_LISTS];
293
294	unsigned int busy_queues;
295	unsigned int busy_sync_queues;
296
297	int rq_in_driver;
298	int rq_in_flight[2];
299
300	/*
301	 * queue-depth detection
302	 */
303	int rq_queued;
304	int hw_tag;
305	/*
306	 * hw_tag can be
307	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
308	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
309	 *  0 => no NCQ
310	 */
311	int hw_tag_est_depth;
312	unsigned int hw_tag_samples;
313
314	/*
315	 * idle window management
316	 */
317	struct timer_list idle_slice_timer;
318	struct work_struct unplug_work;
319
320	struct cfq_queue *active_queue;
321	struct cfq_io_cq *active_cic;
322
323	/*
324	 * async queue for each priority case
325	 */
326	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
327	struct cfq_queue *async_idle_cfqq;
328
329	sector_t last_position;
330
331	/*
332	 * tunables, see top of file
333	 */
334	unsigned int cfq_quantum;
335	unsigned int cfq_fifo_expire[2];
336	unsigned int cfq_back_penalty;
337	unsigned int cfq_back_max;
338	unsigned int cfq_slice[2];
339	unsigned int cfq_slice_async_rq;
340	unsigned int cfq_slice_idle;
341	unsigned int cfq_group_idle;
342	unsigned int cfq_latency;
343
344	/*
345	 * Fallback dummy cfqq for extreme OOM conditions
346	 */
347	struct cfq_queue oom_cfqq;
348
349	unsigned long last_delayed_sync;
350};
351
352static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
353
354static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
355					    enum wl_prio_t prio,
356					    enum wl_type_t type)
357{
358	if (!cfqg)
359		return NULL;
360
361	if (prio == IDLE_WORKLOAD)
362		return &cfqg->service_tree_idle;
363
364	return &cfqg->service_trees[prio][type];
365}
366
367enum cfqq_state_flags {
368	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
369	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
370	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
371	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
372	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
373	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
374	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
375	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
376	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
377	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
378	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
379	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
380	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
381};
382
383#define CFQ_CFQQ_FNS(name)						\
384static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
385{									\
386	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
387}									\
388static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
389{									\
390	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
391}									\
392static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
393{									\
394	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
395}
396
397CFQ_CFQQ_FNS(on_rr);
398CFQ_CFQQ_FNS(wait_request);
399CFQ_CFQQ_FNS(must_dispatch);
400CFQ_CFQQ_FNS(must_alloc_slice);
401CFQ_CFQQ_FNS(fifo_expire);
402CFQ_CFQQ_FNS(idle_window);
403CFQ_CFQQ_FNS(prio_changed);
404CFQ_CFQQ_FNS(slice_new);
405CFQ_CFQQ_FNS(sync);
406CFQ_CFQQ_FNS(coop);
407CFQ_CFQQ_FNS(split_coop);
408CFQ_CFQQ_FNS(deep);
409CFQ_CFQQ_FNS(wait_busy);
410#undef CFQ_CFQQ_FNS
411
412#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
413
414/* cfqg stats flags */
415enum cfqg_stats_flags {
416	CFQG_stats_waiting = 0,
417	CFQG_stats_idling,
418	CFQG_stats_empty,
419};
420
421#define CFQG_FLAG_FNS(name)						\
422static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
423{									\
424	stats->flags |= (1 << CFQG_stats_##name);			\
425}									\
426static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
427{									\
428	stats->flags &= ~(1 << CFQG_stats_##name);			\
429}									\
430static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
431{									\
432	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
433}									\
434
435CFQG_FLAG_FNS(waiting)
436CFQG_FLAG_FNS(idling)
437CFQG_FLAG_FNS(empty)
438#undef CFQG_FLAG_FNS
439
440/* This should be called with the queue_lock held. */
441static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
442{
443	unsigned long long now;
444
445	if (!cfqg_stats_waiting(stats))
446		return;
447
448	now = sched_clock();
449	if (time_after64(now, stats->start_group_wait_time))
450		blkg_stat_add(&stats->group_wait_time,
451			      now - stats->start_group_wait_time);
452	cfqg_stats_clear_waiting(stats);
453}
454
455/* This should be called with the queue_lock held. */
456static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
457						 struct cfq_group *curr_cfqg)
458{
459	struct cfqg_stats *stats = &cfqg->stats;
460
461	if (cfqg_stats_waiting(stats))
462		return;
463	if (cfqg == curr_cfqg)
464		return;
465	stats->start_group_wait_time = sched_clock();
466	cfqg_stats_mark_waiting(stats);
467}
468
469/* This should be called with the queue_lock held. */
470static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
471{
472	unsigned long long now;
473
474	if (!cfqg_stats_empty(stats))
475		return;
476
477	now = sched_clock();
478	if (time_after64(now, stats->start_empty_time))
479		blkg_stat_add(&stats->empty_time,
480			      now - stats->start_empty_time);
481	cfqg_stats_clear_empty(stats);
482}
483
484static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
485{
486	blkg_stat_add(&cfqg->stats.dequeue, 1);
487}
488
489static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
490{
491	struct cfqg_stats *stats = &cfqg->stats;
492
493	if (blkg_rwstat_sum(&stats->queued))
494		return;
495
496	/*
497	 * group is already marked empty. This can happen if cfqq got new
498	 * request in parent group and moved to this group while being added
499	 * to service tree. Just ignore the event and move on.
500	 */
501	if (cfqg_stats_empty(stats))
502		return;
503
504	stats->start_empty_time = sched_clock();
505	cfqg_stats_mark_empty(stats);
506}
507
508static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
509{
510	struct cfqg_stats *stats = &cfqg->stats;
511
512	if (cfqg_stats_idling(stats)) {
513		unsigned long long now = sched_clock();
514
515		if (time_after64(now, stats->start_idle_time))
516			blkg_stat_add(&stats->idle_time,
517				      now - stats->start_idle_time);
518		cfqg_stats_clear_idling(stats);
519	}
520}
521
522static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
523{
524	struct cfqg_stats *stats = &cfqg->stats;
525
526	BUG_ON(cfqg_stats_idling(stats));
527
528	stats->start_idle_time = sched_clock();
529	cfqg_stats_mark_idling(stats);
530}
531
532static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
533{
534	struct cfqg_stats *stats = &cfqg->stats;
535
536	blkg_stat_add(&stats->avg_queue_size_sum,
537		      blkg_rwstat_sum(&stats->queued));
538	blkg_stat_add(&stats->avg_queue_size_samples, 1);
539	cfqg_stats_update_group_wait_time(stats);
540}
541
542#else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
543
544static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
545static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
546static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
547static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
548static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
549static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
550static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
551
552#endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
553
554#ifdef CONFIG_CFQ_GROUP_IOSCHED
555
556static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)
557{
558	return blkg_to_pdata(blkg, &blkio_policy_cfq);
559}
560
561static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg)
562{
563	return pdata_to_blkg(cfqg);
564}
565
566static inline void cfqg_get(struct cfq_group *cfqg)
567{
568	return blkg_get(cfqg_to_blkg(cfqg));
569}
570
571static inline void cfqg_put(struct cfq_group *cfqg)
572{
573	return blkg_put(cfqg_to_blkg(cfqg));
574}
575
576#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
577	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
578			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
579			blkg_path(cfqg_to_blkg((cfqq)->cfqg)), ##args)
580
581#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
582	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
583			blkg_path(cfqg_to_blkg((cfqg))), ##args)	\
584
585static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
586					    struct cfq_group *curr_cfqg, int rw)
587{
588	blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
589	cfqg_stats_end_empty_time(&cfqg->stats);
590	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
591}
592
593static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
594			unsigned long time, unsigned long unaccounted_time)
595{
596	blkg_stat_add(&cfqg->stats.time, time);
597#ifdef CONFIG_DEBUG_BLK_CGROUP
598	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
599#endif
600}
601
602static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
603{
604	blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
605}
606
607static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
608{
609	blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
610}
611
612static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
613					      uint64_t bytes, int rw)
614{
615	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
616	blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
617	blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
618}
619
620static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
621			uint64_t start_time, uint64_t io_start_time, int rw)
622{
623	struct cfqg_stats *stats = &cfqg->stats;
624	unsigned long long now = sched_clock();
625
626	if (time_after64(now, io_start_time))
627		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
628	if (time_after64(io_start_time, start_time))
629		blkg_rwstat_add(&stats->wait_time, rw,
630				io_start_time - start_time);
631}
632
633static void cfqg_stats_reset(struct blkio_group *blkg)
634{
635	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
636	struct cfqg_stats *stats = &cfqg->stats;
637
638	/* queued stats shouldn't be cleared */
639	blkg_rwstat_reset(&stats->service_bytes);
640	blkg_rwstat_reset(&stats->serviced);
641	blkg_rwstat_reset(&stats->merged);
642	blkg_rwstat_reset(&stats->service_time);
643	blkg_rwstat_reset(&stats->wait_time);
644	blkg_stat_reset(&stats->time);
645#ifdef CONFIG_DEBUG_BLK_CGROUP
646	blkg_stat_reset(&stats->unaccounted_time);
647	blkg_stat_reset(&stats->avg_queue_size_sum);
648	blkg_stat_reset(&stats->avg_queue_size_samples);
649	blkg_stat_reset(&stats->dequeue);
650	blkg_stat_reset(&stats->group_wait_time);
651	blkg_stat_reset(&stats->idle_time);
652	blkg_stat_reset(&stats->empty_time);
653#endif
654}
655
656#else	/* CONFIG_CFQ_GROUP_IOSCHED */
657
658static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg) { return NULL; }
659static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg) { return NULL; }
660static inline void cfqg_get(struct cfq_group *cfqg) { }
661static inline void cfqg_put(struct cfq_group *cfqg) { }
662
663#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
664	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
665#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
666
667static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
668			struct cfq_group *curr_cfqg, int rw) { }
669static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
670			unsigned long time, unsigned long unaccounted_time) { }
671static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
672static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
673static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
674					      uint64_t bytes, int rw) { }
675static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
676			uint64_t start_time, uint64_t io_start_time, int rw) { }
677
678#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
679
680#define cfq_log(cfqd, fmt, args...)	\
681	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
682
683/* Traverses through cfq group service trees */
684#define for_each_cfqg_st(cfqg, i, j, st) \
685	for (i = 0; i <= IDLE_WORKLOAD; i++) \
686		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
687			: &cfqg->service_tree_idle; \
688			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
689			(i == IDLE_WORKLOAD && j == 0); \
690			j++, st = i < IDLE_WORKLOAD ? \
691			&cfqg->service_trees[i][j]: NULL) \
692
693static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
694	struct cfq_ttime *ttime, bool group_idle)
695{
696	unsigned long slice;
697	if (!sample_valid(ttime->ttime_samples))
698		return false;
699	if (group_idle)
700		slice = cfqd->cfq_group_idle;
701	else
702		slice = cfqd->cfq_slice_idle;
703	return ttime->ttime_mean > slice;
704}
705
706static inline bool iops_mode(struct cfq_data *cfqd)
707{
708	/*
709	 * If we are not idling on queues and it is a NCQ drive, parallel
710	 * execution of requests is on and measuring time is not possible
711	 * in most of the cases until and unless we drive shallower queue
712	 * depths and that becomes a performance bottleneck. In such cases
713	 * switch to start providing fairness in terms of number of IOs.
714	 */
715	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
716		return true;
717	else
718		return false;
719}
720
721static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
722{
723	if (cfq_class_idle(cfqq))
724		return IDLE_WORKLOAD;
725	if (cfq_class_rt(cfqq))
726		return RT_WORKLOAD;
727	return BE_WORKLOAD;
728}
729
730
731static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
732{
733	if (!cfq_cfqq_sync(cfqq))
734		return ASYNC_WORKLOAD;
735	if (!cfq_cfqq_idle_window(cfqq))
736		return SYNC_NOIDLE_WORKLOAD;
737	return SYNC_WORKLOAD;
738}
739
740static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
741					struct cfq_data *cfqd,
742					struct cfq_group *cfqg)
743{
744	if (wl == IDLE_WORKLOAD)
745		return cfqg->service_tree_idle.count;
746
747	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
748		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
749		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
750}
751
752static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
753					struct cfq_group *cfqg)
754{
755	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
756		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
757}
758
759static void cfq_dispatch_insert(struct request_queue *, struct request *);
760static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
761				       struct cfq_io_cq *cic, struct bio *bio,
762				       gfp_t gfp_mask);
763
764static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
765{
766	/* cic->icq is the first member, %NULL will convert to %NULL */
767	return container_of(icq, struct cfq_io_cq, icq);
768}
769
770static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
771					       struct io_context *ioc)
772{
773	if (ioc)
774		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
775	return NULL;
776}
777
778static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
779{
780	return cic->cfqq[is_sync];
781}
782
783static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
784				bool is_sync)
785{
786	cic->cfqq[is_sync] = cfqq;
787}
788
789static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
790{
791	return cic->icq.q->elevator->elevator_data;
792}
793
794/*
795 * We regard a request as SYNC, if it's either a read or has the SYNC bit
796 * set (in which case it could also be direct WRITE).
797 */
798static inline bool cfq_bio_sync(struct bio *bio)
799{
800	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
801}
802
803/*
804 * scheduler run of queue, if there are requests pending and no one in the
805 * driver that will restart queueing
806 */
807static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
808{
809	if (cfqd->busy_queues) {
810		cfq_log(cfqd, "schedule dispatch");
811		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
812	}
813}
814
815/*
816 * Scale schedule slice based on io priority. Use the sync time slice only
817 * if a queue is marked sync and has sync io queued. A sync queue with async
818 * io only, should not get full sync slice length.
819 */
820static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
821				 unsigned short prio)
822{
823	const int base_slice = cfqd->cfq_slice[sync];
824
825	WARN_ON(prio >= IOPRIO_BE_NR);
826
827	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
828}
829
830static inline int
831cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
832{
833	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
834}
835
836static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
837{
838	u64 d = delta << CFQ_SERVICE_SHIFT;
839
840	d = d * CFQ_WEIGHT_DEFAULT;
841	do_div(d, cfqg->weight);
842	return d;
843}
844
845static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
846{
847	s64 delta = (s64)(vdisktime - min_vdisktime);
848	if (delta > 0)
849		min_vdisktime = vdisktime;
850
851	return min_vdisktime;
852}
853
854static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
855{
856	s64 delta = (s64)(vdisktime - min_vdisktime);
857	if (delta < 0)
858		min_vdisktime = vdisktime;
859
860	return min_vdisktime;
861}
862
863static void update_min_vdisktime(struct cfq_rb_root *st)
864{
865	struct cfq_group *cfqg;
866
867	if (st->left) {
868		cfqg = rb_entry_cfqg(st->left);
869		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
870						  cfqg->vdisktime);
871	}
872}
873
874/*
875 * get averaged number of queues of RT/BE priority.
876 * average is updated, with a formula that gives more weight to higher numbers,
877 * to quickly follows sudden increases and decrease slowly
878 */
879
880static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
881					struct cfq_group *cfqg, bool rt)
882{
883	unsigned min_q, max_q;
884	unsigned mult  = cfq_hist_divisor - 1;
885	unsigned round = cfq_hist_divisor / 2;
886	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
887
888	min_q = min(cfqg->busy_queues_avg[rt], busy);
889	max_q = max(cfqg->busy_queues_avg[rt], busy);
890	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
891		cfq_hist_divisor;
892	return cfqg->busy_queues_avg[rt];
893}
894
895static inline unsigned
896cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
897{
898	struct cfq_rb_root *st = &cfqd->grp_service_tree;
899
900	return cfq_target_latency * cfqg->weight / st->total_weight;
901}
902
903static inline unsigned
904cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
905{
906	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
907	if (cfqd->cfq_latency) {
908		/*
909		 * interested queues (we consider only the ones with the same
910		 * priority class in the cfq group)
911		 */
912		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
913						cfq_class_rt(cfqq));
914		unsigned sync_slice = cfqd->cfq_slice[1];
915		unsigned expect_latency = sync_slice * iq;
916		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
917
918		if (expect_latency > group_slice) {
919			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
920			/* scale low_slice according to IO priority
921			 * and sync vs async */
922			unsigned low_slice =
923				min(slice, base_low_slice * slice / sync_slice);
924			/* the adapted slice value is scaled to fit all iqs
925			 * into the target latency */
926			slice = max(slice * group_slice / expect_latency,
927				    low_slice);
928		}
929	}
930	return slice;
931}
932
933static inline void
934cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
935{
936	unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
937
938	cfqq->slice_start = jiffies;
939	cfqq->slice_end = jiffies + slice;
940	cfqq->allocated_slice = slice;
941	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
942}
943
944/*
945 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
946 * isn't valid until the first request from the dispatch is activated
947 * and the slice time set.
948 */
949static inline bool cfq_slice_used(struct cfq_queue *cfqq)
950{
951	if (cfq_cfqq_slice_new(cfqq))
952		return false;
953	if (time_before(jiffies, cfqq->slice_end))
954		return false;
955
956	return true;
957}
958
959/*
960 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
961 * We choose the request that is closest to the head right now. Distance
962 * behind the head is penalized and only allowed to a certain extent.
963 */
964static struct request *
965cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
966{
967	sector_t s1, s2, d1 = 0, d2 = 0;
968	unsigned long back_max;
969#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
970#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
971	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
972
973	if (rq1 == NULL || rq1 == rq2)
974		return rq2;
975	if (rq2 == NULL)
976		return rq1;
977
978	if (rq_is_sync(rq1) != rq_is_sync(rq2))
979		return rq_is_sync(rq1) ? rq1 : rq2;
980
981	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
982		return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
983
984	s1 = blk_rq_pos(rq1);
985	s2 = blk_rq_pos(rq2);
986
987	/*
988	 * by definition, 1KiB is 2 sectors
989	 */
990	back_max = cfqd->cfq_back_max * 2;
991
992	/*
993	 * Strict one way elevator _except_ in the case where we allow
994	 * short backward seeks which are biased as twice the cost of a
995	 * similar forward seek.
996	 */
997	if (s1 >= last)
998		d1 = s1 - last;
999	else if (s1 + back_max >= last)
1000		d1 = (last - s1) * cfqd->cfq_back_penalty;
1001	else
1002		wrap |= CFQ_RQ1_WRAP;
1003
1004	if (s2 >= last)
1005		d2 = s2 - last;
1006	else if (s2 + back_max >= last)
1007		d2 = (last - s2) * cfqd->cfq_back_penalty;
1008	else
1009		wrap |= CFQ_RQ2_WRAP;
1010
1011	/* Found required data */
1012
1013	/*
1014	 * By doing switch() on the bit mask "wrap" we avoid having to
1015	 * check two variables for all permutations: --> faster!
1016	 */
1017	switch (wrap) {
1018	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1019		if (d1 < d2)
1020			return rq1;
1021		else if (d2 < d1)
1022			return rq2;
1023		else {
1024			if (s1 >= s2)
1025				return rq1;
1026			else
1027				return rq2;
1028		}
1029
1030	case CFQ_RQ2_WRAP:
1031		return rq1;
1032	case CFQ_RQ1_WRAP:
1033		return rq2;
1034	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1035	default:
1036		/*
1037		 * Since both rqs are wrapped,
1038		 * start with the one that's further behind head
1039		 * (--> only *one* back seek required),
1040		 * since back seek takes more time than forward.
1041		 */
1042		if (s1 <= s2)
1043			return rq1;
1044		else
1045			return rq2;
1046	}
1047}
1048
1049/*
1050 * The below is leftmost cache rbtree addon
1051 */
1052static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1053{
1054	/* Service tree is empty */
1055	if (!root->count)
1056		return NULL;
1057
1058	if (!root->left)
1059		root->left = rb_first(&root->rb);
1060
1061	if (root->left)
1062		return rb_entry(root->left, struct cfq_queue, rb_node);
1063
1064	return NULL;
1065}
1066
1067static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1068{
1069	if (!root->left)
1070		root->left = rb_first(&root->rb);
1071
1072	if (root->left)
1073		return rb_entry_cfqg(root->left);
1074
1075	return NULL;
1076}
1077
1078static void rb_erase_init(struct rb_node *n, struct rb_root *root)
1079{
1080	rb_erase(n, root);
1081	RB_CLEAR_NODE(n);
1082}
1083
1084static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1085{
1086	if (root->left == n)
1087		root->left = NULL;
1088	rb_erase_init(n, &root->rb);
1089	--root->count;
1090}
1091
1092/*
1093 * would be nice to take fifo expire time into account as well
1094 */
1095static struct request *
1096cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1097		  struct request *last)
1098{
1099	struct rb_node *rbnext = rb_next(&last->rb_node);
1100	struct rb_node *rbprev = rb_prev(&last->rb_node);
1101	struct request *next = NULL, *prev = NULL;
1102
1103	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1104
1105	if (rbprev)
1106		prev = rb_entry_rq(rbprev);
1107
1108	if (rbnext)
1109		next = rb_entry_rq(rbnext);
1110	else {
1111		rbnext = rb_first(&cfqq->sort_list);
1112		if (rbnext && rbnext != &last->rb_node)
1113			next = rb_entry_rq(rbnext);
1114	}
1115
1116	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1117}
1118
1119static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
1120				      struct cfq_queue *cfqq)
1121{
1122	/*
1123	 * just an approximation, should be ok.
1124	 */
1125	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1126		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1127}
1128
1129static inline s64
1130cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1131{
1132	return cfqg->vdisktime - st->min_vdisktime;
1133}
1134
1135static void
1136__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1137{
1138	struct rb_node **node = &st->rb.rb_node;
1139	struct rb_node *parent = NULL;
1140	struct cfq_group *__cfqg;
1141	s64 key = cfqg_key(st, cfqg);
1142	int left = 1;
1143
1144	while (*node != NULL) {
1145		parent = *node;
1146		__cfqg = rb_entry_cfqg(parent);
1147
1148		if (key < cfqg_key(st, __cfqg))
1149			node = &parent->rb_left;
1150		else {
1151			node = &parent->rb_right;
1152			left = 0;
1153		}
1154	}
1155
1156	if (left)
1157		st->left = &cfqg->rb_node;
1158
1159	rb_link_node(&cfqg->rb_node, parent, node);
1160	rb_insert_color(&cfqg->rb_node, &st->rb);
1161}
1162
1163static void
1164cfq_update_group_weight(struct cfq_group *cfqg)
1165{
1166	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1167	if (cfqg->new_weight) {
1168		cfqg->weight = cfqg->new_weight;
1169		cfqg->new_weight = 0;
1170	}
1171}
1172
1173static void
1174cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1175{
1176	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1177
1178	cfq_update_group_weight(cfqg);
1179	__cfq_group_service_tree_add(st, cfqg);
1180	st->total_weight += cfqg->weight;
1181}
1182
1183static void
1184cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1185{
1186	struct cfq_rb_root *st = &cfqd->grp_service_tree;
1187	struct cfq_group *__cfqg;
1188	struct rb_node *n;
1189
1190	cfqg->nr_cfqq++;
1191	if (!RB_EMPTY_NODE(&cfqg->rb_node))
1192		return;
1193
1194	/*
1195	 * Currently put the group at the end. Later implement something
1196	 * so that groups get lesser vtime based on their weights, so that
1197	 * if group does not loose all if it was not continuously backlogged.
1198	 */
1199	n = rb_last(&st->rb);
1200	if (n) {
1201		__cfqg = rb_entry_cfqg(n);
1202		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
1203	} else
1204		cfqg->vdisktime = st->min_vdisktime;
1205	cfq_group_service_tree_add(st, cfqg);
1206}
1207
1208static void
1209cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1210{
1211	st->total_weight -= cfqg->weight;
1212	if (!RB_EMPTY_NODE(&cfqg->rb_node))
1213		cfq_rb_erase(&cfqg->rb_node, st);
1214}
1215
1216static void
1217cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1218{
1219	struct cfq_rb_root *st = &cfqd->grp_service_tree;
1220
1221	BUG_ON(cfqg->nr_cfqq < 1);
1222	cfqg->nr_cfqq--;
1223
1224	/* If there are other cfq queues under this group, don't delete it */
1225	if (cfqg->nr_cfqq)
1226		return;
1227
1228	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1229	cfq_group_service_tree_del(st, cfqg);
1230	cfqg->saved_workload_slice = 0;
1231	cfqg_stats_update_dequeue(cfqg);
1232}
1233
1234static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1235						unsigned int *unaccounted_time)
1236{
1237	unsigned int slice_used;
1238
1239	/*
1240	 * Queue got expired before even a single request completed or
1241	 * got expired immediately after first request completion.
1242	 */
1243	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
1244		/*
1245		 * Also charge the seek time incurred to the group, otherwise
1246		 * if there are mutiple queues in the group, each can dispatch
1247		 * a single request on seeky media and cause lots of seek time
1248		 * and group will never know it.
1249		 */
1250		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
1251					1);
1252	} else {
1253		slice_used = jiffies - cfqq->slice_start;
1254		if (slice_used > cfqq->allocated_slice) {
1255			*unaccounted_time = slice_used - cfqq->allocated_slice;
1256			slice_used = cfqq->allocated_slice;
1257		}
1258		if (time_after(cfqq->slice_start, cfqq->dispatch_start))
1259			*unaccounted_time += cfqq->slice_start -
1260					cfqq->dispatch_start;
1261	}
1262
1263	return slice_used;
1264}
1265
1266static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1267				struct cfq_queue *cfqq)
1268{
1269	struct cfq_rb_root *st = &cfqd->grp_service_tree;
1270	unsigned int used_sl, charge, unaccounted_sl = 0;
1271	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1272			- cfqg->service_tree_idle.count;
1273
1274	BUG_ON(nr_sync < 0);
1275	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1276
1277	if (iops_mode(cfqd))
1278		charge = cfqq->slice_dispatch;
1279	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1280		charge = cfqq->allocated_slice;
1281
1282	/* Can't update vdisktime while group is on service tree */
1283	cfq_group_service_tree_del(st, cfqg);
1284	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
1285	/* If a new weight was requested, update now, off tree */
1286	cfq_group_service_tree_add(st, cfqg);
1287
1288	/* This group is being expired. Save the context */
1289	if (time_after(cfqd->workload_expires, jiffies)) {
1290		cfqg->saved_workload_slice = cfqd->workload_expires
1291						- jiffies;
1292		cfqg->saved_workload = cfqd->serving_type;
1293		cfqg->saved_serving_prio = cfqd->serving_prio;
1294	} else
1295		cfqg->saved_workload_slice = 0;
1296
1297	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1298					st->min_vdisktime);
1299	cfq_log_cfqq(cfqq->cfqd, cfqq,
1300		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
1301		     used_sl, cfqq->slice_dispatch, charge,
1302		     iops_mode(cfqd), cfqq->nr_sectors);
1303	cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1304	cfqg_stats_set_start_empty_time(cfqg);
1305}
1306
1307/**
1308 * cfq_init_cfqg_base - initialize base part of a cfq_group
1309 * @cfqg: cfq_group to initialize
1310 *
1311 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1312 * is enabled or not.
1313 */
1314static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1315{
1316	struct cfq_rb_root *st;
1317	int i, j;
1318
1319	for_each_cfqg_st(cfqg, i, j, st)
1320		*st = CFQ_RB_ROOT;
1321	RB_CLEAR_NODE(&cfqg->rb_node);
1322
1323	cfqg->ttime.last_end_request = jiffies;
1324}
1325
1326#ifdef CONFIG_CFQ_GROUP_IOSCHED
1327static void cfq_init_blkio_group(struct blkio_group *blkg)
1328{
1329	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1330
1331	cfq_init_cfqg_base(cfqg);
1332	cfqg->weight = blkg->blkcg->cfq_weight;
1333}
1334
1335/*
1336 * Search for the cfq group current task belongs to. request_queue lock must
1337 * be held.
1338 */
1339static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1340						struct blkio_cgroup *blkcg)
1341{
1342	struct request_queue *q = cfqd->queue;
1343	struct cfq_group *cfqg = NULL;
1344
1345	/* avoid lookup for the common case where there's no blkio cgroup */
1346	if (blkcg == &blkio_root_cgroup) {
1347		cfqg = cfqd->root_group;
1348	} else {
1349		struct blkio_group *blkg;
1350
1351		blkg = blkg_lookup_create(blkcg, q);
1352		if (!IS_ERR(blkg))
1353			cfqg = blkg_to_cfqg(blkg);
1354	}
1355
1356	return cfqg;
1357}
1358
1359static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1360{
1361	/* Currently, all async queues are mapped to root group */
1362	if (!cfq_cfqq_sync(cfqq))
1363		cfqg = cfqq->cfqd->root_group;
1364
1365	cfqq->cfqg = cfqg;
1366	/* cfqq reference on cfqg */
1367	cfqg_get(cfqg);
1368}
1369
1370static u64 cfqg_prfill_weight_device(struct seq_file *sf, void *pdata, int off)
1371{
1372	struct cfq_group *cfqg = pdata;
1373
1374	if (!cfqg->dev_weight)
1375		return 0;
1376	return __blkg_prfill_u64(sf, pdata, cfqg->dev_weight);
1377}
1378
1379static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
1380				    struct seq_file *sf)
1381{
1382	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
1383			  cfqg_prfill_weight_device, &blkio_policy_cfq, 0,
1384			  false);
1385	return 0;
1386}
1387
1388static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
1389			    struct seq_file *sf)
1390{
1391	seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->cfq_weight);
1392	return 0;
1393}
1394
1395static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1396				  const char *buf)
1397{
1398	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
1399	struct blkg_conf_ctx ctx;
1400	struct cfq_group *cfqg;
1401	int ret;
1402
1403	ret = blkg_conf_prep(blkcg, &blkio_policy_cfq, buf, &ctx);
1404	if (ret)
1405		return ret;
1406
1407	ret = -EINVAL;
1408	cfqg = blkg_to_cfqg(ctx.blkg);
1409	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1410		cfqg->dev_weight = ctx.v;
1411		cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
1412		ret = 0;
1413	}
1414
1415	blkg_conf_finish(&ctx);
1416	return ret;
1417}
1418
1419static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1420{
1421	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
1422	struct blkio_group *blkg;
1423	struct hlist_node *n;
1424
1425	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
1426		return -EINVAL;
1427
1428	spin_lock_irq(&blkcg->lock);
1429	blkcg->cfq_weight = (unsigned int)val;
1430
1431	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1432		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1433
1434		if (cfqg && !cfqg->dev_weight)
1435			cfqg->new_weight = blkcg->cfq_weight;
1436	}
1437
1438	spin_unlock_irq(&blkcg->lock);
1439	return 0;
1440}
1441
1442static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
1443			   struct seq_file *sf)
1444{
1445	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
1446
1447	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkio_policy_cfq,
1448			  cft->private, false);
1449	return 0;
1450}
1451
1452static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
1453			     struct seq_file *sf)
1454{
1455	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
1456
1457	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkio_policy_cfq,
1458			  cft->private, true);
1459	return 0;
1460}
1461
1462#ifdef CONFIG_DEBUG_BLK_CGROUP
1463static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, void *pdata, int off)
1464{
1465	struct cfq_group *cfqg = pdata;
1466	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1467	u64 v = 0;
1468
1469	if (samples) {
1470		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1471		do_div(v, samples);
1472	}
1473	__blkg_prfill_u64(sf, pdata, v);
1474	return 0;
1475}
1476
1477/* print avg_queue_size */
1478static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
1479				     struct seq_file *sf)
1480{
1481	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
1482
1483	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
1484			  &blkio_policy_cfq, 0, false);
1485	return 0;
1486}
1487#endif	/* CONFIG_DEBUG_BLK_CGROUP */
1488
1489static struct cftype cfq_blkcg_files[] = {
1490	{
1491		.name = "weight_device",
1492		.read_seq_string = cfqg_print_weight_device,
1493		.write_string = cfqg_set_weight_device,
1494		.max_write_len = 256,
1495	},
1496	{
1497		.name = "weight",
1498		.read_seq_string = cfq_print_weight,
1499		.write_u64 = cfq_set_weight,
1500	},
1501	{
1502		.name = "time",
1503		.private = offsetof(struct cfq_group, stats.time),
1504		.read_seq_string = cfqg_print_stat,
1505	},
1506	{
1507		.name = "sectors",
1508		.private = offsetof(struct cfq_group, stats.sectors),
1509		.read_seq_string = cfqg_print_stat,
1510	},
1511	{
1512		.name = "io_service_bytes",
1513		.private = offsetof(struct cfq_group, stats.service_bytes),
1514		.read_seq_string = cfqg_print_rwstat,
1515	},
1516	{
1517		.name = "io_serviced",
1518		.private = offsetof(struct cfq_group, stats.serviced),
1519		.read_seq_string = cfqg_print_rwstat,
1520	},
1521	{
1522		.name = "io_service_time",
1523		.private = offsetof(struct cfq_group, stats.service_time),
1524		.read_seq_string = cfqg_print_rwstat,
1525	},
1526	{
1527		.name = "io_wait_time",
1528		.private = offsetof(struct cfq_group, stats.wait_time),
1529		.read_seq_string = cfqg_print_rwstat,
1530	},
1531	{
1532		.name = "io_merged",
1533		.private = offsetof(struct cfq_group, stats.merged),
1534		.read_seq_string = cfqg_print_rwstat,
1535	},
1536	{
1537		.name = "io_queued",
1538		.private = offsetof(struct cfq_group, stats.queued),
1539		.read_seq_string = cfqg_print_rwstat,
1540	},
1541#ifdef CONFIG_DEBUG_BLK_CGROUP
1542	{
1543		.name = "avg_queue_size",
1544		.read_seq_string = cfqg_print_avg_queue_size,
1545	},
1546	{
1547		.name = "group_wait_time",
1548		.private = offsetof(struct cfq_group, stats.group_wait_time),
1549		.read_seq_string = cfqg_print_stat,
1550	},
1551	{
1552		.name = "idle_time",
1553		.private = offsetof(struct cfq_group, stats.idle_time),
1554		.read_seq_string = cfqg_print_stat,
1555	},
1556	{
1557		.name = "empty_time",
1558		.private = offsetof(struct cfq_group, stats.empty_time),
1559		.read_seq_string = cfqg_print_stat,
1560	},
1561	{
1562		.name = "dequeue",
1563		.private = offsetof(struct cfq_group, stats.dequeue),
1564		.read_seq_string = cfqg_print_stat,
1565	},
1566	{
1567		.name = "unaccounted_time",
1568		.private = offsetof(struct cfq_group, stats.unaccounted_time),
1569		.read_seq_string = cfqg_print_stat,
1570	},
1571#endif	/* CONFIG_DEBUG_BLK_CGROUP */
1572	{ }	/* terminate */
1573};
1574#else /* GROUP_IOSCHED */
1575static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1576						struct blkio_cgroup *blkcg)
1577{
1578	return cfqd->root_group;
1579}
1580
1581static inline void
1582cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1583	cfqq->cfqg = cfqg;
1584}
1585
1586#endif /* GROUP_IOSCHED */
1587
1588/*
1589 * The cfqd->service_trees holds all pending cfq_queue's that have
1590 * requests waiting to be processed. It is sorted in the order that
1591 * we will service the queues.
1592 */
1593static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1594				 bool add_front)
1595{
1596	struct rb_node **p, *parent;
1597	struct cfq_queue *__cfqq;
1598	unsigned long rb_key;
1599	struct cfq_rb_root *service_tree;
1600	int left;
1601	int new_cfqq = 1;
1602
1603	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1604						cfqq_type(cfqq));
1605	if (cfq_class_idle(cfqq)) {
1606		rb_key = CFQ_IDLE_DELAY;
1607		parent = rb_last(&service_tree->rb);
1608		if (parent && parent != &cfqq->rb_node) {
1609			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1610			rb_key += __cfqq->rb_key;
1611		} else
1612			rb_key += jiffies;
1613	} else if (!add_front) {
1614		/*
1615		 * Get our rb key offset. Subtract any residual slice
1616		 * value carried from last service. A negative resid
1617		 * count indicates slice overrun, and this should position
1618		 * the next service time further away in the tree.
1619		 */
1620		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1621		rb_key -= cfqq->slice_resid;
1622		cfqq->slice_resid = 0;
1623	} else {
1624		rb_key = -HZ;
1625		__cfqq = cfq_rb_first(service_tree);
1626		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1627	}
1628
1629	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1630		new_cfqq = 0;
1631		/*
1632		 * same position, nothing more to do
1633		 */
1634		if (rb_key == cfqq->rb_key &&
1635		    cfqq->service_tree == service_tree)
1636			return;
1637
1638		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1639		cfqq->service_tree = NULL;
1640	}
1641
1642	left = 1;
1643	parent = NULL;
1644	cfqq->service_tree = service_tree;
1645	p = &service_tree->rb.rb_node;
1646	while (*p) {
1647		struct rb_node **n;
1648
1649		parent = *p;
1650		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1651
1652		/*
1653		 * sort by key, that represents service time.
1654		 */
1655		if (time_before(rb_key, __cfqq->rb_key))
1656			n = &(*p)->rb_left;
1657		else {
1658			n = &(*p)->rb_right;
1659			left = 0;
1660		}
1661
1662		p = n;
1663	}
1664
1665	if (left)
1666		service_tree->left = &cfqq->rb_node;
1667
1668	cfqq->rb_key = rb_key;
1669	rb_link_node(&cfqq->rb_node, parent, p);
1670	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1671	service_tree->count++;
1672	if (add_front || !new_cfqq)
1673		return;
1674	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1675}
1676
1677static struct cfq_queue *
1678cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1679		     sector_t sector, struct rb_node **ret_parent,
1680		     struct rb_node ***rb_link)
1681{
1682	struct rb_node **p, *parent;
1683	struct cfq_queue *cfqq = NULL;
1684
1685	parent = NULL;
1686	p = &root->rb_node;
1687	while (*p) {
1688		struct rb_node **n;
1689
1690		parent = *p;
1691		cfqq = rb_entry(parent, struct cfq_queue, p_node);
1692
1693		/*
1694		 * Sort strictly based on sector.  Smallest to the left,
1695		 * largest to the right.
1696		 */
1697		if (sector > blk_rq_pos(cfqq->next_rq))
1698			n = &(*p)->rb_right;
1699		else if (sector < blk_rq_pos(cfqq->next_rq))
1700			n = &(*p)->rb_left;
1701		else
1702			break;
1703		p = n;
1704		cfqq = NULL;
1705	}
1706
1707	*ret_parent = parent;
1708	if (rb_link)
1709		*rb_link = p;
1710	return cfqq;
1711}
1712
1713static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1714{
1715	struct rb_node **p, *parent;
1716	struct cfq_queue *__cfqq;
1717
1718	if (cfqq->p_root) {
1719		rb_erase(&cfqq->p_node, cfqq->p_root);
1720		cfqq->p_root = NULL;
1721	}
1722
1723	if (cfq_class_idle(cfqq))
1724		return;
1725	if (!cfqq->next_rq)
1726		return;
1727
1728	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1729	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1730				      blk_rq_pos(cfqq->next_rq), &parent, &p);
1731	if (!__cfqq) {
1732		rb_link_node(&cfqq->p_node, parent, p);
1733		rb_insert_color(&cfqq->p_node, cfqq->p_root);
1734	} else
1735		cfqq->p_root = NULL;
1736}
1737
1738/*
1739 * Update cfqq's position in the service tree.
1740 */
1741static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1742{
1743	/*
1744	 * Resorting requires the cfqq to be on the RR list already.
1745	 */
1746	if (cfq_cfqq_on_rr(cfqq)) {
1747		cfq_service_tree_add(cfqd, cfqq, 0);
1748		cfq_prio_tree_add(cfqd, cfqq);
1749	}
1750}
1751
1752/*
1753 * add to busy list of queues for service, trying to be fair in ordering
1754 * the pending list according to last request service
1755 */
1756static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1757{
1758	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
1759	BUG_ON(cfq_cfqq_on_rr(cfqq));
1760	cfq_mark_cfqq_on_rr(cfqq);
1761	cfqd->busy_queues++;
1762	if (cfq_cfqq_sync(cfqq))
1763		cfqd->busy_sync_queues++;
1764
1765	cfq_resort_rr_list(cfqd, cfqq);
1766}
1767
1768/*
1769 * Called when the cfqq no longer has requests pending, remove it from
1770 * the service tree.
1771 */
1772static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1773{
1774	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
1775	BUG_ON(!cfq_cfqq_on_rr(cfqq));
1776	cfq_clear_cfqq_on_rr(cfqq);
1777
1778	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1779		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1780		cfqq->service_tree = NULL;
1781	}
1782	if (cfqq->p_root) {
1783		rb_erase(&cfqq->p_node, cfqq->p_root);
1784		cfqq->p_root = NULL;
1785	}
1786
1787	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
1788	BUG_ON(!cfqd->busy_queues);
1789	cfqd->busy_queues--;
1790	if (cfq_cfqq_sync(cfqq))
1791		cfqd->busy_sync_queues--;
1792}
1793
1794/*
1795 * rb tree support functions
1796 */
1797static void cfq_del_rq_rb(struct request *rq)
1798{
1799	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1800	const int sync = rq_is_sync(rq);
1801
1802	BUG_ON(!cfqq->queued[sync]);
1803	cfqq->queued[sync]--;
1804
1805	elv_rb_del(&cfqq->sort_list, rq);
1806
1807	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1808		/*
1809		 * Queue will be deleted from service tree when we actually
1810		 * expire it later. Right now just remove it from prio tree
1811		 * as it is empty.
1812		 */
1813		if (cfqq->p_root) {
1814			rb_erase(&cfqq->p_node, cfqq->p_root);
1815			cfqq->p_root = NULL;
1816		}
1817	}
1818}
1819
1820static void cfq_add_rq_rb(struct request *rq)
1821{
1822	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1823	struct cfq_data *cfqd = cfqq->cfqd;
1824	struct request *prev;
1825
1826	cfqq->queued[rq_is_sync(rq)]++;
1827
1828	elv_rb_add(&cfqq->sort_list, rq);
1829
1830	if (!cfq_cfqq_on_rr(cfqq))
1831		cfq_add_cfqq_rr(cfqd, cfqq);
1832
1833	/*
1834	 * check if this request is a better next-serve candidate
1835	 */
1836	prev = cfqq->next_rq;
1837	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
1838
1839	/*
1840	 * adjust priority tree position, if ->next_rq changes
1841	 */
1842	if (prev != cfqq->next_rq)
1843		cfq_prio_tree_add(cfqd, cfqq);
1844
1845	BUG_ON(!cfqq->next_rq);
1846}
1847
1848static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1849{
1850	elv_rb_del(&cfqq->sort_list, rq);
1851	cfqq->queued[rq_is_sync(rq)]--;
1852	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
1853	cfq_add_rq_rb(rq);
1854	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
1855				 rq->cmd_flags);
1856}
1857
1858static struct request *
1859cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1860{
1861	struct task_struct *tsk = current;
1862	struct cfq_io_cq *cic;
1863	struct cfq_queue *cfqq;
1864
1865	cic = cfq_cic_lookup(cfqd, tsk->io_context);
1866	if (!cic)
1867		return NULL;
1868
1869	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1870	if (cfqq) {
1871		sector_t sector = bio->bi_sector + bio_sectors(bio);
1872
1873		return elv_rb_find(&cfqq->sort_list, sector);
1874	}
1875
1876	return NULL;
1877}
1878
1879static void cfq_activate_request(struct request_queue *q, struct request *rq)
1880{
1881	struct cfq_data *cfqd = q->elevator->elevator_data;
1882
1883	cfqd->rq_in_driver++;
1884	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
1885						cfqd->rq_in_driver);
1886
1887	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1888}
1889
1890static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
1891{
1892	struct cfq_data *cfqd = q->elevator->elevator_data;
1893
1894	WARN_ON(!cfqd->rq_in_driver);
1895	cfqd->rq_in_driver--;
1896	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
1897						cfqd->rq_in_driver);
1898}
1899
1900static void cfq_remove_request(struct request *rq)
1901{
1902	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1903
1904	if (cfqq->next_rq == rq)
1905		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
1906
1907	list_del_init(&rq->queuelist);
1908	cfq_del_rq_rb(rq);
1909
1910	cfqq->cfqd->rq_queued--;
1911	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
1912	if (rq->cmd_flags & REQ_PRIO) {
1913		WARN_ON(!cfqq->prio_pending);
1914		cfqq->prio_pending--;
1915	}
1916}
1917
1918static int cfq_merge(struct request_queue *q, struct request **req,
1919		     struct bio *bio)
1920{
1921	struct cfq_data *cfqd = q->elevator->elevator_data;
1922	struct request *__rq;
1923
1924	__rq = cfq_find_rq_fmerge(cfqd, bio);
1925	if (__rq && elv_rq_merge_ok(__rq, bio)) {
1926		*req = __rq;
1927		return ELEVATOR_FRONT_MERGE;
1928	}
1929
1930	return ELEVATOR_NO_MERGE;
1931}
1932
1933static void cfq_merged_request(struct request_queue *q, struct request *req,
1934			       int type)
1935{
1936	if (type == ELEVATOR_FRONT_MERGE) {
1937		struct cfq_queue *cfqq = RQ_CFQQ(req);
1938
1939		cfq_reposition_rq_rb(cfqq, req);
1940	}
1941}
1942
1943static void cfq_bio_merged(struct request_queue *q, struct request *req,
1944				struct bio *bio)
1945{
1946	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
1947}
1948
1949static void
1950cfq_merged_requests(struct request_queue *q, struct request *rq,
1951		    struct request *next)
1952{
1953	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1954	struct cfq_data *cfqd = q->elevator->elevator_data;
1955
1956	/*
1957	 * reposition in fifo if next is older than rq
1958	 */
1959	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1960	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1961		list_move(&rq->queuelist, &next->queuelist);
1962		rq_set_fifo_time(rq, rq_fifo_time(next));
1963	}
1964
1965	if (cfqq->next_rq == next)
1966		cfqq->next_rq = rq;
1967	cfq_remove_request(next);
1968	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
1969
1970	cfqq = RQ_CFQQ(next);
1971	/*
1972	 * all requests of this queue are merged to other queues, delete it
1973	 * from the service tree. If it's the active_queue,
1974	 * cfq_dispatch_requests() will choose to expire it or do idle
1975	 */
1976	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
1977	    cfqq != cfqd->active_queue)
1978		cfq_del_cfqq_rr(cfqd, cfqq);
1979}
1980
1981static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1982			   struct bio *bio)
1983{
1984	struct cfq_data *cfqd = q->elevator->elevator_data;
1985	struct cfq_io_cq *cic;
1986	struct cfq_queue *cfqq;
1987
1988	/*
1989	 * Disallow merge of a sync bio into an async request.
1990	 */
1991	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
1992		return false;
1993
1994	/*
1995	 * Lookup the cfqq that this bio will be queued with and allow
1996	 * merge only if rq is queued there.
1997	 */
1998	cic = cfq_cic_lookup(cfqd, current->io_context);
1999	if (!cic)
2000		return false;
2001
2002	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
2003	return cfqq == RQ_CFQQ(rq);
2004}
2005
2006static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2007{
2008	del_timer(&cfqd->idle_slice_timer);
2009	cfqg_stats_update_idle_time(cfqq->cfqg);
2010}
2011
2012static void __cfq_set_active_queue(struct cfq_data *cfqd,
2013				   struct cfq_queue *cfqq)
2014{
2015	if (cfqq) {
2016		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
2017				cfqd->serving_prio, cfqd->serving_type);
2018		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2019		cfqq->slice_start = 0;
2020		cfqq->dispatch_start = jiffies;
2021		cfqq->allocated_slice = 0;
2022		cfqq->slice_end = 0;
2023		cfqq->slice_dispatch = 0;
2024		cfqq->nr_sectors = 0;
2025
2026		cfq_clear_cfqq_wait_request(cfqq);
2027		cfq_clear_cfqq_must_dispatch(cfqq);
2028		cfq_clear_cfqq_must_alloc_slice(cfqq);
2029		cfq_clear_cfqq_fifo_expire(cfqq);
2030		cfq_mark_cfqq_slice_new(cfqq);
2031
2032		cfq_del_timer(cfqd, cfqq);
2033	}
2034
2035	cfqd->active_queue = cfqq;
2036}
2037
2038/*
2039 * current cfqq expired its slice (or was too idle), select new one
2040 */
2041static void
2042__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2043		    bool timed_out)
2044{
2045	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2046
2047	if (cfq_cfqq_wait_request(cfqq))
2048		cfq_del_timer(cfqd, cfqq);
2049
2050	cfq_clear_cfqq_wait_request(cfqq);
2051	cfq_clear_cfqq_wait_busy(cfqq);
2052
2053	/*
2054	 * If this cfqq is shared between multiple processes, check to
2055	 * make sure that those processes are still issuing I/Os within
2056	 * the mean seek distance.  If not, it may be time to break the
2057	 * queues apart again.
2058	 */
2059	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2060		cfq_mark_cfqq_split_coop(cfqq);
2061
2062	/*
2063	 * store what was left of this slice, if the queue idled/timed out
2064	 */
2065	if (timed_out) {
2066		if (cfq_cfqq_slice_new(cfqq))
2067			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2068		else
2069			cfqq->slice_resid = cfqq->slice_end - jiffies;
2070		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
2071	}
2072
2073	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2074
2075	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2076		cfq_del_cfqq_rr(cfqd, cfqq);
2077
2078	cfq_resort_rr_list(cfqd, cfqq);
2079
2080	if (cfqq == cfqd->active_queue)
2081		cfqd->active_queue = NULL;
2082
2083	if (cfqd->active_cic) {
2084		put_io_context(cfqd->active_cic->icq.ioc);
2085		cfqd->active_cic = NULL;
2086	}
2087}
2088
2089static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2090{
2091	struct cfq_queue *cfqq = cfqd->active_queue;
2092
2093	if (cfqq)
2094		__cfq_slice_expired(cfqd, cfqq, timed_out);
2095}
2096
2097/*
2098 * Get next queue for service. Unless we have a queue preemption,
2099 * we'll simply select the first cfqq in the service tree.
2100 */
2101static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2102{
2103	struct cfq_rb_root *service_tree =
2104		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
2105					cfqd->serving_type);
2106
2107	if (!cfqd->rq_queued)
2108		return NULL;
2109
2110	/* There is nothing to dispatch */
2111	if (!service_tree)
2112		return NULL;
2113	if (RB_EMPTY_ROOT(&service_tree->rb))
2114		return NULL;
2115	return cfq_rb_first(service_tree);
2116}
2117
2118static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2119{
2120	struct cfq_group *cfqg;
2121	struct cfq_queue *cfqq;
2122	int i, j;
2123	struct cfq_rb_root *st;
2124
2125	if (!cfqd->rq_queued)
2126		return NULL;
2127
2128	cfqg = cfq_get_next_cfqg(cfqd);
2129	if (!cfqg)
2130		return NULL;
2131
2132	for_each_cfqg_st(cfqg, i, j, st)
2133		if ((cfqq = cfq_rb_first(st)) != NULL)
2134			return cfqq;
2135	return NULL;
2136}
2137
2138/*
2139 * Get and set a new active queue for service.
2140 */
2141static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2142					      struct cfq_queue *cfqq)
2143{
2144	if (!cfqq)
2145		cfqq = cfq_get_next_queue(cfqd);
2146
2147	__cfq_set_active_queue(cfqd, cfqq);
2148	return cfqq;
2149}
2150
2151static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2152					  struct request *rq)
2153{
2154	if (blk_rq_pos(rq) >= cfqd->last_position)
2155		return blk_rq_pos(rq) - cfqd->last_position;
2156	else
2157		return cfqd->last_position - blk_rq_pos(rq);
2158}
2159
2160static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2161			       struct request *rq)
2162{
2163	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2164}
2165
2166static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2167				    struct cfq_queue *cur_cfqq)
2168{
2169	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2170	struct rb_node *parent, *node;
2171	struct cfq_queue *__cfqq;
2172	sector_t sector = cfqd->last_position;
2173
2174	if (RB_EMPTY_ROOT(root))
2175		return NULL;
2176
2177	/*
2178	 * First, if we find a request starting at the end of the last
2179	 * request, choose it.
2180	 */
2181	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2182	if (__cfqq)
2183		return __cfqq;
2184
2185	/*
2186	 * If the exact sector wasn't found, the parent of the NULL leaf
2187	 * will contain the closest sector.
2188	 */
2189	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
2190	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2191		return __cfqq;
2192
2193	if (blk_rq_pos(__cfqq->next_rq) < sector)
2194		node = rb_next(&__cfqq->p_node);
2195	else
2196		node = rb_prev(&__cfqq->p_node);
2197	if (!node)
2198		return NULL;
2199
2200	__cfqq = rb_entry(node, struct cfq_queue, p_node);
2201	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2202		return __cfqq;
2203
2204	return NULL;
2205}
2206
2207/*
2208 * cfqd - obvious
2209 * cur_cfqq - passed in so that we don't decide that the current queue is
2210 * 	      closely cooperating with itself.
2211 *
2212 * So, basically we're assuming that that cur_cfqq has dispatched at least
2213 * one request, and that cfqd->last_position reflects a position on the disk
2214 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
2215 * assumption.
2216 */
2217static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2218					      struct cfq_queue *cur_cfqq)
2219{
2220	struct cfq_queue *cfqq;
2221
2222	if (cfq_class_idle(cur_cfqq))
2223		return NULL;
2224	if (!cfq_cfqq_sync(cur_cfqq))
2225		return NULL;
2226	if (CFQQ_SEEKY(cur_cfqq))
2227		return NULL;
2228
2229	/*
2230	 * Don't search priority tree if it's the only queue in the group.
2231	 */
2232	if (cur_cfqq->cfqg->nr_cfqq == 1)
2233		return NULL;
2234
2235	/*
2236	 * We should notice if some of the queues are cooperating, eg
2237	 * working closely on the same area of the disk. In that case,
2238	 * we can group them together and don't waste time idling.
2239	 */
2240	cfqq = cfqq_close(cfqd, cur_cfqq);
2241	if (!cfqq)
2242		return NULL;
2243
2244	/* If new queue belongs to different cfq_group, don't choose it */
2245	if (cur_cfqq->cfqg != cfqq->cfqg)
2246		return NULL;
2247
2248	/*
2249	 * It only makes sense to merge sync queues.
2250	 */
2251	if (!cfq_cfqq_sync(cfqq))
2252		return NULL;
2253	if (CFQQ_SEEKY(cfqq))
2254		return NULL;
2255
2256	/*
2257	 * Do not merge queues of different priority classes
2258	 */
2259	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2260		return NULL;
2261
2262	return cfqq;
2263}
2264
2265/*
2266 * Determine whether we should enforce idle window for this queue.
2267 */
2268
2269static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2270{
2271	enum wl_prio_t prio = cfqq_prio(cfqq);
2272	struct cfq_rb_root *service_tree = cfqq->service_tree;
2273
2274	BUG_ON(!service_tree);
2275	BUG_ON(!service_tree->count);
2276
2277	if (!cfqd->cfq_slice_idle)
2278		return false;
2279
2280	/* We never do for idle class queues. */
2281	if (prio == IDLE_WORKLOAD)
2282		return false;
2283
2284	/* We do for queues that were marked with idle window flag. */
2285	if (cfq_cfqq_idle_window(cfqq) &&
2286	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2287		return true;
2288
2289	/*
2290	 * Otherwise, we do only if they are the last ones
2291	 * in their service tree.
2292	 */
2293	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
2294	   !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
2295		return true;
2296	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
2297			service_tree->count);
2298	return false;
2299}
2300
2301static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2302{
2303	struct cfq_queue *cfqq = cfqd->active_queue;
2304	struct cfq_io_cq *cic;
2305	unsigned long sl, group_idle = 0;
2306
2307	/*
2308	 * SSD device without seek penalty, disable idling. But only do so
2309	 * for devices that support queuing, otherwise we still have a problem
2310	 * with sync vs async workloads.
2311	 */
2312	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
2313		return;
2314
2315	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2316	WARN_ON(cfq_cfqq_slice_new(cfqq));
2317
2318	/*
2319	 * idle is disabled, either manually or by past process history
2320	 */
2321	if (!cfq_should_idle(cfqd, cfqq)) {
2322		/* no queue idling. Check for group idling */
2323		if (cfqd->cfq_group_idle)
2324			group_idle = cfqd->cfq_group_idle;
2325		else
2326			return;
2327	}
2328
2329	/*
2330	 * still active requests from this queue, don't idle
2331	 */
2332	if (cfqq->dispatched)
2333		return;
2334
2335	/*
2336	 * task has exited, don't wait
2337	 */
2338	cic = cfqd->active_cic;
2339	if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2340		return;
2341
2342	/*
2343	 * If our average think time is larger than the remaining time
2344	 * slice, then don't idle. This avoids overrunning the allotted
2345	 * time slice.
2346	 */
2347	if (sample_valid(cic->ttime.ttime_samples) &&
2348	    (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
2349		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
2350			     cic->ttime.ttime_mean);
2351		return;
2352	}
2353
2354	/* There are other queues in the group, don't do group idle */
2355	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
2356		return;
2357
2358	cfq_mark_cfqq_wait_request(cfqq);
2359
2360	if (group_idle)
2361		sl = cfqd->cfq_group_idle;
2362	else
2363		sl = cfqd->cfq_slice_idle;
2364
2365	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2366	cfqg_stats_set_start_idle_time(cfqq->cfqg);
2367	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2368			group_idle ? 1 : 0);
2369}
2370
2371/*
2372 * Move request from internal lists to the request queue dispatch list.
2373 */
2374static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2375{
2376	struct cfq_data *cfqd = q->elevator->elevator_data;
2377	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2378
2379	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2380
2381	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2382	cfq_remove_request(rq);
2383	cfqq->dispatched++;
2384	(RQ_CFQG(rq))->dispatched++;
2385	elv_dispatch_sort(q, rq);
2386
2387	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2388	cfqq->nr_sectors += blk_rq_sectors(rq);
2389	cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
2390}
2391
2392/*
2393 * return expired entry, or NULL to just start from scratch in rbtree
2394 */
2395static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
2396{
2397	struct request *rq = NULL;
2398
2399	if (cfq_cfqq_fifo_expire(cfqq))
2400		return NULL;
2401
2402	cfq_mark_cfqq_fifo_expire(cfqq);
2403
2404	if (list_empty(&cfqq->fifo))
2405		return NULL;
2406
2407	rq = rq_entry_fifo(cfqq->fifo.next);
2408	if (time_before(jiffies, rq_fifo_time(rq)))
2409		rq = NULL;
2410
2411	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
2412	return rq;
2413}
2414
2415static inline int
2416cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2417{
2418	const int base_rq = cfqd->cfq_slice_async_rq;
2419
2420	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2421
2422	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2423}
2424
2425/*
2426 * Must be called with the queue_lock held.
2427 */
2428static int cfqq_process_refs(struct cfq_queue *cfqq)
2429{
2430	int process_refs, io_refs;
2431
2432	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2433	process_refs = cfqq->ref - io_refs;
2434	BUG_ON(process_refs < 0);
2435	return process_refs;
2436}
2437
2438static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2439{
2440	int process_refs, new_process_refs;
2441	struct cfq_queue *__cfqq;
2442
2443	/*
2444	 * If there are no process references on the new_cfqq, then it is
2445	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2446	 * chain may have dropped their last reference (not just their
2447	 * last process reference).
2448	 */
2449	if (!cfqq_process_refs(new_cfqq))
2450		return;
2451
2452	/* Avoid a circular list and skip interim queue merges */
2453	while ((__cfqq = new_cfqq->new_cfqq)) {
2454		if (__cfqq == cfqq)
2455			return;
2456		new_cfqq = __cfqq;
2457	}
2458
2459	process_refs = cfqq_process_refs(cfqq);
2460	new_process_refs = cfqq_process_refs(new_cfqq);
2461	/*
2462	 * If the process for the cfqq has gone away, there is no
2463	 * sense in merging the queues.
2464	 */
2465	if (process_refs == 0 || new_process_refs == 0)
2466		return;
2467
2468	/*
2469	 * Merge in the direction of the lesser amount of work.
2470	 */
2471	if (new_process_refs >= process_refs) {
2472		cfqq->new_cfqq = new_cfqq;
2473		new_cfqq->ref += process_refs;
2474	} else {
2475		new_cfqq->new_cfqq = cfqq;
2476		cfqq->ref += new_process_refs;
2477	}
2478}
2479
2480static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2481				struct cfq_group *cfqg, enum wl_prio_t prio)
2482{
2483	struct cfq_queue *queue;
2484	int i;
2485	bool key_valid = false;
2486	unsigned long lowest_key = 0;
2487	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2488
2489	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2490		/* select the one with lowest rb_key */
2491		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
2492		if (queue &&
2493		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
2494			lowest_key = queue->rb_key;
2495			cur_best = i;
2496			key_valid = true;
2497		}
2498	}
2499
2500	return cur_best;
2501}
2502
2503static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2504{
2505	unsigned slice;
2506	unsigned count;
2507	struct cfq_rb_root *st;
2508	unsigned group_slice;
2509	enum wl_prio_t original_prio = cfqd->serving_prio;
2510
2511	/* Choose next priority. RT > BE > IDLE */
2512	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2513		cfqd->serving_prio = RT_WORKLOAD;
2514	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2515		cfqd->serving_prio = BE_WORKLOAD;
2516	else {
2517		cfqd->serving_prio = IDLE_WORKLOAD;
2518		cfqd->workload_expires = jiffies + 1;
2519		return;
2520	}
2521
2522	if (original_prio != cfqd->serving_prio)
2523		goto new_workload;
2524
2525	/*
2526	 * For RT and BE, we have to choose also the type
2527	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2528	 * expiration time
2529	 */
2530	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2531	count = st->count;
2532
2533	/*
2534	 * check workload expiration, and that we still have other queues ready
2535	 */
2536	if (count && !time_after(jiffies, cfqd->workload_expires))
2537		return;
2538
2539new_workload:
2540	/* otherwise select new workload type */
2541	cfqd->serving_type =
2542		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2543	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2544	count = st->count;
2545
2546	/*
2547	 * the workload slice is computed as a fraction of target latency
2548	 * proportional to the number of queues in that workload, over
2549	 * all the queues in the same priority class
2550	 */
2551	group_slice = cfq_group_slice(cfqd, cfqg);
2552
2553	slice = group_slice * count /
2554		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2555		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2556
2557	if (cfqd->serving_type == ASYNC_WORKLOAD) {
2558		unsigned int tmp;
2559
2560		/*
2561		 * Async queues are currently system wide. Just taking
2562		 * proportion of queues with-in same group will lead to higher
2563		 * async ratio system wide as generally root group is going
2564		 * to have higher weight. A more accurate thing would be to
2565		 * calculate system wide asnc/sync ratio.
2566		 */
2567		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2568		tmp = tmp/cfqd->busy_queues;
2569		slice = min_t(unsigned, slice, tmp);
2570
2571		/* async workload slice is scaled down according to
2572		 * the sync/async slice ratio. */
2573		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2574	} else
2575		/* sync workload slice is at least 2 * cfq_slice_idle */
2576		slice = max(slice, 2 * cfqd->cfq_slice_idle);
2577
2578	slice = max_t(unsigned, slice, CFQ_MIN_TT);
2579	cfq_log(cfqd, "workload slice:%d", slice);
2580	cfqd->workload_expires = jiffies + slice;
2581}
2582
2583static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2584{
2585	struct cfq_rb_root *st = &cfqd->grp_service_tree;
2586	struct cfq_group *cfqg;
2587
2588	if (RB_EMPTY_ROOT(&st->rb))
2589		return NULL;
2590	cfqg = cfq_rb_first_group(st);
2591	update_min_vdisktime(st);
2592	return cfqg;
2593}
2594
2595static void cfq_choose_cfqg(struct cfq_data *cfqd)
2596{
2597	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2598
2599	cfqd->serving_group = cfqg;
2600
2601	/* Restore the workload type data */
2602	if (cfqg->saved_workload_slice) {
2603		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2604		cfqd->serving_type = cfqg->saved_workload;
2605		cfqd->serving_prio = cfqg->saved_serving_prio;
2606	} else
2607		cfqd->workload_expires = jiffies - 1;
2608
2609	choose_service_tree(cfqd, cfqg);
2610}
2611
2612/*
2613 * Select a queue for service. If we have a current active queue,
2614 * check whether to continue servicing it, or retrieve and set a new one.
2615 */
2616static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2617{
2618	struct cfq_queue *cfqq, *new_cfqq = NULL;
2619
2620	cfqq = cfqd->active_queue;
2621	if (!cfqq)
2622		goto new_queue;
2623
2624	if (!cfqd->rq_queued)
2625		return NULL;
2626
2627	/*
2628	 * We were waiting for group to get backlogged. Expire the queue
2629	 */
2630	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2631		goto expire;
2632
2633	/*
2634	 * The active queue has run out of time, expire it and select new.
2635	 */
2636	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2637		/*
2638		 * If slice had not expired at the completion of last request
2639		 * we might not have turned on wait_busy flag. Don't expire
2640		 * the queue yet. Allow the group to get backlogged.
2641		 *
2642		 * The very fact that we have used the slice, that means we
2643		 * have been idling all along on this queue and it should be
2644		 * ok to wait for this request to complete.
2645		 */
2646		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2647		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2648			cfqq = NULL;
2649			goto keep_queue;
2650		} else
2651			goto check_group_idle;
2652	}
2653
2654	/*
2655	 * The active queue has requests and isn't expired, allow it to
2656	 * dispatch.
2657	 */
2658	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2659		goto keep_queue;
2660
2661	/*
2662	 * If another queue has a request waiting within our mean seek
2663	 * distance, let it run.  The expire code will check for close
2664	 * cooperators and put the close queue at the front of the service
2665	 * tree.  If possible, merge the expiring queue with the new cfqq.
2666	 */
2667	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2668	if (new_cfqq) {
2669		if (!cfqq->new_cfqq)
2670			cfq_setup_merge(cfqq, new_cfqq);
2671		goto expire;
2672	}
2673
2674	/*
2675	 * No requests pending. If the active queue still has requests in
2676	 * flight or is idling for a new request, allow either of these
2677	 * conditions to happen (or time out) before selecting a new queue.
2678	 */
2679	if (timer_pending(&cfqd->idle_slice_timer)) {
2680		cfqq = NULL;
2681		goto keep_queue;
2682	}
2683
2684	/*
2685	 * This is a deep seek queue, but the device is much faster than
2686	 * the queue can deliver, don't idle
2687	 **/
2688	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2689	    (cfq_cfqq_slice_new(cfqq) ||
2690	    (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2691		cfq_clear_cfqq_deep(cfqq);
2692		cfq_clear_cfqq_idle_window(cfqq);
2693	}
2694
2695	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2696		cfqq = NULL;
2697		goto keep_queue;
2698	}
2699
2700	/*
2701	 * If group idle is enabled and there are requests dispatched from
2702	 * this group, wait for requests to complete.
2703	 */
2704check_group_idle:
2705	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
2706	    cfqq->cfqg->dispatched &&
2707	    !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
2708		cfqq = NULL;
2709		goto keep_queue;
2710	}
2711
2712expire:
2713	cfq_slice_expired(cfqd, 0);
2714new_queue:
2715	/*
2716	 * Current queue expired. Check if we have to switch to a new
2717	 * service tree
2718	 */
2719	if (!new_cfqq)
2720		cfq_choose_cfqg(cfqd);
2721
2722	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2723keep_queue:
2724	return cfqq;
2725}
2726
2727static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2728{
2729	int dispatched = 0;
2730
2731	while (cfqq->next_rq) {
2732		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2733		dispatched++;
2734	}
2735
2736	BUG_ON(!list_empty(&cfqq->fifo));
2737
2738	/* By default cfqq is not expired if it is empty. Do it explicitly */
2739	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2740	return dispatched;
2741}
2742
2743/*
2744 * Drain our current requests. Used for barriers and when switching
2745 * io schedulers on-the-fly.
2746 */
2747static int cfq_forced_dispatch(struct cfq_data *cfqd)
2748{
2749	struct cfq_queue *cfqq;
2750	int dispatched = 0;
2751
2752	/* Expire the timeslice of the current active queue first */
2753	cfq_slice_expired(cfqd, 0);
2754	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
2755		__cfq_set_active_queue(cfqd, cfqq);
2756		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2757	}
2758
2759	BUG_ON(cfqd->busy_queues);
2760
2761	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
2762	return dispatched;
2763}
2764
2765static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2766	struct cfq_queue *cfqq)
2767{
2768	/* the queue hasn't finished any request, can't estimate */
2769	if (cfq_cfqq_slice_new(cfqq))
2770		return true;
2771	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2772		cfqq->slice_end))
2773		return true;
2774
2775	return false;
2776}
2777
2778static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2779{
2780	unsigned int max_dispatch;
2781
2782	/*
2783	 * Drain async requests before we start sync IO
2784	 */
2785	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
2786		return false;
2787
2788	/*
2789	 * If this is an async queue and we have sync IO in flight, let it wait
2790	 */
2791	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
2792		return false;
2793
2794	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2795	if (cfq_class_idle(cfqq))
2796		max_dispatch = 1;
2797
2798	/*
2799	 * Does this cfqq already have too much IO in flight?
2800	 */
2801	if (cfqq->dispatched >= max_dispatch) {
2802		bool promote_sync = false;
2803		/*
2804		 * idle queue must always only have a single IO in flight
2805		 */
2806		if (cfq_class_idle(cfqq))
2807			return false;
2808
2809		/*
2810		 * If there is only one sync queue
2811		 * we can ignore async queue here and give the sync
2812		 * queue no dispatch limit. The reason is a sync queue can
2813		 * preempt async queue, limiting the sync queue doesn't make
2814		 * sense. This is useful for aiostress test.
2815		 */
2816		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
2817			promote_sync = true;
2818
2819		/*
2820		 * We have other queues, don't allow more IO from this one
2821		 */
2822		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
2823				!promote_sync)
2824			return false;
2825
2826		/*
2827		 * Sole queue user, no limit
2828		 */
2829		if (cfqd->busy_queues == 1 || promote_sync)
2830			max_dispatch = -1;
2831		else
2832			/*
2833			 * Normally we start throttling cfqq when cfq_quantum/2
2834			 * requests have been dispatched. But we can drive
2835			 * deeper queue depths at the beginning of slice
2836			 * subjected to upper limit of cfq_quantum.
2837			 * */
2838			max_dispatch = cfqd->cfq_quantum;
2839	}
2840
2841	/*
2842	 * Async queues must wait a bit before being allowed dispatch.
2843	 * We also ramp up the dispatch depth gradually for async IO,
2844	 * based on the last sync IO we serviced
2845	 */
2846	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
2847		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
2848		unsigned int depth;
2849
2850		depth = last_sync / cfqd->cfq_slice[1];
2851		if (!depth && !cfqq->dispatched)
2852			depth = 1;
2853		if (depth < max_dispatch)
2854			max_dispatch = depth;
2855	}
2856
2857	/*
2858	 * If we're below the current max, allow a dispatch
2859	 */
2860	return cfqq->dispatched < max_dispatch;
2861}
2862
2863/*
2864 * Dispatch a request from cfqq, moving them to the request queue
2865 * dispatch list.
2866 */
2867static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2868{
2869	struct request *rq;
2870
2871	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
2872
2873	if (!cfq_may_dispatch(cfqd, cfqq))
2874		return false;
2875
2876	/*
2877	 * follow expired path, else get first next available
2878	 */
2879	rq = cfq_check_fifo(cfqq);
2880	if (!rq)
2881		rq = cfqq->next_rq;
2882
2883	/*
2884	 * insert request into driver dispatch list
2885	 */
2886	cfq_dispatch_insert(cfqd->queue, rq);
2887
2888	if (!cfqd->active_cic) {
2889		struct cfq_io_cq *cic = RQ_CIC(rq);
2890
2891		atomic_long_inc(&cic->icq.ioc->refcount);
2892		cfqd->active_cic = cic;
2893	}
2894
2895	return true;
2896}
2897
2898/*
2899 * Find the cfqq that we need to service and move a request from that to the
2900 * dispatch list
2901 */
2902static int cfq_dispatch_requests(struct request_queue *q, int force)
2903{
2904	struct cfq_data *cfqd = q->elevator->elevator_data;
2905	struct cfq_queue *cfqq;
2906
2907	if (!cfqd->busy_queues)
2908		return 0;
2909
2910	if (unlikely(force))
2911		return cfq_forced_dispatch(cfqd);
2912
2913	cfqq = cfq_select_queue(cfqd);
2914	if (!cfqq)
2915		return 0;
2916
2917	/*
2918	 * Dispatch a request from this cfqq, if it is allowed
2919	 */
2920	if (!cfq_dispatch_request(cfqd, cfqq))
2921		return 0;
2922
2923	cfqq->slice_dispatch++;
2924	cfq_clear_cfqq_must_dispatch(cfqq);
2925
2926	/*
2927	 * expire an async queue immediately if it has used up its slice. idle
2928	 * queue always expire after 1 dispatch round.
2929	 */
2930	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
2931	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
2932	    cfq_class_idle(cfqq))) {
2933		cfqq->slice_end = jiffies + 1;
2934		cfq_slice_expired(cfqd, 0);
2935	}
2936
2937	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2938	return 1;
2939}
2940
2941/*
2942 * task holds one reference to the queue, dropped when task exits. each rq
2943 * in-flight on this queue also holds a reference, dropped when rq is freed.
2944 *
2945 * Each cfq queue took a reference on the parent group. Drop it now.
2946 * queue lock must be held here.
2947 */
2948static void cfq_put_queue(struct cfq_queue *cfqq)
2949{
2950	struct cfq_data *cfqd = cfqq->cfqd;
2951	struct cfq_group *cfqg;
2952
2953	BUG_ON(cfqq->ref <= 0);
2954
2955	cfqq->ref--;
2956	if (cfqq->ref)
2957		return;
2958
2959	cfq_log_cfqq(cfqd, cfqq, "put_queue");
2960	BUG_ON(rb_first(&cfqq->sort_list));
2961	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2962	cfqg = cfqq->cfqg;
2963
2964	if (unlikely(cfqd->active_queue == cfqq)) {
2965		__cfq_slice_expired(cfqd, cfqq, 0);
2966		cfq_schedule_dispatch(cfqd);
2967	}
2968
2969	BUG_ON(cfq_cfqq_on_rr(cfqq));
2970	kmem_cache_free(cfq_pool, cfqq);
2971	cfqg_put(cfqg);
2972}
2973
2974static void cfq_put_cooperator(struct cfq_queue *cfqq)
2975{
2976	struct cfq_queue *__cfqq, *next;
2977
2978	/*
2979	 * If this queue was scheduled to merge with another queue, be
2980	 * sure to drop the reference taken on that queue (and others in
2981	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
2982	 */
2983	__cfqq = cfqq->new_cfqq;
2984	while (__cfqq) {
2985		if (__cfqq == cfqq) {
2986			WARN(1, "cfqq->new_cfqq loop detected\n");
2987			break;
2988		}
2989		next = __cfqq->new_cfqq;
2990		cfq_put_queue(__cfqq);
2991		__cfqq = next;
2992	}
2993}
2994
2995static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2996{
2997	if (unlikely(cfqq == cfqd->active_queue)) {
2998		__cfq_slice_expired(cfqd, cfqq, 0);
2999		cfq_schedule_dispatch(cfqd);
3000	}
3001
3002	cfq_put_cooperator(cfqq);
3003
3004	cfq_put_queue(cfqq);
3005}
3006
3007static void cfq_init_icq(struct io_cq *icq)
3008{
3009	struct cfq_io_cq *cic = icq_to_cic(icq);
3010
3011	cic->ttime.last_end_request = jiffies;
3012}
3013
3014static void cfq_exit_icq(struct io_cq *icq)
3015{
3016	struct cfq_io_cq *cic = icq_to_cic(icq);
3017	struct cfq_data *cfqd = cic_to_cfqd(cic);
3018
3019	if (cic->cfqq[BLK_RW_ASYNC]) {
3020		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
3021		cic->cfqq[BLK_RW_ASYNC] = NULL;
3022	}
3023
3024	if (cic->cfqq[BLK_RW_SYNC]) {
3025		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
3026		cic->cfqq[BLK_RW_SYNC] = NULL;
3027	}
3028}
3029
3030static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3031{
3032	struct task_struct *tsk = current;
3033	int ioprio_class;
3034
3035	if (!cfq_cfqq_prio_changed(cfqq))
3036		return;
3037
3038	ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3039	switch (ioprio_class) {
3040	default:
3041		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3042	case IOPRIO_CLASS_NONE:
3043		/*
3044		 * no prio set, inherit CPU scheduling settings
3045		 */
3046		cfqq->ioprio = task_nice_ioprio(tsk);
3047		cfqq->ioprio_class = task_nice_ioclass(tsk);
3048		break;
3049	case IOPRIO_CLASS_RT:
3050		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3051		cfqq->ioprio_class = IOPRIO_CLASS_RT;
3052		break;
3053	case IOPRIO_CLASS_BE:
3054		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3055		cfqq->ioprio_class = IOPRIO_CLASS_BE;
3056		break;
3057	case IOPRIO_CLASS_IDLE:
3058		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3059		cfqq->ioprio = 7;
3060		cfq_clear_cfqq_idle_window(cfqq);
3061		break;
3062	}
3063
3064	/*
3065	 * keep track of original prio settings in case we have to temporarily
3066	 * elevate the priority of this queue
3067	 */
3068	cfqq->org_ioprio = cfqq->ioprio;
3069	cfq_clear_cfqq_prio_changed(cfqq);
3070}
3071
3072static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3073{
3074	int ioprio = cic->icq.ioc->ioprio;
3075	struct cfq_data *cfqd = cic_to_cfqd(cic);
3076	struct cfq_queue *cfqq;
3077
3078	/*
3079	 * Check whether ioprio has changed.  The condition may trigger
3080	 * spuriously on a newly created cic but there's no harm.
3081	 */
3082	if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3083		return;
3084
3085	cfqq = cic->cfqq[BLK_RW_ASYNC];
3086	if (cfqq) {
3087		struct cfq_queue *new_cfqq;
3088		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
3089					 GFP_ATOMIC);
3090		if (new_cfqq) {
3091			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
3092			cfq_put_queue(cfqq);
3093		}
3094	}
3095
3096	cfqq = cic->cfqq[BLK_RW_SYNC];
3097	if (cfqq)
3098		cfq_mark_cfqq_prio_changed(cfqq);
3099
3100	cic->ioprio = ioprio;
3101}
3102
3103static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3104			  pid_t pid, bool is_sync)
3105{
3106	RB_CLEAR_NODE(&cfqq->rb_node);
3107	RB_CLEAR_NODE(&cfqq->p_node);
3108	INIT_LIST_HEAD(&cfqq->fifo);
3109
3110	cfqq->ref = 0;
3111	cfqq->cfqd = cfqd;
3112
3113	cfq_mark_cfqq_prio_changed(cfqq);
3114
3115	if (is_sync) {
3116		if (!cfq_class_idle(cfqq))
3117			cfq_mark_cfqq_idle_window(cfqq);
3118		cfq_mark_cfqq_sync(cfqq);
3119	}
3120	cfqq->pid = pid;
3121}
3122
3123#ifdef CONFIG_CFQ_GROUP_IOSCHED
3124static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3125{
3126	struct cfq_data *cfqd = cic_to_cfqd(cic);
3127	struct cfq_queue *sync_cfqq;
3128	uint64_t id;
3129
3130	rcu_read_lock();
3131	id = bio_blkio_cgroup(bio)->id;
3132	rcu_read_unlock();
3133
3134	/*
3135	 * Check whether blkcg has changed.  The condition may trigger
3136	 * spuriously on a newly created cic but there's no harm.
3137	 */
3138	if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
3139		return;
3140
3141	sync_cfqq = cic_to_cfqq(cic, 1);
3142	if (sync_cfqq) {
3143		/*
3144		 * Drop reference to sync queue. A new sync queue will be
3145		 * assigned in new group upon arrival of a fresh request.
3146		 */
3147		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
3148		cic_set_cfqq(cic, NULL, 1);
3149		cfq_put_queue(sync_cfqq);
3150	}
3151
3152	cic->blkcg_id = id;
3153}
3154#else
3155static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
3156#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
3157
3158static struct cfq_queue *
3159cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3160		     struct bio *bio, gfp_t gfp_mask)
3161{
3162	struct blkio_cgroup *blkcg;
3163	struct cfq_queue *cfqq, *new_cfqq = NULL;
3164	struct cfq_group *cfqg;
3165
3166retry:
3167	rcu_read_lock();
3168
3169	blkcg = bio_blkio_cgroup(bio);
3170	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
3171	cfqq = cic_to_cfqq(cic, is_sync);
3172
3173	/*
3174	 * Always try a new alloc if we fell back to the OOM cfqq
3175	 * originally, since it should just be a temporary situation.
3176	 */
3177	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3178		cfqq = NULL;
3179		if (new_cfqq) {
3180			cfqq = new_cfqq;
3181			new_cfqq = NULL;
3182		} else if (gfp_mask & __GFP_WAIT) {
3183			rcu_read_unlock();
3184			spin_unlock_irq(cfqd->queue->queue_lock);
3185			new_cfqq = kmem_cache_alloc_node(cfq_pool,
3186					gfp_mask | __GFP_ZERO,
3187					cfqd->queue->node);
3188			spin_lock_irq(cfqd->queue->queue_lock);
3189			if (new_cfqq)
3190				goto retry;
3191		} else {
3192			cfqq = kmem_cache_alloc_node(cfq_pool,
3193					gfp_mask | __GFP_ZERO,
3194					cfqd->queue->node);
3195		}
3196
3197		if (cfqq) {
3198			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3199			cfq_init_prio_data(cfqq, cic);
3200			cfq_link_cfqq_cfqg(cfqq, cfqg);
3201			cfq_log_cfqq(cfqd, cfqq, "alloced");
3202		} else
3203			cfqq = &cfqd->oom_cfqq;
3204	}
3205
3206	if (new_cfqq)
3207		kmem_cache_free(cfq_pool, new_cfqq);
3208
3209	rcu_read_unlock();
3210	return cfqq;
3211}
3212
3213static struct cfq_queue **
3214cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
3215{
3216	switch (ioprio_class) {
3217	case IOPRIO_CLASS_RT:
3218		return &cfqd->async_cfqq[0][ioprio];
3219	case IOPRIO_CLASS_NONE:
3220		ioprio = IOPRIO_NORM;
3221		/* fall through */
3222	case IOPRIO_CLASS_BE:
3223		return &cfqd->async_cfqq[1][ioprio];
3224	case IOPRIO_CLASS_IDLE:
3225		return &cfqd->async_idle_cfqq;
3226	default:
3227		BUG();
3228	}
3229}
3230
3231static struct cfq_queue *
3232cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3233	      struct bio *bio, gfp_t gfp_mask)
3234{
3235	const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3236	const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3237	struct cfq_queue **async_cfqq = NULL;
3238	struct cfq_queue *cfqq = NULL;
3239
3240	if (!is_sync) {
3241		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
3242		cfqq = *async_cfqq;
3243	}
3244
3245	if (!cfqq)
3246		cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
3247
3248	/*
3249	 * pin the queue now that it's allocated, scheduler exit will prune it
3250	 */
3251	if (!is_sync && !(*async_cfqq)) {
3252		cfqq->ref++;
3253		*async_cfqq = cfqq;
3254	}
3255
3256	cfqq->ref++;
3257	return cfqq;
3258}
3259
3260static void
3261__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3262{
3263	unsigned long elapsed = jiffies - ttime->last_end_request;
3264	elapsed = min(elapsed, 2UL * slice_idle);
3265
3266	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3267	ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
3268	ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
3269}
3270
3271static void
3272cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3273			struct cfq_io_cq *cic)
3274{
3275	if (cfq_cfqq_sync(cfqq)) {
3276		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3277		__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3278			cfqd->cfq_slice_idle);
3279	}
3280#ifdef CONFIG_CFQ_GROUP_IOSCHED
3281	__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3282#endif
3283}
3284
3285static void
3286cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3287		       struct request *rq)
3288{
3289	sector_t sdist = 0;
3290	sector_t n_sec = blk_rq_sectors(rq);
3291	if (cfqq->last_request_pos) {
3292		if (cfqq->last_request_pos < blk_rq_pos(rq))
3293			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3294		else
3295			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3296	}
3297
3298	cfqq->seek_history <<= 1;
3299	if (blk_queue_nonrot(cfqd->queue))
3300		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3301	else
3302		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3303}
3304
3305/*
3306 * Disable idle window if the process thinks too long or seeks so much that
3307 * it doesn't matter
3308 */
3309static void
3310cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3311		       struct cfq_io_cq *cic)
3312{
3313	int old_idle, enable_idle;
3314
3315	/*
3316	 * Don't idle for async or idle io prio class
3317	 */
3318	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3319		return;
3320
3321	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3322
3323	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3324		cfq_mark_cfqq_deep(cfqq);
3325
3326	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3327		enable_idle = 0;
3328	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3329		 !cfqd->cfq_slice_idle ||
3330		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3331		enable_idle = 0;
3332	else if (sample_valid(cic->ttime.ttime_samples)) {
3333		if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3334			enable_idle = 0;
3335		else
3336			enable_idle = 1;
3337	}
3338
3339	if (old_idle != enable_idle) {
3340		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3341		if (enable_idle)
3342			cfq_mark_cfqq_idle_window(cfqq);
3343		else
3344			cfq_clear_cfqq_idle_window(cfqq);
3345	}
3346}
3347
3348/*
3349 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3350 * no or if we aren't sure, a 1 will cause a preempt.
3351 */
3352static bool
3353cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3354		   struct request *rq)
3355{
3356	struct cfq_queue *cfqq;
3357
3358	cfqq = cfqd->active_queue;
3359	if (!cfqq)
3360		return false;
3361
3362	if (cfq_class_idle(new_cfqq))
3363		return false;
3364
3365	if (cfq_class_idle(cfqq))
3366		return true;
3367
3368	/*
3369	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3370	 */
3371	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3372		return false;
3373
3374	/*
3375	 * if the new request is sync, but the currently running queue is
3376	 * not, let the sync request have priority.
3377	 */
3378	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3379		return true;
3380
3381	if (new_cfqq->cfqg != cfqq->cfqg)
3382		return false;
3383
3384	if (cfq_slice_used(cfqq))
3385		return true;
3386
3387	/* Allow preemption only if we are idling on sync-noidle tree */
3388	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3389	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3390	    new_cfqq->service_tree->count == 2 &&
3391	    RB_EMPTY_ROOT(&cfqq->sort_list))
3392		return true;
3393
3394	/*
3395	 * So both queues are sync. Let the new request get disk time if
3396	 * it's a metadata request and the current queue is doing regular IO.
3397	 */
3398	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
3399		return true;
3400
3401	/*
3402	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3403	 */
3404	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3405		return true;
3406
3407	/* An idle queue should not be idle now for some reason */
3408	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3409		return true;
3410
3411	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3412		return false;
3413
3414	/*
3415	 * if this request is as-good as one we would expect from the
3416	 * current cfqq, let it preempt
3417	 */
3418	if (cfq_rq_close(cfqd, cfqq, rq))
3419		return true;
3420
3421	return false;
3422}
3423
3424/*
3425 * cfqq preempts the active queue. if we allowed preempt with no slice left,
3426 * let it have half of its nominal slice.
3427 */
3428static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3429{
3430	enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
3431
3432	cfq_log_cfqq(cfqd, cfqq, "preempt");
3433	cfq_slice_expired(cfqd, 1);
3434
3435	/*
3436	 * workload type is changed, don't save slice, otherwise preempt
3437	 * doesn't happen
3438	 */
3439	if (old_type != cfqq_type(cfqq))
3440		cfqq->cfqg->saved_workload_slice = 0;
3441
3442	/*
3443	 * Put the new queue at the front of the of the current list,
3444	 * so we know that it will be selected next.
3445	 */
3446	BUG_ON(!cfq_cfqq_on_rr(cfqq));
3447
3448	cfq_service_tree_add(cfqd, cfqq, 1);
3449
3450	cfqq->slice_end = 0;
3451	cfq_mark_cfqq_slice_new(cfqq);
3452}
3453
3454/*
3455 * Called when a new fs request (rq) is added (to cfqq). Check if there's
3456 * something we should do about it
3457 */
3458static void
3459cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3460		struct request *rq)
3461{
3462	struct cfq_io_cq *cic = RQ_CIC(rq);
3463
3464	cfqd->rq_queued++;
3465	if (rq->cmd_flags & REQ_PRIO)
3466		cfqq->prio_pending++;
3467
3468	cfq_update_io_thinktime(cfqd, cfqq, cic);
3469	cfq_update_io_seektime(cfqd, cfqq, rq);
3470	cfq_update_idle_window(cfqd, cfqq, cic);
3471
3472	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3473
3474	if (cfqq == cfqd->active_queue) {
3475		/*
3476		 * Remember that we saw a request from this process, but
3477		 * don't start queuing just yet. Otherwise we risk seeing lots
3478		 * of tiny requests, because we disrupt the normal plugging
3479		 * and merging. If the request is already larger than a single
3480		 * page, let it rip immediately. For that case we assume that
3481		 * merging is already done. Ditto for a busy system that
3482		 * has other work pending, don't risk delaying until the
3483		 * idle timer unplug to continue working.
3484		 */
3485		if (cfq_cfqq_wait_request(cfqq)) {
3486			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3487			    cfqd->busy_queues > 1) {
3488				cfq_del_timer(cfqd, cfqq);
3489				cfq_clear_cfqq_wait_request(cfqq);
3490				__blk_run_queue(cfqd->queue);
3491			} else {
3492				cfqg_stats_update_idle_time(cfqq->cfqg);
3493				cfq_mark_cfqq_must_dispatch(cfqq);
3494			}
3495		}
3496	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3497		/*
3498		 * not the active queue - expire current slice if it is
3499		 * idle and has expired it's mean thinktime or this new queue
3500		 * has some old slice time left and is of higher priority or
3501		 * this new queue is RT and the current one is BE
3502		 */
3503		cfq_preempt_queue(cfqd, cfqq);
3504		__blk_run_queue(cfqd->queue);
3505	}
3506}
3507
3508static void cfq_insert_request(struct request_queue *q, struct request *rq)
3509{
3510	struct cfq_data *cfqd = q->elevator->elevator_data;
3511	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3512
3513	cfq_log_cfqq(cfqd, cfqq, "insert_request");
3514	cfq_init_prio_data(cfqq, RQ_CIC(rq));
3515
3516	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3517	list_add_tail(&rq->queuelist, &cfqq->fifo);
3518	cfq_add_rq_rb(rq);
3519	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
3520				 rq->cmd_flags);
3521	cfq_rq_enqueued(cfqd, cfqq, rq);
3522}
3523
3524/*
3525 * Update hw_tag based on peak queue depth over 50 samples under
3526 * sufficient load.
3527 */
3528static void cfq_update_hw_tag(struct cfq_data *cfqd)
3529{
3530	struct cfq_queue *cfqq = cfqd->active_queue;
3531
3532	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3533		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3534
3535	if (cfqd->hw_tag == 1)
3536		return;
3537
3538	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3539	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3540		return;
3541
3542	/*
3543	 * If active queue hasn't enough requests and can idle, cfq might not
3544	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3545	 * case
3546	 */
3547	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3548	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3549	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3550		return;
3551
3552	if (cfqd->hw_tag_samples++ < 50)
3553		return;
3554
3555	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3556		cfqd->hw_tag = 1;
3557	else
3558		cfqd->hw_tag = 0;
3559}
3560
3561static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3562{
3563	struct cfq_io_cq *cic = cfqd->active_cic;
3564
3565	/* If the queue already has requests, don't wait */
3566	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3567		return false;
3568
3569	/* If there are other queues in the group, don't wait */
3570	if (cfqq->cfqg->nr_cfqq > 1)
3571		return false;
3572
3573	/* the only queue in the group, but think time is big */
3574	if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
3575		return false;
3576
3577	if (cfq_slice_used(cfqq))
3578		return true;
3579
3580	/* if slice left is less than think time, wait busy */
3581	if (cic && sample_valid(cic->ttime.ttime_samples)
3582	    && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
3583		return true;
3584
3585	/*
3586	 * If think times is less than a jiffy than ttime_mean=0 and above
3587	 * will not be true. It might happen that slice has not expired yet
3588	 * but will expire soon (4-5 ns) during select_queue(). To cover the
3589	 * case where think time is less than a jiffy, mark the queue wait
3590	 * busy if only 1 jiffy is left in the slice.
3591	 */
3592	if (cfqq->slice_end - jiffies == 1)
3593		return true;
3594
3595	return false;
3596}
3597
3598static void cfq_completed_request(struct request_queue *q, struct request *rq)
3599{
3600	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3601	struct cfq_data *cfqd = cfqq->cfqd;
3602	const int sync = rq_is_sync(rq);
3603	unsigned long now;
3604
3605	now = jiffies;
3606	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
3607		     !!(rq->cmd_flags & REQ_NOIDLE));
3608
3609	cfq_update_hw_tag(cfqd);
3610
3611	WARN_ON(!cfqd->rq_in_driver);
3612	WARN_ON(!cfqq->dispatched);
3613	cfqd->rq_in_driver--;
3614	cfqq->dispatched--;
3615	(RQ_CFQG(rq))->dispatched--;
3616	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
3617				     rq_io_start_time_ns(rq), rq->cmd_flags);
3618
3619	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3620
3621	if (sync) {
3622		struct cfq_rb_root *service_tree;
3623
3624		RQ_CIC(rq)->ttime.last_end_request = now;
3625
3626		if (cfq_cfqq_on_rr(cfqq))
3627			service_tree = cfqq->service_tree;
3628		else
3629			service_tree = service_tree_for(cfqq->cfqg,
3630				cfqq_prio(cfqq), cfqq_type(cfqq));
3631		service_tree->ttime.last_end_request = now;
3632		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3633			cfqd->last_delayed_sync = now;
3634	}
3635
3636#ifdef CONFIG_CFQ_GROUP_IOSCHED
3637	cfqq->cfqg->ttime.last_end_request = now;
3638#endif
3639
3640	/*
3641	 * If this is the active queue, check if it needs to be expired,
3642	 * or if we want to idle in case it has no pending requests.
3643	 */
3644	if (cfqd->active_queue == cfqq) {
3645		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3646
3647		if (cfq_cfqq_slice_new(cfqq)) {
3648			cfq_set_prio_slice(cfqd, cfqq);
3649			cfq_clear_cfqq_slice_new(cfqq);
3650		}
3651
3652		/*
3653		 * Should we wait for next request to come in before we expire
3654		 * the queue.
3655		 */
3656		if (cfq_should_wait_busy(cfqd, cfqq)) {
3657			unsigned long extend_sl = cfqd->cfq_slice_idle;
3658			if (!cfqd->cfq_slice_idle)
3659				extend_sl = cfqd->cfq_group_idle;
3660			cfqq->slice_end = jiffies + extend_sl;
3661			cfq_mark_cfqq_wait_busy(cfqq);
3662			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3663		}
3664
3665		/*
3666		 * Idling is not enabled on:
3667		 * - expired queues
3668		 * - idle-priority queues
3669		 * - async queues
3670		 * - queues with still some requests queued
3671		 * - when there is a close cooperator
3672		 */
3673		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3674			cfq_slice_expired(cfqd, 1);
3675		else if (sync && cfqq_empty &&
3676			 !cfq_close_cooperator(cfqd, cfqq)) {
3677			cfq_arm_slice_timer(cfqd);
3678		}
3679	}
3680
3681	if (!cfqd->rq_in_driver)
3682		cfq_schedule_dispatch(cfqd);
3683}
3684
3685static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3686{
3687	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3688		cfq_mark_cfqq_must_alloc_slice(cfqq);
3689		return ELV_MQUEUE_MUST;
3690	}
3691
3692	return ELV_MQUEUE_MAY;
3693}
3694
3695static int cfq_may_queue(struct request_queue *q, int rw)
3696{
3697	struct cfq_data *cfqd = q->elevator->elevator_data;
3698	struct task_struct *tsk = current;
3699	struct cfq_io_cq *cic;
3700	struct cfq_queue *cfqq;
3701
3702	/*
3703	 * don't force setup of a queue from here, as a call to may_queue
3704	 * does not necessarily imply that a request actually will be queued.
3705	 * so just lookup a possibly existing queue, or return 'may queue'
3706	 * if that fails
3707	 */
3708	cic = cfq_cic_lookup(cfqd, tsk->io_context);
3709	if (!cic)
3710		return ELV_MQUEUE_MAY;
3711
3712	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3713	if (cfqq) {
3714		cfq_init_prio_data(cfqq, cic);
3715
3716		return __cfq_may_queue(cfqq);
3717	}
3718
3719	return ELV_MQUEUE_MAY;
3720}
3721
3722/*
3723 * queue lock held here
3724 */
3725static void cfq_put_request(struct request *rq)
3726{
3727	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3728
3729	if (cfqq) {
3730		const int rw = rq_data_dir(rq);
3731
3732		BUG_ON(!cfqq->allocated[rw]);
3733		cfqq->allocated[rw]--;
3734
3735		/* Put down rq reference on cfqg */
3736		cfqg_put(RQ_CFQG(rq));
3737		rq->elv.priv[0] = NULL;
3738		rq->elv.priv[1] = NULL;
3739
3740		cfq_put_queue(cfqq);
3741	}
3742}
3743
3744static struct cfq_queue *
3745cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
3746		struct cfq_queue *cfqq)
3747{
3748	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3749	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3750	cfq_mark_cfqq_coop(cfqq->new_cfqq);
3751	cfq_put_queue(cfqq);
3752	return cic_to_cfqq(cic, 1);
3753}
3754
3755/*
3756 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3757 * was the last process referring to said cfqq.
3758 */
3759static struct cfq_queue *
3760split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
3761{
3762	if (cfqq_process_refs(cfqq) == 1) {
3763		cfqq->pid = current->pid;
3764		cfq_clear_cfqq_coop(cfqq);
3765		cfq_clear_cfqq_split_coop(cfqq);
3766		return cfqq;
3767	}
3768
3769	cic_set_cfqq(cic, NULL, 1);
3770
3771	cfq_put_cooperator(cfqq);
3772
3773	cfq_put_queue(cfqq);
3774	return NULL;
3775}
3776/*
3777 * Allocate cfq data structures associated with this request.
3778 */
3779static int
3780cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
3781		gfp_t gfp_mask)
3782{
3783	struct cfq_data *cfqd = q->elevator->elevator_data;
3784	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
3785	const int rw = rq_data_dir(rq);
3786	const bool is_sync = rq_is_sync(rq);
3787	struct cfq_queue *cfqq;
3788
3789	might_sleep_if(gfp_mask & __GFP_WAIT);
3790
3791	spin_lock_irq(q->queue_lock);
3792
3793	check_ioprio_changed(cic, bio);
3794	check_blkcg_changed(cic, bio);
3795new_queue:
3796	cfqq = cic_to_cfqq(cic, is_sync);
3797	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3798		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
3799		cic_set_cfqq(cic, cfqq, is_sync);
3800	} else {
3801		/*
3802		 * If the queue was seeky for too long, break it apart.
3803		 */
3804		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3805			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3806			cfqq = split_cfqq(cic, cfqq);
3807			if (!cfqq)
3808				goto new_queue;
3809		}
3810
3811		/*
3812		 * Check to see if this queue is scheduled to merge with
3813		 * another, closely cooperating queue.  The merging of
3814		 * queues happens here as it must be done in process context.
3815		 * The reference on new_cfqq was taken in merge_cfqqs.
3816		 */
3817		if (cfqq->new_cfqq)
3818			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
3819	}
3820
3821	cfqq->allocated[rw]++;
3822
3823	cfqq->ref++;
3824	cfqg_get(cfqq->cfqg);
3825	rq->elv.priv[0] = cfqq;
3826	rq->elv.priv[1] = cfqq->cfqg;
3827	spin_unlock_irq(q->queue_lock);
3828	return 0;
3829}
3830
3831static void cfq_kick_queue(struct work_struct *work)
3832{
3833	struct cfq_data *cfqd =
3834		container_of(work, struct cfq_data, unplug_work);
3835	struct request_queue *q = cfqd->queue;
3836
3837	spin_lock_irq(q->queue_lock);
3838	__blk_run_queue(cfqd->queue);
3839	spin_unlock_irq(q->queue_lock);
3840}
3841
3842/*
3843 * Timer running if the active_queue is currently idling inside its time slice
3844 */
3845static void cfq_idle_slice_timer(unsigned long data)
3846{
3847	struct cfq_data *cfqd = (struct cfq_data *) data;
3848	struct cfq_queue *cfqq;
3849	unsigned long flags;
3850	int timed_out = 1;
3851
3852	cfq_log(cfqd, "idle timer fired");
3853
3854	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3855
3856	cfqq = cfqd->active_queue;
3857	if (cfqq) {
3858		timed_out = 0;
3859
3860		/*
3861		 * We saw a request before the queue expired, let it through
3862		 */
3863		if (cfq_cfqq_must_dispatch(cfqq))
3864			goto out_kick;
3865
3866		/*
3867		 * expired
3868		 */
3869		if (cfq_slice_used(cfqq))
3870			goto expire;
3871
3872		/*
3873		 * only expire and reinvoke request handler, if there are
3874		 * other queues with pending requests
3875		 */
3876		if (!cfqd->busy_queues)
3877			goto out_cont;
3878
3879		/*
3880		 * not expired and it has a request pending, let it dispatch
3881		 */
3882		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3883			goto out_kick;
3884
3885		/*
3886		 * Queue depth flag is reset only when the idle didn't succeed
3887		 */
3888		cfq_clear_cfqq_deep(cfqq);
3889	}
3890expire:
3891	cfq_slice_expired(cfqd, timed_out);
3892out_kick:
3893	cfq_schedule_dispatch(cfqd);
3894out_cont:
3895	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3896}
3897
3898static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
3899{
3900	del_timer_sync(&cfqd->idle_slice_timer);
3901	cancel_work_sync(&cfqd->unplug_work);
3902}
3903
3904static void cfq_put_async_queues(struct cfq_data *cfqd)
3905{
3906	int i;
3907
3908	for (i = 0; i < IOPRIO_BE_NR; i++) {
3909		if (cfqd->async_cfqq[0][i])
3910			cfq_put_queue(cfqd->async_cfqq[0][i]);
3911		if (cfqd->async_cfqq[1][i])
3912			cfq_put_queue(cfqd->async_cfqq[1][i]);
3913	}
3914
3915	if (cfqd->async_idle_cfqq)
3916		cfq_put_queue(cfqd->async_idle_cfqq);
3917}
3918
3919static void cfq_exit_queue(struct elevator_queue *e)
3920{
3921	struct cfq_data *cfqd = e->elevator_data;
3922	struct request_queue *q = cfqd->queue;
3923
3924	cfq_shutdown_timer_wq(cfqd);
3925
3926	spin_lock_irq(q->queue_lock);
3927
3928	if (cfqd->active_queue)
3929		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3930
3931	cfq_put_async_queues(cfqd);
3932
3933	spin_unlock_irq(q->queue_lock);
3934
3935	cfq_shutdown_timer_wq(cfqd);
3936
3937#ifndef CONFIG_CFQ_GROUP_IOSCHED
3938	kfree(cfqd->root_group);
3939#endif
3940	blkcg_deactivate_policy(q, &blkio_policy_cfq);
3941	kfree(cfqd);
3942}
3943
3944static int cfq_init_queue(struct request_queue *q)
3945{
3946	struct cfq_data *cfqd;
3947	struct blkio_group *blkg __maybe_unused;
3948	int i, ret;
3949
3950	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3951	if (!cfqd)
3952		return -ENOMEM;
3953
3954	cfqd->queue = q;
3955	q->elevator->elevator_data = cfqd;
3956
3957	/* Init root service tree */
3958	cfqd->grp_service_tree = CFQ_RB_ROOT;
3959
3960	/* Init root group and prefer root group over other groups by default */
3961#ifdef CONFIG_CFQ_GROUP_IOSCHED
3962	ret = blkcg_activate_policy(q, &blkio_policy_cfq);
3963	if (ret)
3964		goto out_free;
3965
3966	cfqd->root_group = blkg_to_cfqg(q->root_blkg);
3967#else
3968	ret = -ENOMEM;
3969	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
3970					GFP_KERNEL, cfqd->queue->node);
3971	if (!cfqd->root_group)
3972		goto out_free;
3973
3974	cfq_init_cfqg_base(cfqd->root_group);
3975#endif
3976	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
3977
3978	/*
3979	 * Not strictly needed (since RB_ROOT just clears the node and we
3980	 * zeroed cfqd on alloc), but better be safe in case someone decides
3981	 * to add magic to the rb code
3982	 */
3983	for (i = 0; i < CFQ_PRIO_LISTS; i++)
3984		cfqd->prio_trees[i] = RB_ROOT;
3985
3986	/*
3987	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
3988	 * Grab a permanent reference to it, so that the normal code flow
3989	 * will not attempt to free it.  oom_cfqq is linked to root_group
3990	 * but shouldn't hold a reference as it'll never be unlinked.  Lose
3991	 * the reference from linking right away.
3992	 */
3993	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3994	cfqd->oom_cfqq.ref++;
3995
3996	spin_lock_irq(q->queue_lock);
3997	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
3998	cfqg_put(cfqd->root_group);
3999	spin_unlock_irq(q->queue_lock);
4000
4001	init_timer(&cfqd->idle_slice_timer);
4002	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4003	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
4004
4005	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4006
4007	cfqd->cfq_quantum = cfq_quantum;
4008	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4009	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4010	cfqd->cfq_back_max = cfq_back_max;
4011	cfqd->cfq_back_penalty = cfq_back_penalty;
4012	cfqd->cfq_slice[0] = cfq_slice_async;
4013	cfqd->cfq_slice[1] = cfq_slice_sync;
4014	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4015	cfqd->cfq_slice_idle = cfq_slice_idle;
4016	cfqd->cfq_group_idle = cfq_group_idle;
4017	cfqd->cfq_latency = 1;
4018	cfqd->hw_tag = -1;
4019	/*
4020	 * we optimistically start assuming sync ops weren't delayed in last
4021	 * second, in order to have larger depth for async operations.
4022	 */
4023	cfqd->last_delayed_sync = jiffies - HZ;
4024	return 0;
4025
4026out_free:
4027	kfree(cfqd);
4028	return ret;
4029}
4030
4031/*
4032 * sysfs parts below -->
4033 */
4034static ssize_t
4035cfq_var_show(unsigned int var, char *page)
4036{
4037	return sprintf(page, "%d\n", var);
4038}
4039
4040static ssize_t
4041cfq_var_store(unsigned int *var, const char *page, size_t count)
4042{
4043	char *p = (char *) page;
4044
4045	*var = simple_strtoul(p, &p, 10);
4046	return count;
4047}
4048
4049#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
4050static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
4051{									\
4052	struct cfq_data *cfqd = e->elevator_data;			\
4053	unsigned int __data = __VAR;					\
4054	if (__CONV)							\
4055		__data = jiffies_to_msecs(__data);			\
4056	return cfq_var_show(__data, (page));				\
4057}
4058SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4059SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4060SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4061SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4062SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4063SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4064SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4065SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4066SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4067SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4068SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4069#undef SHOW_FUNCTION
4070
4071#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
4072static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
4073{									\
4074	struct cfq_data *cfqd = e->elevator_data;			\
4075	unsigned int __data;						\
4076	int ret = cfq_var_store(&__data, (page), count);		\
4077	if (__data < (MIN))						\
4078		__data = (MIN);						\
4079	else if (__data > (MAX))					\
4080		__data = (MAX);						\
4081	if (__CONV)							\
4082		*(__PTR) = msecs_to_jiffies(__data);			\
4083	else								\
4084		*(__PTR) = __data;					\
4085	return ret;							\
4086}
4087STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4088STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4089		UINT_MAX, 1);
4090STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4091		UINT_MAX, 1);
4092STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4093STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4094		UINT_MAX, 0);
4095STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4096STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4097STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4098STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4099STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4100		UINT_MAX, 0);
4101STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4102#undef STORE_FUNCTION
4103
4104#define CFQ_ATTR(name) \
4105	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
4106
4107static struct elv_fs_entry cfq_attrs[] = {
4108	CFQ_ATTR(quantum),
4109	CFQ_ATTR(fifo_expire_sync),
4110	CFQ_ATTR(fifo_expire_async),
4111	CFQ_ATTR(back_seek_max),
4112	CFQ_ATTR(back_seek_penalty),
4113	CFQ_ATTR(slice_sync),
4114	CFQ_ATTR(slice_async),
4115	CFQ_ATTR(slice_async_rq),
4116	CFQ_ATTR(slice_idle),
4117	CFQ_ATTR(group_idle),
4118	CFQ_ATTR(low_latency),
4119	__ATTR_NULL
4120};
4121
4122static struct elevator_type iosched_cfq = {
4123	.ops = {
4124		.elevator_merge_fn = 		cfq_merge,
4125		.elevator_merged_fn =		cfq_merged_request,
4126		.elevator_merge_req_fn =	cfq_merged_requests,
4127		.elevator_allow_merge_fn =	cfq_allow_merge,
4128		.elevator_bio_merged_fn =	cfq_bio_merged,
4129		.elevator_dispatch_fn =		cfq_dispatch_requests,
4130		.elevator_add_req_fn =		cfq_insert_request,
4131		.elevator_activate_req_fn =	cfq_activate_request,
4132		.elevator_deactivate_req_fn =	cfq_deactivate_request,
4133		.elevator_completed_req_fn =	cfq_completed_request,
4134		.elevator_former_req_fn =	elv_rb_former_request,
4135		.elevator_latter_req_fn =	elv_rb_latter_request,
4136		.elevator_init_icq_fn =		cfq_init_icq,
4137		.elevator_exit_icq_fn =		cfq_exit_icq,
4138		.elevator_set_req_fn =		cfq_set_request,
4139		.elevator_put_req_fn =		cfq_put_request,
4140		.elevator_may_queue_fn =	cfq_may_queue,
4141		.elevator_init_fn =		cfq_init_queue,
4142		.elevator_exit_fn =		cfq_exit_queue,
4143	},
4144	.icq_size	=	sizeof(struct cfq_io_cq),
4145	.icq_align	=	__alignof__(struct cfq_io_cq),
4146	.elevator_attrs =	cfq_attrs,
4147	.elevator_name	=	"cfq",
4148	.elevator_owner =	THIS_MODULE,
4149};
4150
4151#ifdef CONFIG_CFQ_GROUP_IOSCHED
4152static struct blkio_policy_type blkio_policy_cfq = {
4153	.ops = {
4154		.blkio_init_group_fn =		cfq_init_blkio_group,
4155		.blkio_reset_group_stats_fn =	cfqg_stats_reset,
4156	},
4157	.pdata_size = sizeof(struct cfq_group),
4158	.cftypes = cfq_blkcg_files,
4159};
4160#endif
4161
4162static int __init cfq_init(void)
4163{
4164	int ret;
4165
4166	/*
4167	 * could be 0 on HZ < 1000 setups
4168	 */
4169	if (!cfq_slice_async)
4170		cfq_slice_async = 1;
4171	if (!cfq_slice_idle)
4172		cfq_slice_idle = 1;
4173
4174#ifdef CONFIG_CFQ_GROUP_IOSCHED
4175	if (!cfq_group_idle)
4176		cfq_group_idle = 1;
4177#else
4178		cfq_group_idle = 0;
4179#endif
4180
4181	ret = blkio_policy_register(&blkio_policy_cfq);
4182	if (ret)
4183		return ret;
4184
4185	cfq_pool = KMEM_CACHE(cfq_queue, 0);
4186	if (!cfq_pool)
4187		goto err_pol_unreg;
4188
4189	ret = elv_register(&iosched_cfq);
4190	if (ret)
4191		goto err_free_pool;
4192
4193	return 0;
4194
4195err_free_pool:
4196	kmem_cache_destroy(cfq_pool);
4197err_pol_unreg:
4198	blkio_policy_unregister(&blkio_policy_cfq);
4199	return ret;
4200}
4201
4202static void __exit cfq_exit(void)
4203{
4204	blkio_policy_unregister(&blkio_policy_cfq);
4205	elv_unregister(&iosched_cfq);
4206	kmem_cache_destroy(cfq_pool);
4207}
4208
4209module_init(cfq_init);
4210module_exit(cfq_exit);
4211
4212MODULE_AUTHOR("Jens Axboe");
4213MODULE_LICENSE("GPL");
4214MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
4215