cfq-iosched.c revision 0871714e08fed7ba66cadad11b2e4f85a9dc9b96
1/*
2 *  CFQ, or complete fairness queueing, disk scheduler.
3 *
4 *  Based on ideas from a previously unfinished io
5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/blkdev.h>
11#include <linux/elevator.h>
12#include <linux/rbtree.h>
13#include <linux/ioprio.h>
14
15/*
16 * tunables
17 */
18static const int cfq_quantum = 4;		/* max queue in one round of service */
19static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
20static const int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
21static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
22
23static const int cfq_slice_sync = HZ / 10;
24static int cfq_slice_async = HZ / 25;
25static const int cfq_slice_async_rq = 2;
26static int cfq_slice_idle = HZ / 125;
27
28/*
29 * offset from end of service tree
30 */
31#define CFQ_IDLE_DELAY		(HZ / 5)
32
33/*
34 * below this threshold, we consider thinktime immediate
35 */
36#define CFQ_MIN_TT		(2)
37
38#define CFQ_SLICE_SCALE		(5)
39
40#define RQ_CIC(rq)		((struct cfq_io_context*)(rq)->elevator_private)
41#define RQ_CFQQ(rq)		((rq)->elevator_private2)
42
43static struct kmem_cache *cfq_pool;
44static struct kmem_cache *cfq_ioc_pool;
45
46static DEFINE_PER_CPU(unsigned long, ioc_count);
47static struct completion *ioc_gone;
48
49#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
50#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
51#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
52
53#define ASYNC			(0)
54#define SYNC			(1)
55
56#define sample_valid(samples)	((samples) > 80)
57
58/*
59 * Most of our rbtree usage is for sorting with min extraction, so
60 * if we cache the leftmost node we don't have to walk down the tree
61 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
62 * move this into the elevator for the rq sorting as well.
63 */
64struct cfq_rb_root {
65	struct rb_root rb;
66	struct rb_node *left;
67};
68#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, }
69
70/*
71 * Per block device queue structure
72 */
73struct cfq_data {
74	struct request_queue *queue;
75
76	/*
77	 * rr list of queues with requests and the count of them
78	 */
79	struct cfq_rb_root service_tree;
80	unsigned int busy_queues;
81
82	int rq_in_driver;
83	int sync_flight;
84	int hw_tag;
85
86	/*
87	 * idle window management
88	 */
89	struct timer_list idle_slice_timer;
90	struct work_struct unplug_work;
91
92	struct cfq_queue *active_queue;
93	struct cfq_io_context *active_cic;
94
95	/*
96	 * async queue for each priority case
97	 */
98	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
99	struct cfq_queue *async_idle_cfqq;
100
101	sector_t last_position;
102	unsigned long last_end_request;
103
104	/*
105	 * tunables, see top of file
106	 */
107	unsigned int cfq_quantum;
108	unsigned int cfq_fifo_expire[2];
109	unsigned int cfq_back_penalty;
110	unsigned int cfq_back_max;
111	unsigned int cfq_slice[2];
112	unsigned int cfq_slice_async_rq;
113	unsigned int cfq_slice_idle;
114
115	struct list_head cic_list;
116};
117
118/*
119 * Per process-grouping structure
120 */
121struct cfq_queue {
122	/* reference count */
123	atomic_t ref;
124	/* parent cfq_data */
125	struct cfq_data *cfqd;
126	/* service_tree member */
127	struct rb_node rb_node;
128	/* service_tree key */
129	unsigned long rb_key;
130	/* sorted list of pending requests */
131	struct rb_root sort_list;
132	/* if fifo isn't expired, next request to serve */
133	struct request *next_rq;
134	/* requests queued in sort_list */
135	int queued[2];
136	/* currently allocated requests */
137	int allocated[2];
138	/* pending metadata requests */
139	int meta_pending;
140	/* fifo list of requests in sort_list */
141	struct list_head fifo;
142
143	unsigned long slice_end;
144	long slice_resid;
145
146	/* number of requests that are on the dispatch list or inside driver */
147	int dispatched;
148
149	/* io prio of this group */
150	unsigned short ioprio, org_ioprio;
151	unsigned short ioprio_class, org_ioprio_class;
152
153	/* various state flags, see below */
154	unsigned int flags;
155};
156
157enum cfqq_state_flags {
158	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
159	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
160	CFQ_CFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
161	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
162	CFQ_CFQQ_FLAG_must_dispatch,	/* must dispatch, even if expired */
163	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
164	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
165	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
166	CFQ_CFQQ_FLAG_queue_new,	/* queue never been serviced */
167	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
168	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
169};
170
171#define CFQ_CFQQ_FNS(name)						\
172static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
173{									\
174	cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
175}									\
176static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
177{									\
178	cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
179}									\
180static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
181{									\
182	return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
183}
184
185CFQ_CFQQ_FNS(on_rr);
186CFQ_CFQQ_FNS(wait_request);
187CFQ_CFQQ_FNS(must_alloc);
188CFQ_CFQQ_FNS(must_alloc_slice);
189CFQ_CFQQ_FNS(must_dispatch);
190CFQ_CFQQ_FNS(fifo_expire);
191CFQ_CFQQ_FNS(idle_window);
192CFQ_CFQQ_FNS(prio_changed);
193CFQ_CFQQ_FNS(queue_new);
194CFQ_CFQQ_FNS(slice_new);
195CFQ_CFQQ_FNS(sync);
196#undef CFQ_CFQQ_FNS
197
198static void cfq_dispatch_insert(struct request_queue *, struct request *);
199static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
200				       struct io_context *, gfp_t);
201static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
202						struct io_context *);
203
204static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
205					    int is_sync)
206{
207	return cic->cfqq[!!is_sync];
208}
209
210static inline void cic_set_cfqq(struct cfq_io_context *cic,
211				struct cfq_queue *cfqq, int is_sync)
212{
213	cic->cfqq[!!is_sync] = cfqq;
214}
215
216/*
217 * We regard a request as SYNC, if it's either a read or has the SYNC bit
218 * set (in which case it could also be direct WRITE).
219 */
220static inline int cfq_bio_sync(struct bio *bio)
221{
222	if (bio_data_dir(bio) == READ || bio_sync(bio))
223		return 1;
224
225	return 0;
226}
227
228/*
229 * scheduler run of queue, if there are requests pending and no one in the
230 * driver that will restart queueing
231 */
232static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
233{
234	if (cfqd->busy_queues)
235		kblockd_schedule_work(&cfqd->unplug_work);
236}
237
238static int cfq_queue_empty(struct request_queue *q)
239{
240	struct cfq_data *cfqd = q->elevator->elevator_data;
241
242	return !cfqd->busy_queues;
243}
244
245/*
246 * Scale schedule slice based on io priority. Use the sync time slice only
247 * if a queue is marked sync and has sync io queued. A sync queue with async
248 * io only, should not get full sync slice length.
249 */
250static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync,
251				 unsigned short prio)
252{
253	const int base_slice = cfqd->cfq_slice[sync];
254
255	WARN_ON(prio >= IOPRIO_BE_NR);
256
257	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
258}
259
260static inline int
261cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
262{
263	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
264}
265
266static inline void
267cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
268{
269	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
270}
271
272/*
273 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
274 * isn't valid until the first request from the dispatch is activated
275 * and the slice time set.
276 */
277static inline int cfq_slice_used(struct cfq_queue *cfqq)
278{
279	if (cfq_cfqq_slice_new(cfqq))
280		return 0;
281	if (time_before(jiffies, cfqq->slice_end))
282		return 0;
283
284	return 1;
285}
286
287/*
288 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
289 * We choose the request that is closest to the head right now. Distance
290 * behind the head is penalized and only allowed to a certain extent.
291 */
292static struct request *
293cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
294{
295	sector_t last, s1, s2, d1 = 0, d2 = 0;
296	unsigned long back_max;
297#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
298#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
299	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
300
301	if (rq1 == NULL || rq1 == rq2)
302		return rq2;
303	if (rq2 == NULL)
304		return rq1;
305
306	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
307		return rq1;
308	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
309		return rq2;
310	if (rq_is_meta(rq1) && !rq_is_meta(rq2))
311		return rq1;
312	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
313		return rq2;
314
315	s1 = rq1->sector;
316	s2 = rq2->sector;
317
318	last = cfqd->last_position;
319
320	/*
321	 * by definition, 1KiB is 2 sectors
322	 */
323	back_max = cfqd->cfq_back_max * 2;
324
325	/*
326	 * Strict one way elevator _except_ in the case where we allow
327	 * short backward seeks which are biased as twice the cost of a
328	 * similar forward seek.
329	 */
330	if (s1 >= last)
331		d1 = s1 - last;
332	else if (s1 + back_max >= last)
333		d1 = (last - s1) * cfqd->cfq_back_penalty;
334	else
335		wrap |= CFQ_RQ1_WRAP;
336
337	if (s2 >= last)
338		d2 = s2 - last;
339	else if (s2 + back_max >= last)
340		d2 = (last - s2) * cfqd->cfq_back_penalty;
341	else
342		wrap |= CFQ_RQ2_WRAP;
343
344	/* Found required data */
345
346	/*
347	 * By doing switch() on the bit mask "wrap" we avoid having to
348	 * check two variables for all permutations: --> faster!
349	 */
350	switch (wrap) {
351	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
352		if (d1 < d2)
353			return rq1;
354		else if (d2 < d1)
355			return rq2;
356		else {
357			if (s1 >= s2)
358				return rq1;
359			else
360				return rq2;
361		}
362
363	case CFQ_RQ2_WRAP:
364		return rq1;
365	case CFQ_RQ1_WRAP:
366		return rq2;
367	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
368	default:
369		/*
370		 * Since both rqs are wrapped,
371		 * start with the one that's further behind head
372		 * (--> only *one* back seek required),
373		 * since back seek takes more time than forward.
374		 */
375		if (s1 <= s2)
376			return rq1;
377		else
378			return rq2;
379	}
380}
381
382/*
383 * The below is leftmost cache rbtree addon
384 */
385static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
386{
387	if (!root->left)
388		root->left = rb_first(&root->rb);
389
390	if (root->left)
391		return rb_entry(root->left, struct cfq_queue, rb_node);
392
393	return NULL;
394}
395
396static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
397{
398	if (root->left == n)
399		root->left = NULL;
400
401	rb_erase(n, &root->rb);
402	RB_CLEAR_NODE(n);
403}
404
405/*
406 * would be nice to take fifo expire time into account as well
407 */
408static struct request *
409cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
410		  struct request *last)
411{
412	struct rb_node *rbnext = rb_next(&last->rb_node);
413	struct rb_node *rbprev = rb_prev(&last->rb_node);
414	struct request *next = NULL, *prev = NULL;
415
416	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
417
418	if (rbprev)
419		prev = rb_entry_rq(rbprev);
420
421	if (rbnext)
422		next = rb_entry_rq(rbnext);
423	else {
424		rbnext = rb_first(&cfqq->sort_list);
425		if (rbnext && rbnext != &last->rb_node)
426			next = rb_entry_rq(rbnext);
427	}
428
429	return cfq_choose_req(cfqd, next, prev);
430}
431
432static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
433				      struct cfq_queue *cfqq)
434{
435	/*
436	 * just an approximation, should be ok.
437	 */
438	return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
439		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
440}
441
442/*
443 * The cfqd->service_tree holds all pending cfq_queue's that have
444 * requests waiting to be processed. It is sorted in the order that
445 * we will service the queues.
446 */
447static void cfq_service_tree_add(struct cfq_data *cfqd,
448				    struct cfq_queue *cfqq, int add_front)
449{
450	struct rb_node **p, *parent;
451	struct cfq_queue *__cfqq;
452	unsigned long rb_key;
453	int left;
454
455	if (cfq_class_idle(cfqq)) {
456		rb_key = CFQ_IDLE_DELAY;
457		parent = rb_last(&cfqd->service_tree.rb);
458		if (parent && parent != &cfqq->rb_node) {
459			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
460			rb_key += __cfqq->rb_key;
461		} else
462			rb_key += jiffies;
463	} else if (!add_front) {
464		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
465		rb_key += cfqq->slice_resid;
466		cfqq->slice_resid = 0;
467	} else
468		rb_key = 0;
469
470	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
471		/*
472		 * same position, nothing more to do
473		 */
474		if (rb_key == cfqq->rb_key)
475			return;
476
477		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
478	}
479
480	left = 1;
481	parent = NULL;
482	p = &cfqd->service_tree.rb.rb_node;
483	while (*p) {
484		struct rb_node **n;
485
486		parent = *p;
487		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
488
489		/*
490		 * sort RT queues first, we always want to give
491		 * preference to them. IDLE queues goes to the back.
492		 * after that, sort on the next service time.
493		 */
494		if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
495			n = &(*p)->rb_left;
496		else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
497			n = &(*p)->rb_right;
498		else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
499			n = &(*p)->rb_left;
500		else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
501			n = &(*p)->rb_right;
502		else if (rb_key < __cfqq->rb_key)
503			n = &(*p)->rb_left;
504		else
505			n = &(*p)->rb_right;
506
507		if (n == &(*p)->rb_right)
508			left = 0;
509
510		p = n;
511	}
512
513	if (left)
514		cfqd->service_tree.left = &cfqq->rb_node;
515
516	cfqq->rb_key = rb_key;
517	rb_link_node(&cfqq->rb_node, parent, p);
518	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
519}
520
521/*
522 * Update cfqq's position in the service tree.
523 */
524static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
525{
526	/*
527	 * Resorting requires the cfqq to be on the RR list already.
528	 */
529	if (cfq_cfqq_on_rr(cfqq))
530		cfq_service_tree_add(cfqd, cfqq, 0);
531}
532
533/*
534 * add to busy list of queues for service, trying to be fair in ordering
535 * the pending list according to last request service
536 */
537static inline void
538cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
539{
540	BUG_ON(cfq_cfqq_on_rr(cfqq));
541	cfq_mark_cfqq_on_rr(cfqq);
542	cfqd->busy_queues++;
543
544	cfq_resort_rr_list(cfqd, cfqq);
545}
546
547/*
548 * Called when the cfqq no longer has requests pending, remove it from
549 * the service tree.
550 */
551static inline void
552cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
553{
554	BUG_ON(!cfq_cfqq_on_rr(cfqq));
555	cfq_clear_cfqq_on_rr(cfqq);
556
557	if (!RB_EMPTY_NODE(&cfqq->rb_node))
558		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
559
560	BUG_ON(!cfqd->busy_queues);
561	cfqd->busy_queues--;
562}
563
564/*
565 * rb tree support functions
566 */
567static inline void cfq_del_rq_rb(struct request *rq)
568{
569	struct cfq_queue *cfqq = RQ_CFQQ(rq);
570	struct cfq_data *cfqd = cfqq->cfqd;
571	const int sync = rq_is_sync(rq);
572
573	BUG_ON(!cfqq->queued[sync]);
574	cfqq->queued[sync]--;
575
576	elv_rb_del(&cfqq->sort_list, rq);
577
578	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
579		cfq_del_cfqq_rr(cfqd, cfqq);
580}
581
582static void cfq_add_rq_rb(struct request *rq)
583{
584	struct cfq_queue *cfqq = RQ_CFQQ(rq);
585	struct cfq_data *cfqd = cfqq->cfqd;
586	struct request *__alias;
587
588	cfqq->queued[rq_is_sync(rq)]++;
589
590	/*
591	 * looks a little odd, but the first insert might return an alias.
592	 * if that happens, put the alias on the dispatch list
593	 */
594	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
595		cfq_dispatch_insert(cfqd->queue, __alias);
596
597	if (!cfq_cfqq_on_rr(cfqq))
598		cfq_add_cfqq_rr(cfqd, cfqq);
599
600	/*
601	 * check if this request is a better next-serve candidate
602	 */
603	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
604	BUG_ON(!cfqq->next_rq);
605}
606
607static inline void
608cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
609{
610	elv_rb_del(&cfqq->sort_list, rq);
611	cfqq->queued[rq_is_sync(rq)]--;
612	cfq_add_rq_rb(rq);
613}
614
615static struct request *
616cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
617{
618	struct task_struct *tsk = current;
619	struct cfq_io_context *cic;
620	struct cfq_queue *cfqq;
621
622	cic = cfq_cic_lookup(cfqd, tsk->io_context);
623	if (!cic)
624		return NULL;
625
626	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
627	if (cfqq) {
628		sector_t sector = bio->bi_sector + bio_sectors(bio);
629
630		return elv_rb_find(&cfqq->sort_list, sector);
631	}
632
633	return NULL;
634}
635
636static void cfq_activate_request(struct request_queue *q, struct request *rq)
637{
638	struct cfq_data *cfqd = q->elevator->elevator_data;
639
640	cfqd->rq_in_driver++;
641
642	/*
643	 * If the depth is larger 1, it really could be queueing. But lets
644	 * make the mark a little higher - idling could still be good for
645	 * low queueing, and a low queueing number could also just indicate
646	 * a SCSI mid layer like behaviour where limit+1 is often seen.
647	 */
648	if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
649		cfqd->hw_tag = 1;
650
651	cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
652}
653
654static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
655{
656	struct cfq_data *cfqd = q->elevator->elevator_data;
657
658	WARN_ON(!cfqd->rq_in_driver);
659	cfqd->rq_in_driver--;
660}
661
662static void cfq_remove_request(struct request *rq)
663{
664	struct cfq_queue *cfqq = RQ_CFQQ(rq);
665
666	if (cfqq->next_rq == rq)
667		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
668
669	list_del_init(&rq->queuelist);
670	cfq_del_rq_rb(rq);
671
672	if (rq_is_meta(rq)) {
673		WARN_ON(!cfqq->meta_pending);
674		cfqq->meta_pending--;
675	}
676}
677
678static int cfq_merge(struct request_queue *q, struct request **req,
679		     struct bio *bio)
680{
681	struct cfq_data *cfqd = q->elevator->elevator_data;
682	struct request *__rq;
683
684	__rq = cfq_find_rq_fmerge(cfqd, bio);
685	if (__rq && elv_rq_merge_ok(__rq, bio)) {
686		*req = __rq;
687		return ELEVATOR_FRONT_MERGE;
688	}
689
690	return ELEVATOR_NO_MERGE;
691}
692
693static void cfq_merged_request(struct request_queue *q, struct request *req,
694			       int type)
695{
696	if (type == ELEVATOR_FRONT_MERGE) {
697		struct cfq_queue *cfqq = RQ_CFQQ(req);
698
699		cfq_reposition_rq_rb(cfqq, req);
700	}
701}
702
703static void
704cfq_merged_requests(struct request_queue *q, struct request *rq,
705		    struct request *next)
706{
707	/*
708	 * reposition in fifo if next is older than rq
709	 */
710	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
711	    time_before(next->start_time, rq->start_time))
712		list_move(&rq->queuelist, &next->queuelist);
713
714	cfq_remove_request(next);
715}
716
717static int cfq_allow_merge(struct request_queue *q, struct request *rq,
718			   struct bio *bio)
719{
720	struct cfq_data *cfqd = q->elevator->elevator_data;
721	struct cfq_io_context *cic;
722	struct cfq_queue *cfqq;
723
724	/*
725	 * Disallow merge of a sync bio into an async request.
726	 */
727	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
728		return 0;
729
730	/*
731	 * Lookup the cfqq that this bio will be queued with. Allow
732	 * merge only if rq is queued there.
733	 */
734	cic = cfq_cic_lookup(cfqd, current->io_context);
735	if (!cic)
736		return 0;
737
738	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
739	if (cfqq == RQ_CFQQ(rq))
740		return 1;
741
742	return 0;
743}
744
745static inline void
746__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
747{
748	if (cfqq) {
749		cfqq->slice_end = 0;
750		cfq_clear_cfqq_must_alloc_slice(cfqq);
751		cfq_clear_cfqq_fifo_expire(cfqq);
752		cfq_mark_cfqq_slice_new(cfqq);
753		cfq_clear_cfqq_queue_new(cfqq);
754	}
755
756	cfqd->active_queue = cfqq;
757}
758
759/*
760 * current cfqq expired its slice (or was too idle), select new one
761 */
762static void
763__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
764		    int timed_out)
765{
766	if (cfq_cfqq_wait_request(cfqq))
767		del_timer(&cfqd->idle_slice_timer);
768
769	cfq_clear_cfqq_must_dispatch(cfqq);
770	cfq_clear_cfqq_wait_request(cfqq);
771
772	/*
773	 * store what was left of this slice, if the queue idled/timed out
774	 */
775	if (timed_out && !cfq_cfqq_slice_new(cfqq))
776		cfqq->slice_resid = cfqq->slice_end - jiffies;
777
778	cfq_resort_rr_list(cfqd, cfqq);
779
780	if (cfqq == cfqd->active_queue)
781		cfqd->active_queue = NULL;
782
783	if (cfqd->active_cic) {
784		put_io_context(cfqd->active_cic->ioc);
785		cfqd->active_cic = NULL;
786	}
787}
788
789static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out)
790{
791	struct cfq_queue *cfqq = cfqd->active_queue;
792
793	if (cfqq)
794		__cfq_slice_expired(cfqd, cfqq, timed_out);
795}
796
797/*
798 * Get next queue for service. Unless we have a queue preemption,
799 * we'll simply select the first cfqq in the service tree.
800 */
801static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
802{
803	if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
804		return NULL;
805
806	return cfq_rb_first(&cfqd->service_tree);
807}
808
809/*
810 * Get and set a new active queue for service.
811 */
812static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
813{
814	struct cfq_queue *cfqq;
815
816	cfqq = cfq_get_next_queue(cfqd);
817	__cfq_set_active_queue(cfqd, cfqq);
818	return cfqq;
819}
820
821static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
822					  struct request *rq)
823{
824	if (rq->sector >= cfqd->last_position)
825		return rq->sector - cfqd->last_position;
826	else
827		return cfqd->last_position - rq->sector;
828}
829
830static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
831{
832	struct cfq_io_context *cic = cfqd->active_cic;
833
834	if (!sample_valid(cic->seek_samples))
835		return 0;
836
837	return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
838}
839
840static int cfq_close_cooperator(struct cfq_data *cfq_data,
841				struct cfq_queue *cfqq)
842{
843	/*
844	 * We should notice if some of the queues are cooperating, eg
845	 * working closely on the same area of the disk. In that case,
846	 * we can group them together and don't waste time idling.
847	 */
848	return 0;
849}
850
851#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
852
853static void cfq_arm_slice_timer(struct cfq_data *cfqd)
854{
855	struct cfq_queue *cfqq = cfqd->active_queue;
856	struct cfq_io_context *cic;
857	unsigned long sl;
858
859	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
860	WARN_ON(cfq_cfqq_slice_new(cfqq));
861
862	/*
863	 * idle is disabled, either manually or by past process history
864	 */
865	if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
866		return;
867
868	/*
869	 * task has exited, don't wait
870	 */
871	cic = cfqd->active_cic;
872	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
873		return;
874
875	/*
876	 * See if this prio level has a good candidate
877	 */
878	if (cfq_close_cooperator(cfqd, cfqq) &&
879	    (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
880		return;
881
882	cfq_mark_cfqq_must_dispatch(cfqq);
883	cfq_mark_cfqq_wait_request(cfqq);
884
885	/*
886	 * we don't want to idle for seeks, but we do want to allow
887	 * fair distribution of slice time for a process doing back-to-back
888	 * seeks. so allow a little bit of time for him to submit a new rq
889	 */
890	sl = cfqd->cfq_slice_idle;
891	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
892		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
893
894	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
895}
896
897/*
898 * Move request from internal lists to the request queue dispatch list.
899 */
900static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
901{
902	struct cfq_data *cfqd = q->elevator->elevator_data;
903	struct cfq_queue *cfqq = RQ_CFQQ(rq);
904
905	cfq_remove_request(rq);
906	cfqq->dispatched++;
907	elv_dispatch_sort(q, rq);
908
909	if (cfq_cfqq_sync(cfqq))
910		cfqd->sync_flight++;
911}
912
913/*
914 * return expired entry, or NULL to just start from scratch in rbtree
915 */
916static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
917{
918	struct cfq_data *cfqd = cfqq->cfqd;
919	struct request *rq;
920	int fifo;
921
922	if (cfq_cfqq_fifo_expire(cfqq))
923		return NULL;
924
925	cfq_mark_cfqq_fifo_expire(cfqq);
926
927	if (list_empty(&cfqq->fifo))
928		return NULL;
929
930	fifo = cfq_cfqq_sync(cfqq);
931	rq = rq_entry_fifo(cfqq->fifo.next);
932
933	if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
934		return NULL;
935
936	return rq;
937}
938
939static inline int
940cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
941{
942	const int base_rq = cfqd->cfq_slice_async_rq;
943
944	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
945
946	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
947}
948
949/*
950 * Select a queue for service. If we have a current active queue,
951 * check whether to continue servicing it, or retrieve and set a new one.
952 */
953static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
954{
955	struct cfq_queue *cfqq;
956
957	cfqq = cfqd->active_queue;
958	if (!cfqq)
959		goto new_queue;
960
961	/*
962	 * The active queue has run out of time, expire it and select new.
963	 */
964	if (cfq_slice_used(cfqq))
965		goto expire;
966
967	/*
968	 * The active queue has requests and isn't expired, allow it to
969	 * dispatch.
970	 */
971	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
972		goto keep_queue;
973
974	/*
975	 * No requests pending. If the active queue still has requests in
976	 * flight or is idling for a new request, allow either of these
977	 * conditions to happen (or time out) before selecting a new queue.
978	 */
979	if (timer_pending(&cfqd->idle_slice_timer) ||
980	    (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
981		cfqq = NULL;
982		goto keep_queue;
983	}
984
985expire:
986	cfq_slice_expired(cfqd, 0);
987new_queue:
988	cfqq = cfq_set_active_queue(cfqd);
989keep_queue:
990	return cfqq;
991}
992
993/*
994 * Dispatch some requests from cfqq, moving them to the request queue
995 * dispatch list.
996 */
997static int
998__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
999			int max_dispatch)
1000{
1001	int dispatched = 0;
1002
1003	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
1004
1005	do {
1006		struct request *rq;
1007
1008		/*
1009		 * follow expired path, else get first next available
1010		 */
1011		if ((rq = cfq_check_fifo(cfqq)) == NULL)
1012			rq = cfqq->next_rq;
1013
1014		/*
1015		 * finally, insert request into driver dispatch list
1016		 */
1017		cfq_dispatch_insert(cfqd->queue, rq);
1018
1019		dispatched++;
1020
1021		if (!cfqd->active_cic) {
1022			atomic_inc(&RQ_CIC(rq)->ioc->refcount);
1023			cfqd->active_cic = RQ_CIC(rq);
1024		}
1025
1026		if (RB_EMPTY_ROOT(&cfqq->sort_list))
1027			break;
1028
1029	} while (dispatched < max_dispatch);
1030
1031	/*
1032	 * expire an async queue immediately if it has used up its slice. idle
1033	 * queue always expire after 1 dispatch round.
1034	 */
1035	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
1036	    dispatched >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1037	    cfq_class_idle(cfqq))) {
1038		cfqq->slice_end = jiffies + 1;
1039		cfq_slice_expired(cfqd, 0);
1040	}
1041
1042	return dispatched;
1043}
1044
1045static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
1046{
1047	int dispatched = 0;
1048
1049	while (cfqq->next_rq) {
1050		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
1051		dispatched++;
1052	}
1053
1054	BUG_ON(!list_empty(&cfqq->fifo));
1055	return dispatched;
1056}
1057
1058/*
1059 * Drain our current requests. Used for barriers and when switching
1060 * io schedulers on-the-fly.
1061 */
1062static int cfq_forced_dispatch(struct cfq_data *cfqd)
1063{
1064	struct cfq_queue *cfqq;
1065	int dispatched = 0;
1066
1067	while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
1068		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
1069
1070	cfq_slice_expired(cfqd, 0);
1071
1072	BUG_ON(cfqd->busy_queues);
1073
1074	return dispatched;
1075}
1076
1077static int cfq_dispatch_requests(struct request_queue *q, int force)
1078{
1079	struct cfq_data *cfqd = q->elevator->elevator_data;
1080	struct cfq_queue *cfqq;
1081	int dispatched;
1082
1083	if (!cfqd->busy_queues)
1084		return 0;
1085
1086	if (unlikely(force))
1087		return cfq_forced_dispatch(cfqd);
1088
1089	dispatched = 0;
1090	while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
1091		int max_dispatch;
1092
1093		max_dispatch = cfqd->cfq_quantum;
1094		if (cfq_class_idle(cfqq))
1095			max_dispatch = 1;
1096
1097		if (cfqq->dispatched >= max_dispatch) {
1098			if (cfqd->busy_queues > 1)
1099				break;
1100			if (cfqq->dispatched >= 4 * max_dispatch)
1101				break;
1102		}
1103
1104		if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
1105			break;
1106
1107		cfq_clear_cfqq_must_dispatch(cfqq);
1108		cfq_clear_cfqq_wait_request(cfqq);
1109		del_timer(&cfqd->idle_slice_timer);
1110
1111		dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1112	}
1113
1114	return dispatched;
1115}
1116
1117/*
1118 * task holds one reference to the queue, dropped when task exits. each rq
1119 * in-flight on this queue also holds a reference, dropped when rq is freed.
1120 *
1121 * queue lock must be held here.
1122 */
1123static void cfq_put_queue(struct cfq_queue *cfqq)
1124{
1125	struct cfq_data *cfqd = cfqq->cfqd;
1126
1127	BUG_ON(atomic_read(&cfqq->ref) <= 0);
1128
1129	if (!atomic_dec_and_test(&cfqq->ref))
1130		return;
1131
1132	BUG_ON(rb_first(&cfqq->sort_list));
1133	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1134	BUG_ON(cfq_cfqq_on_rr(cfqq));
1135
1136	if (unlikely(cfqd->active_queue == cfqq)) {
1137		__cfq_slice_expired(cfqd, cfqq, 0);
1138		cfq_schedule_dispatch(cfqd);
1139	}
1140
1141	kmem_cache_free(cfq_pool, cfqq);
1142}
1143
1144/*
1145 * Call func for each cic attached to this ioc. Returns number of cic's seen.
1146 */
1147#define CIC_GANG_NR	16
1148static unsigned int
1149call_for_each_cic(struct io_context *ioc,
1150		  void (*func)(struct io_context *, struct cfq_io_context *))
1151{
1152	struct cfq_io_context *cics[CIC_GANG_NR];
1153	unsigned long index = 0;
1154	unsigned int called = 0;
1155	int nr;
1156
1157	rcu_read_lock();
1158
1159	do {
1160		int i;
1161
1162		/*
1163		 * Perhaps there's a better way - this just gang lookups from
1164		 * 0 to the end, restarting after each CIC_GANG_NR from the
1165		 * last key + 1.
1166		 */
1167		nr = radix_tree_gang_lookup(&ioc->radix_root, (void **) cics,
1168						index, CIC_GANG_NR);
1169		if (!nr)
1170			break;
1171
1172		called += nr;
1173		index = 1 + (unsigned long) cics[nr - 1]->key;
1174
1175		for (i = 0; i < nr; i++)
1176			func(ioc, cics[i]);
1177	} while (nr == CIC_GANG_NR);
1178
1179	rcu_read_unlock();
1180
1181	return called;
1182}
1183
1184static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
1185{
1186	unsigned long flags;
1187
1188	BUG_ON(!cic->dead_key);
1189
1190	spin_lock_irqsave(&ioc->lock, flags);
1191	radix_tree_delete(&ioc->radix_root, cic->dead_key);
1192	spin_unlock_irqrestore(&ioc->lock, flags);
1193
1194	kmem_cache_free(cfq_ioc_pool, cic);
1195}
1196
1197static void cfq_free_io_context(struct io_context *ioc)
1198{
1199	int freed;
1200
1201	/*
1202	 * ioc->refcount is zero here, so no more cic's are allowed to be
1203	 * linked into this ioc. So it should be ok to iterate over the known
1204	 * list, we will see all cic's since no new ones are added.
1205	 */
1206	freed = call_for_each_cic(ioc, cic_free_func);
1207
1208	elv_ioc_count_mod(ioc_count, -freed);
1209
1210	if (ioc_gone && !elv_ioc_count_read(ioc_count))
1211		complete(ioc_gone);
1212}
1213
1214static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1215{
1216	if (unlikely(cfqq == cfqd->active_queue)) {
1217		__cfq_slice_expired(cfqd, cfqq, 0);
1218		cfq_schedule_dispatch(cfqd);
1219	}
1220
1221	cfq_put_queue(cfqq);
1222}
1223
1224static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
1225					 struct cfq_io_context *cic)
1226{
1227	list_del_init(&cic->queue_list);
1228
1229	/*
1230	 * Make sure key == NULL is seen for dead queues
1231	 */
1232	smp_wmb();
1233	cic->dead_key = (unsigned long) cic->key;
1234	cic->key = NULL;
1235
1236	if (cic->cfqq[ASYNC]) {
1237		cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
1238		cic->cfqq[ASYNC] = NULL;
1239	}
1240
1241	if (cic->cfqq[SYNC]) {
1242		cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]);
1243		cic->cfqq[SYNC] = NULL;
1244	}
1245}
1246
1247static void cfq_exit_single_io_context(struct io_context *ioc,
1248				       struct cfq_io_context *cic)
1249{
1250	struct cfq_data *cfqd = cic->key;
1251
1252	if (cfqd) {
1253		struct request_queue *q = cfqd->queue;
1254		unsigned long flags;
1255
1256		spin_lock_irqsave(q->queue_lock, flags);
1257		__cfq_exit_single_io_context(cfqd, cic);
1258		spin_unlock_irqrestore(q->queue_lock, flags);
1259	}
1260}
1261
1262/*
1263 * The process that ioc belongs to has exited, we need to clean up
1264 * and put the internal structures we have that belongs to that process.
1265 */
1266static void cfq_exit_io_context(struct io_context *ioc)
1267{
1268	rcu_assign_pointer(ioc->ioc_data, NULL);
1269	call_for_each_cic(ioc, cfq_exit_single_io_context);
1270}
1271
1272static struct cfq_io_context *
1273cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1274{
1275	struct cfq_io_context *cic;
1276
1277	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
1278							cfqd->queue->node);
1279	if (cic) {
1280		cic->last_end_request = jiffies;
1281		INIT_LIST_HEAD(&cic->queue_list);
1282		cic->dtor = cfq_free_io_context;
1283		cic->exit = cfq_exit_io_context;
1284		elv_ioc_count_inc(ioc_count);
1285	}
1286
1287	return cic;
1288}
1289
1290static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
1291{
1292	struct task_struct *tsk = current;
1293	int ioprio_class;
1294
1295	if (!cfq_cfqq_prio_changed(cfqq))
1296		return;
1297
1298	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
1299	switch (ioprio_class) {
1300		default:
1301			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1302		case IOPRIO_CLASS_NONE:
1303			/*
1304			 * no prio set, place us in the middle of the BE classes
1305			 */
1306			cfqq->ioprio = task_nice_ioprio(tsk);
1307			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1308			break;
1309		case IOPRIO_CLASS_RT:
1310			cfqq->ioprio = task_ioprio(ioc);
1311			cfqq->ioprio_class = IOPRIO_CLASS_RT;
1312			break;
1313		case IOPRIO_CLASS_BE:
1314			cfqq->ioprio = task_ioprio(ioc);
1315			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1316			break;
1317		case IOPRIO_CLASS_IDLE:
1318			cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1319			cfqq->ioprio = 7;
1320			cfq_clear_cfqq_idle_window(cfqq);
1321			break;
1322	}
1323
1324	/*
1325	 * keep track of original prio settings in case we have to temporarily
1326	 * elevate the priority of this queue
1327	 */
1328	cfqq->org_ioprio = cfqq->ioprio;
1329	cfqq->org_ioprio_class = cfqq->ioprio_class;
1330	cfq_clear_cfqq_prio_changed(cfqq);
1331}
1332
1333static inline void changed_ioprio(struct io_context *ioc,
1334				  struct cfq_io_context *cic)
1335{
1336	struct cfq_data *cfqd = cic->key;
1337	struct cfq_queue *cfqq;
1338	unsigned long flags;
1339
1340	if (unlikely(!cfqd))
1341		return;
1342
1343	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1344
1345	cfqq = cic->cfqq[ASYNC];
1346	if (cfqq) {
1347		struct cfq_queue *new_cfqq;
1348		new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc, GFP_ATOMIC);
1349		if (new_cfqq) {
1350			cic->cfqq[ASYNC] = new_cfqq;
1351			cfq_put_queue(cfqq);
1352		}
1353	}
1354
1355	cfqq = cic->cfqq[SYNC];
1356	if (cfqq)
1357		cfq_mark_cfqq_prio_changed(cfqq);
1358
1359	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1360}
1361
1362static void cfq_ioc_set_ioprio(struct io_context *ioc)
1363{
1364	call_for_each_cic(ioc, changed_ioprio);
1365	ioc->ioprio_changed = 0;
1366}
1367
1368static struct cfq_queue *
1369cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync,
1370		     struct io_context *ioc, gfp_t gfp_mask)
1371{
1372	struct cfq_queue *cfqq, *new_cfqq = NULL;
1373	struct cfq_io_context *cic;
1374
1375retry:
1376	cic = cfq_cic_lookup(cfqd, ioc);
1377	/* cic always exists here */
1378	cfqq = cic_to_cfqq(cic, is_sync);
1379
1380	if (!cfqq) {
1381		if (new_cfqq) {
1382			cfqq = new_cfqq;
1383			new_cfqq = NULL;
1384		} else if (gfp_mask & __GFP_WAIT) {
1385			/*
1386			 * Inform the allocator of the fact that we will
1387			 * just repeat this allocation if it fails, to allow
1388			 * the allocator to do whatever it needs to attempt to
1389			 * free memory.
1390			 */
1391			spin_unlock_irq(cfqd->queue->queue_lock);
1392			new_cfqq = kmem_cache_alloc_node(cfq_pool,
1393					gfp_mask | __GFP_NOFAIL | __GFP_ZERO,
1394					cfqd->queue->node);
1395			spin_lock_irq(cfqd->queue->queue_lock);
1396			goto retry;
1397		} else {
1398			cfqq = kmem_cache_alloc_node(cfq_pool,
1399					gfp_mask | __GFP_ZERO,
1400					cfqd->queue->node);
1401			if (!cfqq)
1402				goto out;
1403		}
1404
1405		RB_CLEAR_NODE(&cfqq->rb_node);
1406		INIT_LIST_HEAD(&cfqq->fifo);
1407
1408		atomic_set(&cfqq->ref, 0);
1409		cfqq->cfqd = cfqd;
1410
1411		cfq_mark_cfqq_prio_changed(cfqq);
1412		cfq_mark_cfqq_queue_new(cfqq);
1413
1414		cfq_init_prio_data(cfqq, ioc);
1415
1416		if (is_sync) {
1417			if (!cfq_class_idle(cfqq))
1418				cfq_mark_cfqq_idle_window(cfqq);
1419			cfq_mark_cfqq_sync(cfqq);
1420		}
1421	}
1422
1423	if (new_cfqq)
1424		kmem_cache_free(cfq_pool, new_cfqq);
1425
1426out:
1427	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1428	return cfqq;
1429}
1430
1431static struct cfq_queue **
1432cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
1433{
1434	switch(ioprio_class) {
1435	case IOPRIO_CLASS_RT:
1436		return &cfqd->async_cfqq[0][ioprio];
1437	case IOPRIO_CLASS_BE:
1438		return &cfqd->async_cfqq[1][ioprio];
1439	case IOPRIO_CLASS_IDLE:
1440		return &cfqd->async_idle_cfqq;
1441	default:
1442		BUG();
1443	}
1444}
1445
1446static struct cfq_queue *
1447cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc,
1448	      gfp_t gfp_mask)
1449{
1450	const int ioprio = task_ioprio(ioc);
1451	const int ioprio_class = task_ioprio_class(ioc);
1452	struct cfq_queue **async_cfqq = NULL;
1453	struct cfq_queue *cfqq = NULL;
1454
1455	if (!is_sync) {
1456		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
1457		cfqq = *async_cfqq;
1458	}
1459
1460	if (!cfqq) {
1461		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
1462		if (!cfqq)
1463			return NULL;
1464	}
1465
1466	/*
1467	 * pin the queue now that it's allocated, scheduler exit will prune it
1468	 */
1469	if (!is_sync && !(*async_cfqq)) {
1470		atomic_inc(&cfqq->ref);
1471		*async_cfqq = cfqq;
1472	}
1473
1474	atomic_inc(&cfqq->ref);
1475	return cfqq;
1476}
1477
1478static void cfq_cic_free(struct cfq_io_context *cic)
1479{
1480	kmem_cache_free(cfq_ioc_pool, cic);
1481	elv_ioc_count_dec(ioc_count);
1482
1483	if (ioc_gone && !elv_ioc_count_read(ioc_count))
1484		complete(ioc_gone);
1485}
1486
1487/*
1488 * We drop cfq io contexts lazily, so we may find a dead one.
1489 */
1490static void
1491cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
1492		  struct cfq_io_context *cic)
1493{
1494	unsigned long flags;
1495
1496	WARN_ON(!list_empty(&cic->queue_list));
1497
1498	spin_lock_irqsave(&ioc->lock, flags);
1499
1500	if (ioc->ioc_data == cic)
1501		rcu_assign_pointer(ioc->ioc_data, NULL);
1502
1503	radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
1504	spin_unlock_irqrestore(&ioc->lock, flags);
1505
1506	cfq_cic_free(cic);
1507}
1508
1509static struct cfq_io_context *
1510cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1511{
1512	struct cfq_io_context *cic;
1513	void *k;
1514
1515	if (unlikely(!ioc))
1516		return NULL;
1517
1518	/*
1519	 * we maintain a last-hit cache, to avoid browsing over the tree
1520	 */
1521	cic = rcu_dereference(ioc->ioc_data);
1522	if (cic && cic->key == cfqd)
1523		return cic;
1524
1525	do {
1526		rcu_read_lock();
1527		cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
1528		rcu_read_unlock();
1529		if (!cic)
1530			break;
1531		/* ->key must be copied to avoid race with cfq_exit_queue() */
1532		k = cic->key;
1533		if (unlikely(!k)) {
1534			cfq_drop_dead_cic(cfqd, ioc, cic);
1535			continue;
1536		}
1537
1538		rcu_assign_pointer(ioc->ioc_data, cic);
1539		break;
1540	} while (1);
1541
1542	return cic;
1543}
1544
1545/*
1546 * Add cic into ioc, using cfqd as the search key. This enables us to lookup
1547 * the process specific cfq io context when entered from the block layer.
1548 * Also adds the cic to a per-cfqd list, used when this queue is removed.
1549 */
1550static inline int
1551cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
1552	     struct cfq_io_context *cic, gfp_t gfp_mask)
1553{
1554	unsigned long flags;
1555	int ret;
1556
1557	ret = radix_tree_preload(gfp_mask);
1558	if (!ret) {
1559		cic->ioc = ioc;
1560		cic->key = cfqd;
1561
1562		spin_lock_irqsave(&ioc->lock, flags);
1563		ret = radix_tree_insert(&ioc->radix_root,
1564						(unsigned long) cfqd, cic);
1565		spin_unlock_irqrestore(&ioc->lock, flags);
1566
1567		radix_tree_preload_end();
1568
1569		if (!ret) {
1570			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1571			list_add(&cic->queue_list, &cfqd->cic_list);
1572			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1573		}
1574	}
1575
1576	if (ret)
1577		printk(KERN_ERR "cfq: cic link failed!\n");
1578
1579	return ret;
1580}
1581
1582/*
1583 * Setup general io context and cfq io context. There can be several cfq
1584 * io contexts per general io context, if this process is doing io to more
1585 * than one device managed by cfq.
1586 */
1587static struct cfq_io_context *
1588cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1589{
1590	struct io_context *ioc = NULL;
1591	struct cfq_io_context *cic;
1592
1593	might_sleep_if(gfp_mask & __GFP_WAIT);
1594
1595	ioc = get_io_context(gfp_mask, cfqd->queue->node);
1596	if (!ioc)
1597		return NULL;
1598
1599	cic = cfq_cic_lookup(cfqd, ioc);
1600	if (cic)
1601		goto out;
1602
1603	cic = cfq_alloc_io_context(cfqd, gfp_mask);
1604	if (cic == NULL)
1605		goto err;
1606
1607	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
1608		goto err_free;
1609
1610out:
1611	smp_read_barrier_depends();
1612	if (unlikely(ioc->ioprio_changed))
1613		cfq_ioc_set_ioprio(ioc);
1614
1615	return cic;
1616err_free:
1617	cfq_cic_free(cic);
1618err:
1619	put_io_context(ioc);
1620	return NULL;
1621}
1622
1623static void
1624cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1625{
1626	unsigned long elapsed = jiffies - cic->last_end_request;
1627	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1628
1629	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1630	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1631	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1632}
1633
1634static void
1635cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
1636		       struct request *rq)
1637{
1638	sector_t sdist;
1639	u64 total;
1640
1641	if (cic->last_request_pos < rq->sector)
1642		sdist = rq->sector - cic->last_request_pos;
1643	else
1644		sdist = cic->last_request_pos - rq->sector;
1645
1646	/*
1647	 * Don't allow the seek distance to get too large from the
1648	 * odd fragment, pagein, etc
1649	 */
1650	if (cic->seek_samples <= 60) /* second&third seek */
1651		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1652	else
1653		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
1654
1655	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1656	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1657	total = cic->seek_total + (cic->seek_samples/2);
1658	do_div(total, cic->seek_samples);
1659	cic->seek_mean = (sector_t)total;
1660}
1661
1662/*
1663 * Disable idle window if the process thinks too long or seeks so much that
1664 * it doesn't matter
1665 */
1666static void
1667cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1668		       struct cfq_io_context *cic)
1669{
1670	int enable_idle;
1671
1672	/*
1673	 * Don't idle for async or idle io prio class
1674	 */
1675	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
1676		return;
1677
1678	enable_idle = cfq_cfqq_idle_window(cfqq);
1679
1680	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
1681	    (cfqd->hw_tag && CIC_SEEKY(cic)))
1682		enable_idle = 0;
1683	else if (sample_valid(cic->ttime_samples)) {
1684		if (cic->ttime_mean > cfqd->cfq_slice_idle)
1685			enable_idle = 0;
1686		else
1687			enable_idle = 1;
1688	}
1689
1690	if (enable_idle)
1691		cfq_mark_cfqq_idle_window(cfqq);
1692	else
1693		cfq_clear_cfqq_idle_window(cfqq);
1694}
1695
1696/*
1697 * Check if new_cfqq should preempt the currently active queue. Return 0 for
1698 * no or if we aren't sure, a 1 will cause a preempt.
1699 */
1700static int
1701cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1702		   struct request *rq)
1703{
1704	struct cfq_queue *cfqq;
1705
1706	cfqq = cfqd->active_queue;
1707	if (!cfqq)
1708		return 0;
1709
1710	if (cfq_slice_used(cfqq))
1711		return 1;
1712
1713	if (cfq_class_idle(new_cfqq))
1714		return 0;
1715
1716	if (cfq_class_idle(cfqq))
1717		return 1;
1718
1719	/*
1720	 * if the new request is sync, but the currently running queue is
1721	 * not, let the sync request have priority.
1722	 */
1723	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
1724		return 1;
1725
1726	/*
1727	 * So both queues are sync. Let the new request get disk time if
1728	 * it's a metadata request and the current queue is doing regular IO.
1729	 */
1730	if (rq_is_meta(rq) && !cfqq->meta_pending)
1731		return 1;
1732
1733	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
1734		return 0;
1735
1736	/*
1737	 * if this request is as-good as one we would expect from the
1738	 * current cfqq, let it preempt
1739	 */
1740	if (cfq_rq_close(cfqd, rq))
1741		return 1;
1742
1743	return 0;
1744}
1745
1746/*
1747 * cfqq preempts the active queue. if we allowed preempt with no slice left,
1748 * let it have half of its nominal slice.
1749 */
1750static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1751{
1752	cfq_slice_expired(cfqd, 1);
1753
1754	/*
1755	 * Put the new queue at the front of the of the current list,
1756	 * so we know that it will be selected next.
1757	 */
1758	BUG_ON(!cfq_cfqq_on_rr(cfqq));
1759
1760	cfq_service_tree_add(cfqd, cfqq, 1);
1761
1762	cfqq->slice_end = 0;
1763	cfq_mark_cfqq_slice_new(cfqq);
1764}
1765
1766/*
1767 * Called when a new fs request (rq) is added (to cfqq). Check if there's
1768 * something we should do about it
1769 */
1770static void
1771cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1772		struct request *rq)
1773{
1774	struct cfq_io_context *cic = RQ_CIC(rq);
1775
1776	if (rq_is_meta(rq))
1777		cfqq->meta_pending++;
1778
1779	cfq_update_io_thinktime(cfqd, cic);
1780	cfq_update_io_seektime(cfqd, cic, rq);
1781	cfq_update_idle_window(cfqd, cfqq, cic);
1782
1783	cic->last_request_pos = rq->sector + rq->nr_sectors;
1784
1785	if (cfqq == cfqd->active_queue) {
1786		/*
1787		 * if we are waiting for a request for this queue, let it rip
1788		 * immediately and flag that we must not expire this queue
1789		 * just now
1790		 */
1791		if (cfq_cfqq_wait_request(cfqq)) {
1792			cfq_mark_cfqq_must_dispatch(cfqq);
1793			del_timer(&cfqd->idle_slice_timer);
1794			blk_start_queueing(cfqd->queue);
1795		}
1796	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
1797		/*
1798		 * not the active queue - expire current slice if it is
1799		 * idle and has expired it's mean thinktime or this new queue
1800		 * has some old slice time left and is of higher priority
1801		 */
1802		cfq_preempt_queue(cfqd, cfqq);
1803		cfq_mark_cfqq_must_dispatch(cfqq);
1804		blk_start_queueing(cfqd->queue);
1805	}
1806}
1807
1808static void cfq_insert_request(struct request_queue *q, struct request *rq)
1809{
1810	struct cfq_data *cfqd = q->elevator->elevator_data;
1811	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1812
1813	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
1814
1815	cfq_add_rq_rb(rq);
1816
1817	list_add_tail(&rq->queuelist, &cfqq->fifo);
1818
1819	cfq_rq_enqueued(cfqd, cfqq, rq);
1820}
1821
1822static void cfq_completed_request(struct request_queue *q, struct request *rq)
1823{
1824	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1825	struct cfq_data *cfqd = cfqq->cfqd;
1826	const int sync = rq_is_sync(rq);
1827	unsigned long now;
1828
1829	now = jiffies;
1830
1831	WARN_ON(!cfqd->rq_in_driver);
1832	WARN_ON(!cfqq->dispatched);
1833	cfqd->rq_in_driver--;
1834	cfqq->dispatched--;
1835
1836	if (cfq_cfqq_sync(cfqq))
1837		cfqd->sync_flight--;
1838
1839	if (!cfq_class_idle(cfqq))
1840		cfqd->last_end_request = now;
1841
1842	if (sync)
1843		RQ_CIC(rq)->last_end_request = now;
1844
1845	/*
1846	 * If this is the active queue, check if it needs to be expired,
1847	 * or if we want to idle in case it has no pending requests.
1848	 */
1849	if (cfqd->active_queue == cfqq) {
1850		if (cfq_cfqq_slice_new(cfqq)) {
1851			cfq_set_prio_slice(cfqd, cfqq);
1852			cfq_clear_cfqq_slice_new(cfqq);
1853		}
1854		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
1855			cfq_slice_expired(cfqd, 1);
1856		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
1857			cfq_arm_slice_timer(cfqd);
1858	}
1859
1860	if (!cfqd->rq_in_driver)
1861		cfq_schedule_dispatch(cfqd);
1862}
1863
1864/*
1865 * we temporarily boost lower priority queues if they are holding fs exclusive
1866 * resources. they are boosted to normal prio (CLASS_BE/4)
1867 */
1868static void cfq_prio_boost(struct cfq_queue *cfqq)
1869{
1870	if (has_fs_excl()) {
1871		/*
1872		 * boost idle prio on transactions that would lock out other
1873		 * users of the filesystem
1874		 */
1875		if (cfq_class_idle(cfqq))
1876			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1877		if (cfqq->ioprio > IOPRIO_NORM)
1878			cfqq->ioprio = IOPRIO_NORM;
1879	} else {
1880		/*
1881		 * check if we need to unboost the queue
1882		 */
1883		if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1884			cfqq->ioprio_class = cfqq->org_ioprio_class;
1885		if (cfqq->ioprio != cfqq->org_ioprio)
1886			cfqq->ioprio = cfqq->org_ioprio;
1887	}
1888}
1889
1890static inline int __cfq_may_queue(struct cfq_queue *cfqq)
1891{
1892	if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1893	    !cfq_cfqq_must_alloc_slice(cfqq)) {
1894		cfq_mark_cfqq_must_alloc_slice(cfqq);
1895		return ELV_MQUEUE_MUST;
1896	}
1897
1898	return ELV_MQUEUE_MAY;
1899}
1900
1901static int cfq_may_queue(struct request_queue *q, int rw)
1902{
1903	struct cfq_data *cfqd = q->elevator->elevator_data;
1904	struct task_struct *tsk = current;
1905	struct cfq_io_context *cic;
1906	struct cfq_queue *cfqq;
1907
1908	/*
1909	 * don't force setup of a queue from here, as a call to may_queue
1910	 * does not necessarily imply that a request actually will be queued.
1911	 * so just lookup a possibly existing queue, or return 'may queue'
1912	 * if that fails
1913	 */
1914	cic = cfq_cic_lookup(cfqd, tsk->io_context);
1915	if (!cic)
1916		return ELV_MQUEUE_MAY;
1917
1918	cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC);
1919	if (cfqq) {
1920		cfq_init_prio_data(cfqq, cic->ioc);
1921		cfq_prio_boost(cfqq);
1922
1923		return __cfq_may_queue(cfqq);
1924	}
1925
1926	return ELV_MQUEUE_MAY;
1927}
1928
1929/*
1930 * queue lock held here
1931 */
1932static void cfq_put_request(struct request *rq)
1933{
1934	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1935
1936	if (cfqq) {
1937		const int rw = rq_data_dir(rq);
1938
1939		BUG_ON(!cfqq->allocated[rw]);
1940		cfqq->allocated[rw]--;
1941
1942		put_io_context(RQ_CIC(rq)->ioc);
1943
1944		rq->elevator_private = NULL;
1945		rq->elevator_private2 = NULL;
1946
1947		cfq_put_queue(cfqq);
1948	}
1949}
1950
1951/*
1952 * Allocate cfq data structures associated with this request.
1953 */
1954static int
1955cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
1956{
1957	struct cfq_data *cfqd = q->elevator->elevator_data;
1958	struct cfq_io_context *cic;
1959	const int rw = rq_data_dir(rq);
1960	const int is_sync = rq_is_sync(rq);
1961	struct cfq_queue *cfqq;
1962	unsigned long flags;
1963
1964	might_sleep_if(gfp_mask & __GFP_WAIT);
1965
1966	cic = cfq_get_io_context(cfqd, gfp_mask);
1967
1968	spin_lock_irqsave(q->queue_lock, flags);
1969
1970	if (!cic)
1971		goto queue_fail;
1972
1973	cfqq = cic_to_cfqq(cic, is_sync);
1974	if (!cfqq) {
1975		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
1976
1977		if (!cfqq)
1978			goto queue_fail;
1979
1980		cic_set_cfqq(cic, cfqq, is_sync);
1981	}
1982
1983	cfqq->allocated[rw]++;
1984	cfq_clear_cfqq_must_alloc(cfqq);
1985	atomic_inc(&cfqq->ref);
1986
1987	spin_unlock_irqrestore(q->queue_lock, flags);
1988
1989	rq->elevator_private = cic;
1990	rq->elevator_private2 = cfqq;
1991	return 0;
1992
1993queue_fail:
1994	if (cic)
1995		put_io_context(cic->ioc);
1996
1997	cfq_schedule_dispatch(cfqd);
1998	spin_unlock_irqrestore(q->queue_lock, flags);
1999	return 1;
2000}
2001
2002static void cfq_kick_queue(struct work_struct *work)
2003{
2004	struct cfq_data *cfqd =
2005		container_of(work, struct cfq_data, unplug_work);
2006	struct request_queue *q = cfqd->queue;
2007	unsigned long flags;
2008
2009	spin_lock_irqsave(q->queue_lock, flags);
2010	blk_start_queueing(q);
2011	spin_unlock_irqrestore(q->queue_lock, flags);
2012}
2013
2014/*
2015 * Timer running if the active_queue is currently idling inside its time slice
2016 */
2017static void cfq_idle_slice_timer(unsigned long data)
2018{
2019	struct cfq_data *cfqd = (struct cfq_data *) data;
2020	struct cfq_queue *cfqq;
2021	unsigned long flags;
2022	int timed_out = 1;
2023
2024	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2025
2026	if ((cfqq = cfqd->active_queue) != NULL) {
2027		timed_out = 0;
2028
2029		/*
2030		 * expired
2031		 */
2032		if (cfq_slice_used(cfqq))
2033			goto expire;
2034
2035		/*
2036		 * only expire and reinvoke request handler, if there are
2037		 * other queues with pending requests
2038		 */
2039		if (!cfqd->busy_queues)
2040			goto out_cont;
2041
2042		/*
2043		 * not expired and it has a request pending, let it dispatch
2044		 */
2045		if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
2046			cfq_mark_cfqq_must_dispatch(cfqq);
2047			goto out_kick;
2048		}
2049	}
2050expire:
2051	cfq_slice_expired(cfqd, timed_out);
2052out_kick:
2053	cfq_schedule_dispatch(cfqd);
2054out_cont:
2055	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2056}
2057
2058static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2059{
2060	del_timer_sync(&cfqd->idle_slice_timer);
2061	kblockd_flush_work(&cfqd->unplug_work);
2062}
2063
2064static void cfq_put_async_queues(struct cfq_data *cfqd)
2065{
2066	int i;
2067
2068	for (i = 0; i < IOPRIO_BE_NR; i++) {
2069		if (cfqd->async_cfqq[0][i])
2070			cfq_put_queue(cfqd->async_cfqq[0][i]);
2071		if (cfqd->async_cfqq[1][i])
2072			cfq_put_queue(cfqd->async_cfqq[1][i]);
2073	}
2074
2075	if (cfqd->async_idle_cfqq)
2076		cfq_put_queue(cfqd->async_idle_cfqq);
2077}
2078
2079static void cfq_exit_queue(elevator_t *e)
2080{
2081	struct cfq_data *cfqd = e->elevator_data;
2082	struct request_queue *q = cfqd->queue;
2083
2084	cfq_shutdown_timer_wq(cfqd);
2085
2086	spin_lock_irq(q->queue_lock);
2087
2088	if (cfqd->active_queue)
2089		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
2090
2091	while (!list_empty(&cfqd->cic_list)) {
2092		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2093							struct cfq_io_context,
2094							queue_list);
2095
2096		__cfq_exit_single_io_context(cfqd, cic);
2097	}
2098
2099	cfq_put_async_queues(cfqd);
2100
2101	spin_unlock_irq(q->queue_lock);
2102
2103	cfq_shutdown_timer_wq(cfqd);
2104
2105	kfree(cfqd);
2106}
2107
2108static void *cfq_init_queue(struct request_queue *q)
2109{
2110	struct cfq_data *cfqd;
2111
2112	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
2113	if (!cfqd)
2114		return NULL;
2115
2116	cfqd->service_tree = CFQ_RB_ROOT;
2117	INIT_LIST_HEAD(&cfqd->cic_list);
2118
2119	cfqd->queue = q;
2120
2121	init_timer(&cfqd->idle_slice_timer);
2122	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2123	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2124
2125	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
2126
2127	cfqd->last_end_request = jiffies;
2128	cfqd->cfq_quantum = cfq_quantum;
2129	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2130	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2131	cfqd->cfq_back_max = cfq_back_max;
2132	cfqd->cfq_back_penalty = cfq_back_penalty;
2133	cfqd->cfq_slice[0] = cfq_slice_async;
2134	cfqd->cfq_slice[1] = cfq_slice_sync;
2135	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2136	cfqd->cfq_slice_idle = cfq_slice_idle;
2137
2138	return cfqd;
2139}
2140
2141static void cfq_slab_kill(void)
2142{
2143	if (cfq_pool)
2144		kmem_cache_destroy(cfq_pool);
2145	if (cfq_ioc_pool)
2146		kmem_cache_destroy(cfq_ioc_pool);
2147}
2148
2149static int __init cfq_slab_setup(void)
2150{
2151	cfq_pool = KMEM_CACHE(cfq_queue, 0);
2152	if (!cfq_pool)
2153		goto fail;
2154
2155	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, SLAB_DESTROY_BY_RCU);
2156	if (!cfq_ioc_pool)
2157		goto fail;
2158
2159	return 0;
2160fail:
2161	cfq_slab_kill();
2162	return -ENOMEM;
2163}
2164
2165/*
2166 * sysfs parts below -->
2167 */
2168static ssize_t
2169cfq_var_show(unsigned int var, char *page)
2170{
2171	return sprintf(page, "%d\n", var);
2172}
2173
2174static ssize_t
2175cfq_var_store(unsigned int *var, const char *page, size_t count)
2176{
2177	char *p = (char *) page;
2178
2179	*var = simple_strtoul(p, &p, 10);
2180	return count;
2181}
2182
2183#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
2184static ssize_t __FUNC(elevator_t *e, char *page)			\
2185{									\
2186	struct cfq_data *cfqd = e->elevator_data;			\
2187	unsigned int __data = __VAR;					\
2188	if (__CONV)							\
2189		__data = jiffies_to_msecs(__data);			\
2190	return cfq_var_show(__data, (page));				\
2191}
2192SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2193SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2194SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2195SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
2196SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2197SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2198SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2199SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2200SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2201#undef SHOW_FUNCTION
2202
2203#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
2204static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
2205{									\
2206	struct cfq_data *cfqd = e->elevator_data;			\
2207	unsigned int __data;						\
2208	int ret = cfq_var_store(&__data, (page), count);		\
2209	if (__data < (MIN))						\
2210		__data = (MIN);						\
2211	else if (__data > (MAX))					\
2212		__data = (MAX);						\
2213	if (__CONV)							\
2214		*(__PTR) = msecs_to_jiffies(__data);			\
2215	else								\
2216		*(__PTR) = __data;					\
2217	return ret;							\
2218}
2219STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2220STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2221STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2222STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2223STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2224STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2225STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2226STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2227STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2228#undef STORE_FUNCTION
2229
2230#define CFQ_ATTR(name) \
2231	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
2232
2233static struct elv_fs_entry cfq_attrs[] = {
2234	CFQ_ATTR(quantum),
2235	CFQ_ATTR(fifo_expire_sync),
2236	CFQ_ATTR(fifo_expire_async),
2237	CFQ_ATTR(back_seek_max),
2238	CFQ_ATTR(back_seek_penalty),
2239	CFQ_ATTR(slice_sync),
2240	CFQ_ATTR(slice_async),
2241	CFQ_ATTR(slice_async_rq),
2242	CFQ_ATTR(slice_idle),
2243	__ATTR_NULL
2244};
2245
2246static struct elevator_type iosched_cfq = {
2247	.ops = {
2248		.elevator_merge_fn = 		cfq_merge,
2249		.elevator_merged_fn =		cfq_merged_request,
2250		.elevator_merge_req_fn =	cfq_merged_requests,
2251		.elevator_allow_merge_fn =	cfq_allow_merge,
2252		.elevator_dispatch_fn =		cfq_dispatch_requests,
2253		.elevator_add_req_fn =		cfq_insert_request,
2254		.elevator_activate_req_fn =	cfq_activate_request,
2255		.elevator_deactivate_req_fn =	cfq_deactivate_request,
2256		.elevator_queue_empty_fn =	cfq_queue_empty,
2257		.elevator_completed_req_fn =	cfq_completed_request,
2258		.elevator_former_req_fn =	elv_rb_former_request,
2259		.elevator_latter_req_fn =	elv_rb_latter_request,
2260		.elevator_set_req_fn =		cfq_set_request,
2261		.elevator_put_req_fn =		cfq_put_request,
2262		.elevator_may_queue_fn =	cfq_may_queue,
2263		.elevator_init_fn =		cfq_init_queue,
2264		.elevator_exit_fn =		cfq_exit_queue,
2265		.trim =				cfq_free_io_context,
2266	},
2267	.elevator_attrs =	cfq_attrs,
2268	.elevator_name =	"cfq",
2269	.elevator_owner =	THIS_MODULE,
2270};
2271
2272static int __init cfq_init(void)
2273{
2274	/*
2275	 * could be 0 on HZ < 1000 setups
2276	 */
2277	if (!cfq_slice_async)
2278		cfq_slice_async = 1;
2279	if (!cfq_slice_idle)
2280		cfq_slice_idle = 1;
2281
2282	if (cfq_slab_setup())
2283		return -ENOMEM;
2284
2285	elv_register(&iosched_cfq);
2286
2287	return 0;
2288}
2289
2290static void __exit cfq_exit(void)
2291{
2292	DECLARE_COMPLETION_ONSTACK(all_gone);
2293	elv_unregister(&iosched_cfq);
2294	ioc_gone = &all_gone;
2295	/* ioc_gone's update must be visible before reading ioc_count */
2296	smp_wmb();
2297	if (elv_ioc_count_read(ioc_count))
2298		wait_for_completion(ioc_gone);
2299	synchronize_rcu();
2300	cfq_slab_kill();
2301}
2302
2303module_init(cfq_init);
2304module_exit(cfq_exit);
2305
2306MODULE_AUTHOR("Jens Axboe");
2307MODULE_LICENSE("GPL");
2308MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
2309