cfq-iosched.c revision 67060e37994444ee9c0bd2413c8baa6cc58e7adb
1/*
2 *  CFQ, or complete fairness queueing, disk scheduler.
3 *
4 *  Based on ideas from a previously unfinished io
5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/blkdev.h>
11#include <linux/elevator.h>
12#include <linux/hash.h>
13#include <linux/rbtree.h>
14#include <linux/ioprio.h>
15
16/*
17 * tunables
18 */
19static const int cfq_quantum = 4;		/* max queue in one round of service */
20static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
21static const int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
22static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
23
24static const int cfq_slice_sync = HZ / 10;
25static int cfq_slice_async = HZ / 25;
26static const int cfq_slice_async_rq = 2;
27static int cfq_slice_idle = HZ / 125;
28
29/*
30 * grace period before allowing idle class to get disk access
31 */
32#define CFQ_IDLE_GRACE		(HZ / 10)
33
34/*
35 * below this threshold, we consider thinktime immediate
36 */
37#define CFQ_MIN_TT		(2)
38
39#define CFQ_SLICE_SCALE		(5)
40
41#define CFQ_KEY_ASYNC		(0)
42
43/*
44 * for the hash of cfqq inside the cfqd
45 */
46#define CFQ_QHASH_SHIFT		6
47#define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
48#define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)
49
50#define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
51
52#define RQ_CIC(rq)		((struct cfq_io_context*)(rq)->elevator_private)
53#define RQ_CFQQ(rq)		((rq)->elevator_private2)
54
55static struct kmem_cache *cfq_pool;
56static struct kmem_cache *cfq_ioc_pool;
57
58static DEFINE_PER_CPU(unsigned long, ioc_count);
59static struct completion *ioc_gone;
60
61#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
62#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
63#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
64
65#define ASYNC			(0)
66#define SYNC			(1)
67
68#define cfq_cfqq_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
69
70#define sample_valid(samples)	((samples) > 80)
71
72/*
73 * Most of our rbtree usage is for sorting with min extraction, so
74 * if we cache the leftmost node we don't have to walk down the tree
75 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
76 * move this into the elevator for the rq sorting as well.
77 */
78struct cfq_rb_root {
79	struct rb_root rb;
80	struct rb_node *left;
81};
82#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, }
83
84/*
85 * Per block device queue structure
86 */
87struct cfq_data {
88	request_queue_t *queue;
89
90	/*
91	 * rr list of queues with requests and the count of them
92	 */
93	struct cfq_rb_root service_tree;
94	struct list_head cur_rr;
95	unsigned int busy_queues;
96
97	/*
98	 * cfqq lookup hash
99	 */
100	struct hlist_head *cfq_hash;
101
102	int rq_in_driver;
103	int hw_tag;
104
105	/*
106	 * idle window management
107	 */
108	struct timer_list idle_slice_timer;
109	struct work_struct unplug_work;
110
111	struct cfq_queue *active_queue;
112	struct cfq_io_context *active_cic;
113	unsigned int dispatch_slice;
114
115	struct timer_list idle_class_timer;
116
117	sector_t last_position;
118	unsigned long last_end_request;
119
120	/*
121	 * tunables, see top of file
122	 */
123	unsigned int cfq_quantum;
124	unsigned int cfq_fifo_expire[2];
125	unsigned int cfq_back_penalty;
126	unsigned int cfq_back_max;
127	unsigned int cfq_slice[2];
128	unsigned int cfq_slice_async_rq;
129	unsigned int cfq_slice_idle;
130
131	struct list_head cic_list;
132
133	sector_t new_seek_mean;
134	u64 new_seek_total;
135};
136
137/*
138 * Per process-grouping structure
139 */
140struct cfq_queue {
141	/* reference count */
142	atomic_t ref;
143	/* parent cfq_data */
144	struct cfq_data *cfqd;
145	/* cfqq lookup hash */
146	struct hlist_node cfq_hash;
147	/* hash key */
148	unsigned int key;
149	/* member of the rr/busy/cur/idle cfqd list */
150	struct list_head cfq_list;
151	/* service_tree member */
152	struct rb_node rb_node;
153	/* service_tree key */
154	unsigned long rb_key;
155	/* sorted list of pending requests */
156	struct rb_root sort_list;
157	/* if fifo isn't expired, next request to serve */
158	struct request *next_rq;
159	/* requests queued in sort_list */
160	int queued[2];
161	/* currently allocated requests */
162	int allocated[2];
163	/* pending metadata requests */
164	int meta_pending;
165	/* fifo list of requests in sort_list */
166	struct list_head fifo;
167
168	unsigned long slice_end;
169	long slice_resid;
170
171	/* number of requests that are on the dispatch list or inside driver */
172	int dispatched;
173
174	/* io prio of this group */
175	unsigned short ioprio, org_ioprio;
176	unsigned short ioprio_class, org_ioprio_class;
177
178	/* various state flags, see below */
179	unsigned int flags;
180
181	sector_t last_request_pos;
182};
183
184enum cfqq_state_flags {
185	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
186	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
187	CFQ_CFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
188	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
189	CFQ_CFQQ_FLAG_must_dispatch,	/* must dispatch, even if expired */
190	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
191	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
192	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
193	CFQ_CFQQ_FLAG_queue_new,	/* queue never been serviced */
194	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
195};
196
197#define CFQ_CFQQ_FNS(name)						\
198static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
199{									\
200	cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
201}									\
202static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
203{									\
204	cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
205}									\
206static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
207{									\
208	return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
209}
210
211CFQ_CFQQ_FNS(on_rr);
212CFQ_CFQQ_FNS(wait_request);
213CFQ_CFQQ_FNS(must_alloc);
214CFQ_CFQQ_FNS(must_alloc_slice);
215CFQ_CFQQ_FNS(must_dispatch);
216CFQ_CFQQ_FNS(fifo_expire);
217CFQ_CFQQ_FNS(idle_window);
218CFQ_CFQQ_FNS(prio_changed);
219CFQ_CFQQ_FNS(queue_new);
220CFQ_CFQQ_FNS(slice_new);
221#undef CFQ_CFQQ_FNS
222
223static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
224static void cfq_dispatch_insert(request_queue_t *, struct request *);
225static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
226
227/*
228 * scheduler run of queue, if there are requests pending and no one in the
229 * driver that will restart queueing
230 */
231static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
232{
233	if (cfqd->busy_queues)
234		kblockd_schedule_work(&cfqd->unplug_work);
235}
236
237static int cfq_queue_empty(request_queue_t *q)
238{
239	struct cfq_data *cfqd = q->elevator->elevator_data;
240
241	return !cfqd->busy_queues;
242}
243
244static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync)
245{
246	/*
247	 * Use the per-process queue, for read requests and syncronous writes
248	 */
249	if (!(rw & REQ_RW) || is_sync)
250		return task->pid;
251
252	return CFQ_KEY_ASYNC;
253}
254
255/*
256 * Scale schedule slice based on io priority. Use the sync time slice only
257 * if a queue is marked sync and has sync io queued. A sync queue with async
258 * io only, should not get full sync slice length.
259 */
260static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync,
261				 unsigned short prio)
262{
263	const int base_slice = cfqd->cfq_slice[sync];
264
265	WARN_ON(prio >= IOPRIO_BE_NR);
266
267	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
268}
269
270static inline int
271cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
272{
273	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
274}
275
276static inline void
277cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
278{
279	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
280}
281
282/*
283 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
284 * isn't valid until the first request from the dispatch is activated
285 * and the slice time set.
286 */
287static inline int cfq_slice_used(struct cfq_queue *cfqq)
288{
289	if (cfq_cfqq_slice_new(cfqq))
290		return 0;
291	if (time_before(jiffies, cfqq->slice_end))
292		return 0;
293
294	return 1;
295}
296
297/*
298 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
299 * We choose the request that is closest to the head right now. Distance
300 * behind the head is penalized and only allowed to a certain extent.
301 */
302static struct request *
303cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
304{
305	sector_t last, s1, s2, d1 = 0, d2 = 0;
306	unsigned long back_max;
307#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
308#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
309	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
310
311	if (rq1 == NULL || rq1 == rq2)
312		return rq2;
313	if (rq2 == NULL)
314		return rq1;
315
316	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
317		return rq1;
318	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
319		return rq2;
320	if (rq_is_meta(rq1) && !rq_is_meta(rq2))
321		return rq1;
322	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
323		return rq2;
324
325	s1 = rq1->sector;
326	s2 = rq2->sector;
327
328	last = cfqd->last_position;
329
330	/*
331	 * by definition, 1KiB is 2 sectors
332	 */
333	back_max = cfqd->cfq_back_max * 2;
334
335	/*
336	 * Strict one way elevator _except_ in the case where we allow
337	 * short backward seeks which are biased as twice the cost of a
338	 * similar forward seek.
339	 */
340	if (s1 >= last)
341		d1 = s1 - last;
342	else if (s1 + back_max >= last)
343		d1 = (last - s1) * cfqd->cfq_back_penalty;
344	else
345		wrap |= CFQ_RQ1_WRAP;
346
347	if (s2 >= last)
348		d2 = s2 - last;
349	else if (s2 + back_max >= last)
350		d2 = (last - s2) * cfqd->cfq_back_penalty;
351	else
352		wrap |= CFQ_RQ2_WRAP;
353
354	/* Found required data */
355
356	/*
357	 * By doing switch() on the bit mask "wrap" we avoid having to
358	 * check two variables for all permutations: --> faster!
359	 */
360	switch (wrap) {
361	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
362		if (d1 < d2)
363			return rq1;
364		else if (d2 < d1)
365			return rq2;
366		else {
367			if (s1 >= s2)
368				return rq1;
369			else
370				return rq2;
371		}
372
373	case CFQ_RQ2_WRAP:
374		return rq1;
375	case CFQ_RQ1_WRAP:
376		return rq2;
377	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
378	default:
379		/*
380		 * Since both rqs are wrapped,
381		 * start with the one that's further behind head
382		 * (--> only *one* back seek required),
383		 * since back seek takes more time than forward.
384		 */
385		if (s1 <= s2)
386			return rq1;
387		else
388			return rq2;
389	}
390}
391
392static struct rb_node *cfq_rb_first(struct cfq_rb_root *root)
393{
394	if (!root->left)
395		root->left = rb_first(&root->rb);
396
397	return root->left;
398}
399
400static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
401{
402	if (root->left == n)
403		root->left = NULL;
404
405	rb_erase(n, &root->rb);
406	RB_CLEAR_NODE(n);
407}
408
409/*
410 * would be nice to take fifo expire time into account as well
411 */
412static struct request *
413cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
414		  struct request *last)
415{
416	struct rb_node *rbnext = rb_next(&last->rb_node);
417	struct rb_node *rbprev = rb_prev(&last->rb_node);
418	struct request *next = NULL, *prev = NULL;
419
420	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
421
422	if (rbprev)
423		prev = rb_entry_rq(rbprev);
424
425	if (rbnext)
426		next = rb_entry_rq(rbnext);
427	else {
428		rbnext = rb_first(&cfqq->sort_list);
429		if (rbnext && rbnext != &last->rb_node)
430			next = rb_entry_rq(rbnext);
431	}
432
433	return cfq_choose_req(cfqd, next, prev);
434}
435
436static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
437				      struct cfq_queue *cfqq)
438{
439	/*
440	 * just an approximation, should be ok.
441	 */
442	return ((cfqd->busy_queues - 1) * cfq_prio_slice(cfqd, 1, 0));
443}
444
445static void cfq_service_tree_add(struct cfq_data *cfqd,
446				    struct cfq_queue *cfqq)
447{
448	struct rb_node **p = &cfqd->service_tree.rb.rb_node;
449	struct rb_node *parent = NULL;
450	unsigned long rb_key;
451	int left = 1;
452
453	rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
454	rb_key += cfqq->slice_resid;
455	cfqq->slice_resid = 0;
456
457	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
458		/*
459		 * same position, nothing more to do
460		 */
461		if (rb_key == cfqq->rb_key)
462			return;
463
464		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
465	}
466
467	while (*p) {
468		struct cfq_queue *__cfqq;
469		struct rb_node **n;
470
471		parent = *p;
472		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
473
474		/*
475		 * sort RT queues first, we always want to give
476		 * preference to them. IDLE queues goes to the back.
477		 * after that, sort on the next service time.
478		 */
479		if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
480			n = &(*p)->rb_left;
481		else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
482			n = &(*p)->rb_right;
483		else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
484			n = &(*p)->rb_left;
485		else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
486			n = &(*p)->rb_right;
487		else if (rb_key < __cfqq->rb_key)
488			n = &(*p)->rb_left;
489		else
490			n = &(*p)->rb_right;
491
492		if (n == &(*p)->rb_right)
493			left = 0;
494
495		p = n;
496	}
497
498	if (left)
499		cfqd->service_tree.left = &cfqq->rb_node;
500
501	cfqq->rb_key = rb_key;
502	rb_link_node(&cfqq->rb_node, parent, p);
503	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
504}
505
506static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
507{
508	struct cfq_data *cfqd = cfqq->cfqd;
509
510	/*
511	 * Resorting requires the cfqq to be on the RR list already.
512	 */
513	if (!cfq_cfqq_on_rr(cfqq))
514		return;
515
516	cfq_service_tree_add(cfqd, cfqq);
517}
518
519/*
520 * add to busy list of queues for service, trying to be fair in ordering
521 * the pending list according to last request service
522 */
523static inline void
524cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
525{
526	BUG_ON(cfq_cfqq_on_rr(cfqq));
527	cfq_mark_cfqq_on_rr(cfqq);
528	cfqd->busy_queues++;
529
530	cfq_resort_rr_list(cfqq, 0);
531}
532
533static inline void
534cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
535{
536	BUG_ON(!cfq_cfqq_on_rr(cfqq));
537	cfq_clear_cfqq_on_rr(cfqq);
538	list_del_init(&cfqq->cfq_list);
539
540	if (!RB_EMPTY_NODE(&cfqq->rb_node))
541		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
542
543	BUG_ON(!cfqd->busy_queues);
544	cfqd->busy_queues--;
545}
546
547/*
548 * rb tree support functions
549 */
550static inline void cfq_del_rq_rb(struct request *rq)
551{
552	struct cfq_queue *cfqq = RQ_CFQQ(rq);
553	struct cfq_data *cfqd = cfqq->cfqd;
554	const int sync = rq_is_sync(rq);
555
556	BUG_ON(!cfqq->queued[sync]);
557	cfqq->queued[sync]--;
558
559	elv_rb_del(&cfqq->sort_list, rq);
560
561	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
562		cfq_del_cfqq_rr(cfqd, cfqq);
563}
564
565static void cfq_add_rq_rb(struct request *rq)
566{
567	struct cfq_queue *cfqq = RQ_CFQQ(rq);
568	struct cfq_data *cfqd = cfqq->cfqd;
569	struct request *__alias;
570
571	cfqq->queued[rq_is_sync(rq)]++;
572
573	/*
574	 * looks a little odd, but the first insert might return an alias.
575	 * if that happens, put the alias on the dispatch list
576	 */
577	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
578		cfq_dispatch_insert(cfqd->queue, __alias);
579
580	if (!cfq_cfqq_on_rr(cfqq))
581		cfq_add_cfqq_rr(cfqd, cfqq);
582
583	/*
584	 * check if this request is a better next-serve candidate
585	 */
586	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
587	BUG_ON(!cfqq->next_rq);
588}
589
590static inline void
591cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
592{
593	elv_rb_del(&cfqq->sort_list, rq);
594	cfqq->queued[rq_is_sync(rq)]--;
595	cfq_add_rq_rb(rq);
596}
597
598static struct request *
599cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
600{
601	struct task_struct *tsk = current;
602	pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio), bio_sync(bio));
603	struct cfq_queue *cfqq;
604
605	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
606	if (cfqq) {
607		sector_t sector = bio->bi_sector + bio_sectors(bio);
608
609		return elv_rb_find(&cfqq->sort_list, sector);
610	}
611
612	return NULL;
613}
614
615static void cfq_activate_request(request_queue_t *q, struct request *rq)
616{
617	struct cfq_data *cfqd = q->elevator->elevator_data;
618
619	cfqd->rq_in_driver++;
620
621	/*
622	 * If the depth is larger 1, it really could be queueing. But lets
623	 * make the mark a little higher - idling could still be good for
624	 * low queueing, and a low queueing number could also just indicate
625	 * a SCSI mid layer like behaviour where limit+1 is often seen.
626	 */
627	if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
628		cfqd->hw_tag = 1;
629
630	cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
631}
632
633static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
634{
635	struct cfq_data *cfqd = q->elevator->elevator_data;
636
637	WARN_ON(!cfqd->rq_in_driver);
638	cfqd->rq_in_driver--;
639}
640
641static void cfq_remove_request(struct request *rq)
642{
643	struct cfq_queue *cfqq = RQ_CFQQ(rq);
644
645	if (cfqq->next_rq == rq)
646		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
647
648	list_del_init(&rq->queuelist);
649	cfq_del_rq_rb(rq);
650
651	if (rq_is_meta(rq)) {
652		WARN_ON(!cfqq->meta_pending);
653		cfqq->meta_pending--;
654	}
655}
656
657static int
658cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
659{
660	struct cfq_data *cfqd = q->elevator->elevator_data;
661	struct request *__rq;
662
663	__rq = cfq_find_rq_fmerge(cfqd, bio);
664	if (__rq && elv_rq_merge_ok(__rq, bio)) {
665		*req = __rq;
666		return ELEVATOR_FRONT_MERGE;
667	}
668
669	return ELEVATOR_NO_MERGE;
670}
671
672static void cfq_merged_request(request_queue_t *q, struct request *req,
673			       int type)
674{
675	if (type == ELEVATOR_FRONT_MERGE) {
676		struct cfq_queue *cfqq = RQ_CFQQ(req);
677
678		cfq_reposition_rq_rb(cfqq, req);
679	}
680}
681
682static void
683cfq_merged_requests(request_queue_t *q, struct request *rq,
684		    struct request *next)
685{
686	/*
687	 * reposition in fifo if next is older than rq
688	 */
689	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
690	    time_before(next->start_time, rq->start_time))
691		list_move(&rq->queuelist, &next->queuelist);
692
693	cfq_remove_request(next);
694}
695
696static int cfq_allow_merge(request_queue_t *q, struct request *rq,
697			   struct bio *bio)
698{
699	struct cfq_data *cfqd = q->elevator->elevator_data;
700	const int rw = bio_data_dir(bio);
701	struct cfq_queue *cfqq;
702	pid_t key;
703
704	/*
705	 * Disallow merge of a sync bio into an async request.
706	 */
707	if ((bio_data_dir(bio) == READ || bio_sync(bio)) && !rq_is_sync(rq))
708		return 0;
709
710	/*
711	 * Lookup the cfqq that this bio will be queued with. Allow
712	 * merge only if rq is queued there.
713	 */
714	key = cfq_queue_pid(current, rw, bio_sync(bio));
715	cfqq = cfq_find_cfq_hash(cfqd, key, current->ioprio);
716
717	if (cfqq == RQ_CFQQ(rq))
718		return 1;
719
720	return 0;
721}
722
723static inline void
724__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
725{
726	if (cfqq) {
727		/*
728		 * stop potential idle class queues waiting service
729		 */
730		del_timer(&cfqd->idle_class_timer);
731
732		cfqq->slice_end = 0;
733		cfq_clear_cfqq_must_alloc_slice(cfqq);
734		cfq_clear_cfqq_fifo_expire(cfqq);
735		cfq_mark_cfqq_slice_new(cfqq);
736		cfq_clear_cfqq_queue_new(cfqq);
737	}
738
739	cfqd->active_queue = cfqq;
740}
741
742/*
743 * current cfqq expired its slice (or was too idle), select new one
744 */
745static void
746__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
747		    int preempted, int timed_out)
748{
749	if (cfq_cfqq_wait_request(cfqq))
750		del_timer(&cfqd->idle_slice_timer);
751
752	cfq_clear_cfqq_must_dispatch(cfqq);
753	cfq_clear_cfqq_wait_request(cfqq);
754
755	/*
756	 * store what was left of this slice, if the queue idled out
757	 * or was preempted
758	 */
759	if (timed_out && !cfq_cfqq_slice_new(cfqq))
760		cfqq->slice_resid = cfqq->slice_end - jiffies;
761
762	cfq_resort_rr_list(cfqq, preempted);
763
764	if (cfqq == cfqd->active_queue)
765		cfqd->active_queue = NULL;
766
767	if (cfqd->active_cic) {
768		put_io_context(cfqd->active_cic->ioc);
769		cfqd->active_cic = NULL;
770	}
771
772	cfqd->dispatch_slice = 0;
773}
774
775static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted,
776				     int timed_out)
777{
778	struct cfq_queue *cfqq = cfqd->active_queue;
779
780	if (cfqq)
781		__cfq_slice_expired(cfqd, cfqq, preempted, timed_out);
782}
783
784static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
785{
786	struct cfq_queue *cfqq = NULL;
787
788	if (!list_empty(&cfqd->cur_rr)) {
789		/*
790		 * if current list is non-empty, grab first entry.
791		 */
792		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
793	} else if (!RB_EMPTY_ROOT(&cfqd->service_tree.rb)) {
794		struct rb_node *n = cfq_rb_first(&cfqd->service_tree);
795		unsigned long end;
796
797		cfqq = rb_entry(n, struct cfq_queue, rb_node);
798		if (cfq_class_idle(cfqq)) {
799			/*
800			 * if we have idle queues and no rt or be queues had
801			 * pending requests, either allow immediate service if
802			 * the grace period has passed or arm the idle grace
803			 * timer
804			 */
805			end = cfqd->last_end_request + CFQ_IDLE_GRACE;
806			if (time_before(jiffies, end)) {
807				mod_timer(&cfqd->idle_class_timer, end);
808				cfqq = NULL;
809			}
810		}
811	}
812
813	return cfqq;
814}
815
816static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
817{
818	struct cfq_queue *cfqq;
819
820	cfqq = cfq_get_next_queue(cfqd);
821	__cfq_set_active_queue(cfqd, cfqq);
822	return cfqq;
823}
824
825static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
826					  struct request *rq)
827{
828	if (rq->sector >= cfqd->last_position)
829		return rq->sector - cfqd->last_position;
830	else
831		return cfqd->last_position - rq->sector;
832}
833
834static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
835{
836	struct cfq_io_context *cic = cfqd->active_cic;
837
838	if (!sample_valid(cic->seek_samples))
839		return 0;
840
841	return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
842}
843
844static int cfq_close_cooperator(struct cfq_data *cfq_data,
845				struct cfq_queue *cfqq)
846{
847	/*
848	 * We should notice if some of the queues are cooperating, eg
849	 * working closely on the same area of the disk. In that case,
850	 * we can group them together and don't waste time idling.
851	 */
852	return 0;
853}
854
855#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
856
857static void cfq_arm_slice_timer(struct cfq_data *cfqd)
858{
859	struct cfq_queue *cfqq = cfqd->active_queue;
860	struct cfq_io_context *cic;
861	unsigned long sl;
862
863	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
864	WARN_ON(cfq_cfqq_slice_new(cfqq));
865
866	/*
867	 * idle is disabled, either manually or by past process history
868	 */
869	if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
870		return;
871
872	/*
873	 * task has exited, don't wait
874	 */
875	cic = cfqd->active_cic;
876	if (!cic || !cic->ioc->task)
877		return;
878
879	/*
880	 * See if this prio level has a good candidate
881	 */
882	if (cfq_close_cooperator(cfqd, cfqq) &&
883	    (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
884		return;
885
886	cfq_mark_cfqq_must_dispatch(cfqq);
887	cfq_mark_cfqq_wait_request(cfqq);
888
889	/*
890	 * we don't want to idle for seeks, but we do want to allow
891	 * fair distribution of slice time for a process doing back-to-back
892	 * seeks. so allow a little bit of time for him to submit a new rq
893	 */
894	sl = cfqd->cfq_slice_idle;
895	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
896		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
897
898	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
899}
900
901static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
902{
903	struct cfq_queue *cfqq = RQ_CFQQ(rq);
904
905	cfq_remove_request(rq);
906	cfqq->dispatched++;
907	elv_dispatch_sort(q, rq);
908}
909
910/*
911 * return expired entry, or NULL to just start from scratch in rbtree
912 */
913static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
914{
915	struct cfq_data *cfqd = cfqq->cfqd;
916	struct request *rq;
917	int fifo;
918
919	if (cfq_cfqq_fifo_expire(cfqq))
920		return NULL;
921
922	cfq_mark_cfqq_fifo_expire(cfqq);
923
924	if (list_empty(&cfqq->fifo))
925		return NULL;
926
927	fifo = cfq_cfqq_sync(cfqq);
928	rq = rq_entry_fifo(cfqq->fifo.next);
929
930	if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
931		return NULL;
932
933	return rq;
934}
935
936static inline int
937cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
938{
939	const int base_rq = cfqd->cfq_slice_async_rq;
940
941	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
942
943	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
944}
945
946/*
947 * get next queue for service
948 */
949static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
950{
951	struct cfq_queue *cfqq;
952
953	cfqq = cfqd->active_queue;
954	if (!cfqq)
955		goto new_queue;
956
957	/*
958	 * The active queue has run out of time, expire it and select new.
959	 */
960	if (cfq_slice_used(cfqq))
961		goto expire;
962
963	/*
964	 * The active queue has requests and isn't expired, allow it to
965	 * dispatch.
966	 */
967	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
968		goto keep_queue;
969
970	/*
971	 * No requests pending. If the active queue still has requests in
972	 * flight or is idling for a new request, allow either of these
973	 * conditions to happen (or time out) before selecting a new queue.
974	 */
975	if (cfqq->dispatched || timer_pending(&cfqd->idle_slice_timer)) {
976		cfqq = NULL;
977		goto keep_queue;
978	}
979
980expire:
981	cfq_slice_expired(cfqd, 0, 0);
982new_queue:
983	cfqq = cfq_set_active_queue(cfqd);
984keep_queue:
985	return cfqq;
986}
987
988static int
989__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
990			int max_dispatch)
991{
992	int dispatched = 0;
993
994	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
995
996	do {
997		struct request *rq;
998
999		/*
1000		 * follow expired path, else get first next available
1001		 */
1002		if ((rq = cfq_check_fifo(cfqq)) == NULL)
1003			rq = cfqq->next_rq;
1004
1005		/*
1006		 * finally, insert request into driver dispatch list
1007		 */
1008		cfq_dispatch_insert(cfqd->queue, rq);
1009
1010		cfqd->dispatch_slice++;
1011		dispatched++;
1012
1013		if (!cfqd->active_cic) {
1014			atomic_inc(&RQ_CIC(rq)->ioc->refcount);
1015			cfqd->active_cic = RQ_CIC(rq);
1016		}
1017
1018		if (RB_EMPTY_ROOT(&cfqq->sort_list))
1019			break;
1020
1021	} while (dispatched < max_dispatch);
1022
1023	/*
1024	 * expire an async queue immediately if it has used up its slice. idle
1025	 * queue always expire after 1 dispatch round.
1026	 */
1027	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
1028	    cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1029	    cfq_class_idle(cfqq))) {
1030		cfqq->slice_end = jiffies + 1;
1031		cfq_slice_expired(cfqd, 0, 0);
1032	}
1033
1034	return dispatched;
1035}
1036
1037static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
1038{
1039	int dispatched = 0;
1040
1041	while (cfqq->next_rq) {
1042		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
1043		dispatched++;
1044	}
1045
1046	BUG_ON(!list_empty(&cfqq->fifo));
1047	return dispatched;
1048}
1049
1050static int cfq_forced_dispatch_cfqqs(struct list_head *list)
1051{
1052	struct cfq_queue *cfqq, *next;
1053	int dispatched;
1054
1055	dispatched = 0;
1056	list_for_each_entry_safe(cfqq, next, list, cfq_list)
1057		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
1058
1059	return dispatched;
1060}
1061
1062static int cfq_forced_dispatch(struct cfq_data *cfqd)
1063{
1064	int dispatched = 0;
1065	struct rb_node *n;
1066
1067	while ((n = cfq_rb_first(&cfqd->service_tree)) != NULL) {
1068		struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node);
1069
1070		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
1071	}
1072
1073	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
1074
1075	cfq_slice_expired(cfqd, 0, 0);
1076
1077	BUG_ON(cfqd->busy_queues);
1078
1079	return dispatched;
1080}
1081
1082static int cfq_dispatch_requests(request_queue_t *q, int force)
1083{
1084	struct cfq_data *cfqd = q->elevator->elevator_data;
1085	struct cfq_queue *cfqq;
1086	int dispatched;
1087
1088	if (!cfqd->busy_queues)
1089		return 0;
1090
1091	if (unlikely(force))
1092		return cfq_forced_dispatch(cfqd);
1093
1094	dispatched = 0;
1095	while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
1096		int max_dispatch;
1097
1098		if (cfqd->busy_queues > 1) {
1099			/*
1100			 * So we have dispatched before in this round, if the
1101			 * next queue has idling enabled (must be sync), don't
1102			 * allow it service until the previous have completed.
1103			 */
1104			if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq) &&
1105			    dispatched)
1106				break;
1107			if (cfqq->dispatched >= cfqd->cfq_quantum)
1108				break;
1109		}
1110
1111		cfq_clear_cfqq_must_dispatch(cfqq);
1112		cfq_clear_cfqq_wait_request(cfqq);
1113		del_timer(&cfqd->idle_slice_timer);
1114
1115		max_dispatch = cfqd->cfq_quantum;
1116		if (cfq_class_idle(cfqq))
1117			max_dispatch = 1;
1118
1119		dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1120	}
1121
1122	return dispatched;
1123}
1124
1125/*
1126 * task holds one reference to the queue, dropped when task exits. each rq
1127 * in-flight on this queue also holds a reference, dropped when rq is freed.
1128 *
1129 * queue lock must be held here.
1130 */
1131static void cfq_put_queue(struct cfq_queue *cfqq)
1132{
1133	struct cfq_data *cfqd = cfqq->cfqd;
1134
1135	BUG_ON(atomic_read(&cfqq->ref) <= 0);
1136
1137	if (!atomic_dec_and_test(&cfqq->ref))
1138		return;
1139
1140	BUG_ON(rb_first(&cfqq->sort_list));
1141	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1142	BUG_ON(cfq_cfqq_on_rr(cfqq));
1143
1144	if (unlikely(cfqd->active_queue == cfqq)) {
1145		__cfq_slice_expired(cfqd, cfqq, 0, 0);
1146		cfq_schedule_dispatch(cfqd);
1147	}
1148
1149	/*
1150	 * it's on the empty list and still hashed
1151	 */
1152	hlist_del(&cfqq->cfq_hash);
1153	kmem_cache_free(cfq_pool, cfqq);
1154}
1155
1156static struct cfq_queue *
1157__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1158		    const int hashval)
1159{
1160	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1161	struct hlist_node *entry;
1162	struct cfq_queue *__cfqq;
1163
1164	hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
1165		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1166
1167		if (__cfqq->key == key && (__p == prio || !prio))
1168			return __cfqq;
1169	}
1170
1171	return NULL;
1172}
1173
1174static struct cfq_queue *
1175cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1176{
1177	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1178}
1179
1180static void cfq_free_io_context(struct io_context *ioc)
1181{
1182	struct cfq_io_context *__cic;
1183	struct rb_node *n;
1184	int freed = 0;
1185
1186	while ((n = rb_first(&ioc->cic_root)) != NULL) {
1187		__cic = rb_entry(n, struct cfq_io_context, rb_node);
1188		rb_erase(&__cic->rb_node, &ioc->cic_root);
1189		kmem_cache_free(cfq_ioc_pool, __cic);
1190		freed++;
1191	}
1192
1193	elv_ioc_count_mod(ioc_count, -freed);
1194
1195	if (ioc_gone && !elv_ioc_count_read(ioc_count))
1196		complete(ioc_gone);
1197}
1198
1199static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1200{
1201	if (unlikely(cfqq == cfqd->active_queue)) {
1202		__cfq_slice_expired(cfqd, cfqq, 0, 0);
1203		cfq_schedule_dispatch(cfqd);
1204	}
1205
1206	cfq_put_queue(cfqq);
1207}
1208
1209static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
1210					 struct cfq_io_context *cic)
1211{
1212	list_del_init(&cic->queue_list);
1213	smp_wmb();
1214	cic->key = NULL;
1215
1216	if (cic->cfqq[ASYNC]) {
1217		cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
1218		cic->cfqq[ASYNC] = NULL;
1219	}
1220
1221	if (cic->cfqq[SYNC]) {
1222		cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]);
1223		cic->cfqq[SYNC] = NULL;
1224	}
1225}
1226
1227
1228/*
1229 * Called with interrupts disabled
1230 */
1231static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1232{
1233	struct cfq_data *cfqd = cic->key;
1234
1235	if (cfqd) {
1236		request_queue_t *q = cfqd->queue;
1237
1238		spin_lock_irq(q->queue_lock);
1239		__cfq_exit_single_io_context(cfqd, cic);
1240		spin_unlock_irq(q->queue_lock);
1241	}
1242}
1243
1244static void cfq_exit_io_context(struct io_context *ioc)
1245{
1246	struct cfq_io_context *__cic;
1247	struct rb_node *n;
1248
1249	/*
1250	 * put the reference this task is holding to the various queues
1251	 */
1252
1253	n = rb_first(&ioc->cic_root);
1254	while (n != NULL) {
1255		__cic = rb_entry(n, struct cfq_io_context, rb_node);
1256
1257		cfq_exit_single_io_context(__cic);
1258		n = rb_next(n);
1259	}
1260}
1261
1262static struct cfq_io_context *
1263cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1264{
1265	struct cfq_io_context *cic;
1266
1267	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask, cfqd->queue->node);
1268	if (cic) {
1269		memset(cic, 0, sizeof(*cic));
1270		cic->last_end_request = jiffies;
1271		INIT_LIST_HEAD(&cic->queue_list);
1272		cic->dtor = cfq_free_io_context;
1273		cic->exit = cfq_exit_io_context;
1274		elv_ioc_count_inc(ioc_count);
1275	}
1276
1277	return cic;
1278}
1279
1280static void cfq_init_prio_data(struct cfq_queue *cfqq)
1281{
1282	struct task_struct *tsk = current;
1283	int ioprio_class;
1284
1285	if (!cfq_cfqq_prio_changed(cfqq))
1286		return;
1287
1288	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1289	switch (ioprio_class) {
1290		default:
1291			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1292		case IOPRIO_CLASS_NONE:
1293			/*
1294			 * no prio set, place us in the middle of the BE classes
1295			 */
1296			cfqq->ioprio = task_nice_ioprio(tsk);
1297			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1298			break;
1299		case IOPRIO_CLASS_RT:
1300			cfqq->ioprio = task_ioprio(tsk);
1301			cfqq->ioprio_class = IOPRIO_CLASS_RT;
1302			break;
1303		case IOPRIO_CLASS_BE:
1304			cfqq->ioprio = task_ioprio(tsk);
1305			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1306			break;
1307		case IOPRIO_CLASS_IDLE:
1308			cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1309			cfqq->ioprio = 7;
1310			cfq_clear_cfqq_idle_window(cfqq);
1311			break;
1312	}
1313
1314	/*
1315	 * keep track of original prio settings in case we have to temporarily
1316	 * elevate the priority of this queue
1317	 */
1318	cfqq->org_ioprio = cfqq->ioprio;
1319	cfqq->org_ioprio_class = cfqq->ioprio_class;
1320	cfq_clear_cfqq_prio_changed(cfqq);
1321}
1322
1323static inline void changed_ioprio(struct cfq_io_context *cic)
1324{
1325	struct cfq_data *cfqd = cic->key;
1326	struct cfq_queue *cfqq;
1327	unsigned long flags;
1328
1329	if (unlikely(!cfqd))
1330		return;
1331
1332	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1333
1334	cfqq = cic->cfqq[ASYNC];
1335	if (cfqq) {
1336		struct cfq_queue *new_cfqq;
1337		new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task,
1338					 GFP_ATOMIC);
1339		if (new_cfqq) {
1340			cic->cfqq[ASYNC] = new_cfqq;
1341			cfq_put_queue(cfqq);
1342		}
1343	}
1344
1345	cfqq = cic->cfqq[SYNC];
1346	if (cfqq)
1347		cfq_mark_cfqq_prio_changed(cfqq);
1348
1349	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1350}
1351
1352static void cfq_ioc_set_ioprio(struct io_context *ioc)
1353{
1354	struct cfq_io_context *cic;
1355	struct rb_node *n;
1356
1357	ioc->ioprio_changed = 0;
1358
1359	n = rb_first(&ioc->cic_root);
1360	while (n != NULL) {
1361		cic = rb_entry(n, struct cfq_io_context, rb_node);
1362
1363		changed_ioprio(cic);
1364		n = rb_next(n);
1365	}
1366}
1367
1368static struct cfq_queue *
1369cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
1370	      gfp_t gfp_mask)
1371{
1372	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1373	struct cfq_queue *cfqq, *new_cfqq = NULL;
1374	unsigned short ioprio;
1375
1376retry:
1377	ioprio = tsk->ioprio;
1378	cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1379
1380	if (!cfqq) {
1381		if (new_cfqq) {
1382			cfqq = new_cfqq;
1383			new_cfqq = NULL;
1384		} else if (gfp_mask & __GFP_WAIT) {
1385			/*
1386			 * Inform the allocator of the fact that we will
1387			 * just repeat this allocation if it fails, to allow
1388			 * the allocator to do whatever it needs to attempt to
1389			 * free memory.
1390			 */
1391			spin_unlock_irq(cfqd->queue->queue_lock);
1392			new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask|__GFP_NOFAIL, cfqd->queue->node);
1393			spin_lock_irq(cfqd->queue->queue_lock);
1394			goto retry;
1395		} else {
1396			cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask, cfqd->queue->node);
1397			if (!cfqq)
1398				goto out;
1399		}
1400
1401		memset(cfqq, 0, sizeof(*cfqq));
1402
1403		INIT_HLIST_NODE(&cfqq->cfq_hash);
1404		INIT_LIST_HEAD(&cfqq->cfq_list);
1405		RB_CLEAR_NODE(&cfqq->rb_node);
1406		INIT_LIST_HEAD(&cfqq->fifo);
1407
1408		cfqq->key = key;
1409		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1410		atomic_set(&cfqq->ref, 0);
1411		cfqq->cfqd = cfqd;
1412
1413		if (key != CFQ_KEY_ASYNC)
1414			cfq_mark_cfqq_idle_window(cfqq);
1415
1416		cfq_mark_cfqq_prio_changed(cfqq);
1417		cfq_mark_cfqq_queue_new(cfqq);
1418		cfq_init_prio_data(cfqq);
1419	}
1420
1421	if (new_cfqq)
1422		kmem_cache_free(cfq_pool, new_cfqq);
1423
1424	atomic_inc(&cfqq->ref);
1425out:
1426	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1427	return cfqq;
1428}
1429
1430static void
1431cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
1432{
1433	WARN_ON(!list_empty(&cic->queue_list));
1434	rb_erase(&cic->rb_node, &ioc->cic_root);
1435	kmem_cache_free(cfq_ioc_pool, cic);
1436	elv_ioc_count_dec(ioc_count);
1437}
1438
1439static struct cfq_io_context *
1440cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1441{
1442	struct rb_node *n;
1443	struct cfq_io_context *cic;
1444	void *k, *key = cfqd;
1445
1446restart:
1447	n = ioc->cic_root.rb_node;
1448	while (n) {
1449		cic = rb_entry(n, struct cfq_io_context, rb_node);
1450		/* ->key must be copied to avoid race with cfq_exit_queue() */
1451		k = cic->key;
1452		if (unlikely(!k)) {
1453			cfq_drop_dead_cic(ioc, cic);
1454			goto restart;
1455		}
1456
1457		if (key < k)
1458			n = n->rb_left;
1459		else if (key > k)
1460			n = n->rb_right;
1461		else
1462			return cic;
1463	}
1464
1465	return NULL;
1466}
1467
1468static inline void
1469cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
1470	     struct cfq_io_context *cic)
1471{
1472	struct rb_node **p;
1473	struct rb_node *parent;
1474	struct cfq_io_context *__cic;
1475	unsigned long flags;
1476	void *k;
1477
1478	cic->ioc = ioc;
1479	cic->key = cfqd;
1480
1481restart:
1482	parent = NULL;
1483	p = &ioc->cic_root.rb_node;
1484	while (*p) {
1485		parent = *p;
1486		__cic = rb_entry(parent, struct cfq_io_context, rb_node);
1487		/* ->key must be copied to avoid race with cfq_exit_queue() */
1488		k = __cic->key;
1489		if (unlikely(!k)) {
1490			cfq_drop_dead_cic(ioc, __cic);
1491			goto restart;
1492		}
1493
1494		if (cic->key < k)
1495			p = &(*p)->rb_left;
1496		else if (cic->key > k)
1497			p = &(*p)->rb_right;
1498		else
1499			BUG();
1500	}
1501
1502	rb_link_node(&cic->rb_node, parent, p);
1503	rb_insert_color(&cic->rb_node, &ioc->cic_root);
1504
1505	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1506	list_add(&cic->queue_list, &cfqd->cic_list);
1507	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1508}
1509
1510/*
1511 * Setup general io context and cfq io context. There can be several cfq
1512 * io contexts per general io context, if this process is doing io to more
1513 * than one device managed by cfq.
1514 */
1515static struct cfq_io_context *
1516cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1517{
1518	struct io_context *ioc = NULL;
1519	struct cfq_io_context *cic;
1520
1521	might_sleep_if(gfp_mask & __GFP_WAIT);
1522
1523	ioc = get_io_context(gfp_mask, cfqd->queue->node);
1524	if (!ioc)
1525		return NULL;
1526
1527	cic = cfq_cic_rb_lookup(cfqd, ioc);
1528	if (cic)
1529		goto out;
1530
1531	cic = cfq_alloc_io_context(cfqd, gfp_mask);
1532	if (cic == NULL)
1533		goto err;
1534
1535	cfq_cic_link(cfqd, ioc, cic);
1536out:
1537	smp_read_barrier_depends();
1538	if (unlikely(ioc->ioprio_changed))
1539		cfq_ioc_set_ioprio(ioc);
1540
1541	return cic;
1542err:
1543	put_io_context(ioc);
1544	return NULL;
1545}
1546
1547static void
1548cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1549{
1550	unsigned long elapsed = jiffies - cic->last_end_request;
1551	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1552
1553	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1554	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1555	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1556}
1557
1558static void
1559cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
1560		       struct request *rq)
1561{
1562	sector_t sdist;
1563	u64 total;
1564
1565	if (cic->last_request_pos < rq->sector)
1566		sdist = rq->sector - cic->last_request_pos;
1567	else
1568		sdist = cic->last_request_pos - rq->sector;
1569
1570	if (!cic->seek_samples) {
1571		cfqd->new_seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1572		cfqd->new_seek_mean = cfqd->new_seek_total / 256;
1573	}
1574
1575	/*
1576	 * Don't allow the seek distance to get too large from the
1577	 * odd fragment, pagein, etc
1578	 */
1579	if (cic->seek_samples <= 60) /* second&third seek */
1580		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1581	else
1582		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
1583
1584	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1585	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1586	total = cic->seek_total + (cic->seek_samples/2);
1587	do_div(total, cic->seek_samples);
1588	cic->seek_mean = (sector_t)total;
1589}
1590
1591/*
1592 * Disable idle window if the process thinks too long or seeks so much that
1593 * it doesn't matter
1594 */
1595static void
1596cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1597		       struct cfq_io_context *cic)
1598{
1599	int enable_idle = cfq_cfqq_idle_window(cfqq);
1600
1601	if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
1602	    (cfqd->hw_tag && CIC_SEEKY(cic)))
1603		enable_idle = 0;
1604	else if (sample_valid(cic->ttime_samples)) {
1605		if (cic->ttime_mean > cfqd->cfq_slice_idle)
1606			enable_idle = 0;
1607		else
1608			enable_idle = 1;
1609	}
1610
1611	if (enable_idle)
1612		cfq_mark_cfqq_idle_window(cfqq);
1613	else
1614		cfq_clear_cfqq_idle_window(cfqq);
1615}
1616
1617/*
1618 * Check if new_cfqq should preempt the currently active queue. Return 0 for
1619 * no or if we aren't sure, a 1 will cause a preempt.
1620 */
1621static int
1622cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1623		   struct request *rq)
1624{
1625	struct cfq_queue *cfqq;
1626
1627	cfqq = cfqd->active_queue;
1628	if (!cfqq)
1629		return 0;
1630
1631	if (cfq_slice_used(cfqq))
1632		return 1;
1633
1634	if (cfq_class_idle(new_cfqq))
1635		return 0;
1636
1637	if (cfq_class_idle(cfqq))
1638		return 1;
1639
1640	/*
1641	 * if the new request is sync, but the currently running queue is
1642	 * not, let the sync request have priority.
1643	 */
1644	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
1645		return 1;
1646
1647	/*
1648	 * So both queues are sync. Let the new request get disk time if
1649	 * it's a metadata request and the current queue is doing regular IO.
1650	 */
1651	if (rq_is_meta(rq) && !cfqq->meta_pending)
1652		return 1;
1653
1654	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
1655		return 0;
1656
1657	/*
1658	 * if this request is as-good as one we would expect from the
1659	 * current cfqq, let it preempt
1660	 */
1661	if (cfq_rq_close(cfqd, rq))
1662		return 1;
1663
1664	return 0;
1665}
1666
1667/*
1668 * cfqq preempts the active queue. if we allowed preempt with no slice left,
1669 * let it have half of its nominal slice.
1670 */
1671static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1672{
1673	cfq_slice_expired(cfqd, 1, 1);
1674
1675	/*
1676	 * Put the new queue at the front of the of the current list,
1677	 * so we know that it will be selected next.
1678	 */
1679	BUG_ON(!cfq_cfqq_on_rr(cfqq));
1680	list_del_init(&cfqq->cfq_list);
1681	list_add(&cfqq->cfq_list, &cfqd->cur_rr);
1682
1683	cfqq->slice_end = 0;
1684	cfq_mark_cfqq_slice_new(cfqq);
1685}
1686
1687/*
1688 * Called when a new fs request (rq) is added (to cfqq). Check if there's
1689 * something we should do about it
1690 */
1691static void
1692cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1693		struct request *rq)
1694{
1695	struct cfq_io_context *cic = RQ_CIC(rq);
1696
1697	if (rq_is_meta(rq))
1698		cfqq->meta_pending++;
1699
1700	cfq_update_io_thinktime(cfqd, cic);
1701	cfq_update_io_seektime(cfqd, cic, rq);
1702	cfq_update_idle_window(cfqd, cfqq, cic);
1703
1704	cic->last_request_pos = rq->sector + rq->nr_sectors;
1705	cfqq->last_request_pos = cic->last_request_pos;
1706
1707	if (cfqq == cfqd->active_queue) {
1708		/*
1709		 * if we are waiting for a request for this queue, let it rip
1710		 * immediately and flag that we must not expire this queue
1711		 * just now
1712		 */
1713		if (cfq_cfqq_wait_request(cfqq)) {
1714			cfq_mark_cfqq_must_dispatch(cfqq);
1715			del_timer(&cfqd->idle_slice_timer);
1716			blk_start_queueing(cfqd->queue);
1717		}
1718	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
1719		/*
1720		 * not the active queue - expire current slice if it is
1721		 * idle and has expired it's mean thinktime or this new queue
1722		 * has some old slice time left and is of higher priority
1723		 */
1724		cfq_preempt_queue(cfqd, cfqq);
1725		cfq_mark_cfqq_must_dispatch(cfqq);
1726		blk_start_queueing(cfqd->queue);
1727	}
1728}
1729
1730static void cfq_insert_request(request_queue_t *q, struct request *rq)
1731{
1732	struct cfq_data *cfqd = q->elevator->elevator_data;
1733	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1734
1735	cfq_init_prio_data(cfqq);
1736
1737	cfq_add_rq_rb(rq);
1738
1739	list_add_tail(&rq->queuelist, &cfqq->fifo);
1740
1741	cfq_rq_enqueued(cfqd, cfqq, rq);
1742}
1743
1744static void cfq_completed_request(request_queue_t *q, struct request *rq)
1745{
1746	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1747	struct cfq_data *cfqd = cfqq->cfqd;
1748	const int sync = rq_is_sync(rq);
1749	unsigned long now;
1750
1751	now = jiffies;
1752
1753	WARN_ON(!cfqd->rq_in_driver);
1754	WARN_ON(!cfqq->dispatched);
1755	cfqd->rq_in_driver--;
1756	cfqq->dispatched--;
1757
1758	if (!cfq_class_idle(cfqq))
1759		cfqd->last_end_request = now;
1760
1761	if (sync)
1762		RQ_CIC(rq)->last_end_request = now;
1763
1764	/*
1765	 * If this is the active queue, check if it needs to be expired,
1766	 * or if we want to idle in case it has no pending requests.
1767	 */
1768	if (cfqd->active_queue == cfqq) {
1769		if (cfq_cfqq_slice_new(cfqq)) {
1770			cfq_set_prio_slice(cfqd, cfqq);
1771			cfq_clear_cfqq_slice_new(cfqq);
1772		}
1773		if (cfq_slice_used(cfqq))
1774			cfq_slice_expired(cfqd, 0, 1);
1775		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
1776			cfq_arm_slice_timer(cfqd);
1777	}
1778
1779	if (!cfqd->rq_in_driver)
1780		cfq_schedule_dispatch(cfqd);
1781}
1782
1783/*
1784 * we temporarily boost lower priority queues if they are holding fs exclusive
1785 * resources. they are boosted to normal prio (CLASS_BE/4)
1786 */
1787static void cfq_prio_boost(struct cfq_queue *cfqq)
1788{
1789	if (has_fs_excl()) {
1790		/*
1791		 * boost idle prio on transactions that would lock out other
1792		 * users of the filesystem
1793		 */
1794		if (cfq_class_idle(cfqq))
1795			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1796		if (cfqq->ioprio > IOPRIO_NORM)
1797			cfqq->ioprio = IOPRIO_NORM;
1798	} else {
1799		/*
1800		 * check if we need to unboost the queue
1801		 */
1802		if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1803			cfqq->ioprio_class = cfqq->org_ioprio_class;
1804		if (cfqq->ioprio != cfqq->org_ioprio)
1805			cfqq->ioprio = cfqq->org_ioprio;
1806	}
1807}
1808
1809static inline int __cfq_may_queue(struct cfq_queue *cfqq)
1810{
1811	if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1812	    !cfq_cfqq_must_alloc_slice(cfqq)) {
1813		cfq_mark_cfqq_must_alloc_slice(cfqq);
1814		return ELV_MQUEUE_MUST;
1815	}
1816
1817	return ELV_MQUEUE_MAY;
1818}
1819
1820static int cfq_may_queue(request_queue_t *q, int rw)
1821{
1822	struct cfq_data *cfqd = q->elevator->elevator_data;
1823	struct task_struct *tsk = current;
1824	struct cfq_queue *cfqq;
1825	unsigned int key;
1826
1827	key = cfq_queue_pid(tsk, rw, rw & REQ_RW_SYNC);
1828
1829	/*
1830	 * don't force setup of a queue from here, as a call to may_queue
1831	 * does not necessarily imply that a request actually will be queued.
1832	 * so just lookup a possibly existing queue, or return 'may queue'
1833	 * if that fails
1834	 */
1835	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
1836	if (cfqq) {
1837		cfq_init_prio_data(cfqq);
1838		cfq_prio_boost(cfqq);
1839
1840		return __cfq_may_queue(cfqq);
1841	}
1842
1843	return ELV_MQUEUE_MAY;
1844}
1845
1846/*
1847 * queue lock held here
1848 */
1849static void cfq_put_request(struct request *rq)
1850{
1851	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1852
1853	if (cfqq) {
1854		const int rw = rq_data_dir(rq);
1855
1856		BUG_ON(!cfqq->allocated[rw]);
1857		cfqq->allocated[rw]--;
1858
1859		put_io_context(RQ_CIC(rq)->ioc);
1860
1861		rq->elevator_private = NULL;
1862		rq->elevator_private2 = NULL;
1863
1864		cfq_put_queue(cfqq);
1865	}
1866}
1867
1868/*
1869 * Allocate cfq data structures associated with this request.
1870 */
1871static int
1872cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
1873{
1874	struct cfq_data *cfqd = q->elevator->elevator_data;
1875	struct task_struct *tsk = current;
1876	struct cfq_io_context *cic;
1877	const int rw = rq_data_dir(rq);
1878	const int is_sync = rq_is_sync(rq);
1879	pid_t key = cfq_queue_pid(tsk, rw, is_sync);
1880	struct cfq_queue *cfqq;
1881	unsigned long flags;
1882
1883	might_sleep_if(gfp_mask & __GFP_WAIT);
1884
1885	cic = cfq_get_io_context(cfqd, gfp_mask);
1886
1887	spin_lock_irqsave(q->queue_lock, flags);
1888
1889	if (!cic)
1890		goto queue_fail;
1891
1892	if (!cic->cfqq[is_sync]) {
1893		cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
1894		if (!cfqq)
1895			goto queue_fail;
1896
1897		cic->cfqq[is_sync] = cfqq;
1898	} else
1899		cfqq = cic->cfqq[is_sync];
1900
1901	cfqq->allocated[rw]++;
1902	cfq_clear_cfqq_must_alloc(cfqq);
1903	atomic_inc(&cfqq->ref);
1904
1905	spin_unlock_irqrestore(q->queue_lock, flags);
1906
1907	rq->elevator_private = cic;
1908	rq->elevator_private2 = cfqq;
1909	return 0;
1910
1911queue_fail:
1912	if (cic)
1913		put_io_context(cic->ioc);
1914
1915	cfq_schedule_dispatch(cfqd);
1916	spin_unlock_irqrestore(q->queue_lock, flags);
1917	return 1;
1918}
1919
1920static void cfq_kick_queue(struct work_struct *work)
1921{
1922	struct cfq_data *cfqd =
1923		container_of(work, struct cfq_data, unplug_work);
1924	request_queue_t *q = cfqd->queue;
1925	unsigned long flags;
1926
1927	spin_lock_irqsave(q->queue_lock, flags);
1928	blk_start_queueing(q);
1929	spin_unlock_irqrestore(q->queue_lock, flags);
1930}
1931
1932/*
1933 * Timer running if the active_queue is currently idling inside its time slice
1934 */
1935static void cfq_idle_slice_timer(unsigned long data)
1936{
1937	struct cfq_data *cfqd = (struct cfq_data *) data;
1938	struct cfq_queue *cfqq;
1939	unsigned long flags;
1940	int timed_out = 1;
1941
1942	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1943
1944	if ((cfqq = cfqd->active_queue) != NULL) {
1945		timed_out = 0;
1946
1947		/*
1948		 * expired
1949		 */
1950		if (cfq_slice_used(cfqq))
1951			goto expire;
1952
1953		/*
1954		 * only expire and reinvoke request handler, if there are
1955		 * other queues with pending requests
1956		 */
1957		if (!cfqd->busy_queues)
1958			goto out_cont;
1959
1960		/*
1961		 * not expired and it has a request pending, let it dispatch
1962		 */
1963		if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
1964			cfq_mark_cfqq_must_dispatch(cfqq);
1965			goto out_kick;
1966		}
1967	}
1968expire:
1969	cfq_slice_expired(cfqd, 0, timed_out);
1970out_kick:
1971	cfq_schedule_dispatch(cfqd);
1972out_cont:
1973	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1974}
1975
1976/*
1977 * Timer running if an idle class queue is waiting for service
1978 */
1979static void cfq_idle_class_timer(unsigned long data)
1980{
1981	struct cfq_data *cfqd = (struct cfq_data *) data;
1982	unsigned long flags, end;
1983
1984	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1985
1986	/*
1987	 * race with a non-idle queue, reset timer
1988	 */
1989	end = cfqd->last_end_request + CFQ_IDLE_GRACE;
1990	if (!time_after_eq(jiffies, end))
1991		mod_timer(&cfqd->idle_class_timer, end);
1992	else
1993		cfq_schedule_dispatch(cfqd);
1994
1995	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1996}
1997
1998static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
1999{
2000	del_timer_sync(&cfqd->idle_slice_timer);
2001	del_timer_sync(&cfqd->idle_class_timer);
2002	blk_sync_queue(cfqd->queue);
2003}
2004
2005static void cfq_exit_queue(elevator_t *e)
2006{
2007	struct cfq_data *cfqd = e->elevator_data;
2008	request_queue_t *q = cfqd->queue;
2009
2010	cfq_shutdown_timer_wq(cfqd);
2011
2012	spin_lock_irq(q->queue_lock);
2013
2014	if (cfqd->active_queue)
2015		__cfq_slice_expired(cfqd, cfqd->active_queue, 0, 0);
2016
2017	while (!list_empty(&cfqd->cic_list)) {
2018		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2019							struct cfq_io_context,
2020							queue_list);
2021
2022		__cfq_exit_single_io_context(cfqd, cic);
2023	}
2024
2025	spin_unlock_irq(q->queue_lock);
2026
2027	cfq_shutdown_timer_wq(cfqd);
2028
2029	kfree(cfqd->cfq_hash);
2030	kfree(cfqd);
2031}
2032
2033static void *cfq_init_queue(request_queue_t *q)
2034{
2035	struct cfq_data *cfqd;
2036	int i;
2037
2038	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
2039	if (!cfqd)
2040		return NULL;
2041
2042	memset(cfqd, 0, sizeof(*cfqd));
2043
2044	cfqd->service_tree = CFQ_RB_ROOT;
2045	INIT_LIST_HEAD(&cfqd->cur_rr);
2046	INIT_LIST_HEAD(&cfqd->cic_list);
2047
2048	cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
2049	if (!cfqd->cfq_hash)
2050		goto out_free;
2051
2052	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2053		INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2054
2055	cfqd->queue = q;
2056
2057	init_timer(&cfqd->idle_slice_timer);
2058	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2059	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2060
2061	init_timer(&cfqd->idle_class_timer);
2062	cfqd->idle_class_timer.function = cfq_idle_class_timer;
2063	cfqd->idle_class_timer.data = (unsigned long) cfqd;
2064
2065	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
2066
2067	cfqd->cfq_quantum = cfq_quantum;
2068	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2069	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2070	cfqd->cfq_back_max = cfq_back_max;
2071	cfqd->cfq_back_penalty = cfq_back_penalty;
2072	cfqd->cfq_slice[0] = cfq_slice_async;
2073	cfqd->cfq_slice[1] = cfq_slice_sync;
2074	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2075	cfqd->cfq_slice_idle = cfq_slice_idle;
2076
2077	return cfqd;
2078out_free:
2079	kfree(cfqd);
2080	return NULL;
2081}
2082
2083static void cfq_slab_kill(void)
2084{
2085	if (cfq_pool)
2086		kmem_cache_destroy(cfq_pool);
2087	if (cfq_ioc_pool)
2088		kmem_cache_destroy(cfq_ioc_pool);
2089}
2090
2091static int __init cfq_slab_setup(void)
2092{
2093	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2094					NULL, NULL);
2095	if (!cfq_pool)
2096		goto fail;
2097
2098	cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2099			sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2100	if (!cfq_ioc_pool)
2101		goto fail;
2102
2103	return 0;
2104fail:
2105	cfq_slab_kill();
2106	return -ENOMEM;
2107}
2108
2109/*
2110 * sysfs parts below -->
2111 */
2112static ssize_t
2113cfq_var_show(unsigned int var, char *page)
2114{
2115	return sprintf(page, "%d\n", var);
2116}
2117
2118static ssize_t
2119cfq_var_store(unsigned int *var, const char *page, size_t count)
2120{
2121	char *p = (char *) page;
2122
2123	*var = simple_strtoul(p, &p, 10);
2124	return count;
2125}
2126
2127#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
2128static ssize_t __FUNC(elevator_t *e, char *page)			\
2129{									\
2130	struct cfq_data *cfqd = e->elevator_data;			\
2131	unsigned int __data = __VAR;					\
2132	if (__CONV)							\
2133		__data = jiffies_to_msecs(__data);			\
2134	return cfq_var_show(__data, (page));				\
2135}
2136SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2137SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2138SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2139SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
2140SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2141SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2142SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2143SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2144SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2145#undef SHOW_FUNCTION
2146
2147#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
2148static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
2149{									\
2150	struct cfq_data *cfqd = e->elevator_data;			\
2151	unsigned int __data;						\
2152	int ret = cfq_var_store(&__data, (page), count);		\
2153	if (__data < (MIN))						\
2154		__data = (MIN);						\
2155	else if (__data > (MAX))					\
2156		__data = (MAX);						\
2157	if (__CONV)							\
2158		*(__PTR) = msecs_to_jiffies(__data);			\
2159	else								\
2160		*(__PTR) = __data;					\
2161	return ret;							\
2162}
2163STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2164STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2165STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2166STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2167STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2168STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2169STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2170STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2171STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2172#undef STORE_FUNCTION
2173
2174#define CFQ_ATTR(name) \
2175	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
2176
2177static struct elv_fs_entry cfq_attrs[] = {
2178	CFQ_ATTR(quantum),
2179	CFQ_ATTR(fifo_expire_sync),
2180	CFQ_ATTR(fifo_expire_async),
2181	CFQ_ATTR(back_seek_max),
2182	CFQ_ATTR(back_seek_penalty),
2183	CFQ_ATTR(slice_sync),
2184	CFQ_ATTR(slice_async),
2185	CFQ_ATTR(slice_async_rq),
2186	CFQ_ATTR(slice_idle),
2187	__ATTR_NULL
2188};
2189
2190static struct elevator_type iosched_cfq = {
2191	.ops = {
2192		.elevator_merge_fn = 		cfq_merge,
2193		.elevator_merged_fn =		cfq_merged_request,
2194		.elevator_merge_req_fn =	cfq_merged_requests,
2195		.elevator_allow_merge_fn =	cfq_allow_merge,
2196		.elevator_dispatch_fn =		cfq_dispatch_requests,
2197		.elevator_add_req_fn =		cfq_insert_request,
2198		.elevator_activate_req_fn =	cfq_activate_request,
2199		.elevator_deactivate_req_fn =	cfq_deactivate_request,
2200		.elevator_queue_empty_fn =	cfq_queue_empty,
2201		.elevator_completed_req_fn =	cfq_completed_request,
2202		.elevator_former_req_fn =	elv_rb_former_request,
2203		.elevator_latter_req_fn =	elv_rb_latter_request,
2204		.elevator_set_req_fn =		cfq_set_request,
2205		.elevator_put_req_fn =		cfq_put_request,
2206		.elevator_may_queue_fn =	cfq_may_queue,
2207		.elevator_init_fn =		cfq_init_queue,
2208		.elevator_exit_fn =		cfq_exit_queue,
2209		.trim =				cfq_free_io_context,
2210	},
2211	.elevator_attrs =	cfq_attrs,
2212	.elevator_name =	"cfq",
2213	.elevator_owner =	THIS_MODULE,
2214};
2215
2216static int __init cfq_init(void)
2217{
2218	int ret;
2219
2220	/*
2221	 * could be 0 on HZ < 1000 setups
2222	 */
2223	if (!cfq_slice_async)
2224		cfq_slice_async = 1;
2225	if (!cfq_slice_idle)
2226		cfq_slice_idle = 1;
2227
2228	if (cfq_slab_setup())
2229		return -ENOMEM;
2230
2231	ret = elv_register(&iosched_cfq);
2232	if (ret)
2233		cfq_slab_kill();
2234
2235	return ret;
2236}
2237
2238static void __exit cfq_exit(void)
2239{
2240	DECLARE_COMPLETION_ONSTACK(all_gone);
2241	elv_unregister(&iosched_cfq);
2242	ioc_gone = &all_gone;
2243	/* ioc_gone's update must be visible before reading ioc_count */
2244	smp_wmb();
2245	if (elv_ioc_count_read(ioc_count))
2246		wait_for_completion(ioc_gone);
2247	synchronize_rcu();
2248	cfq_slab_kill();
2249}
2250
2251module_init(cfq_init);
2252module_exit(cfq_exit);
2253
2254MODULE_AUTHOR("Jens Axboe");
2255MODULE_LICENSE("GPL");
2256MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
2257