cfq-iosched.c revision 6d048f5310aa2dda2b5acd947eab3598c25e269f
1/*
2 *  CFQ, or complete fairness queueing, disk scheduler.
3 *
4 *  Based on ideas from a previously unfinished io
5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/blkdev.h>
11#include <linux/elevator.h>
12#include <linux/hash.h>
13#include <linux/rbtree.h>
14#include <linux/ioprio.h>
15
16/*
17 * tunables
18 */
19static const int cfq_quantum = 4;		/* max queue in one round of service */
20static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
21static const int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
22static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
23
24static const int cfq_slice_sync = HZ / 10;
25static int cfq_slice_async = HZ / 25;
26static const int cfq_slice_async_rq = 2;
27static int cfq_slice_idle = HZ / 125;
28
29#define CFQ_IDLE_GRACE		(HZ / 10)
30#define CFQ_SLICE_SCALE		(5)
31
32#define CFQ_KEY_ASYNC		(0)
33
34/*
35 * for the hash of cfqq inside the cfqd
36 */
37#define CFQ_QHASH_SHIFT		6
38#define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
39#define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)
40
41#define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
42
43#define RQ_CIC(rq)		((struct cfq_io_context*)(rq)->elevator_private)
44#define RQ_CFQQ(rq)		((rq)->elevator_private2)
45
46static struct kmem_cache *cfq_pool;
47static struct kmem_cache *cfq_ioc_pool;
48
49static DEFINE_PER_CPU(unsigned long, ioc_count);
50static struct completion *ioc_gone;
51
52#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
53#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
54#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
55
56#define ASYNC			(0)
57#define SYNC			(1)
58
59#define cfq_cfqq_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
60
61#define sample_valid(samples)	((samples) > 80)
62
63/*
64 * Per block device queue structure
65 */
66struct cfq_data {
67	request_queue_t *queue;
68
69	/*
70	 * rr list of queues with requests and the count of them
71	 */
72	struct list_head rr_list[CFQ_PRIO_LISTS];
73	struct list_head busy_rr;
74	struct list_head cur_rr;
75	struct list_head idle_rr;
76	unsigned long cur_rr_tick;
77	unsigned int busy_queues;
78
79	/*
80	 * cfqq lookup hash
81	 */
82	struct hlist_head *cfq_hash;
83
84	int rq_in_driver;
85	int hw_tag;
86
87	/*
88	 * idle window management
89	 */
90	struct timer_list idle_slice_timer;
91	struct work_struct unplug_work;
92
93	struct cfq_queue *active_queue;
94	struct cfq_io_context *active_cic;
95	int cur_prio, cur_end_prio;
96	unsigned long prio_time;
97	unsigned int dispatch_slice;
98
99	struct timer_list idle_class_timer;
100
101	sector_t last_position;
102	unsigned long last_end_request;
103
104	/*
105	 * tunables, see top of file
106	 */
107	unsigned int cfq_quantum;
108	unsigned int cfq_fifo_expire[2];
109	unsigned int cfq_back_penalty;
110	unsigned int cfq_back_max;
111	unsigned int cfq_slice[2];
112	unsigned int cfq_slice_async_rq;
113	unsigned int cfq_slice_idle;
114
115	struct list_head cic_list;
116
117	sector_t new_seek_mean;
118	u64 new_seek_total;
119};
120
121/*
122 * Per process-grouping structure
123 */
124struct cfq_queue {
125	/* reference count */
126	atomic_t ref;
127	/* parent cfq_data */
128	struct cfq_data *cfqd;
129	/* cfqq lookup hash */
130	struct hlist_node cfq_hash;
131	/* hash key */
132	unsigned int key;
133	/* member of the rr/busy/cur/idle cfqd list */
134	struct list_head cfq_list;
135	/* in what tick we were last serviced */
136	unsigned long rr_tick;
137	/* sorted list of pending requests */
138	struct rb_root sort_list;
139	/* if fifo isn't expired, next request to serve */
140	struct request *next_rq;
141	/* requests queued in sort_list */
142	int queued[2];
143	/* currently allocated requests */
144	int allocated[2];
145	/* pending metadata requests */
146	int meta_pending;
147	/* fifo list of requests in sort_list */
148	struct list_head fifo;
149
150	unsigned long slice_end;
151	unsigned long service_last;
152	unsigned long slice_start;
153	long slice_resid;
154
155	/* number of requests that are on the dispatch list or inside driver */
156	int dispatched;
157
158	/* io prio of this group */
159	unsigned short ioprio, org_ioprio;
160	unsigned short ioprio_class, org_ioprio_class;
161
162	/* various state flags, see below */
163	unsigned int flags;
164
165	sector_t last_request_pos;
166};
167
168enum cfqq_state_flags {
169	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
170	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
171	CFQ_CFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
172	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
173	CFQ_CFQQ_FLAG_must_dispatch,	/* must dispatch, even if expired */
174	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
175	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
176	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
177	CFQ_CFQQ_FLAG_queue_new,	/* queue never been serviced */
178	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
179};
180
181#define CFQ_CFQQ_FNS(name)						\
182static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
183{									\
184	cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
185}									\
186static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
187{									\
188	cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
189}									\
190static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
191{									\
192	return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
193}
194
195CFQ_CFQQ_FNS(on_rr);
196CFQ_CFQQ_FNS(wait_request);
197CFQ_CFQQ_FNS(must_alloc);
198CFQ_CFQQ_FNS(must_alloc_slice);
199CFQ_CFQQ_FNS(must_dispatch);
200CFQ_CFQQ_FNS(fifo_expire);
201CFQ_CFQQ_FNS(idle_window);
202CFQ_CFQQ_FNS(prio_changed);
203CFQ_CFQQ_FNS(queue_new);
204CFQ_CFQQ_FNS(slice_new);
205#undef CFQ_CFQQ_FNS
206
207static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
208static void cfq_dispatch_insert(request_queue_t *, struct request *);
209static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
210
211/*
212 * scheduler run of queue, if there are requests pending and no one in the
213 * driver that will restart queueing
214 */
215static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
216{
217	if (cfqd->busy_queues)
218		kblockd_schedule_work(&cfqd->unplug_work);
219}
220
221static int cfq_queue_empty(request_queue_t *q)
222{
223	struct cfq_data *cfqd = q->elevator->elevator_data;
224
225	return !cfqd->busy_queues;
226}
227
228static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync)
229{
230	/*
231	 * Use the per-process queue, for read requests and syncronous writes
232	 */
233	if (!(rw & REQ_RW) || is_sync)
234		return task->pid;
235
236	return CFQ_KEY_ASYNC;
237}
238
239/*
240 * Scale schedule slice based on io priority. Use the sync time slice only
241 * if a queue is marked sync and has sync io queued. A sync queue with async
242 * io only, should not get full sync slice length.
243 */
244static inline int
245cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
246{
247	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
248
249	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
250
251	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
252}
253
254static inline void
255cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
256{
257	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
258	cfqq->slice_end += cfqq->slice_resid;
259
260	/*
261	 * Don't carry over residual for more than one slice, we only want
262	 * to slightly correct the fairness. Carrying over forever would
263	 * easily introduce oscillations.
264	 */
265	cfqq->slice_resid = 0;
266
267	cfqq->slice_start = jiffies;
268}
269
270/*
271 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
272 * isn't valid until the first request from the dispatch is activated
273 * and the slice time set.
274 */
275static inline int cfq_slice_used(struct cfq_queue *cfqq)
276{
277	if (cfq_cfqq_slice_new(cfqq))
278		return 0;
279	if (time_before(jiffies, cfqq->slice_end))
280		return 0;
281
282	return 1;
283}
284
285/*
286 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
287 * We choose the request that is closest to the head right now. Distance
288 * behind the head is penalized and only allowed to a certain extent.
289 */
290static struct request *
291cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
292{
293	sector_t last, s1, s2, d1 = 0, d2 = 0;
294	unsigned long back_max;
295#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
296#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
297	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
298
299	if (rq1 == NULL || rq1 == rq2)
300		return rq2;
301	if (rq2 == NULL)
302		return rq1;
303
304	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
305		return rq1;
306	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
307		return rq2;
308	if (rq_is_meta(rq1) && !rq_is_meta(rq2))
309		return rq1;
310	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
311		return rq2;
312
313	s1 = rq1->sector;
314	s2 = rq2->sector;
315
316	last = cfqd->last_position;
317
318	/*
319	 * by definition, 1KiB is 2 sectors
320	 */
321	back_max = cfqd->cfq_back_max * 2;
322
323	/*
324	 * Strict one way elevator _except_ in the case where we allow
325	 * short backward seeks which are biased as twice the cost of a
326	 * similar forward seek.
327	 */
328	if (s1 >= last)
329		d1 = s1 - last;
330	else if (s1 + back_max >= last)
331		d1 = (last - s1) * cfqd->cfq_back_penalty;
332	else
333		wrap |= CFQ_RQ1_WRAP;
334
335	if (s2 >= last)
336		d2 = s2 - last;
337	else if (s2 + back_max >= last)
338		d2 = (last - s2) * cfqd->cfq_back_penalty;
339	else
340		wrap |= CFQ_RQ2_WRAP;
341
342	/* Found required data */
343
344	/*
345	 * By doing switch() on the bit mask "wrap" we avoid having to
346	 * check two variables for all permutations: --> faster!
347	 */
348	switch (wrap) {
349	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
350		if (d1 < d2)
351			return rq1;
352		else if (d2 < d1)
353			return rq2;
354		else {
355			if (s1 >= s2)
356				return rq1;
357			else
358				return rq2;
359		}
360
361	case CFQ_RQ2_WRAP:
362		return rq1;
363	case CFQ_RQ1_WRAP:
364		return rq2;
365	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
366	default:
367		/*
368		 * Since both rqs are wrapped,
369		 * start with the one that's further behind head
370		 * (--> only *one* back seek required),
371		 * since back seek takes more time than forward.
372		 */
373		if (s1 <= s2)
374			return rq1;
375		else
376			return rq2;
377	}
378}
379
380/*
381 * would be nice to take fifo expire time into account as well
382 */
383static struct request *
384cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
385		  struct request *last)
386{
387	struct rb_node *rbnext = rb_next(&last->rb_node);
388	struct rb_node *rbprev = rb_prev(&last->rb_node);
389	struct request *next = NULL, *prev = NULL;
390
391	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
392
393	if (rbprev)
394		prev = rb_entry_rq(rbprev);
395
396	if (rbnext)
397		next = rb_entry_rq(rbnext);
398	else {
399		rbnext = rb_first(&cfqq->sort_list);
400		if (rbnext && rbnext != &last->rb_node)
401			next = rb_entry_rq(rbnext);
402	}
403
404	return cfq_choose_req(cfqd, next, prev);
405}
406
407/*
408 * This function finds out where to insert a BE queue in the service hierarchy
409 */
410static void cfq_resort_be_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
411				int preempted)
412{
413	struct list_head *list, *n;
414	struct cfq_queue *__cfqq;
415	int add_tail = 0;
416
417	/*
418	 * if cfqq has requests in flight, don't allow it to be
419	 * found in cfq_set_active_queue before it has finished them.
420	 * this is done to increase fairness between a process that
421	 * has lots of io pending vs one that only generates one
422	 * sporadically or synchronously
423	 */
424	if (cfqq->dispatched)
425		list = &cfqd->busy_rr;
426	else if (cfqq->ioprio == (cfqd->cur_prio + 1) &&
427		 cfq_cfqq_sync(cfqq) &&
428		 (time_before(cfqd->prio_time, cfqq->service_last) ||
429		  cfq_cfqq_queue_new(cfqq) || preempted)) {
430		list = &cfqd->cur_rr;
431		add_tail = 1;
432	} else
433		list = &cfqd->rr_list[cfqq->ioprio];
434
435	if (!cfq_cfqq_sync(cfqq) || add_tail) {
436		/*
437		 * async queue always goes to the end. this wont be overly
438		 * unfair to writes, as the sort of the sync queue wont be
439		 * allowed to pass the async queue again.
440		 */
441		list_add_tail(&cfqq->cfq_list, list);
442	} else if (preempted || cfq_cfqq_queue_new(cfqq)) {
443		/*
444		 * If this queue was preempted or is new (never been serviced),
445		 * let it be added first for fairness but beind other new
446		 * queues.
447		 */
448		n = list;
449		while (n->next != list) {
450			__cfqq = list_entry_cfqq(n->next);
451			if (!cfq_cfqq_queue_new(__cfqq))
452				break;
453
454			n = n->next;
455		}
456		list_add(&cfqq->cfq_list, n);
457	} else {
458		/*
459		 * sort by last service, but don't cross a new or async
460		 * queue. we don't cross a new queue because it hasn't been
461		 * service before, and we don't cross an async queue because
462		 * it gets added to the end on expire.
463		 */
464		n = list;
465		while ((n = n->prev) != list) {
466			struct cfq_queue *__c = list_entry_cfqq(n);
467
468			if (!cfq_cfqq_sync(__c) || !__c->service_last)
469				break;
470			if (time_before(__c->service_last, cfqq->service_last))
471				break;
472		}
473		list_add(&cfqq->cfq_list, n);
474	}
475}
476
477static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
478{
479	struct cfq_data *cfqd = cfqq->cfqd;
480	struct list_head *n;
481
482	/*
483	 * Resorting requires the cfqq to be on the RR list already.
484	 */
485	if (!cfq_cfqq_on_rr(cfqq))
486		return;
487
488	list_del(&cfqq->cfq_list);
489
490	if (cfq_class_rt(cfqq)) {
491		/*
492		 * At to the front of the current list, but behind other
493		 * RT queues.
494		 */
495		n = &cfqd->cur_rr;
496		while (n->next != &cfqd->cur_rr)
497			if (!cfq_class_rt(cfqq))
498				break;
499
500		list_add(&cfqq->cfq_list, n);
501	} else if (cfq_class_idle(cfqq)) {
502		/*
503		 * IDLE goes to the tail of the idle list
504		 */
505		list_add_tail(&cfqq->cfq_list, &cfqd->idle_rr);
506	} else {
507		/*
508		 * So we get here, ergo the queue is a regular best-effort queue
509		 */
510		cfq_resort_be_queue(cfqd, cfqq, preempted);
511	}
512}
513
514/*
515 * add to busy list of queues for service, trying to be fair in ordering
516 * the pending list according to last request service
517 */
518static inline void
519cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
520{
521	BUG_ON(cfq_cfqq_on_rr(cfqq));
522	cfq_mark_cfqq_on_rr(cfqq);
523	cfqd->busy_queues++;
524
525	cfq_resort_rr_list(cfqq, 0);
526}
527
528static inline void
529cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
530{
531	BUG_ON(!cfq_cfqq_on_rr(cfqq));
532	cfq_clear_cfqq_on_rr(cfqq);
533	list_del_init(&cfqq->cfq_list);
534
535	BUG_ON(!cfqd->busy_queues);
536	cfqd->busy_queues--;
537}
538
539/*
540 * rb tree support functions
541 */
542static inline void cfq_del_rq_rb(struct request *rq)
543{
544	struct cfq_queue *cfqq = RQ_CFQQ(rq);
545	struct cfq_data *cfqd = cfqq->cfqd;
546	const int sync = rq_is_sync(rq);
547
548	BUG_ON(!cfqq->queued[sync]);
549	cfqq->queued[sync]--;
550
551	elv_rb_del(&cfqq->sort_list, rq);
552
553	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
554		cfq_del_cfqq_rr(cfqd, cfqq);
555}
556
557static void cfq_add_rq_rb(struct request *rq)
558{
559	struct cfq_queue *cfqq = RQ_CFQQ(rq);
560	struct cfq_data *cfqd = cfqq->cfqd;
561	struct request *__alias;
562
563	cfqq->queued[rq_is_sync(rq)]++;
564
565	/*
566	 * looks a little odd, but the first insert might return an alias.
567	 * if that happens, put the alias on the dispatch list
568	 */
569	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
570		cfq_dispatch_insert(cfqd->queue, __alias);
571
572	if (!cfq_cfqq_on_rr(cfqq))
573		cfq_add_cfqq_rr(cfqd, cfqq);
574
575	/*
576	 * check if this request is a better next-serve candidate
577	 */
578	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
579	BUG_ON(!cfqq->next_rq);
580}
581
582static inline void
583cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
584{
585	elv_rb_del(&cfqq->sort_list, rq);
586	cfqq->queued[rq_is_sync(rq)]--;
587	cfq_add_rq_rb(rq);
588}
589
590static struct request *
591cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
592{
593	struct task_struct *tsk = current;
594	pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio), bio_sync(bio));
595	struct cfq_queue *cfqq;
596
597	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
598	if (cfqq) {
599		sector_t sector = bio->bi_sector + bio_sectors(bio);
600
601		return elv_rb_find(&cfqq->sort_list, sector);
602	}
603
604	return NULL;
605}
606
607static void cfq_activate_request(request_queue_t *q, struct request *rq)
608{
609	struct cfq_data *cfqd = q->elevator->elevator_data;
610
611	cfqd->rq_in_driver++;
612
613	/*
614	 * If the depth is larger 1, it really could be queueing. But lets
615	 * make the mark a little higher - idling could still be good for
616	 * low queueing, and a low queueing number could also just indicate
617	 * a SCSI mid layer like behaviour where limit+1 is often seen.
618	 */
619	if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
620		cfqd->hw_tag = 1;
621
622	cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
623}
624
625static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
626{
627	struct cfq_data *cfqd = q->elevator->elevator_data;
628
629	WARN_ON(!cfqd->rq_in_driver);
630	cfqd->rq_in_driver--;
631}
632
633static void cfq_remove_request(struct request *rq)
634{
635	struct cfq_queue *cfqq = RQ_CFQQ(rq);
636
637	if (cfqq->next_rq == rq)
638		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
639
640	list_del_init(&rq->queuelist);
641	cfq_del_rq_rb(rq);
642
643	if (rq_is_meta(rq)) {
644		WARN_ON(!cfqq->meta_pending);
645		cfqq->meta_pending--;
646	}
647}
648
649static int
650cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
651{
652	struct cfq_data *cfqd = q->elevator->elevator_data;
653	struct request *__rq;
654
655	__rq = cfq_find_rq_fmerge(cfqd, bio);
656	if (__rq && elv_rq_merge_ok(__rq, bio)) {
657		*req = __rq;
658		return ELEVATOR_FRONT_MERGE;
659	}
660
661	return ELEVATOR_NO_MERGE;
662}
663
664static void cfq_merged_request(request_queue_t *q, struct request *req,
665			       int type)
666{
667	if (type == ELEVATOR_FRONT_MERGE) {
668		struct cfq_queue *cfqq = RQ_CFQQ(req);
669
670		cfq_reposition_rq_rb(cfqq, req);
671	}
672}
673
674static void
675cfq_merged_requests(request_queue_t *q, struct request *rq,
676		    struct request *next)
677{
678	/*
679	 * reposition in fifo if next is older than rq
680	 */
681	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
682	    time_before(next->start_time, rq->start_time))
683		list_move(&rq->queuelist, &next->queuelist);
684
685	cfq_remove_request(next);
686}
687
688static int cfq_allow_merge(request_queue_t *q, struct request *rq,
689			   struct bio *bio)
690{
691	struct cfq_data *cfqd = q->elevator->elevator_data;
692	const int rw = bio_data_dir(bio);
693	struct cfq_queue *cfqq;
694	pid_t key;
695
696	/*
697	 * Disallow merge of a sync bio into an async request.
698	 */
699	if ((bio_data_dir(bio) == READ || bio_sync(bio)) && !rq_is_sync(rq))
700		return 0;
701
702	/*
703	 * Lookup the cfqq that this bio will be queued with. Allow
704	 * merge only if rq is queued there.
705	 */
706	key = cfq_queue_pid(current, rw, bio_sync(bio));
707	cfqq = cfq_find_cfq_hash(cfqd, key, current->ioprio);
708
709	if (cfqq == RQ_CFQQ(rq))
710		return 1;
711
712	return 0;
713}
714
715static inline void
716__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
717{
718	if (cfqq) {
719		/*
720		 * stop potential idle class queues waiting service
721		 */
722		del_timer(&cfqd->idle_class_timer);
723
724		cfqq->slice_end = 0;
725		cfq_clear_cfqq_must_alloc_slice(cfqq);
726		cfq_clear_cfqq_fifo_expire(cfqq);
727		cfq_mark_cfqq_slice_new(cfqq);
728		cfqq->rr_tick = cfqd->cur_rr_tick;
729	}
730
731	cfqd->active_queue = cfqq;
732}
733
734/*
735 * current cfqq expired its slice (or was too idle), select new one
736 */
737static void
738__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
739		    int preempted, int timed_out)
740{
741	if (cfq_cfqq_wait_request(cfqq))
742		del_timer(&cfqd->idle_slice_timer);
743
744	cfq_clear_cfqq_must_dispatch(cfqq);
745	cfq_clear_cfqq_wait_request(cfqq);
746	cfq_clear_cfqq_queue_new(cfqq);
747
748	/*
749	 * store what was left of this slice, if the queue idled out
750	 * or was preempted
751	 */
752	if (timed_out && !cfq_cfqq_slice_new(cfqq))
753		cfqq->slice_resid = cfqq->slice_end - jiffies;
754
755	cfq_resort_rr_list(cfqq, preempted);
756
757	if (cfqq == cfqd->active_queue)
758		cfqd->active_queue = NULL;
759
760	if (cfqd->active_cic) {
761		put_io_context(cfqd->active_cic->ioc);
762		cfqd->active_cic = NULL;
763	}
764
765	cfqd->dispatch_slice = 0;
766}
767
768static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted,
769				     int timed_out)
770{
771	struct cfq_queue *cfqq = cfqd->active_queue;
772
773	if (cfqq)
774		__cfq_slice_expired(cfqd, cfqq, preempted, timed_out);
775}
776
777/*
778 * 0
779 * 0,1
780 * 0,1,2
781 * 0,1,2,3
782 * 0,1,2,3,4
783 * 0,1,2,3,4,5
784 * 0,1,2,3,4,5,6
785 * 0,1,2,3,4,5,6,7
786 */
787static int cfq_get_next_prio_level(struct cfq_data *cfqd)
788{
789	int prio, wrap;
790
791	prio = -1;
792	wrap = 0;
793	do {
794		int p;
795
796		for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
797			if (!list_empty(&cfqd->rr_list[p])) {
798				prio = p;
799				break;
800			}
801		}
802
803		if (prio != -1)
804			break;
805		cfqd->cur_prio = 0;
806		if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
807			cfqd->cur_end_prio = 0;
808			if (wrap)
809				break;
810			wrap = 1;
811		}
812	} while (1);
813
814	if (unlikely(prio == -1))
815		return -1;
816
817	BUG_ON(prio >= CFQ_PRIO_LISTS);
818
819	list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
820
821	cfqd->cur_prio = prio + 1;
822	if (cfqd->cur_prio > cfqd->cur_end_prio) {
823		cfqd->cur_end_prio = cfqd->cur_prio;
824		cfqd->cur_prio = 0;
825	}
826	if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
827		cfqd->cur_prio = 0;
828		cfqd->cur_end_prio = 0;
829	}
830
831	cfqd->cur_rr_tick++;
832	cfqd->prio_time = jiffies;
833	return prio;
834}
835
836static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
837					  struct request *rq)
838{
839	if (rq->sector >= cfqd->last_position)
840		return rq->sector - cfqd->last_position;
841	else
842		return cfqd->last_position - rq->sector;
843}
844
845static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd)
846{
847	struct cfq_queue *cfqq = NULL, *__cfqq;
848	sector_t best = -1, dist;
849
850	list_for_each_entry(__cfqq, &cfqd->cur_rr, cfq_list) {
851		if (!__cfqq->next_rq || !cfq_cfqq_sync(__cfqq))
852			continue;
853
854		dist = cfq_dist_from_last(cfqd, __cfqq->next_rq);
855		if (dist < best) {
856			best = dist;
857			cfqq = __cfqq;
858		}
859	}
860
861	/*
862	 * Only async queue(s) available, grab first entry
863	 */
864	if (!cfqq)
865		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
866
867	return cfqq;
868}
869
870static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
871{
872	struct cfq_queue *cfqq = NULL;
873
874	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) {
875		/*
876		 * if current list is non-empty, grab first entry. if it is
877		 * empty, get next prio level and grab first entry then if any
878		 * are spliced
879		 */
880		cfqq = cfq_get_best_queue(cfqd);
881	} else if (!list_empty(&cfqd->busy_rr)) {
882		/*
883		 * If no new queues are available, check if the busy list has
884		 * some before falling back to idle io.
885		 */
886		cfqq = list_entry_cfqq(cfqd->busy_rr.next);
887	} else if (!list_empty(&cfqd->idle_rr)) {
888		/*
889		 * if we have idle queues and no rt or be queues had pending
890		 * requests, either allow immediate service if the grace period
891		 * has passed or arm the idle grace timer
892		 */
893		unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
894
895		if (time_after_eq(jiffies, end))
896			cfqq = list_entry_cfqq(cfqd->idle_rr.next);
897		else
898			mod_timer(&cfqd->idle_class_timer, end);
899	}
900
901	return cfqq;
902}
903
904static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
905{
906	struct cfq_queue *cfqq;
907
908	do {
909		long prio;
910
911		cfqq = cfq_get_next_queue(cfqd);
912		if (!cfqq)
913			break;
914
915		prio = cfq_prio_to_slice(cfqd, cfqq);
916		if (cfqq->slice_resid > -prio)
917			break;
918
919		cfqq->slice_resid += prio;
920		list_del_init(&cfqq->cfq_list);
921		list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]);
922		cfqq = NULL;
923	} while (1);
924
925	__cfq_set_active_queue(cfqd, cfqq);
926	return cfqq;
927}
928
929static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
930{
931	struct cfq_io_context *cic = cfqd->active_cic;
932
933	if (!sample_valid(cic->seek_samples))
934		return 0;
935
936	return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
937}
938
939static struct cfq_queue *__cfq_close_cooperator(struct cfq_data *cfqd,
940						struct cfq_queue *cur_cfqq,
941						struct list_head *list)
942{
943	struct cfq_queue *cfqq;
944
945	list_for_each_entry(cfqq, list, cfq_list) {
946		if (cfqq == cur_cfqq || !cfq_cfqq_sync(cfqq))
947			continue;
948
949		BUG_ON(!cfqq->next_rq);
950
951		if (cfq_rq_close(cfqd, cfqq->next_rq))
952			return cfqq;
953	}
954
955	return NULL;
956}
957
958static int cfq_close_cooperator(struct cfq_data *cfqd,
959				struct cfq_queue *cur_cfqq)
960{
961	struct cfq_queue *cfqq;
962
963	if (!cfqd->busy_queues)
964		return 0;
965
966	/*
967	 * check cur_rr and same-prio rr_list for candidates
968	 */
969	cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->cur_rr);
970	if (cfqq)
971		return 1;
972
973	cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->rr_list[cur_cfqq->ioprio]);
974	if (cfqq && (cfqq->rr_tick == cfqd->cur_rr_tick))
975		cfqq = NULL;
976
977	return cfqq != NULL;
978}
979
980#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
981
982static void cfq_arm_slice_timer(struct cfq_data *cfqd)
983{
984	struct cfq_queue *cfqq = cfqd->active_queue;
985	struct cfq_io_context *cic;
986	unsigned long sl;
987
988	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
989	WARN_ON(cfq_cfqq_slice_new(cfqq));
990
991	/*
992	 * idle is disabled, either manually or by past process history
993	 */
994	if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
995		return;
996
997	/*
998	 * task has exited, don't wait
999	 */
1000	cic = cfqd->active_cic;
1001	if (!cic || !cic->ioc->task)
1002		return;
1003
1004	/*
1005	 * See if this prio level has a good candidate
1006	 */
1007	if (cfq_close_cooperator(cfqd, cfqq))
1008		return;
1009
1010	cfq_mark_cfqq_must_dispatch(cfqq);
1011	cfq_mark_cfqq_wait_request(cfqq);
1012
1013	/*
1014	 * we don't want to idle for seeks, but we do want to allow
1015	 * fair distribution of slice time for a process doing back-to-back
1016	 * seeks. so allow a little bit of time for him to submit a new rq
1017	 */
1018	sl = cfqd->cfq_slice_idle;
1019	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
1020		sl = min(sl, msecs_to_jiffies(2));
1021
1022	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1023}
1024
1025static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
1026{
1027	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1028
1029	cfq_remove_request(rq);
1030	cfqq->dispatched++;
1031	elv_dispatch_sort(q, rq);
1032}
1033
1034/*
1035 * return expired entry, or NULL to just start from scratch in rbtree
1036 */
1037static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
1038{
1039	struct cfq_data *cfqd = cfqq->cfqd;
1040	struct request *rq;
1041	int fifo;
1042
1043	if (cfq_cfqq_fifo_expire(cfqq))
1044		return NULL;
1045
1046	cfq_mark_cfqq_fifo_expire(cfqq);
1047
1048	if (list_empty(&cfqq->fifo))
1049		return NULL;
1050
1051	fifo = cfq_cfqq_sync(cfqq);
1052	rq = rq_entry_fifo(cfqq->fifo.next);
1053
1054	if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
1055		return NULL;
1056
1057	return rq;
1058}
1059
1060static inline int
1061cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1062{
1063	const int base_rq = cfqd->cfq_slice_async_rq;
1064
1065	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1066
1067	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
1068}
1069
1070/*
1071 * get next queue for service
1072 */
1073static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1074{
1075	struct cfq_queue *cfqq;
1076
1077	cfqq = cfqd->active_queue;
1078	if (!cfqq)
1079		goto new_queue;
1080
1081	/*
1082	 * The active queue has run out of time, expire it and select new.
1083	 */
1084	if (cfq_slice_used(cfqq))
1085		goto expire;
1086
1087	/*
1088	 * The active queue has requests and isn't expired, allow it to
1089	 * dispatch.
1090	 */
1091	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
1092		goto keep_queue;
1093
1094	/*
1095	 * No requests pending. If the active queue still has requests in
1096	 * flight or is idling for a new request, allow either of these
1097	 * conditions to happen (or time out) before selecting a new queue.
1098	 */
1099	if (cfqq->dispatched || timer_pending(&cfqd->idle_slice_timer)) {
1100		cfqq = NULL;
1101		goto keep_queue;
1102	}
1103
1104expire:
1105	cfq_slice_expired(cfqd, 0, 0);
1106new_queue:
1107	cfqq = cfq_set_active_queue(cfqd);
1108keep_queue:
1109	return cfqq;
1110}
1111
1112static int
1113__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1114			int max_dispatch)
1115{
1116	int dispatched = 0;
1117
1118	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
1119
1120	do {
1121		struct request *rq;
1122
1123		/*
1124		 * follow expired path, else get first next available
1125		 */
1126		if ((rq = cfq_check_fifo(cfqq)) == NULL)
1127			rq = cfqq->next_rq;
1128
1129		/*
1130		 * finally, insert request into driver dispatch list
1131		 */
1132		cfq_dispatch_insert(cfqd->queue, rq);
1133
1134		cfqd->dispatch_slice++;
1135		dispatched++;
1136
1137		if (!cfqd->active_cic) {
1138			atomic_inc(&RQ_CIC(rq)->ioc->refcount);
1139			cfqd->active_cic = RQ_CIC(rq);
1140		}
1141
1142		if (RB_EMPTY_ROOT(&cfqq->sort_list))
1143			break;
1144
1145	} while (dispatched < max_dispatch);
1146
1147	/*
1148	 * expire an async queue immediately if it has used up its slice. idle
1149	 * queue always expire after 1 dispatch round.
1150	 */
1151	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
1152	    cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1153	    cfq_class_idle(cfqq))) {
1154		cfqq->slice_end = jiffies + 1;
1155		cfq_slice_expired(cfqd, 0, 0);
1156	}
1157
1158	return dispatched;
1159}
1160
1161static int
1162cfq_forced_dispatch_cfqqs(struct list_head *list)
1163{
1164	struct cfq_queue *cfqq, *next;
1165	int dispatched;
1166
1167	dispatched = 0;
1168	list_for_each_entry_safe(cfqq, next, list, cfq_list) {
1169		while (cfqq->next_rq) {
1170			cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
1171			dispatched++;
1172		}
1173		BUG_ON(!list_empty(&cfqq->fifo));
1174	}
1175
1176	return dispatched;
1177}
1178
1179static int
1180cfq_forced_dispatch(struct cfq_data *cfqd)
1181{
1182	int i, dispatched = 0;
1183
1184	for (i = 0; i < CFQ_PRIO_LISTS; i++)
1185		dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
1186
1187	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
1188	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
1189	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
1190
1191	cfq_slice_expired(cfqd, 0, 0);
1192
1193	BUG_ON(cfqd->busy_queues);
1194
1195	return dispatched;
1196}
1197
1198static int
1199cfq_dispatch_requests(request_queue_t *q, int force)
1200{
1201	struct cfq_data *cfqd = q->elevator->elevator_data;
1202	struct cfq_queue *cfqq;
1203	int dispatched;
1204
1205	if (!cfqd->busy_queues)
1206		return 0;
1207
1208	if (unlikely(force))
1209		return cfq_forced_dispatch(cfqd);
1210
1211	dispatched = 0;
1212	while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
1213		int max_dispatch;
1214
1215		if (cfqd->busy_queues > 1) {
1216			/*
1217			 * So we have dispatched before in this round, if the
1218			 * next queue has idling enabled (must be sync), don't
1219			 * allow it service until the previous have completed.
1220			 */
1221			if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq) &&
1222			    dispatched)
1223				break;
1224			if (cfqq->dispatched >= cfqd->cfq_quantum)
1225				break;
1226		}
1227
1228		cfq_clear_cfqq_must_dispatch(cfqq);
1229		cfq_clear_cfqq_wait_request(cfqq);
1230		del_timer(&cfqd->idle_slice_timer);
1231
1232		max_dispatch = cfqd->cfq_quantum;
1233		if (cfq_class_idle(cfqq))
1234			max_dispatch = 1;
1235
1236		dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1237	}
1238
1239	return dispatched;
1240}
1241
1242/*
1243 * task holds one reference to the queue, dropped when task exits. each rq
1244 * in-flight on this queue also holds a reference, dropped when rq is freed.
1245 *
1246 * queue lock must be held here.
1247 */
1248static void cfq_put_queue(struct cfq_queue *cfqq)
1249{
1250	struct cfq_data *cfqd = cfqq->cfqd;
1251
1252	BUG_ON(atomic_read(&cfqq->ref) <= 0);
1253
1254	if (!atomic_dec_and_test(&cfqq->ref))
1255		return;
1256
1257	BUG_ON(rb_first(&cfqq->sort_list));
1258	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1259	BUG_ON(cfq_cfqq_on_rr(cfqq));
1260
1261	if (unlikely(cfqd->active_queue == cfqq)) {
1262		__cfq_slice_expired(cfqd, cfqq, 0, 0);
1263		cfq_schedule_dispatch(cfqd);
1264	}
1265
1266	/*
1267	 * it's on the empty list and still hashed
1268	 */
1269	list_del(&cfqq->cfq_list);
1270	hlist_del(&cfqq->cfq_hash);
1271	kmem_cache_free(cfq_pool, cfqq);
1272}
1273
1274static struct cfq_queue *
1275__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1276		    const int hashval)
1277{
1278	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1279	struct hlist_node *entry;
1280	struct cfq_queue *__cfqq;
1281
1282	hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
1283		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1284
1285		if (__cfqq->key == key && (__p == prio || !prio))
1286			return __cfqq;
1287	}
1288
1289	return NULL;
1290}
1291
1292static struct cfq_queue *
1293cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1294{
1295	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1296}
1297
1298static void cfq_free_io_context(struct io_context *ioc)
1299{
1300	struct cfq_io_context *__cic;
1301	struct rb_node *n;
1302	int freed = 0;
1303
1304	while ((n = rb_first(&ioc->cic_root)) != NULL) {
1305		__cic = rb_entry(n, struct cfq_io_context, rb_node);
1306		rb_erase(&__cic->rb_node, &ioc->cic_root);
1307		kmem_cache_free(cfq_ioc_pool, __cic);
1308		freed++;
1309	}
1310
1311	elv_ioc_count_mod(ioc_count, -freed);
1312
1313	if (ioc_gone && !elv_ioc_count_read(ioc_count))
1314		complete(ioc_gone);
1315}
1316
1317static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1318{
1319	if (unlikely(cfqq == cfqd->active_queue)) {
1320		__cfq_slice_expired(cfqd, cfqq, 0, 0);
1321		cfq_schedule_dispatch(cfqd);
1322	}
1323
1324	cfq_put_queue(cfqq);
1325}
1326
1327static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
1328					 struct cfq_io_context *cic)
1329{
1330	list_del_init(&cic->queue_list);
1331	smp_wmb();
1332	cic->key = NULL;
1333
1334	if (cic->cfqq[ASYNC]) {
1335		cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
1336		cic->cfqq[ASYNC] = NULL;
1337	}
1338
1339	if (cic->cfqq[SYNC]) {
1340		cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]);
1341		cic->cfqq[SYNC] = NULL;
1342	}
1343}
1344
1345
1346/*
1347 * Called with interrupts disabled
1348 */
1349static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1350{
1351	struct cfq_data *cfqd = cic->key;
1352
1353	if (cfqd) {
1354		request_queue_t *q = cfqd->queue;
1355
1356		spin_lock_irq(q->queue_lock);
1357		__cfq_exit_single_io_context(cfqd, cic);
1358		spin_unlock_irq(q->queue_lock);
1359	}
1360}
1361
1362static void cfq_exit_io_context(struct io_context *ioc)
1363{
1364	struct cfq_io_context *__cic;
1365	struct rb_node *n;
1366
1367	/*
1368	 * put the reference this task is holding to the various queues
1369	 */
1370
1371	n = rb_first(&ioc->cic_root);
1372	while (n != NULL) {
1373		__cic = rb_entry(n, struct cfq_io_context, rb_node);
1374
1375		cfq_exit_single_io_context(__cic);
1376		n = rb_next(n);
1377	}
1378}
1379
1380static struct cfq_io_context *
1381cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1382{
1383	struct cfq_io_context *cic;
1384
1385	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask, cfqd->queue->node);
1386	if (cic) {
1387		memset(cic, 0, sizeof(*cic));
1388		cic->last_end_request = jiffies;
1389		INIT_LIST_HEAD(&cic->queue_list);
1390		cic->dtor = cfq_free_io_context;
1391		cic->exit = cfq_exit_io_context;
1392		elv_ioc_count_inc(ioc_count);
1393	}
1394
1395	return cic;
1396}
1397
1398static void cfq_init_prio_data(struct cfq_queue *cfqq)
1399{
1400	struct task_struct *tsk = current;
1401	int ioprio_class;
1402
1403	if (!cfq_cfqq_prio_changed(cfqq))
1404		return;
1405
1406	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1407	switch (ioprio_class) {
1408		default:
1409			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1410		case IOPRIO_CLASS_NONE:
1411			/*
1412			 * no prio set, place us in the middle of the BE classes
1413			 */
1414			cfqq->ioprio = task_nice_ioprio(tsk);
1415			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1416			break;
1417		case IOPRIO_CLASS_RT:
1418			cfqq->ioprio = task_ioprio(tsk);
1419			cfqq->ioprio_class = IOPRIO_CLASS_RT;
1420			break;
1421		case IOPRIO_CLASS_BE:
1422			cfqq->ioprio = task_ioprio(tsk);
1423			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1424			break;
1425		case IOPRIO_CLASS_IDLE:
1426			cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1427			cfqq->ioprio = 7;
1428			cfq_clear_cfqq_idle_window(cfqq);
1429			break;
1430	}
1431
1432	/*
1433	 * keep track of original prio settings in case we have to temporarily
1434	 * elevate the priority of this queue
1435	 */
1436	cfqq->org_ioprio = cfqq->ioprio;
1437	cfqq->org_ioprio_class = cfqq->ioprio_class;
1438
1439	cfq_resort_rr_list(cfqq, 0);
1440	cfq_clear_cfqq_prio_changed(cfqq);
1441}
1442
1443static inline void changed_ioprio(struct cfq_io_context *cic)
1444{
1445	struct cfq_data *cfqd = cic->key;
1446	struct cfq_queue *cfqq;
1447	unsigned long flags;
1448
1449	if (unlikely(!cfqd))
1450		return;
1451
1452	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1453
1454	cfqq = cic->cfqq[ASYNC];
1455	if (cfqq) {
1456		struct cfq_queue *new_cfqq;
1457		new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task,
1458					 GFP_ATOMIC);
1459		if (new_cfqq) {
1460			cic->cfqq[ASYNC] = new_cfqq;
1461			cfq_put_queue(cfqq);
1462		}
1463	}
1464
1465	cfqq = cic->cfqq[SYNC];
1466	if (cfqq)
1467		cfq_mark_cfqq_prio_changed(cfqq);
1468
1469	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1470}
1471
1472static void cfq_ioc_set_ioprio(struct io_context *ioc)
1473{
1474	struct cfq_io_context *cic;
1475	struct rb_node *n;
1476
1477	ioc->ioprio_changed = 0;
1478
1479	n = rb_first(&ioc->cic_root);
1480	while (n != NULL) {
1481		cic = rb_entry(n, struct cfq_io_context, rb_node);
1482
1483		changed_ioprio(cic);
1484		n = rb_next(n);
1485	}
1486}
1487
1488static struct cfq_queue *
1489cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
1490	      gfp_t gfp_mask)
1491{
1492	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1493	struct cfq_queue *cfqq, *new_cfqq = NULL;
1494	unsigned short ioprio;
1495
1496retry:
1497	ioprio = tsk->ioprio;
1498	cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1499
1500	if (!cfqq) {
1501		if (new_cfqq) {
1502			cfqq = new_cfqq;
1503			new_cfqq = NULL;
1504		} else if (gfp_mask & __GFP_WAIT) {
1505			/*
1506			 * Inform the allocator of the fact that we will
1507			 * just repeat this allocation if it fails, to allow
1508			 * the allocator to do whatever it needs to attempt to
1509			 * free memory.
1510			 */
1511			spin_unlock_irq(cfqd->queue->queue_lock);
1512			new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask|__GFP_NOFAIL, cfqd->queue->node);
1513			spin_lock_irq(cfqd->queue->queue_lock);
1514			goto retry;
1515		} else {
1516			cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask, cfqd->queue->node);
1517			if (!cfqq)
1518				goto out;
1519		}
1520
1521		memset(cfqq, 0, sizeof(*cfqq));
1522
1523		INIT_HLIST_NODE(&cfqq->cfq_hash);
1524		INIT_LIST_HEAD(&cfqq->cfq_list);
1525		INIT_LIST_HEAD(&cfqq->fifo);
1526
1527		cfqq->key = key;
1528		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1529		atomic_set(&cfqq->ref, 0);
1530		cfqq->cfqd = cfqd;
1531
1532		if (key != CFQ_KEY_ASYNC)
1533			cfq_mark_cfqq_idle_window(cfqq);
1534
1535		cfq_mark_cfqq_prio_changed(cfqq);
1536		cfq_mark_cfqq_queue_new(cfqq);
1537		cfq_init_prio_data(cfqq);
1538	}
1539
1540	if (new_cfqq)
1541		kmem_cache_free(cfq_pool, new_cfqq);
1542
1543	atomic_inc(&cfqq->ref);
1544out:
1545	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1546	return cfqq;
1547}
1548
1549static void
1550cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
1551{
1552	WARN_ON(!list_empty(&cic->queue_list));
1553	rb_erase(&cic->rb_node, &ioc->cic_root);
1554	kmem_cache_free(cfq_ioc_pool, cic);
1555	elv_ioc_count_dec(ioc_count);
1556}
1557
1558static struct cfq_io_context *
1559cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1560{
1561	struct rb_node *n;
1562	struct cfq_io_context *cic;
1563	void *k, *key = cfqd;
1564
1565restart:
1566	n = ioc->cic_root.rb_node;
1567	while (n) {
1568		cic = rb_entry(n, struct cfq_io_context, rb_node);
1569		/* ->key must be copied to avoid race with cfq_exit_queue() */
1570		k = cic->key;
1571		if (unlikely(!k)) {
1572			cfq_drop_dead_cic(ioc, cic);
1573			goto restart;
1574		}
1575
1576		if (key < k)
1577			n = n->rb_left;
1578		else if (key > k)
1579			n = n->rb_right;
1580		else
1581			return cic;
1582	}
1583
1584	return NULL;
1585}
1586
1587static inline void
1588cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
1589	     struct cfq_io_context *cic)
1590{
1591	struct rb_node **p;
1592	struct rb_node *parent;
1593	struct cfq_io_context *__cic;
1594	unsigned long flags;
1595	void *k;
1596
1597	cic->ioc = ioc;
1598	cic->key = cfqd;
1599
1600restart:
1601	parent = NULL;
1602	p = &ioc->cic_root.rb_node;
1603	while (*p) {
1604		parent = *p;
1605		__cic = rb_entry(parent, struct cfq_io_context, rb_node);
1606		/* ->key must be copied to avoid race with cfq_exit_queue() */
1607		k = __cic->key;
1608		if (unlikely(!k)) {
1609			cfq_drop_dead_cic(ioc, __cic);
1610			goto restart;
1611		}
1612
1613		if (cic->key < k)
1614			p = &(*p)->rb_left;
1615		else if (cic->key > k)
1616			p = &(*p)->rb_right;
1617		else
1618			BUG();
1619	}
1620
1621	rb_link_node(&cic->rb_node, parent, p);
1622	rb_insert_color(&cic->rb_node, &ioc->cic_root);
1623
1624	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1625	list_add(&cic->queue_list, &cfqd->cic_list);
1626	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1627}
1628
1629/*
1630 * Setup general io context and cfq io context. There can be several cfq
1631 * io contexts per general io context, if this process is doing io to more
1632 * than one device managed by cfq.
1633 */
1634static struct cfq_io_context *
1635cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1636{
1637	struct io_context *ioc = NULL;
1638	struct cfq_io_context *cic;
1639
1640	might_sleep_if(gfp_mask & __GFP_WAIT);
1641
1642	ioc = get_io_context(gfp_mask, cfqd->queue->node);
1643	if (!ioc)
1644		return NULL;
1645
1646	cic = cfq_cic_rb_lookup(cfqd, ioc);
1647	if (cic)
1648		goto out;
1649
1650	cic = cfq_alloc_io_context(cfqd, gfp_mask);
1651	if (cic == NULL)
1652		goto err;
1653
1654	cfq_cic_link(cfqd, ioc, cic);
1655out:
1656	smp_read_barrier_depends();
1657	if (unlikely(ioc->ioprio_changed))
1658		cfq_ioc_set_ioprio(ioc);
1659
1660	return cic;
1661err:
1662	put_io_context(ioc);
1663	return NULL;
1664}
1665
1666static void
1667cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1668{
1669	unsigned long elapsed = jiffies - cic->last_end_request;
1670	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1671
1672	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1673	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1674	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1675}
1676
1677static void
1678cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
1679		       struct request *rq)
1680{
1681	sector_t sdist;
1682	u64 total;
1683
1684	if (cic->last_request_pos < rq->sector)
1685		sdist = rq->sector - cic->last_request_pos;
1686	else
1687		sdist = cic->last_request_pos - rq->sector;
1688
1689	if (!cic->seek_samples) {
1690		cfqd->new_seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1691		cfqd->new_seek_mean = cfqd->new_seek_total / 256;
1692	}
1693
1694	/*
1695	 * Don't allow the seek distance to get too large from the
1696	 * odd fragment, pagein, etc
1697	 */
1698	if (cic->seek_samples <= 60) /* second&third seek */
1699		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1700	else
1701		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
1702
1703	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1704	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1705	total = cic->seek_total + (cic->seek_samples/2);
1706	do_div(total, cic->seek_samples);
1707	cic->seek_mean = (sector_t)total;
1708}
1709
1710/*
1711 * Disable idle window if the process thinks too long or seeks so much that
1712 * it doesn't matter
1713 */
1714static void
1715cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1716		       struct cfq_io_context *cic)
1717{
1718	int enable_idle = cfq_cfqq_idle_window(cfqq);
1719
1720	if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
1721	    (cfqd->hw_tag && CIC_SEEKY(cic)))
1722		enable_idle = 0;
1723	else if (sample_valid(cic->ttime_samples)) {
1724		if (cic->ttime_mean > cfqd->cfq_slice_idle)
1725			enable_idle = 0;
1726		else
1727			enable_idle = 1;
1728	}
1729
1730	if (enable_idle)
1731		cfq_mark_cfqq_idle_window(cfqq);
1732	else
1733		cfq_clear_cfqq_idle_window(cfqq);
1734}
1735
1736/*
1737 * Check if new_cfqq should preempt the currently active queue. Return 0 for
1738 * no or if we aren't sure, a 1 will cause a preempt.
1739 */
1740static int
1741cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1742		   struct request *rq)
1743{
1744	struct cfq_queue *cfqq;
1745
1746	cfqq = cfqd->active_queue;
1747	if (!cfqq)
1748		return 0;
1749
1750	if (cfq_slice_used(cfqq))
1751		return 1;
1752
1753	if (cfq_class_idle(new_cfqq))
1754		return 0;
1755
1756	if (cfq_class_idle(cfqq))
1757		return 1;
1758
1759	/*
1760	 * if the new request is sync, but the currently running queue is
1761	 * not, let the sync request have priority.
1762	 */
1763	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
1764		return 1;
1765
1766	/*
1767	 * So both queues are sync. Let the new request get disk time if
1768	 * it's a metadata request and the current queue is doing regular IO.
1769	 */
1770	if (rq_is_meta(rq) && !cfqq->meta_pending)
1771		return 1;
1772
1773	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
1774		return 0;
1775
1776	/*
1777	 * if this request is as-good as one we would expect from the
1778	 * current cfqq, let it preempt
1779	 */
1780	if (cfq_rq_close(cfqd, rq))
1781		return 1;
1782
1783	return 0;
1784}
1785
1786/*
1787 * cfqq preempts the active queue. if we allowed preempt with no slice left,
1788 * let it have half of its nominal slice.
1789 */
1790static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1791{
1792	cfq_slice_expired(cfqd, 1, 1);
1793
1794	/*
1795	 * Put the new queue at the front of the of the current list,
1796	 * so we know that it will be selected next.
1797	 */
1798	BUG_ON(!cfq_cfqq_on_rr(cfqq));
1799	list_move(&cfqq->cfq_list, &cfqd->cur_rr);
1800
1801	cfqq->slice_end = 0;
1802	cfq_mark_cfqq_slice_new(cfqq);
1803}
1804
1805/*
1806 * Called when a new fs request (rq) is added (to cfqq). Check if there's
1807 * something we should do about it
1808 */
1809static void
1810cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1811		struct request *rq)
1812{
1813	struct cfq_io_context *cic = RQ_CIC(rq);
1814
1815	if (rq_is_meta(rq))
1816		cfqq->meta_pending++;
1817
1818	cfq_update_io_thinktime(cfqd, cic);
1819	cfq_update_io_seektime(cfqd, cic, rq);
1820	cfq_update_idle_window(cfqd, cfqq, cic);
1821
1822	cic->last_request_pos = rq->sector + rq->nr_sectors;
1823	cfqq->last_request_pos = cic->last_request_pos;
1824
1825	if (cfqq == cfqd->active_queue) {
1826		/*
1827		 * if we are waiting for a request for this queue, let it rip
1828		 * immediately and flag that we must not expire this queue
1829		 * just now
1830		 */
1831		if (cfq_cfqq_wait_request(cfqq)) {
1832			cfq_mark_cfqq_must_dispatch(cfqq);
1833			del_timer(&cfqd->idle_slice_timer);
1834			blk_start_queueing(cfqd->queue);
1835		}
1836	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
1837		/*
1838		 * not the active queue - expire current slice if it is
1839		 * idle and has expired it's mean thinktime or this new queue
1840		 * has some old slice time left and is of higher priority
1841		 */
1842		cfq_preempt_queue(cfqd, cfqq);
1843		cfq_mark_cfqq_must_dispatch(cfqq);
1844		blk_start_queueing(cfqd->queue);
1845	}
1846}
1847
1848static void cfq_insert_request(request_queue_t *q, struct request *rq)
1849{
1850	struct cfq_data *cfqd = q->elevator->elevator_data;
1851	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1852
1853	cfq_init_prio_data(cfqq);
1854
1855	cfq_add_rq_rb(rq);
1856
1857	list_add_tail(&rq->queuelist, &cfqq->fifo);
1858
1859	cfq_rq_enqueued(cfqd, cfqq, rq);
1860}
1861
1862static void cfq_completed_request(request_queue_t *q, struct request *rq)
1863{
1864	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1865	struct cfq_data *cfqd = cfqq->cfqd;
1866	const int sync = rq_is_sync(rq);
1867	unsigned long now;
1868
1869	now = jiffies;
1870
1871	WARN_ON(!cfqd->rq_in_driver);
1872	WARN_ON(!cfqq->dispatched);
1873	cfqd->rq_in_driver--;
1874	cfqq->dispatched--;
1875	cfqq->service_last = now;
1876
1877	if (!cfq_class_idle(cfqq))
1878		cfqd->last_end_request = now;
1879
1880	cfq_resort_rr_list(cfqq, 0);
1881
1882	if (sync)
1883		RQ_CIC(rq)->last_end_request = now;
1884
1885	/*
1886	 * If this is the active queue, check if it needs to be expired,
1887	 * or if we want to idle in case it has no pending requests.
1888	 */
1889	if (cfqd->active_queue == cfqq) {
1890		if (cfq_cfqq_slice_new(cfqq)) {
1891			cfq_set_prio_slice(cfqd, cfqq);
1892			cfq_clear_cfqq_slice_new(cfqq);
1893		}
1894		if (cfq_slice_used(cfqq))
1895			cfq_slice_expired(cfqd, 0, 1);
1896		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
1897			cfq_arm_slice_timer(cfqd);
1898	}
1899
1900	if (!cfqd->rq_in_driver)
1901		cfq_schedule_dispatch(cfqd);
1902}
1903
1904/*
1905 * we temporarily boost lower priority queues if they are holding fs exclusive
1906 * resources. they are boosted to normal prio (CLASS_BE/4)
1907 */
1908static void cfq_prio_boost(struct cfq_queue *cfqq)
1909{
1910	const int ioprio_class = cfqq->ioprio_class;
1911	const int ioprio = cfqq->ioprio;
1912
1913	if (has_fs_excl()) {
1914		/*
1915		 * boost idle prio on transactions that would lock out other
1916		 * users of the filesystem
1917		 */
1918		if (cfq_class_idle(cfqq))
1919			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1920		if (cfqq->ioprio > IOPRIO_NORM)
1921			cfqq->ioprio = IOPRIO_NORM;
1922	} else {
1923		/*
1924		 * check if we need to unboost the queue
1925		 */
1926		if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1927			cfqq->ioprio_class = cfqq->org_ioprio_class;
1928		if (cfqq->ioprio != cfqq->org_ioprio)
1929			cfqq->ioprio = cfqq->org_ioprio;
1930	}
1931
1932	/*
1933	 * refile between round-robin lists if we moved the priority class
1934	 */
1935	if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio))
1936		cfq_resort_rr_list(cfqq, 0);
1937}
1938
1939static inline int __cfq_may_queue(struct cfq_queue *cfqq)
1940{
1941	if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1942	    !cfq_cfqq_must_alloc_slice(cfqq)) {
1943		cfq_mark_cfqq_must_alloc_slice(cfqq);
1944		return ELV_MQUEUE_MUST;
1945	}
1946
1947	return ELV_MQUEUE_MAY;
1948}
1949
1950static int cfq_may_queue(request_queue_t *q, int rw)
1951{
1952	struct cfq_data *cfqd = q->elevator->elevator_data;
1953	struct task_struct *tsk = current;
1954	struct cfq_queue *cfqq;
1955	unsigned int key;
1956
1957	key = cfq_queue_pid(tsk, rw, rw & REQ_RW_SYNC);
1958
1959	/*
1960	 * don't force setup of a queue from here, as a call to may_queue
1961	 * does not necessarily imply that a request actually will be queued.
1962	 * so just lookup a possibly existing queue, or return 'may queue'
1963	 * if that fails
1964	 */
1965	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
1966	if (cfqq) {
1967		cfq_init_prio_data(cfqq);
1968		cfq_prio_boost(cfqq);
1969
1970		return __cfq_may_queue(cfqq);
1971	}
1972
1973	return ELV_MQUEUE_MAY;
1974}
1975
1976/*
1977 * queue lock held here
1978 */
1979static void cfq_put_request(struct request *rq)
1980{
1981	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1982
1983	if (cfqq) {
1984		const int rw = rq_data_dir(rq);
1985
1986		BUG_ON(!cfqq->allocated[rw]);
1987		cfqq->allocated[rw]--;
1988
1989		put_io_context(RQ_CIC(rq)->ioc);
1990
1991		rq->elevator_private = NULL;
1992		rq->elevator_private2 = NULL;
1993
1994		cfq_put_queue(cfqq);
1995	}
1996}
1997
1998/*
1999 * Allocate cfq data structures associated with this request.
2000 */
2001static int
2002cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
2003{
2004	struct cfq_data *cfqd = q->elevator->elevator_data;
2005	struct task_struct *tsk = current;
2006	struct cfq_io_context *cic;
2007	const int rw = rq_data_dir(rq);
2008	const int is_sync = rq_is_sync(rq);
2009	pid_t key = cfq_queue_pid(tsk, rw, is_sync);
2010	struct cfq_queue *cfqq;
2011	unsigned long flags;
2012
2013	might_sleep_if(gfp_mask & __GFP_WAIT);
2014
2015	cic = cfq_get_io_context(cfqd, gfp_mask);
2016
2017	spin_lock_irqsave(q->queue_lock, flags);
2018
2019	if (!cic)
2020		goto queue_fail;
2021
2022	if (!cic->cfqq[is_sync]) {
2023		cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
2024		if (!cfqq)
2025			goto queue_fail;
2026
2027		cic->cfqq[is_sync] = cfqq;
2028	} else
2029		cfqq = cic->cfqq[is_sync];
2030
2031	cfqq->allocated[rw]++;
2032	cfq_clear_cfqq_must_alloc(cfqq);
2033	atomic_inc(&cfqq->ref);
2034
2035	spin_unlock_irqrestore(q->queue_lock, flags);
2036
2037	rq->elevator_private = cic;
2038	rq->elevator_private2 = cfqq;
2039	return 0;
2040
2041queue_fail:
2042	if (cic)
2043		put_io_context(cic->ioc);
2044
2045	cfq_schedule_dispatch(cfqd);
2046	spin_unlock_irqrestore(q->queue_lock, flags);
2047	return 1;
2048}
2049
2050static void cfq_kick_queue(struct work_struct *work)
2051{
2052	struct cfq_data *cfqd =
2053		container_of(work, struct cfq_data, unplug_work);
2054	request_queue_t *q = cfqd->queue;
2055	unsigned long flags;
2056
2057	spin_lock_irqsave(q->queue_lock, flags);
2058	blk_start_queueing(q);
2059	spin_unlock_irqrestore(q->queue_lock, flags);
2060}
2061
2062/*
2063 * Timer running if the active_queue is currently idling inside its time slice
2064 */
2065static void cfq_idle_slice_timer(unsigned long data)
2066{
2067	struct cfq_data *cfqd = (struct cfq_data *) data;
2068	struct cfq_queue *cfqq;
2069	unsigned long flags;
2070	int timed_out = 1;
2071
2072	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2073
2074	if ((cfqq = cfqd->active_queue) != NULL) {
2075		timed_out = 0;
2076
2077		/*
2078		 * expired
2079		 */
2080		if (cfq_slice_used(cfqq))
2081			goto expire;
2082
2083		/*
2084		 * only expire and reinvoke request handler, if there are
2085		 * other queues with pending requests
2086		 */
2087		if (!cfqd->busy_queues)
2088			goto out_cont;
2089
2090		/*
2091		 * not expired and it has a request pending, let it dispatch
2092		 */
2093		if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
2094			cfq_mark_cfqq_must_dispatch(cfqq);
2095			goto out_kick;
2096		}
2097	}
2098expire:
2099	cfq_slice_expired(cfqd, 0, timed_out);
2100out_kick:
2101	cfq_schedule_dispatch(cfqd);
2102out_cont:
2103	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2104}
2105
2106/*
2107 * Timer running if an idle class queue is waiting for service
2108 */
2109static void cfq_idle_class_timer(unsigned long data)
2110{
2111	struct cfq_data *cfqd = (struct cfq_data *) data;
2112	unsigned long flags, end;
2113
2114	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2115
2116	/*
2117	 * race with a non-idle queue, reset timer
2118	 */
2119	end = cfqd->last_end_request + CFQ_IDLE_GRACE;
2120	if (!time_after_eq(jiffies, end))
2121		mod_timer(&cfqd->idle_class_timer, end);
2122	else
2123		cfq_schedule_dispatch(cfqd);
2124
2125	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2126}
2127
2128static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2129{
2130	del_timer_sync(&cfqd->idle_slice_timer);
2131	del_timer_sync(&cfqd->idle_class_timer);
2132	blk_sync_queue(cfqd->queue);
2133}
2134
2135static void cfq_exit_queue(elevator_t *e)
2136{
2137	struct cfq_data *cfqd = e->elevator_data;
2138	request_queue_t *q = cfqd->queue;
2139
2140	cfq_shutdown_timer_wq(cfqd);
2141
2142	spin_lock_irq(q->queue_lock);
2143
2144	if (cfqd->active_queue)
2145		__cfq_slice_expired(cfqd, cfqd->active_queue, 0, 0);
2146
2147	while (!list_empty(&cfqd->cic_list)) {
2148		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2149							struct cfq_io_context,
2150							queue_list);
2151
2152		__cfq_exit_single_io_context(cfqd, cic);
2153	}
2154
2155	spin_unlock_irq(q->queue_lock);
2156
2157	cfq_shutdown_timer_wq(cfqd);
2158
2159	kfree(cfqd->cfq_hash);
2160	kfree(cfqd);
2161}
2162
2163static void *cfq_init_queue(request_queue_t *q)
2164{
2165	struct cfq_data *cfqd;
2166	int i;
2167
2168	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
2169	if (!cfqd)
2170		return NULL;
2171
2172	memset(cfqd, 0, sizeof(*cfqd));
2173
2174	for (i = 0; i < CFQ_PRIO_LISTS; i++)
2175		INIT_LIST_HEAD(&cfqd->rr_list[i]);
2176
2177	INIT_LIST_HEAD(&cfqd->busy_rr);
2178	INIT_LIST_HEAD(&cfqd->cur_rr);
2179	INIT_LIST_HEAD(&cfqd->idle_rr);
2180	INIT_LIST_HEAD(&cfqd->cic_list);
2181
2182	cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
2183	if (!cfqd->cfq_hash)
2184		goto out_free;
2185
2186	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2187		INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2188
2189	cfqd->queue = q;
2190
2191	init_timer(&cfqd->idle_slice_timer);
2192	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2193	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2194
2195	init_timer(&cfqd->idle_class_timer);
2196	cfqd->idle_class_timer.function = cfq_idle_class_timer;
2197	cfqd->idle_class_timer.data = (unsigned long) cfqd;
2198
2199	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
2200
2201	cfqd->cfq_quantum = cfq_quantum;
2202	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2203	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2204	cfqd->cfq_back_max = cfq_back_max;
2205	cfqd->cfq_back_penalty = cfq_back_penalty;
2206	cfqd->cfq_slice[0] = cfq_slice_async;
2207	cfqd->cfq_slice[1] = cfq_slice_sync;
2208	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2209	cfqd->cfq_slice_idle = cfq_slice_idle;
2210
2211	return cfqd;
2212out_free:
2213	kfree(cfqd);
2214	return NULL;
2215}
2216
2217static void cfq_slab_kill(void)
2218{
2219	if (cfq_pool)
2220		kmem_cache_destroy(cfq_pool);
2221	if (cfq_ioc_pool)
2222		kmem_cache_destroy(cfq_ioc_pool);
2223}
2224
2225static int __init cfq_slab_setup(void)
2226{
2227	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2228					NULL, NULL);
2229	if (!cfq_pool)
2230		goto fail;
2231
2232	cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2233			sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2234	if (!cfq_ioc_pool)
2235		goto fail;
2236
2237	return 0;
2238fail:
2239	cfq_slab_kill();
2240	return -ENOMEM;
2241}
2242
2243/*
2244 * sysfs parts below -->
2245 */
2246static ssize_t
2247cfq_var_show(unsigned int var, char *page)
2248{
2249	return sprintf(page, "%d\n", var);
2250}
2251
2252static ssize_t
2253cfq_var_store(unsigned int *var, const char *page, size_t count)
2254{
2255	char *p = (char *) page;
2256
2257	*var = simple_strtoul(p, &p, 10);
2258	return count;
2259}
2260
2261#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
2262static ssize_t __FUNC(elevator_t *e, char *page)			\
2263{									\
2264	struct cfq_data *cfqd = e->elevator_data;			\
2265	unsigned int __data = __VAR;					\
2266	if (__CONV)							\
2267		__data = jiffies_to_msecs(__data);			\
2268	return cfq_var_show(__data, (page));				\
2269}
2270SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2271SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2272SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2273SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
2274SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2275SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2276SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2277SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2278SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2279#undef SHOW_FUNCTION
2280
2281#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
2282static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
2283{									\
2284	struct cfq_data *cfqd = e->elevator_data;			\
2285	unsigned int __data;						\
2286	int ret = cfq_var_store(&__data, (page), count);		\
2287	if (__data < (MIN))						\
2288		__data = (MIN);						\
2289	else if (__data > (MAX))					\
2290		__data = (MAX);						\
2291	if (__CONV)							\
2292		*(__PTR) = msecs_to_jiffies(__data);			\
2293	else								\
2294		*(__PTR) = __data;					\
2295	return ret;							\
2296}
2297STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2298STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2299STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2300STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2301STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2302STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2303STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2304STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2305STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2306#undef STORE_FUNCTION
2307
2308#define CFQ_ATTR(name) \
2309	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
2310
2311static struct elv_fs_entry cfq_attrs[] = {
2312	CFQ_ATTR(quantum),
2313	CFQ_ATTR(fifo_expire_sync),
2314	CFQ_ATTR(fifo_expire_async),
2315	CFQ_ATTR(back_seek_max),
2316	CFQ_ATTR(back_seek_penalty),
2317	CFQ_ATTR(slice_sync),
2318	CFQ_ATTR(slice_async),
2319	CFQ_ATTR(slice_async_rq),
2320	CFQ_ATTR(slice_idle),
2321	__ATTR_NULL
2322};
2323
2324static struct elevator_type iosched_cfq = {
2325	.ops = {
2326		.elevator_merge_fn = 		cfq_merge,
2327		.elevator_merged_fn =		cfq_merged_request,
2328		.elevator_merge_req_fn =	cfq_merged_requests,
2329		.elevator_allow_merge_fn =	cfq_allow_merge,
2330		.elevator_dispatch_fn =		cfq_dispatch_requests,
2331		.elevator_add_req_fn =		cfq_insert_request,
2332		.elevator_activate_req_fn =	cfq_activate_request,
2333		.elevator_deactivate_req_fn =	cfq_deactivate_request,
2334		.elevator_queue_empty_fn =	cfq_queue_empty,
2335		.elevator_completed_req_fn =	cfq_completed_request,
2336		.elevator_former_req_fn =	elv_rb_former_request,
2337		.elevator_latter_req_fn =	elv_rb_latter_request,
2338		.elevator_set_req_fn =		cfq_set_request,
2339		.elevator_put_req_fn =		cfq_put_request,
2340		.elevator_may_queue_fn =	cfq_may_queue,
2341		.elevator_init_fn =		cfq_init_queue,
2342		.elevator_exit_fn =		cfq_exit_queue,
2343		.trim =				cfq_free_io_context,
2344	},
2345	.elevator_attrs =	cfq_attrs,
2346	.elevator_name =	"cfq",
2347	.elevator_owner =	THIS_MODULE,
2348};
2349
2350static int __init cfq_init(void)
2351{
2352	int ret;
2353
2354	/*
2355	 * could be 0 on HZ < 1000 setups
2356	 */
2357	if (!cfq_slice_async)
2358		cfq_slice_async = 1;
2359	if (!cfq_slice_idle)
2360		cfq_slice_idle = 1;
2361
2362	if (cfq_slab_setup())
2363		return -ENOMEM;
2364
2365	ret = elv_register(&iosched_cfq);
2366	if (ret)
2367		cfq_slab_kill();
2368
2369	return ret;
2370}
2371
2372static void __exit cfq_exit(void)
2373{
2374	DECLARE_COMPLETION_ONSTACK(all_gone);
2375	elv_unregister(&iosched_cfq);
2376	ioc_gone = &all_gone;
2377	/* ioc_gone's update must be visible before reading ioc_count */
2378	smp_wmb();
2379	if (elv_ioc_count_read(ioc_count))
2380		wait_for_completion(ioc_gone);
2381	synchronize_rcu();
2382	cfq_slab_kill();
2383}
2384
2385module_init(cfq_init);
2386module_exit(cfq_exit);
2387
2388MODULE_AUTHOR("Jens Axboe");
2389MODULE_LICENSE("GPL");
2390MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
2391