cfq-iosched.c revision bc1c116974a5c3f498112a6f175d3e4a8cd5bdbc
1/*
2 *  CFQ, or complete fairness queueing, disk scheduler.
3 *
4 *  Based on ideas from a previously unfinished io
5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
8 */
9#include <linux/config.h>
10#include <linux/module.h>
11#include <linux/blkdev.h>
12#include <linux/elevator.h>
13#include <linux/hash.h>
14#include <linux/rbtree.h>
15#include <linux/ioprio.h>
16
17/*
18 * tunables
19 */
20static const int cfq_quantum = 4;		/* max queue in one round of service */
21static const int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
22static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
23static const int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
24static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
25
26static const int cfq_slice_sync = HZ / 10;
27static int cfq_slice_async = HZ / 25;
28static const int cfq_slice_async_rq = 2;
29static int cfq_slice_idle = HZ / 70;
30
31#define CFQ_IDLE_GRACE		(HZ / 10)
32#define CFQ_SLICE_SCALE		(5)
33
34#define CFQ_KEY_ASYNC		(0)
35
36static DEFINE_SPINLOCK(cfq_exit_lock);
37
38/*
39 * for the hash of cfqq inside the cfqd
40 */
41#define CFQ_QHASH_SHIFT		6
42#define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
43#define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)
44
45/*
46 * for the hash of crq inside the cfqq
47 */
48#define CFQ_MHASH_SHIFT		6
49#define CFQ_MHASH_BLOCK(sec)	((sec) >> 3)
50#define CFQ_MHASH_ENTRIES	(1 << CFQ_MHASH_SHIFT)
51#define CFQ_MHASH_FN(sec)	hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
52#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
53#define list_entry_hash(ptr)	hlist_entry((ptr), struct cfq_rq, hash)
54
55#define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
56#define list_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
57
58#define RQ_DATA(rq)		(rq)->elevator_private
59
60/*
61 * rb-tree defines
62 */
63#define RB_NONE			(2)
64#define RB_EMPTY(node)		((node)->rb_node == NULL)
65#define RB_CLEAR_COLOR(node)	(node)->rb_color = RB_NONE
66#define RB_CLEAR(node)		do {	\
67	(node)->rb_parent = NULL;	\
68	RB_CLEAR_COLOR((node));		\
69	(node)->rb_right = NULL;	\
70	(node)->rb_left = NULL;		\
71} while (0)
72#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
73#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
74#define rq_rb_key(rq)		(rq)->sector
75
76static kmem_cache_t *crq_pool;
77static kmem_cache_t *cfq_pool;
78static kmem_cache_t *cfq_ioc_pool;
79
80static atomic_t ioc_count = ATOMIC_INIT(0);
81static struct completion *ioc_gone;
82
83#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
84#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
85#define cfq_class_be(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
86#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
87
88#define ASYNC			(0)
89#define SYNC			(1)
90
91#define cfq_cfqq_dispatched(cfqq)	\
92	((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
93
94#define cfq_cfqq_class_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
95
96#define cfq_cfqq_sync(cfqq)		\
97	(cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
98
99#define sample_valid(samples)	((samples) > 80)
100
101/*
102 * Per block device queue structure
103 */
104struct cfq_data {
105	request_queue_t *queue;
106
107	/*
108	 * rr list of queues with requests and the count of them
109	 */
110	struct list_head rr_list[CFQ_PRIO_LISTS];
111	struct list_head busy_rr;
112	struct list_head cur_rr;
113	struct list_head idle_rr;
114	unsigned int busy_queues;
115
116	/*
117	 * non-ordered list of empty cfqq's
118	 */
119	struct list_head empty_list;
120
121	/*
122	 * cfqq lookup hash
123	 */
124	struct hlist_head *cfq_hash;
125
126	/*
127	 * global crq hash for all queues
128	 */
129	struct hlist_head *crq_hash;
130
131	unsigned int max_queued;
132
133	mempool_t *crq_pool;
134
135	int rq_in_driver;
136	int hw_tag;
137
138	/*
139	 * schedule slice state info
140	 */
141	/*
142	 * idle window management
143	 */
144	struct timer_list idle_slice_timer;
145	struct work_struct unplug_work;
146
147	struct cfq_queue *active_queue;
148	struct cfq_io_context *active_cic;
149	int cur_prio, cur_end_prio;
150	unsigned int dispatch_slice;
151
152	struct timer_list idle_class_timer;
153
154	sector_t last_sector;
155	unsigned long last_end_request;
156
157	unsigned int rq_starved;
158
159	/*
160	 * tunables, see top of file
161	 */
162	unsigned int cfq_quantum;
163	unsigned int cfq_queued;
164	unsigned int cfq_fifo_expire[2];
165	unsigned int cfq_back_penalty;
166	unsigned int cfq_back_max;
167	unsigned int cfq_slice[2];
168	unsigned int cfq_slice_async_rq;
169	unsigned int cfq_slice_idle;
170
171	struct list_head cic_list;
172};
173
174/*
175 * Per process-grouping structure
176 */
177struct cfq_queue {
178	/* reference count */
179	atomic_t ref;
180	/* parent cfq_data */
181	struct cfq_data *cfqd;
182	/* cfqq lookup hash */
183	struct hlist_node cfq_hash;
184	/* hash key */
185	unsigned int key;
186	/* on either rr or empty list of cfqd */
187	struct list_head cfq_list;
188	/* sorted list of pending requests */
189	struct rb_root sort_list;
190	/* if fifo isn't expired, next request to serve */
191	struct cfq_rq *next_crq;
192	/* requests queued in sort_list */
193	int queued[2];
194	/* currently allocated requests */
195	int allocated[2];
196	/* fifo list of requests in sort_list */
197	struct list_head fifo;
198
199	unsigned long slice_start;
200	unsigned long slice_end;
201	unsigned long slice_left;
202	unsigned long service_last;
203
204	/* number of requests that are on the dispatch list */
205	int on_dispatch[2];
206
207	/* io prio of this group */
208	unsigned short ioprio, org_ioprio;
209	unsigned short ioprio_class, org_ioprio_class;
210
211	/* various state flags, see below */
212	unsigned int flags;
213};
214
215struct cfq_rq {
216	struct rb_node rb_node;
217	sector_t rb_key;
218	struct request *request;
219	struct hlist_node hash;
220
221	struct cfq_queue *cfq_queue;
222	struct cfq_io_context *io_context;
223
224	unsigned int crq_flags;
225};
226
227enum cfqq_state_flags {
228	CFQ_CFQQ_FLAG_on_rr = 0,
229	CFQ_CFQQ_FLAG_wait_request,
230	CFQ_CFQQ_FLAG_must_alloc,
231	CFQ_CFQQ_FLAG_must_alloc_slice,
232	CFQ_CFQQ_FLAG_must_dispatch,
233	CFQ_CFQQ_FLAG_fifo_expire,
234	CFQ_CFQQ_FLAG_idle_window,
235	CFQ_CFQQ_FLAG_prio_changed,
236};
237
238#define CFQ_CFQQ_FNS(name)						\
239static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
240{									\
241	cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
242}									\
243static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
244{									\
245	cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
246}									\
247static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
248{									\
249	return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
250}
251
252CFQ_CFQQ_FNS(on_rr);
253CFQ_CFQQ_FNS(wait_request);
254CFQ_CFQQ_FNS(must_alloc);
255CFQ_CFQQ_FNS(must_alloc_slice);
256CFQ_CFQQ_FNS(must_dispatch);
257CFQ_CFQQ_FNS(fifo_expire);
258CFQ_CFQQ_FNS(idle_window);
259CFQ_CFQQ_FNS(prio_changed);
260#undef CFQ_CFQQ_FNS
261
262enum cfq_rq_state_flags {
263	CFQ_CRQ_FLAG_is_sync = 0,
264};
265
266#define CFQ_CRQ_FNS(name)						\
267static inline void cfq_mark_crq_##name(struct cfq_rq *crq)		\
268{									\
269	crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name);			\
270}									\
271static inline void cfq_clear_crq_##name(struct cfq_rq *crq)		\
272{									\
273	crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name);			\
274}									\
275static inline int cfq_crq_##name(const struct cfq_rq *crq)		\
276{									\
277	return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0;	\
278}
279
280CFQ_CRQ_FNS(is_sync);
281#undef CFQ_CRQ_FNS
282
283static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
284static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
285static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
286
287#define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
288
289/*
290 * lots of deadline iosched dupes, can be abstracted later...
291 */
292static inline void cfq_del_crq_hash(struct cfq_rq *crq)
293{
294	hlist_del_init(&crq->hash);
295}
296
297static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
298{
299	const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
300
301	hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
302}
303
304static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
305{
306	struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
307	struct hlist_node *entry, *next;
308
309	hlist_for_each_safe(entry, next, hash_list) {
310		struct cfq_rq *crq = list_entry_hash(entry);
311		struct request *__rq = crq->request;
312
313		if (!rq_mergeable(__rq)) {
314			cfq_del_crq_hash(crq);
315			continue;
316		}
317
318		if (rq_hash_key(__rq) == offset)
319			return __rq;
320	}
321
322	return NULL;
323}
324
325/*
326 * scheduler run of queue, if there are requests pending and no one in the
327 * driver that will restart queueing
328 */
329static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
330{
331	if (cfqd->busy_queues)
332		kblockd_schedule_work(&cfqd->unplug_work);
333}
334
335static int cfq_queue_empty(request_queue_t *q)
336{
337	struct cfq_data *cfqd = q->elevator->elevator_data;
338
339	return !cfqd->busy_queues;
340}
341
342static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
343{
344	if (rw == READ || process_sync(task))
345		return task->pid;
346
347	return CFQ_KEY_ASYNC;
348}
349
350/*
351 * Lifted from AS - choose which of crq1 and crq2 that is best served now.
352 * We choose the request that is closest to the head right now. Distance
353 * behind the head is penalized and only allowed to a certain extent.
354 */
355static struct cfq_rq *
356cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
357{
358	sector_t last, s1, s2, d1 = 0, d2 = 0;
359	unsigned long back_max;
360#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
361#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
362	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
363
364	if (crq1 == NULL || crq1 == crq2)
365		return crq2;
366	if (crq2 == NULL)
367		return crq1;
368
369	if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
370		return crq1;
371	else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
372		return crq2;
373
374	s1 = crq1->request->sector;
375	s2 = crq2->request->sector;
376
377	last = cfqd->last_sector;
378
379	/*
380	 * by definition, 1KiB is 2 sectors
381	 */
382	back_max = cfqd->cfq_back_max * 2;
383
384	/*
385	 * Strict one way elevator _except_ in the case where we allow
386	 * short backward seeks which are biased as twice the cost of a
387	 * similar forward seek.
388	 */
389	if (s1 >= last)
390		d1 = s1 - last;
391	else if (s1 + back_max >= last)
392		d1 = (last - s1) * cfqd->cfq_back_penalty;
393	else
394		wrap |= CFQ_RQ1_WRAP;
395
396	if (s2 >= last)
397		d2 = s2 - last;
398	else if (s2 + back_max >= last)
399		d2 = (last - s2) * cfqd->cfq_back_penalty;
400	else
401		wrap |= CFQ_RQ2_WRAP;
402
403	/* Found required data */
404
405	/*
406	 * By doing switch() on the bit mask "wrap" we avoid having to
407	 * check two variables for all permutations: --> faster!
408	 */
409	switch (wrap) {
410	case 0: /* common case for CFQ: crq1 and crq2 not wrapped */
411		if (d1 < d2)
412			return crq1;
413		else if (d2 < d1)
414			return crq2;
415		else {
416			if (s1 >= s2)
417				return crq1;
418			else
419				return crq2;
420		}
421
422	case CFQ_RQ2_WRAP:
423		return crq1;
424	case CFQ_RQ1_WRAP:
425		return crq2;
426	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */
427	default:
428		/*
429		 * Since both rqs are wrapped,
430		 * start with the one that's further behind head
431		 * (--> only *one* back seek required),
432		 * since back seek takes more time than forward.
433		 */
434		if (s1 <= s2)
435			return crq1;
436		else
437			return crq2;
438	}
439}
440
441/*
442 * would be nice to take fifo expire time into account as well
443 */
444static struct cfq_rq *
445cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
446		  struct cfq_rq *last)
447{
448	struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
449	struct rb_node *rbnext, *rbprev;
450
451	if (!(rbnext = rb_next(&last->rb_node))) {
452		rbnext = rb_first(&cfqq->sort_list);
453		if (rbnext == &last->rb_node)
454			rbnext = NULL;
455	}
456
457	rbprev = rb_prev(&last->rb_node);
458
459	if (rbprev)
460		crq_prev = rb_entry_crq(rbprev);
461	if (rbnext)
462		crq_next = rb_entry_crq(rbnext);
463
464	return cfq_choose_req(cfqd, crq_next, crq_prev);
465}
466
467static void cfq_update_next_crq(struct cfq_rq *crq)
468{
469	struct cfq_queue *cfqq = crq->cfq_queue;
470
471	if (cfqq->next_crq == crq)
472		cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
473}
474
475static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
476{
477	struct cfq_data *cfqd = cfqq->cfqd;
478	struct list_head *list, *entry;
479
480	BUG_ON(!cfq_cfqq_on_rr(cfqq));
481
482	list_del(&cfqq->cfq_list);
483
484	if (cfq_class_rt(cfqq))
485		list = &cfqd->cur_rr;
486	else if (cfq_class_idle(cfqq))
487		list = &cfqd->idle_rr;
488	else {
489		/*
490		 * if cfqq has requests in flight, don't allow it to be
491		 * found in cfq_set_active_queue before it has finished them.
492		 * this is done to increase fairness between a process that
493		 * has lots of io pending vs one that only generates one
494		 * sporadically or synchronously
495		 */
496		if (cfq_cfqq_dispatched(cfqq))
497			list = &cfqd->busy_rr;
498		else
499			list = &cfqd->rr_list[cfqq->ioprio];
500	}
501
502	/*
503	 * if queue was preempted, just add to front to be fair. busy_rr
504	 * isn't sorted, but insert at the back for fairness.
505	 */
506	if (preempted || list == &cfqd->busy_rr) {
507		if (preempted)
508			list = list->prev;
509
510		list_add_tail(&cfqq->cfq_list, list);
511		return;
512	}
513
514	/*
515	 * sort by when queue was last serviced
516	 */
517	entry = list;
518	while ((entry = entry->prev) != list) {
519		struct cfq_queue *__cfqq = list_entry_cfqq(entry);
520
521		if (!__cfqq->service_last)
522			break;
523		if (time_before(__cfqq->service_last, cfqq->service_last))
524			break;
525	}
526
527	list_add(&cfqq->cfq_list, entry);
528}
529
530/*
531 * add to busy list of queues for service, trying to be fair in ordering
532 * the pending list according to last request service
533 */
534static inline void
535cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
536{
537	BUG_ON(cfq_cfqq_on_rr(cfqq));
538	cfq_mark_cfqq_on_rr(cfqq);
539	cfqd->busy_queues++;
540
541	cfq_resort_rr_list(cfqq, 0);
542}
543
544static inline void
545cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
546{
547	BUG_ON(!cfq_cfqq_on_rr(cfqq));
548	cfq_clear_cfqq_on_rr(cfqq);
549	list_move(&cfqq->cfq_list, &cfqd->empty_list);
550
551	BUG_ON(!cfqd->busy_queues);
552	cfqd->busy_queues--;
553}
554
555/*
556 * rb tree support functions
557 */
558static inline void cfq_del_crq_rb(struct cfq_rq *crq)
559{
560	struct cfq_queue *cfqq = crq->cfq_queue;
561	struct cfq_data *cfqd = cfqq->cfqd;
562	const int sync = cfq_crq_is_sync(crq);
563
564	BUG_ON(!cfqq->queued[sync]);
565	cfqq->queued[sync]--;
566
567	cfq_update_next_crq(crq);
568
569	rb_erase(&crq->rb_node, &cfqq->sort_list);
570	RB_CLEAR_COLOR(&crq->rb_node);
571
572	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
573		cfq_del_cfqq_rr(cfqd, cfqq);
574}
575
576static struct cfq_rq *
577__cfq_add_crq_rb(struct cfq_rq *crq)
578{
579	struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
580	struct rb_node *parent = NULL;
581	struct cfq_rq *__crq;
582
583	while (*p) {
584		parent = *p;
585		__crq = rb_entry_crq(parent);
586
587		if (crq->rb_key < __crq->rb_key)
588			p = &(*p)->rb_left;
589		else if (crq->rb_key > __crq->rb_key)
590			p = &(*p)->rb_right;
591		else
592			return __crq;
593	}
594
595	rb_link_node(&crq->rb_node, parent, p);
596	return NULL;
597}
598
599static void cfq_add_crq_rb(struct cfq_rq *crq)
600{
601	struct cfq_queue *cfqq = crq->cfq_queue;
602	struct cfq_data *cfqd = cfqq->cfqd;
603	struct request *rq = crq->request;
604	struct cfq_rq *__alias;
605
606	crq->rb_key = rq_rb_key(rq);
607	cfqq->queued[cfq_crq_is_sync(crq)]++;
608
609	/*
610	 * looks a little odd, but the first insert might return an alias.
611	 * if that happens, put the alias on the dispatch list
612	 */
613	while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
614		cfq_dispatch_insert(cfqd->queue, __alias);
615
616	rb_insert_color(&crq->rb_node, &cfqq->sort_list);
617
618	if (!cfq_cfqq_on_rr(cfqq))
619		cfq_add_cfqq_rr(cfqd, cfqq);
620
621	/*
622	 * check if this request is a better next-serve candidate
623	 */
624	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
625}
626
627static inline void
628cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
629{
630	rb_erase(&crq->rb_node, &cfqq->sort_list);
631	cfqq->queued[cfq_crq_is_sync(crq)]--;
632
633	cfq_add_crq_rb(crq);
634}
635
636static struct request *
637cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
638{
639	struct task_struct *tsk = current;
640	pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
641	struct cfq_queue *cfqq;
642	struct rb_node *n;
643	sector_t sector;
644
645	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
646	if (!cfqq)
647		goto out;
648
649	sector = bio->bi_sector + bio_sectors(bio);
650	n = cfqq->sort_list.rb_node;
651	while (n) {
652		struct cfq_rq *crq = rb_entry_crq(n);
653
654		if (sector < crq->rb_key)
655			n = n->rb_left;
656		else if (sector > crq->rb_key)
657			n = n->rb_right;
658		else
659			return crq->request;
660	}
661
662out:
663	return NULL;
664}
665
666static void cfq_activate_request(request_queue_t *q, struct request *rq)
667{
668	struct cfq_data *cfqd = q->elevator->elevator_data;
669
670	cfqd->rq_in_driver++;
671
672	/*
673	 * If the depth is larger 1, it really could be queueing. But lets
674	 * make the mark a little higher - idling could still be good for
675	 * low queueing, and a low queueing number could also just indicate
676	 * a SCSI mid layer like behaviour where limit+1 is often seen.
677	 */
678	if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
679		cfqd->hw_tag = 1;
680}
681
682static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
683{
684	struct cfq_data *cfqd = q->elevator->elevator_data;
685
686	WARN_ON(!cfqd->rq_in_driver);
687	cfqd->rq_in_driver--;
688}
689
690static void cfq_remove_request(struct request *rq)
691{
692	struct cfq_rq *crq = RQ_DATA(rq);
693
694	list_del_init(&rq->queuelist);
695	cfq_del_crq_rb(crq);
696	cfq_del_crq_hash(crq);
697}
698
699static int
700cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
701{
702	struct cfq_data *cfqd = q->elevator->elevator_data;
703	struct request *__rq;
704	int ret;
705
706	__rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
707	if (__rq && elv_rq_merge_ok(__rq, bio)) {
708		ret = ELEVATOR_BACK_MERGE;
709		goto out;
710	}
711
712	__rq = cfq_find_rq_fmerge(cfqd, bio);
713	if (__rq && elv_rq_merge_ok(__rq, bio)) {
714		ret = ELEVATOR_FRONT_MERGE;
715		goto out;
716	}
717
718	return ELEVATOR_NO_MERGE;
719out:
720	*req = __rq;
721	return ret;
722}
723
724static void cfq_merged_request(request_queue_t *q, struct request *req)
725{
726	struct cfq_data *cfqd = q->elevator->elevator_data;
727	struct cfq_rq *crq = RQ_DATA(req);
728
729	cfq_del_crq_hash(crq);
730	cfq_add_crq_hash(cfqd, crq);
731
732	if (rq_rb_key(req) != crq->rb_key) {
733		struct cfq_queue *cfqq = crq->cfq_queue;
734
735		cfq_update_next_crq(crq);
736		cfq_reposition_crq_rb(cfqq, crq);
737	}
738}
739
740static void
741cfq_merged_requests(request_queue_t *q, struct request *rq,
742		    struct request *next)
743{
744	cfq_merged_request(q, rq);
745
746	/*
747	 * reposition in fifo if next is older than rq
748	 */
749	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
750	    time_before(next->start_time, rq->start_time))
751		list_move(&rq->queuelist, &next->queuelist);
752
753	cfq_remove_request(next);
754}
755
756static inline void
757__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
758{
759	if (cfqq) {
760		/*
761		 * stop potential idle class queues waiting service
762		 */
763		del_timer(&cfqd->idle_class_timer);
764
765		cfqq->slice_start = jiffies;
766		cfqq->slice_end = 0;
767		cfqq->slice_left = 0;
768		cfq_clear_cfqq_must_alloc_slice(cfqq);
769		cfq_clear_cfqq_fifo_expire(cfqq);
770	}
771
772	cfqd->active_queue = cfqq;
773}
774
775/*
776 * current cfqq expired its slice (or was too idle), select new one
777 */
778static void
779__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
780		    int preempted)
781{
782	unsigned long now = jiffies;
783
784	if (cfq_cfqq_wait_request(cfqq))
785		del_timer(&cfqd->idle_slice_timer);
786
787	if (!preempted && !cfq_cfqq_dispatched(cfqq)) {
788		cfqq->service_last = now;
789		cfq_schedule_dispatch(cfqd);
790	}
791
792	cfq_clear_cfqq_must_dispatch(cfqq);
793	cfq_clear_cfqq_wait_request(cfqq);
794
795	/*
796	 * store what was left of this slice, if the queue idled out
797	 * or was preempted
798	 */
799	if (time_after(cfqq->slice_end, now))
800		cfqq->slice_left = cfqq->slice_end - now;
801	else
802		cfqq->slice_left = 0;
803
804	if (cfq_cfqq_on_rr(cfqq))
805		cfq_resort_rr_list(cfqq, preempted);
806
807	if (cfqq == cfqd->active_queue)
808		cfqd->active_queue = NULL;
809
810	if (cfqd->active_cic) {
811		put_io_context(cfqd->active_cic->ioc);
812		cfqd->active_cic = NULL;
813	}
814
815	cfqd->dispatch_slice = 0;
816}
817
818static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
819{
820	struct cfq_queue *cfqq = cfqd->active_queue;
821
822	if (cfqq)
823		__cfq_slice_expired(cfqd, cfqq, preempted);
824}
825
826/*
827 * 0
828 * 0,1
829 * 0,1,2
830 * 0,1,2,3
831 * 0,1,2,3,4
832 * 0,1,2,3,4,5
833 * 0,1,2,3,4,5,6
834 * 0,1,2,3,4,5,6,7
835 */
836static int cfq_get_next_prio_level(struct cfq_data *cfqd)
837{
838	int prio, wrap;
839
840	prio = -1;
841	wrap = 0;
842	do {
843		int p;
844
845		for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
846			if (!list_empty(&cfqd->rr_list[p])) {
847				prio = p;
848				break;
849			}
850		}
851
852		if (prio != -1)
853			break;
854		cfqd->cur_prio = 0;
855		if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
856			cfqd->cur_end_prio = 0;
857			if (wrap)
858				break;
859			wrap = 1;
860		}
861	} while (1);
862
863	if (unlikely(prio == -1))
864		return -1;
865
866	BUG_ON(prio >= CFQ_PRIO_LISTS);
867
868	list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
869
870	cfqd->cur_prio = prio + 1;
871	if (cfqd->cur_prio > cfqd->cur_end_prio) {
872		cfqd->cur_end_prio = cfqd->cur_prio;
873		cfqd->cur_prio = 0;
874	}
875	if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
876		cfqd->cur_prio = 0;
877		cfqd->cur_end_prio = 0;
878	}
879
880	return prio;
881}
882
883static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
884{
885	struct cfq_queue *cfqq = NULL;
886
887	/*
888	 * if current list is non-empty, grab first entry. if it is empty,
889	 * get next prio level and grab first entry then if any are spliced
890	 */
891	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
892		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
893
894	/*
895	 * If no new queues are available, check if the busy list has some
896	 * before falling back to idle io.
897	 */
898	if (!cfqq && !list_empty(&cfqd->busy_rr))
899		cfqq = list_entry_cfqq(cfqd->busy_rr.next);
900
901	/*
902	 * if we have idle queues and no rt or be queues had pending
903	 * requests, either allow immediate service if the grace period
904	 * has passed or arm the idle grace timer
905	 */
906	if (!cfqq && !list_empty(&cfqd->idle_rr)) {
907		unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
908
909		if (time_after_eq(jiffies, end))
910			cfqq = list_entry_cfqq(cfqd->idle_rr.next);
911		else
912			mod_timer(&cfqd->idle_class_timer, end);
913	}
914
915	__cfq_set_active_queue(cfqd, cfqq);
916	return cfqq;
917}
918
919static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
920
921{
922	struct cfq_io_context *cic;
923	unsigned long sl;
924
925	WARN_ON(!RB_EMPTY(&cfqq->sort_list));
926	WARN_ON(cfqq != cfqd->active_queue);
927
928	/*
929	 * idle is disabled, either manually or by past process history
930	 */
931	if (!cfqd->cfq_slice_idle)
932		return 0;
933	if (!cfq_cfqq_idle_window(cfqq))
934		return 0;
935	/*
936	 * task has exited, don't wait
937	 */
938	cic = cfqd->active_cic;
939	if (!cic || !cic->ioc->task)
940		return 0;
941
942	cfq_mark_cfqq_must_dispatch(cfqq);
943	cfq_mark_cfqq_wait_request(cfqq);
944
945	sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
946
947	/*
948	 * we don't want to idle for seeks, but we do want to allow
949	 * fair distribution of slice time for a process doing back-to-back
950	 * seeks. so allow a little bit of time for him to submit a new rq
951	 */
952	if (sample_valid(cic->seek_samples) && cic->seek_mean > 131072)
953		sl = 2;
954
955	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
956	return 1;
957}
958
959static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
960{
961	struct cfq_data *cfqd = q->elevator->elevator_data;
962	struct cfq_queue *cfqq = crq->cfq_queue;
963
964	cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
965	cfq_remove_request(crq->request);
966	cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
967	elv_dispatch_sort(q, crq->request);
968}
969
970/*
971 * return expired entry, or NULL to just start from scratch in rbtree
972 */
973static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
974{
975	struct cfq_data *cfqd = cfqq->cfqd;
976	struct request *rq;
977	struct cfq_rq *crq;
978
979	if (cfq_cfqq_fifo_expire(cfqq))
980		return NULL;
981
982	if (!list_empty(&cfqq->fifo)) {
983		int fifo = cfq_cfqq_class_sync(cfqq);
984
985		crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
986		rq = crq->request;
987		if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
988			cfq_mark_cfqq_fifo_expire(cfqq);
989			return crq;
990		}
991	}
992
993	return NULL;
994}
995
996/*
997 * Scale schedule slice based on io priority. Use the sync time slice only
998 * if a queue is marked sync and has sync io queued. A sync queue with async
999 * io only, should not get full sync slice length.
1000 */
1001static inline int
1002cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1003{
1004	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
1005
1006	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1007
1008	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
1009}
1010
1011static inline void
1012cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1013{
1014	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
1015}
1016
1017static inline int
1018cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1019{
1020	const int base_rq = cfqd->cfq_slice_async_rq;
1021
1022	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1023
1024	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
1025}
1026
1027/*
1028 * get next queue for service
1029 */
1030static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1031{
1032	unsigned long now = jiffies;
1033	struct cfq_queue *cfqq;
1034
1035	cfqq = cfqd->active_queue;
1036	if (!cfqq)
1037		goto new_queue;
1038
1039	/*
1040	 * slice has expired
1041	 */
1042	if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
1043		goto expire;
1044
1045	/*
1046	 * if queue has requests, dispatch one. if not, check if
1047	 * enough slice is left to wait for one
1048	 */
1049	if (!RB_EMPTY(&cfqq->sort_list))
1050		goto keep_queue;
1051	else if (cfq_cfqq_class_sync(cfqq) &&
1052		 time_before(now, cfqq->slice_end)) {
1053		if (cfq_arm_slice_timer(cfqd, cfqq))
1054			return NULL;
1055	}
1056
1057expire:
1058	cfq_slice_expired(cfqd, 0);
1059new_queue:
1060	cfqq = cfq_set_active_queue(cfqd);
1061keep_queue:
1062	return cfqq;
1063}
1064
1065static int
1066__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1067			int max_dispatch)
1068{
1069	int dispatched = 0;
1070
1071	BUG_ON(RB_EMPTY(&cfqq->sort_list));
1072
1073	do {
1074		struct cfq_rq *crq;
1075
1076		/*
1077		 * follow expired path, else get first next available
1078		 */
1079		if ((crq = cfq_check_fifo(cfqq)) == NULL)
1080			crq = cfqq->next_crq;
1081
1082		/*
1083		 * finally, insert request into driver dispatch list
1084		 */
1085		cfq_dispatch_insert(cfqd->queue, crq);
1086
1087		cfqd->dispatch_slice++;
1088		dispatched++;
1089
1090		if (!cfqd->active_cic) {
1091			atomic_inc(&crq->io_context->ioc->refcount);
1092			cfqd->active_cic = crq->io_context;
1093		}
1094
1095		if (RB_EMPTY(&cfqq->sort_list))
1096			break;
1097
1098	} while (dispatched < max_dispatch);
1099
1100	/*
1101	 * if slice end isn't set yet, set it. if at least one request was
1102	 * sync, use the sync time slice value
1103	 */
1104	if (!cfqq->slice_end)
1105		cfq_set_prio_slice(cfqd, cfqq);
1106
1107	/*
1108	 * expire an async queue immediately if it has used up its slice. idle
1109	 * queue always expire after 1 dispatch round.
1110	 */
1111	if ((!cfq_cfqq_sync(cfqq) &&
1112	    cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1113	    cfq_class_idle(cfqq))
1114		cfq_slice_expired(cfqd, 0);
1115
1116	return dispatched;
1117}
1118
1119static int
1120cfq_forced_dispatch_cfqqs(struct list_head *list)
1121{
1122	int dispatched = 0;
1123	struct cfq_queue *cfqq, *next;
1124	struct cfq_rq *crq;
1125
1126	list_for_each_entry_safe(cfqq, next, list, cfq_list) {
1127		while ((crq = cfqq->next_crq)) {
1128			cfq_dispatch_insert(cfqq->cfqd->queue, crq);
1129			dispatched++;
1130		}
1131		BUG_ON(!list_empty(&cfqq->fifo));
1132	}
1133	return dispatched;
1134}
1135
1136static int
1137cfq_forced_dispatch(struct cfq_data *cfqd)
1138{
1139	int i, dispatched = 0;
1140
1141	for (i = 0; i < CFQ_PRIO_LISTS; i++)
1142		dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
1143
1144	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
1145	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
1146	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
1147
1148	cfq_slice_expired(cfqd, 0);
1149
1150	BUG_ON(cfqd->busy_queues);
1151
1152	return dispatched;
1153}
1154
1155static int
1156cfq_dispatch_requests(request_queue_t *q, int force)
1157{
1158	struct cfq_data *cfqd = q->elevator->elevator_data;
1159	struct cfq_queue *cfqq;
1160
1161	if (!cfqd->busy_queues)
1162		return 0;
1163
1164	if (unlikely(force))
1165		return cfq_forced_dispatch(cfqd);
1166
1167	cfqq = cfq_select_queue(cfqd);
1168	if (cfqq) {
1169		int max_dispatch;
1170
1171		cfq_clear_cfqq_must_dispatch(cfqq);
1172		cfq_clear_cfqq_wait_request(cfqq);
1173		del_timer(&cfqd->idle_slice_timer);
1174
1175		max_dispatch = cfqd->cfq_quantum;
1176		if (cfq_class_idle(cfqq))
1177			max_dispatch = 1;
1178
1179		return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1180	}
1181
1182	return 0;
1183}
1184
1185/*
1186 * task holds one reference to the queue, dropped when task exits. each crq
1187 * in-flight on this queue also holds a reference, dropped when crq is freed.
1188 *
1189 * queue lock must be held here.
1190 */
1191static void cfq_put_queue(struct cfq_queue *cfqq)
1192{
1193	struct cfq_data *cfqd = cfqq->cfqd;
1194
1195	BUG_ON(atomic_read(&cfqq->ref) <= 0);
1196
1197	if (!atomic_dec_and_test(&cfqq->ref))
1198		return;
1199
1200	BUG_ON(rb_first(&cfqq->sort_list));
1201	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1202	BUG_ON(cfq_cfqq_on_rr(cfqq));
1203
1204	if (unlikely(cfqd->active_queue == cfqq))
1205		__cfq_slice_expired(cfqd, cfqq, 0);
1206
1207	/*
1208	 * it's on the empty list and still hashed
1209	 */
1210	list_del(&cfqq->cfq_list);
1211	hlist_del(&cfqq->cfq_hash);
1212	kmem_cache_free(cfq_pool, cfqq);
1213}
1214
1215static inline struct cfq_queue *
1216__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1217		    const int hashval)
1218{
1219	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1220	struct hlist_node *entry;
1221	struct cfq_queue *__cfqq;
1222
1223	hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
1224		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1225
1226		if (__cfqq->key == key && (__p == prio || !prio))
1227			return __cfqq;
1228	}
1229
1230	return NULL;
1231}
1232
1233static struct cfq_queue *
1234cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1235{
1236	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1237}
1238
1239static void cfq_free_io_context(struct io_context *ioc)
1240{
1241	struct cfq_io_context *__cic;
1242	struct rb_node *n;
1243	int freed = 0;
1244
1245	while ((n = rb_first(&ioc->cic_root)) != NULL) {
1246		__cic = rb_entry(n, struct cfq_io_context, rb_node);
1247		rb_erase(&__cic->rb_node, &ioc->cic_root);
1248		kmem_cache_free(cfq_ioc_pool, __cic);
1249		freed++;
1250	}
1251
1252	if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
1253		complete(ioc_gone);
1254}
1255
1256static void cfq_trim(struct io_context *ioc)
1257{
1258	ioc->set_ioprio = NULL;
1259	cfq_free_io_context(ioc);
1260}
1261
1262/*
1263 * Called with interrupts disabled
1264 */
1265static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1266{
1267	struct cfq_data *cfqd = cic->key;
1268	request_queue_t *q;
1269
1270	if (!cfqd)
1271		return;
1272
1273	q = cfqd->queue;
1274
1275	WARN_ON(!irqs_disabled());
1276
1277	spin_lock(q->queue_lock);
1278
1279	if (cic->cfqq[ASYNC]) {
1280		if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
1281			__cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
1282		cfq_put_queue(cic->cfqq[ASYNC]);
1283		cic->cfqq[ASYNC] = NULL;
1284	}
1285
1286	if (cic->cfqq[SYNC]) {
1287		if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
1288			__cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
1289		cfq_put_queue(cic->cfqq[SYNC]);
1290		cic->cfqq[SYNC] = NULL;
1291	}
1292
1293	cic->key = NULL;
1294	list_del_init(&cic->queue_list);
1295	spin_unlock(q->queue_lock);
1296}
1297
1298static void cfq_exit_io_context(struct io_context *ioc)
1299{
1300	struct cfq_io_context *__cic;
1301	unsigned long flags;
1302	struct rb_node *n;
1303
1304	/*
1305	 * put the reference this task is holding to the various queues
1306	 */
1307	spin_lock_irqsave(&cfq_exit_lock, flags);
1308
1309	n = rb_first(&ioc->cic_root);
1310	while (n != NULL) {
1311		__cic = rb_entry(n, struct cfq_io_context, rb_node);
1312
1313		cfq_exit_single_io_context(__cic);
1314		n = rb_next(n);
1315	}
1316
1317	spin_unlock_irqrestore(&cfq_exit_lock, flags);
1318}
1319
1320static struct cfq_io_context *
1321cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1322{
1323	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1324
1325	if (cic) {
1326		RB_CLEAR(&cic->rb_node);
1327		cic->key = NULL;
1328		cic->cfqq[ASYNC] = NULL;
1329		cic->cfqq[SYNC] = NULL;
1330		cic->last_end_request = jiffies;
1331		cic->ttime_total = 0;
1332		cic->ttime_samples = 0;
1333		cic->ttime_mean = 0;
1334		cic->dtor = cfq_free_io_context;
1335		cic->exit = cfq_exit_io_context;
1336		INIT_LIST_HEAD(&cic->queue_list);
1337		atomic_inc(&ioc_count);
1338	}
1339
1340	return cic;
1341}
1342
1343static void cfq_init_prio_data(struct cfq_queue *cfqq)
1344{
1345	struct task_struct *tsk = current;
1346	int ioprio_class;
1347
1348	if (!cfq_cfqq_prio_changed(cfqq))
1349		return;
1350
1351	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1352	switch (ioprio_class) {
1353		default:
1354			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1355		case IOPRIO_CLASS_NONE:
1356			/*
1357			 * no prio set, place us in the middle of the BE classes
1358			 */
1359			cfqq->ioprio = task_nice_ioprio(tsk);
1360			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1361			break;
1362		case IOPRIO_CLASS_RT:
1363			cfqq->ioprio = task_ioprio(tsk);
1364			cfqq->ioprio_class = IOPRIO_CLASS_RT;
1365			break;
1366		case IOPRIO_CLASS_BE:
1367			cfqq->ioprio = task_ioprio(tsk);
1368			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1369			break;
1370		case IOPRIO_CLASS_IDLE:
1371			cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1372			cfqq->ioprio = 7;
1373			cfq_clear_cfqq_idle_window(cfqq);
1374			break;
1375	}
1376
1377	/*
1378	 * keep track of original prio settings in case we have to temporarily
1379	 * elevate the priority of this queue
1380	 */
1381	cfqq->org_ioprio = cfqq->ioprio;
1382	cfqq->org_ioprio_class = cfqq->ioprio_class;
1383
1384	if (cfq_cfqq_on_rr(cfqq))
1385		cfq_resort_rr_list(cfqq, 0);
1386
1387	cfq_clear_cfqq_prio_changed(cfqq);
1388}
1389
1390static inline void changed_ioprio(struct cfq_io_context *cic)
1391{
1392	struct cfq_data *cfqd = cic->key;
1393	struct cfq_queue *cfqq;
1394	if (cfqd) {
1395		spin_lock(cfqd->queue->queue_lock);
1396		cfqq = cic->cfqq[ASYNC];
1397		if (cfqq) {
1398			struct cfq_queue *new_cfqq;
1399			new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC,
1400						cic->ioc->task, GFP_ATOMIC);
1401			if (new_cfqq) {
1402				cic->cfqq[ASYNC] = new_cfqq;
1403				cfq_put_queue(cfqq);
1404			}
1405		}
1406		cfqq = cic->cfqq[SYNC];
1407		if (cfqq) {
1408			cfq_mark_cfqq_prio_changed(cfqq);
1409			cfq_init_prio_data(cfqq);
1410		}
1411		spin_unlock(cfqd->queue->queue_lock);
1412	}
1413}
1414
1415/*
1416 * callback from sys_ioprio_set, irqs are disabled
1417 */
1418static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1419{
1420	struct cfq_io_context *cic;
1421	struct rb_node *n;
1422
1423	spin_lock(&cfq_exit_lock);
1424
1425	n = rb_first(&ioc->cic_root);
1426	while (n != NULL) {
1427		cic = rb_entry(n, struct cfq_io_context, rb_node);
1428
1429		changed_ioprio(cic);
1430		n = rb_next(n);
1431	}
1432
1433	spin_unlock(&cfq_exit_lock);
1434
1435	return 0;
1436}
1437
1438static struct cfq_queue *
1439cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
1440	      gfp_t gfp_mask)
1441{
1442	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1443	struct cfq_queue *cfqq, *new_cfqq = NULL;
1444	unsigned short ioprio;
1445
1446retry:
1447	ioprio = tsk->ioprio;
1448	cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1449
1450	if (!cfqq) {
1451		if (new_cfqq) {
1452			cfqq = new_cfqq;
1453			new_cfqq = NULL;
1454		} else if (gfp_mask & __GFP_WAIT) {
1455			spin_unlock_irq(cfqd->queue->queue_lock);
1456			new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1457			spin_lock_irq(cfqd->queue->queue_lock);
1458			goto retry;
1459		} else {
1460			cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1461			if (!cfqq)
1462				goto out;
1463		}
1464
1465		memset(cfqq, 0, sizeof(*cfqq));
1466
1467		INIT_HLIST_NODE(&cfqq->cfq_hash);
1468		INIT_LIST_HEAD(&cfqq->cfq_list);
1469		RB_CLEAR_ROOT(&cfqq->sort_list);
1470		INIT_LIST_HEAD(&cfqq->fifo);
1471
1472		cfqq->key = key;
1473		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1474		atomic_set(&cfqq->ref, 0);
1475		cfqq->cfqd = cfqd;
1476		cfqq->service_last = 0;
1477		/*
1478		 * set ->slice_left to allow preemption for a new process
1479		 */
1480		cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1481		if (!cfqd->hw_tag)
1482			cfq_mark_cfqq_idle_window(cfqq);
1483		cfq_mark_cfqq_prio_changed(cfqq);
1484		cfq_init_prio_data(cfqq);
1485	}
1486
1487	if (new_cfqq)
1488		kmem_cache_free(cfq_pool, new_cfqq);
1489
1490	atomic_inc(&cfqq->ref);
1491out:
1492	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1493	return cfqq;
1494}
1495
1496static void
1497cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
1498{
1499	spin_lock(&cfq_exit_lock);
1500	rb_erase(&cic->rb_node, &ioc->cic_root);
1501	list_del_init(&cic->queue_list);
1502	spin_unlock(&cfq_exit_lock);
1503	kmem_cache_free(cfq_ioc_pool, cic);
1504	atomic_dec(&ioc_count);
1505}
1506
1507static struct cfq_io_context *
1508cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1509{
1510	struct rb_node *n;
1511	struct cfq_io_context *cic;
1512	void *k, *key = cfqd;
1513
1514restart:
1515	n = ioc->cic_root.rb_node;
1516	while (n) {
1517		cic = rb_entry(n, struct cfq_io_context, rb_node);
1518		/* ->key must be copied to avoid race with cfq_exit_queue() */
1519		k = cic->key;
1520		if (unlikely(!k)) {
1521			cfq_drop_dead_cic(ioc, cic);
1522			goto restart;
1523		}
1524
1525		if (key < k)
1526			n = n->rb_left;
1527		else if (key > k)
1528			n = n->rb_right;
1529		else
1530			return cic;
1531	}
1532
1533	return NULL;
1534}
1535
1536static inline void
1537cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
1538	     struct cfq_io_context *cic)
1539{
1540	struct rb_node **p;
1541	struct rb_node *parent;
1542	struct cfq_io_context *__cic;
1543	void *k;
1544
1545	cic->ioc = ioc;
1546	cic->key = cfqd;
1547
1548	ioc->set_ioprio = cfq_ioc_set_ioprio;
1549restart:
1550	parent = NULL;
1551	p = &ioc->cic_root.rb_node;
1552	while (*p) {
1553		parent = *p;
1554		__cic = rb_entry(parent, struct cfq_io_context, rb_node);
1555		/* ->key must be copied to avoid race with cfq_exit_queue() */
1556		k = __cic->key;
1557		if (unlikely(!k)) {
1558			cfq_drop_dead_cic(ioc, cic);
1559			goto restart;
1560		}
1561
1562		if (cic->key < k)
1563			p = &(*p)->rb_left;
1564		else if (cic->key > k)
1565			p = &(*p)->rb_right;
1566		else
1567			BUG();
1568	}
1569
1570	spin_lock(&cfq_exit_lock);
1571	rb_link_node(&cic->rb_node, parent, p);
1572	rb_insert_color(&cic->rb_node, &ioc->cic_root);
1573	list_add(&cic->queue_list, &cfqd->cic_list);
1574	spin_unlock(&cfq_exit_lock);
1575}
1576
1577/*
1578 * Setup general io context and cfq io context. There can be several cfq
1579 * io contexts per general io context, if this process is doing io to more
1580 * than one device managed by cfq.
1581 */
1582static struct cfq_io_context *
1583cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1584{
1585	struct io_context *ioc = NULL;
1586	struct cfq_io_context *cic;
1587
1588	might_sleep_if(gfp_mask & __GFP_WAIT);
1589
1590	ioc = get_io_context(gfp_mask);
1591	if (!ioc)
1592		return NULL;
1593
1594	cic = cfq_cic_rb_lookup(cfqd, ioc);
1595	if (cic)
1596		goto out;
1597
1598	cic = cfq_alloc_io_context(cfqd, gfp_mask);
1599	if (cic == NULL)
1600		goto err;
1601
1602	cfq_cic_link(cfqd, ioc, cic);
1603out:
1604	return cic;
1605err:
1606	put_io_context(ioc);
1607	return NULL;
1608}
1609
1610static void
1611cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1612{
1613	unsigned long elapsed, ttime;
1614
1615	/*
1616	 * if this context already has stuff queued, thinktime is from
1617	 * last queue not last end
1618	 */
1619#if 0
1620	if (time_after(cic->last_end_request, cic->last_queue))
1621		elapsed = jiffies - cic->last_end_request;
1622	else
1623		elapsed = jiffies - cic->last_queue;
1624#else
1625		elapsed = jiffies - cic->last_end_request;
1626#endif
1627
1628	ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1629
1630	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1631	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1632	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1633}
1634
1635static void
1636cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
1637		       struct cfq_rq *crq)
1638{
1639	sector_t sdist;
1640	u64 total;
1641
1642	if (cic->last_request_pos < crq->request->sector)
1643		sdist = crq->request->sector - cic->last_request_pos;
1644	else
1645		sdist = cic->last_request_pos - crq->request->sector;
1646
1647	/*
1648	 * Don't allow the seek distance to get too large from the
1649	 * odd fragment, pagein, etc
1650	 */
1651	if (cic->seek_samples <= 60) /* second&third seek */
1652		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1653	else
1654		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
1655
1656	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1657	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1658	total = cic->seek_total + (cic->seek_samples/2);
1659	do_div(total, cic->seek_samples);
1660	cic->seek_mean = (sector_t)total;
1661}
1662
1663/*
1664 * Disable idle window if the process thinks too long or seeks so much that
1665 * it doesn't matter
1666 */
1667static void
1668cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1669		       struct cfq_io_context *cic)
1670{
1671	int enable_idle = cfq_cfqq_idle_window(cfqq);
1672
1673	if (!cic->ioc->task || !cfqd->cfq_slice_idle || cfqd->hw_tag)
1674		enable_idle = 0;
1675	else if (sample_valid(cic->ttime_samples)) {
1676		if (cic->ttime_mean > cfqd->cfq_slice_idle)
1677			enable_idle = 0;
1678		else
1679			enable_idle = 1;
1680	}
1681
1682	if (enable_idle)
1683		cfq_mark_cfqq_idle_window(cfqq);
1684	else
1685		cfq_clear_cfqq_idle_window(cfqq);
1686}
1687
1688
1689/*
1690 * Check if new_cfqq should preempt the currently active queue. Return 0 for
1691 * no or if we aren't sure, a 1 will cause a preempt.
1692 */
1693static int
1694cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1695		   struct cfq_rq *crq)
1696{
1697	struct cfq_queue *cfqq = cfqd->active_queue;
1698
1699	if (cfq_class_idle(new_cfqq))
1700		return 0;
1701
1702	if (!cfqq)
1703		return 1;
1704
1705	if (cfq_class_idle(cfqq))
1706		return 1;
1707	if (!cfq_cfqq_wait_request(new_cfqq))
1708		return 0;
1709	/*
1710	 * if it doesn't have slice left, forget it
1711	 */
1712	if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1713		return 0;
1714	if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
1715		return 1;
1716
1717	return 0;
1718}
1719
1720/*
1721 * cfqq preempts the active queue. if we allowed preempt with no slice left,
1722 * let it have half of its nominal slice.
1723 */
1724static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1725{
1726	struct cfq_queue *__cfqq, *next;
1727
1728	list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1729		cfq_resort_rr_list(__cfqq, 1);
1730
1731	if (!cfqq->slice_left)
1732		cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1733
1734	cfqq->slice_end = cfqq->slice_left + jiffies;
1735	__cfq_slice_expired(cfqd, cfqq, 1);
1736	__cfq_set_active_queue(cfqd, cfqq);
1737}
1738
1739/*
1740 * should really be a ll_rw_blk.c helper
1741 */
1742static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1743{
1744	request_queue_t *q = cfqd->queue;
1745
1746	if (!blk_queue_plugged(q))
1747		q->request_fn(q);
1748	else
1749		__generic_unplug_device(q);
1750}
1751
1752/*
1753 * Called when a new fs request (crq) is added (to cfqq). Check if there's
1754 * something we should do about it
1755 */
1756static void
1757cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1758		 struct cfq_rq *crq)
1759{
1760	struct cfq_io_context *cic;
1761
1762	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
1763
1764	cic = crq->io_context;
1765
1766	/*
1767	 * we never wait for an async request and we don't allow preemption
1768	 * of an async request. so just return early
1769	 */
1770	if (!cfq_crq_is_sync(crq)) {
1771		/*
1772		 * sync process issued an async request, if it's waiting
1773		 * then expire it and kick rq handling.
1774		 */
1775		if (cic == cfqd->active_cic &&
1776		    del_timer(&cfqd->idle_slice_timer)) {
1777			cfq_slice_expired(cfqd, 0);
1778			cfq_start_queueing(cfqd, cfqq);
1779		}
1780		return;
1781	}
1782
1783	cfq_update_io_thinktime(cfqd, cic);
1784	cfq_update_io_seektime(cfqd, cic, crq);
1785	cfq_update_idle_window(cfqd, cfqq, cic);
1786
1787	cic->last_queue = jiffies;
1788	cic->last_request_pos = crq->request->sector + crq->request->nr_sectors;
1789
1790	if (cfqq == cfqd->active_queue) {
1791		/*
1792		 * if we are waiting for a request for this queue, let it rip
1793		 * immediately and flag that we must not expire this queue
1794		 * just now
1795		 */
1796		if (cfq_cfqq_wait_request(cfqq)) {
1797			cfq_mark_cfqq_must_dispatch(cfqq);
1798			del_timer(&cfqd->idle_slice_timer);
1799			cfq_start_queueing(cfqd, cfqq);
1800		}
1801	} else if (cfq_should_preempt(cfqd, cfqq, crq)) {
1802		/*
1803		 * not the active queue - expire current slice if it is
1804		 * idle and has expired it's mean thinktime or this new queue
1805		 * has some old slice time left and is of higher priority
1806		 */
1807		cfq_preempt_queue(cfqd, cfqq);
1808		cfq_mark_cfqq_must_dispatch(cfqq);
1809		cfq_start_queueing(cfqd, cfqq);
1810	}
1811}
1812
1813static void cfq_insert_request(request_queue_t *q, struct request *rq)
1814{
1815	struct cfq_data *cfqd = q->elevator->elevator_data;
1816	struct cfq_rq *crq = RQ_DATA(rq);
1817	struct cfq_queue *cfqq = crq->cfq_queue;
1818
1819	cfq_init_prio_data(cfqq);
1820
1821	cfq_add_crq_rb(crq);
1822
1823	list_add_tail(&rq->queuelist, &cfqq->fifo);
1824
1825	if (rq_mergeable(rq))
1826		cfq_add_crq_hash(cfqd, crq);
1827
1828	cfq_crq_enqueued(cfqd, cfqq, crq);
1829}
1830
1831static void cfq_completed_request(request_queue_t *q, struct request *rq)
1832{
1833	struct cfq_rq *crq = RQ_DATA(rq);
1834	struct cfq_queue *cfqq = crq->cfq_queue;
1835	struct cfq_data *cfqd = cfqq->cfqd;
1836	const int sync = cfq_crq_is_sync(crq);
1837	unsigned long now;
1838
1839	now = jiffies;
1840
1841	WARN_ON(!cfqd->rq_in_driver);
1842	WARN_ON(!cfqq->on_dispatch[sync]);
1843	cfqd->rq_in_driver--;
1844	cfqq->on_dispatch[sync]--;
1845
1846	if (!cfq_class_idle(cfqq))
1847		cfqd->last_end_request = now;
1848
1849	if (!cfq_cfqq_dispatched(cfqq)) {
1850		if (cfq_cfqq_on_rr(cfqq)) {
1851			cfqq->service_last = now;
1852			cfq_resort_rr_list(cfqq, 0);
1853		}
1854		cfq_schedule_dispatch(cfqd);
1855	}
1856
1857	if (cfq_crq_is_sync(crq))
1858		crq->io_context->last_end_request = now;
1859}
1860
1861static struct request *
1862cfq_former_request(request_queue_t *q, struct request *rq)
1863{
1864	struct cfq_rq *crq = RQ_DATA(rq);
1865	struct rb_node *rbprev = rb_prev(&crq->rb_node);
1866
1867	if (rbprev)
1868		return rb_entry_crq(rbprev)->request;
1869
1870	return NULL;
1871}
1872
1873static struct request *
1874cfq_latter_request(request_queue_t *q, struct request *rq)
1875{
1876	struct cfq_rq *crq = RQ_DATA(rq);
1877	struct rb_node *rbnext = rb_next(&crq->rb_node);
1878
1879	if (rbnext)
1880		return rb_entry_crq(rbnext)->request;
1881
1882	return NULL;
1883}
1884
1885/*
1886 * we temporarily boost lower priority queues if they are holding fs exclusive
1887 * resources. they are boosted to normal prio (CLASS_BE/4)
1888 */
1889static void cfq_prio_boost(struct cfq_queue *cfqq)
1890{
1891	const int ioprio_class = cfqq->ioprio_class;
1892	const int ioprio = cfqq->ioprio;
1893
1894	if (has_fs_excl()) {
1895		/*
1896		 * boost idle prio on transactions that would lock out other
1897		 * users of the filesystem
1898		 */
1899		if (cfq_class_idle(cfqq))
1900			cfqq->ioprio_class = IOPRIO_CLASS_BE;
1901		if (cfqq->ioprio > IOPRIO_NORM)
1902			cfqq->ioprio = IOPRIO_NORM;
1903	} else {
1904		/*
1905		 * check if we need to unboost the queue
1906		 */
1907		if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1908			cfqq->ioprio_class = cfqq->org_ioprio_class;
1909		if (cfqq->ioprio != cfqq->org_ioprio)
1910			cfqq->ioprio = cfqq->org_ioprio;
1911	}
1912
1913	/*
1914	 * refile between round-robin lists if we moved the priority class
1915	 */
1916	if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1917	    cfq_cfqq_on_rr(cfqq))
1918		cfq_resort_rr_list(cfqq, 0);
1919}
1920
1921static inline int
1922__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1923		struct task_struct *task, int rw)
1924{
1925#if 1
1926	if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1927	    !cfq_cfqq_must_alloc_slice(cfqq)) {
1928		cfq_mark_cfqq_must_alloc_slice(cfqq);
1929		return ELV_MQUEUE_MUST;
1930	}
1931
1932	return ELV_MQUEUE_MAY;
1933#else
1934	if (!cfqq || task->flags & PF_MEMALLOC)
1935		return ELV_MQUEUE_MAY;
1936	if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) {
1937		if (cfq_cfqq_wait_request(cfqq))
1938			return ELV_MQUEUE_MUST;
1939
1940		/*
1941		 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
1942		 * can quickly flood the queue with writes from a single task
1943		 */
1944		if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) {
1945			cfq_mark_cfqq_must_alloc_slice(cfqq);
1946			return ELV_MQUEUE_MUST;
1947		}
1948
1949		return ELV_MQUEUE_MAY;
1950	}
1951	if (cfq_class_idle(cfqq))
1952		return ELV_MQUEUE_NO;
1953	if (cfqq->allocated[rw] >= cfqd->max_queued) {
1954		struct io_context *ioc = get_io_context(GFP_ATOMIC);
1955		int ret = ELV_MQUEUE_NO;
1956
1957		if (ioc && ioc->nr_batch_requests)
1958			ret = ELV_MQUEUE_MAY;
1959
1960		put_io_context(ioc);
1961		return ret;
1962	}
1963
1964	return ELV_MQUEUE_MAY;
1965#endif
1966}
1967
1968static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
1969{
1970	struct cfq_data *cfqd = q->elevator->elevator_data;
1971	struct task_struct *tsk = current;
1972	struct cfq_queue *cfqq;
1973
1974	/*
1975	 * don't force setup of a queue from here, as a call to may_queue
1976	 * does not necessarily imply that a request actually will be queued.
1977	 * so just lookup a possibly existing queue, or return 'may queue'
1978	 * if that fails
1979	 */
1980	cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1981	if (cfqq) {
1982		cfq_init_prio_data(cfqq);
1983		cfq_prio_boost(cfqq);
1984
1985		return __cfq_may_queue(cfqd, cfqq, tsk, rw);
1986	}
1987
1988	return ELV_MQUEUE_MAY;
1989}
1990
1991static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1992{
1993	struct cfq_data *cfqd = q->elevator->elevator_data;
1994	struct request_list *rl = &q->rq;
1995
1996	if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) {
1997		smp_mb();
1998		if (waitqueue_active(&rl->wait[READ]))
1999			wake_up(&rl->wait[READ]);
2000	}
2001
2002	if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) {
2003		smp_mb();
2004		if (waitqueue_active(&rl->wait[WRITE]))
2005			wake_up(&rl->wait[WRITE]);
2006	}
2007}
2008
2009/*
2010 * queue lock held here
2011 */
2012static void cfq_put_request(request_queue_t *q, struct request *rq)
2013{
2014	struct cfq_data *cfqd = q->elevator->elevator_data;
2015	struct cfq_rq *crq = RQ_DATA(rq);
2016
2017	if (crq) {
2018		struct cfq_queue *cfqq = crq->cfq_queue;
2019		const int rw = rq_data_dir(rq);
2020
2021		BUG_ON(!cfqq->allocated[rw]);
2022		cfqq->allocated[rw]--;
2023
2024		put_io_context(crq->io_context->ioc);
2025
2026		mempool_free(crq, cfqd->crq_pool);
2027		rq->elevator_private = NULL;
2028
2029		cfq_check_waiters(q, cfqq);
2030		cfq_put_queue(cfqq);
2031	}
2032}
2033
2034/*
2035 * Allocate cfq data structures associated with this request.
2036 */
2037static int
2038cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
2039		gfp_t gfp_mask)
2040{
2041	struct cfq_data *cfqd = q->elevator->elevator_data;
2042	struct task_struct *tsk = current;
2043	struct cfq_io_context *cic;
2044	const int rw = rq_data_dir(rq);
2045	pid_t key = cfq_queue_pid(tsk, rw);
2046	struct cfq_queue *cfqq;
2047	struct cfq_rq *crq;
2048	unsigned long flags;
2049	int is_sync = key != CFQ_KEY_ASYNC;
2050
2051	might_sleep_if(gfp_mask & __GFP_WAIT);
2052
2053	cic = cfq_get_io_context(cfqd, gfp_mask);
2054
2055	spin_lock_irqsave(q->queue_lock, flags);
2056
2057	if (!cic)
2058		goto queue_fail;
2059
2060	if (!cic->cfqq[is_sync]) {
2061		cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
2062		if (!cfqq)
2063			goto queue_fail;
2064
2065		cic->cfqq[is_sync] = cfqq;
2066	} else
2067		cfqq = cic->cfqq[is_sync];
2068
2069	cfqq->allocated[rw]++;
2070	cfq_clear_cfqq_must_alloc(cfqq);
2071	cfqd->rq_starved = 0;
2072	atomic_inc(&cfqq->ref);
2073	spin_unlock_irqrestore(q->queue_lock, flags);
2074
2075	crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
2076	if (crq) {
2077		RB_CLEAR(&crq->rb_node);
2078		crq->rb_key = 0;
2079		crq->request = rq;
2080		INIT_HLIST_NODE(&crq->hash);
2081		crq->cfq_queue = cfqq;
2082		crq->io_context = cic;
2083
2084		if (is_sync)
2085			cfq_mark_crq_is_sync(crq);
2086		else
2087			cfq_clear_crq_is_sync(crq);
2088
2089		rq->elevator_private = crq;
2090		return 0;
2091	}
2092
2093	spin_lock_irqsave(q->queue_lock, flags);
2094	cfqq->allocated[rw]--;
2095	if (!(cfqq->allocated[0] + cfqq->allocated[1]))
2096		cfq_mark_cfqq_must_alloc(cfqq);
2097	cfq_put_queue(cfqq);
2098queue_fail:
2099	if (cic)
2100		put_io_context(cic->ioc);
2101	/*
2102	 * mark us rq allocation starved. we need to kickstart the process
2103	 * ourselves if there are no pending requests that can do it for us.
2104	 * that would be an extremely rare OOM situation
2105	 */
2106	cfqd->rq_starved = 1;
2107	cfq_schedule_dispatch(cfqd);
2108	spin_unlock_irqrestore(q->queue_lock, flags);
2109	return 1;
2110}
2111
2112static void cfq_kick_queue(void *data)
2113{
2114	request_queue_t *q = data;
2115	struct cfq_data *cfqd = q->elevator->elevator_data;
2116	unsigned long flags;
2117
2118	spin_lock_irqsave(q->queue_lock, flags);
2119
2120	if (cfqd->rq_starved) {
2121		struct request_list *rl = &q->rq;
2122
2123		/*
2124		 * we aren't guaranteed to get a request after this, but we
2125		 * have to be opportunistic
2126		 */
2127		smp_mb();
2128		if (waitqueue_active(&rl->wait[READ]))
2129			wake_up(&rl->wait[READ]);
2130		if (waitqueue_active(&rl->wait[WRITE]))
2131			wake_up(&rl->wait[WRITE]);
2132	}
2133
2134	blk_remove_plug(q);
2135	q->request_fn(q);
2136	spin_unlock_irqrestore(q->queue_lock, flags);
2137}
2138
2139/*
2140 * Timer running if the active_queue is currently idling inside its time slice
2141 */
2142static void cfq_idle_slice_timer(unsigned long data)
2143{
2144	struct cfq_data *cfqd = (struct cfq_data *) data;
2145	struct cfq_queue *cfqq;
2146	unsigned long flags;
2147
2148	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2149
2150	if ((cfqq = cfqd->active_queue) != NULL) {
2151		unsigned long now = jiffies;
2152
2153		/*
2154		 * expired
2155		 */
2156		if (time_after(now, cfqq->slice_end))
2157			goto expire;
2158
2159		/*
2160		 * only expire and reinvoke request handler, if there are
2161		 * other queues with pending requests
2162		 */
2163		if (!cfqd->busy_queues) {
2164			cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end);
2165			add_timer(&cfqd->idle_slice_timer);
2166			goto out_cont;
2167		}
2168
2169		/*
2170		 * not expired and it has a request pending, let it dispatch
2171		 */
2172		if (!RB_EMPTY(&cfqq->sort_list)) {
2173			cfq_mark_cfqq_must_dispatch(cfqq);
2174			goto out_kick;
2175		}
2176	}
2177expire:
2178	cfq_slice_expired(cfqd, 0);
2179out_kick:
2180	cfq_schedule_dispatch(cfqd);
2181out_cont:
2182	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2183}
2184
2185/*
2186 * Timer running if an idle class queue is waiting for service
2187 */
2188static void cfq_idle_class_timer(unsigned long data)
2189{
2190	struct cfq_data *cfqd = (struct cfq_data *) data;
2191	unsigned long flags, end;
2192
2193	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2194
2195	/*
2196	 * race with a non-idle queue, reset timer
2197	 */
2198	end = cfqd->last_end_request + CFQ_IDLE_GRACE;
2199	if (!time_after_eq(jiffies, end))
2200		mod_timer(&cfqd->idle_class_timer, end);
2201	else
2202		cfq_schedule_dispatch(cfqd);
2203
2204	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2205}
2206
2207static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2208{
2209	del_timer_sync(&cfqd->idle_slice_timer);
2210	del_timer_sync(&cfqd->idle_class_timer);
2211	blk_sync_queue(cfqd->queue);
2212}
2213
2214static void cfq_exit_queue(elevator_t *e)
2215{
2216	struct cfq_data *cfqd = e->elevator_data;
2217	request_queue_t *q = cfqd->queue;
2218
2219	cfq_shutdown_timer_wq(cfqd);
2220
2221	spin_lock(&cfq_exit_lock);
2222	spin_lock_irq(q->queue_lock);
2223
2224	if (cfqd->active_queue)
2225		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
2226
2227	while (!list_empty(&cfqd->cic_list)) {
2228		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2229							struct cfq_io_context,
2230							queue_list);
2231		if (cic->cfqq[ASYNC]) {
2232			cfq_put_queue(cic->cfqq[ASYNC]);
2233			cic->cfqq[ASYNC] = NULL;
2234		}
2235		if (cic->cfqq[SYNC]) {
2236			cfq_put_queue(cic->cfqq[SYNC]);
2237			cic->cfqq[SYNC] = NULL;
2238		}
2239		cic->key = NULL;
2240		list_del_init(&cic->queue_list);
2241	}
2242
2243	spin_unlock_irq(q->queue_lock);
2244	spin_unlock(&cfq_exit_lock);
2245
2246	cfq_shutdown_timer_wq(cfqd);
2247
2248	mempool_destroy(cfqd->crq_pool);
2249	kfree(cfqd->crq_hash);
2250	kfree(cfqd->cfq_hash);
2251	kfree(cfqd);
2252}
2253
2254static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
2255{
2256	struct cfq_data *cfqd;
2257	int i;
2258
2259	cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
2260	if (!cfqd)
2261		return NULL;
2262
2263	memset(cfqd, 0, sizeof(*cfqd));
2264
2265	for (i = 0; i < CFQ_PRIO_LISTS; i++)
2266		INIT_LIST_HEAD(&cfqd->rr_list[i]);
2267
2268	INIT_LIST_HEAD(&cfqd->busy_rr);
2269	INIT_LIST_HEAD(&cfqd->cur_rr);
2270	INIT_LIST_HEAD(&cfqd->idle_rr);
2271	INIT_LIST_HEAD(&cfqd->empty_list);
2272	INIT_LIST_HEAD(&cfqd->cic_list);
2273
2274	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
2275	if (!cfqd->crq_hash)
2276		goto out_crqhash;
2277
2278	cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
2279	if (!cfqd->cfq_hash)
2280		goto out_cfqhash;
2281
2282	cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool);
2283	if (!cfqd->crq_pool)
2284		goto out_crqpool;
2285
2286	for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
2287		INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
2288	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2289		INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2290
2291	cfqd->queue = q;
2292
2293	cfqd->max_queued = q->nr_requests / 4;
2294	q->nr_batching = cfq_queued;
2295
2296	init_timer(&cfqd->idle_slice_timer);
2297	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2298	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2299
2300	init_timer(&cfqd->idle_class_timer);
2301	cfqd->idle_class_timer.function = cfq_idle_class_timer;
2302	cfqd->idle_class_timer.data = (unsigned long) cfqd;
2303
2304	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
2305
2306	cfqd->cfq_queued = cfq_queued;
2307	cfqd->cfq_quantum = cfq_quantum;
2308	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2309	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2310	cfqd->cfq_back_max = cfq_back_max;
2311	cfqd->cfq_back_penalty = cfq_back_penalty;
2312	cfqd->cfq_slice[0] = cfq_slice_async;
2313	cfqd->cfq_slice[1] = cfq_slice_sync;
2314	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2315	cfqd->cfq_slice_idle = cfq_slice_idle;
2316
2317	return cfqd;
2318out_crqpool:
2319	kfree(cfqd->cfq_hash);
2320out_cfqhash:
2321	kfree(cfqd->crq_hash);
2322out_crqhash:
2323	kfree(cfqd);
2324	return NULL;
2325}
2326
2327static void cfq_slab_kill(void)
2328{
2329	if (crq_pool)
2330		kmem_cache_destroy(crq_pool);
2331	if (cfq_pool)
2332		kmem_cache_destroy(cfq_pool);
2333	if (cfq_ioc_pool)
2334		kmem_cache_destroy(cfq_ioc_pool);
2335}
2336
2337static int __init cfq_slab_setup(void)
2338{
2339	crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
2340					NULL, NULL);
2341	if (!crq_pool)
2342		goto fail;
2343
2344	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2345					NULL, NULL);
2346	if (!cfq_pool)
2347		goto fail;
2348
2349	cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2350			sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2351	if (!cfq_ioc_pool)
2352		goto fail;
2353
2354	return 0;
2355fail:
2356	cfq_slab_kill();
2357	return -ENOMEM;
2358}
2359
2360/*
2361 * sysfs parts below -->
2362 */
2363
2364static ssize_t
2365cfq_var_show(unsigned int var, char *page)
2366{
2367	return sprintf(page, "%d\n", var);
2368}
2369
2370static ssize_t
2371cfq_var_store(unsigned int *var, const char *page, size_t count)
2372{
2373	char *p = (char *) page;
2374
2375	*var = simple_strtoul(p, &p, 10);
2376	return count;
2377}
2378
2379#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
2380static ssize_t __FUNC(elevator_t *e, char *page)			\
2381{									\
2382	struct cfq_data *cfqd = e->elevator_data;			\
2383	unsigned int __data = __VAR;					\
2384	if (__CONV)							\
2385		__data = jiffies_to_msecs(__data);			\
2386	return cfq_var_show(__data, (page));				\
2387}
2388SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2389SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
2390SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2391SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2392SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
2393SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2394SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2395SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2396SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2397SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2398#undef SHOW_FUNCTION
2399
2400#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
2401static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
2402{									\
2403	struct cfq_data *cfqd = e->elevator_data;			\
2404	unsigned int __data;						\
2405	int ret = cfq_var_store(&__data, (page), count);		\
2406	if (__data < (MIN))						\
2407		__data = (MIN);						\
2408	else if (__data > (MAX))					\
2409		__data = (MAX);						\
2410	if (__CONV)							\
2411		*(__PTR) = msecs_to_jiffies(__data);			\
2412	else								\
2413		*(__PTR) = __data;					\
2414	return ret;							\
2415}
2416STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2417STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
2418STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2419STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2420STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2421STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2422STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2423STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2424STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2425STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2426#undef STORE_FUNCTION
2427
2428#define CFQ_ATTR(name) \
2429	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
2430
2431static struct elv_fs_entry cfq_attrs[] = {
2432	CFQ_ATTR(quantum),
2433	CFQ_ATTR(queued),
2434	CFQ_ATTR(fifo_expire_sync),
2435	CFQ_ATTR(fifo_expire_async),
2436	CFQ_ATTR(back_seek_max),
2437	CFQ_ATTR(back_seek_penalty),
2438	CFQ_ATTR(slice_sync),
2439	CFQ_ATTR(slice_async),
2440	CFQ_ATTR(slice_async_rq),
2441	CFQ_ATTR(slice_idle),
2442	__ATTR_NULL
2443};
2444
2445static struct elevator_type iosched_cfq = {
2446	.ops = {
2447		.elevator_merge_fn = 		cfq_merge,
2448		.elevator_merged_fn =		cfq_merged_request,
2449		.elevator_merge_req_fn =	cfq_merged_requests,
2450		.elevator_dispatch_fn =		cfq_dispatch_requests,
2451		.elevator_add_req_fn =		cfq_insert_request,
2452		.elevator_activate_req_fn =	cfq_activate_request,
2453		.elevator_deactivate_req_fn =	cfq_deactivate_request,
2454		.elevator_queue_empty_fn =	cfq_queue_empty,
2455		.elevator_completed_req_fn =	cfq_completed_request,
2456		.elevator_former_req_fn =	cfq_former_request,
2457		.elevator_latter_req_fn =	cfq_latter_request,
2458		.elevator_set_req_fn =		cfq_set_request,
2459		.elevator_put_req_fn =		cfq_put_request,
2460		.elevator_may_queue_fn =	cfq_may_queue,
2461		.elevator_init_fn =		cfq_init_queue,
2462		.elevator_exit_fn =		cfq_exit_queue,
2463		.trim =				cfq_trim,
2464	},
2465	.elevator_attrs =	cfq_attrs,
2466	.elevator_name =	"cfq",
2467	.elevator_owner =	THIS_MODULE,
2468};
2469
2470static int __init cfq_init(void)
2471{
2472	int ret;
2473
2474	/*
2475	 * could be 0 on HZ < 1000 setups
2476	 */
2477	if (!cfq_slice_async)
2478		cfq_slice_async = 1;
2479	if (!cfq_slice_idle)
2480		cfq_slice_idle = 1;
2481
2482	if (cfq_slab_setup())
2483		return -ENOMEM;
2484
2485	ret = elv_register(&iosched_cfq);
2486	if (ret)
2487		cfq_slab_kill();
2488
2489	return ret;
2490}
2491
2492static void __exit cfq_exit(void)
2493{
2494	DECLARE_COMPLETION(all_gone);
2495	elv_unregister(&iosched_cfq);
2496	ioc_gone = &all_gone;
2497	/* ioc_gone's update must be visible before reading ioc_count */
2498	smp_wmb();
2499	if (atomic_read(&ioc_count))
2500		wait_for_completion(ioc_gone);
2501	synchronize_rcu();
2502	cfq_slab_kill();
2503}
2504
2505module_init(cfq_init);
2506module_exit(cfq_exit);
2507
2508MODULE_AUTHOR("Jens Axboe");
2509MODULE_LICENSE("GPL");
2510MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
2511