dm-io.c revision 891ce207011d3d9219f79fd5114c8594bbacc653
1/*
2 * Copyright (C) 2003 Sistina Software
3 * Copyright (C) 2006 Red Hat GmbH
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm-io.h"
9
10#include <linux/bio.h>
11#include <linux/mempool.h>
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/slab.h>
15
16static struct bio_set *_bios;
17
18struct dm_io_client {
19	mempool_t *pool;
20	struct bio_set *bios;
21};
22
23/* FIXME: can we shrink this ? */
24struct io {
25	unsigned long error;
26	atomic_t count;
27	struct task_struct *sleeper;
28	struct dm_io_client *client;
29	io_notify_fn callback;
30	void *context;
31};
32
33/*
34 * io contexts are only dynamically allocated for asynchronous
35 * io.  Since async io is likely to be the majority of io we'll
36 * have the same number of io contexts as bios! (FIXME: must reduce this).
37 */
38static unsigned _num_ios;
39static mempool_t *_io_pool;
40
41/*
42 * Temporary functions to allow old and new interfaces to co-exist.
43 */
44static struct bio_set *bios(struct dm_io_client *client)
45{
46	return client ? client->bios : _bios;
47}
48
49static mempool_t *io_pool(struct dm_io_client *client)
50{
51	return client ? client->pool : _io_pool;
52}
53
54static unsigned int pages_to_ios(unsigned int pages)
55{
56	return 4 * pages;	/* too many ? */
57}
58
59static int resize_pool(unsigned int new_ios)
60{
61	int r = 0;
62
63	if (_io_pool) {
64		if (new_ios == 0) {
65			/* free off the pool */
66			mempool_destroy(_io_pool);
67			_io_pool = NULL;
68			bioset_free(_bios);
69
70		} else {
71			/* resize the pool */
72			r = mempool_resize(_io_pool, new_ios, GFP_KERNEL);
73		}
74
75	} else {
76		/* create new pool */
77		_io_pool = mempool_create_kmalloc_pool(new_ios,
78						       sizeof(struct io));
79		if (!_io_pool)
80			return -ENOMEM;
81
82		_bios = bioset_create(16, 16);
83		if (!_bios) {
84			mempool_destroy(_io_pool);
85			_io_pool = NULL;
86			return -ENOMEM;
87		}
88	}
89
90	if (!r)
91		_num_ios = new_ios;
92
93	return r;
94}
95
96int dm_io_get(unsigned int num_pages)
97{
98	return resize_pool(_num_ios + pages_to_ios(num_pages));
99}
100
101void dm_io_put(unsigned int num_pages)
102{
103	resize_pool(_num_ios - pages_to_ios(num_pages));
104}
105
106/*-----------------------------------------------------------------
107 * We need to keep track of which region a bio is doing io for.
108 * In order to save a memory allocation we store this the last
109 * bvec which we know is unused (blech).
110 * XXX This is ugly and can OOPS with some configs... find another way.
111 *---------------------------------------------------------------*/
112static inline void bio_set_region(struct bio *bio, unsigned region)
113{
114	bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
115}
116
117static inline unsigned bio_get_region(struct bio *bio)
118{
119	return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
120}
121
122/*-----------------------------------------------------------------
123 * We need an io object to keep track of the number of bios that
124 * have been dispatched for a particular io.
125 *---------------------------------------------------------------*/
126static void dec_count(struct io *io, unsigned int region, int error)
127{
128	if (error)
129		set_bit(region, &io->error);
130
131	if (atomic_dec_and_test(&io->count)) {
132		if (io->sleeper)
133			wake_up_process(io->sleeper);
134
135		else {
136			int r = io->error;
137			io_notify_fn fn = io->callback;
138			void *context = io->context;
139
140			mempool_free(io, io_pool(io->client));
141			fn(r, context);
142		}
143	}
144}
145
146static int endio(struct bio *bio, unsigned int done, int error)
147{
148	struct io *io;
149	unsigned region;
150
151	/* keep going until we've finished */
152	if (bio->bi_size)
153		return 1;
154
155	if (error && bio_data_dir(bio) == READ)
156		zero_fill_bio(bio);
157
158	/*
159	 * The bio destructor in bio_put() may use the io object.
160	 */
161	io = bio->bi_private;
162	region = bio_get_region(bio);
163
164	bio->bi_max_vecs++;
165	bio_put(bio);
166
167	dec_count(io, region, error);
168
169	return 0;
170}
171
172/*-----------------------------------------------------------------
173 * These little objects provide an abstraction for getting a new
174 * destination page for io.
175 *---------------------------------------------------------------*/
176struct dpages {
177	void (*get_page)(struct dpages *dp,
178			 struct page **p, unsigned long *len, unsigned *offset);
179	void (*next_page)(struct dpages *dp);
180
181	unsigned context_u;
182	void *context_ptr;
183};
184
185/*
186 * Functions for getting the pages from a list.
187 */
188static void list_get_page(struct dpages *dp,
189		  struct page **p, unsigned long *len, unsigned *offset)
190{
191	unsigned o = dp->context_u;
192	struct page_list *pl = (struct page_list *) dp->context_ptr;
193
194	*p = pl->page;
195	*len = PAGE_SIZE - o;
196	*offset = o;
197}
198
199static void list_next_page(struct dpages *dp)
200{
201	struct page_list *pl = (struct page_list *) dp->context_ptr;
202	dp->context_ptr = pl->next;
203	dp->context_u = 0;
204}
205
206static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
207{
208	dp->get_page = list_get_page;
209	dp->next_page = list_next_page;
210	dp->context_u = offset;
211	dp->context_ptr = pl;
212}
213
214/*
215 * Functions for getting the pages from a bvec.
216 */
217static void bvec_get_page(struct dpages *dp,
218		  struct page **p, unsigned long *len, unsigned *offset)
219{
220	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
221	*p = bvec->bv_page;
222	*len = bvec->bv_len;
223	*offset = bvec->bv_offset;
224}
225
226static void bvec_next_page(struct dpages *dp)
227{
228	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
229	dp->context_ptr = bvec + 1;
230}
231
232static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
233{
234	dp->get_page = bvec_get_page;
235	dp->next_page = bvec_next_page;
236	dp->context_ptr = bvec;
237}
238
239static void vm_get_page(struct dpages *dp,
240		 struct page **p, unsigned long *len, unsigned *offset)
241{
242	*p = vmalloc_to_page(dp->context_ptr);
243	*offset = dp->context_u;
244	*len = PAGE_SIZE - dp->context_u;
245}
246
247static void vm_next_page(struct dpages *dp)
248{
249	dp->context_ptr += PAGE_SIZE - dp->context_u;
250	dp->context_u = 0;
251}
252
253static void vm_dp_init(struct dpages *dp, void *data)
254{
255	dp->get_page = vm_get_page;
256	dp->next_page = vm_next_page;
257	dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
258	dp->context_ptr = data;
259}
260
261static void dm_bio_destructor(struct bio *bio)
262{
263	struct io *io = bio->bi_private;
264
265	bio_free(bio, bios(io->client));
266}
267
268/*-----------------------------------------------------------------
269 * IO routines that accept a list of pages.
270 *---------------------------------------------------------------*/
271static void do_region(int rw, unsigned int region, struct io_region *where,
272		      struct dpages *dp, struct io *io)
273{
274	struct bio *bio;
275	struct page *page;
276	unsigned long len;
277	unsigned offset;
278	unsigned num_bvecs;
279	sector_t remaining = where->count;
280
281	while (remaining) {
282		/*
283		 * Allocate a suitably sized-bio: we add an extra
284		 * bvec for bio_get/set_region() and decrement bi_max_vecs
285		 * to hide it from bio_add_page().
286		 */
287		num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2;
288		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, bios(io->client));
289		bio->bi_sector = where->sector + (where->count - remaining);
290		bio->bi_bdev = where->bdev;
291		bio->bi_end_io = endio;
292		bio->bi_private = io;
293		bio->bi_destructor = dm_bio_destructor;
294		bio->bi_max_vecs--;
295		bio_set_region(bio, region);
296
297		/*
298		 * Try and add as many pages as possible.
299		 */
300		while (remaining) {
301			dp->get_page(dp, &page, &len, &offset);
302			len = min(len, to_bytes(remaining));
303			if (!bio_add_page(bio, page, len, offset))
304				break;
305
306			offset = 0;
307			remaining -= to_sector(len);
308			dp->next_page(dp);
309		}
310
311		atomic_inc(&io->count);
312		submit_bio(rw, bio);
313	}
314}
315
316static void dispatch_io(int rw, unsigned int num_regions,
317			struct io_region *where, struct dpages *dp,
318			struct io *io, int sync)
319{
320	int i;
321	struct dpages old_pages = *dp;
322
323	if (sync)
324		rw |= (1 << BIO_RW_SYNC);
325
326	/*
327	 * For multiple regions we need to be careful to rewind
328	 * the dp object for each call to do_region.
329	 */
330	for (i = 0; i < num_regions; i++) {
331		*dp = old_pages;
332		if (where[i].count)
333			do_region(rw, i, where + i, dp, io);
334	}
335
336	/*
337	 * Drop the extra reference that we were holding to avoid
338	 * the io being completed too early.
339	 */
340	dec_count(io, 0, 0);
341}
342
343static int sync_io(struct dm_io_client *client, unsigned int num_regions,
344		   struct io_region *where, int rw, struct dpages *dp,
345		   unsigned long *error_bits)
346{
347	struct io io;
348
349	if (num_regions > 1 && rw != WRITE) {
350		WARN_ON(1);
351		return -EIO;
352	}
353
354	io.error = 0;
355	atomic_set(&io.count, 1); /* see dispatch_io() */
356	io.sleeper = current;
357	io.client = client;
358
359	dispatch_io(rw, num_regions, where, dp, &io, 1);
360
361	while (1) {
362		set_current_state(TASK_UNINTERRUPTIBLE);
363
364		if (!atomic_read(&io.count) || signal_pending(current))
365			break;
366
367		io_schedule();
368	}
369	set_current_state(TASK_RUNNING);
370
371	if (atomic_read(&io.count))
372		return -EINTR;
373
374	if (error_bits)
375		*error_bits = io.error;
376
377	return io.error ? -EIO : 0;
378}
379
380static int async_io(struct dm_io_client *client, unsigned int num_regions,
381		    struct io_region *where, int rw, struct dpages *dp,
382		    io_notify_fn fn, void *context)
383{
384	struct io *io;
385
386	if (num_regions > 1 && rw != WRITE) {
387		WARN_ON(1);
388		fn(1, context);
389		return -EIO;
390	}
391
392	io = mempool_alloc(io_pool(client), GFP_NOIO);
393	io->error = 0;
394	atomic_set(&io->count, 1); /* see dispatch_io() */
395	io->sleeper = NULL;
396	io->client = client;
397	io->callback = fn;
398	io->context = context;
399
400	dispatch_io(rw, num_regions, where, dp, io, 0);
401	return 0;
402}
403
404int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
405	       struct page_list *pl, unsigned int offset,
406	       unsigned long *error_bits)
407{
408	struct dpages dp;
409	list_dp_init(&dp, pl, offset);
410	return sync_io(NULL, num_regions, where, rw, &dp, error_bits);
411}
412
413int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
414		    struct bio_vec *bvec, unsigned long *error_bits)
415{
416	struct dpages dp;
417	bvec_dp_init(&dp, bvec);
418	return sync_io(NULL, num_regions, where, rw, &dp, error_bits);
419}
420
421int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
422		  void *data, unsigned long *error_bits)
423{
424	struct dpages dp;
425	vm_dp_init(&dp, data);
426	return sync_io(NULL, num_regions, where, rw, &dp, error_bits);
427}
428
429int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
430		struct page_list *pl, unsigned int offset,
431		io_notify_fn fn, void *context)
432{
433	struct dpages dp;
434	list_dp_init(&dp, pl, offset);
435	return async_io(NULL, num_regions, where, rw, &dp, fn, context);
436}
437
438int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
439		     struct bio_vec *bvec, io_notify_fn fn, void *context)
440{
441	struct dpages dp;
442	bvec_dp_init(&dp, bvec);
443	return async_io(NULL, num_regions, where, rw, &dp, fn, context);
444}
445
446int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
447		   void *data, io_notify_fn fn, void *context)
448{
449	struct dpages dp;
450	vm_dp_init(&dp, data);
451	return async_io(NULL, num_regions, where, rw, &dp, fn, context);
452}
453
454EXPORT_SYMBOL(dm_io_get);
455EXPORT_SYMBOL(dm_io_put);
456EXPORT_SYMBOL(dm_io_sync);
457EXPORT_SYMBOL(dm_io_async);
458EXPORT_SYMBOL(dm_io_sync_bvec);
459EXPORT_SYMBOL(dm_io_async_bvec);
460EXPORT_SYMBOL(dm_io_sync_vm);
461EXPORT_SYMBOL(dm_io_async_vm);
462