dm-io.c revision c8b03afe3d38a635861e4bfa5c563d844e754a91
1/*
2 * Copyright (C) 2003 Sistina Software
3 * Copyright (C) 2006 Red Hat GmbH
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm-io.h"
9
10#include <linux/bio.h>
11#include <linux/mempool.h>
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/slab.h>
15
16static struct bio_set *_bios;
17
18struct dm_io_client {
19	mempool_t *pool;
20	struct bio_set *bios;
21};
22
23/* FIXME: can we shrink this ? */
24struct io {
25	unsigned long error;
26	atomic_t count;
27	struct task_struct *sleeper;
28	struct dm_io_client *client;
29	io_notify_fn callback;
30	void *context;
31};
32
33/*
34 * io contexts are only dynamically allocated for asynchronous
35 * io.  Since async io is likely to be the majority of io we'll
36 * have the same number of io contexts as bios! (FIXME: must reduce this).
37 */
38static unsigned _num_ios;
39static mempool_t *_io_pool;
40
41/*
42 * Temporary functions to allow old and new interfaces to co-exist.
43 */
44static struct bio_set *bios(struct dm_io_client *client)
45{
46	return client ? client->bios : _bios;
47}
48
49static mempool_t *io_pool(struct dm_io_client *client)
50{
51	return client ? client->pool : _io_pool;
52}
53
54static unsigned int pages_to_ios(unsigned int pages)
55{
56	return 4 * pages;	/* too many ? */
57}
58
59static int resize_pool(unsigned int new_ios)
60{
61	int r = 0;
62
63	if (_io_pool) {
64		if (new_ios == 0) {
65			/* free off the pool */
66			mempool_destroy(_io_pool);
67			_io_pool = NULL;
68			bioset_free(_bios);
69
70		} else {
71			/* resize the pool */
72			r = mempool_resize(_io_pool, new_ios, GFP_KERNEL);
73		}
74
75	} else {
76		/* create new pool */
77		_io_pool = mempool_create_kmalloc_pool(new_ios,
78						       sizeof(struct io));
79		if (!_io_pool)
80			return -ENOMEM;
81
82		_bios = bioset_create(16, 16);
83		if (!_bios) {
84			mempool_destroy(_io_pool);
85			_io_pool = NULL;
86			return -ENOMEM;
87		}
88	}
89
90	if (!r)
91		_num_ios = new_ios;
92
93	return r;
94}
95
96int dm_io_get(unsigned int num_pages)
97{
98	return resize_pool(_num_ios + pages_to_ios(num_pages));
99}
100
101void dm_io_put(unsigned int num_pages)
102{
103	resize_pool(_num_ios - pages_to_ios(num_pages));
104}
105
106/*
107 * Create a client with mempool and bioset.
108 */
109struct dm_io_client *dm_io_client_create(unsigned num_pages)
110{
111	unsigned ios = pages_to_ios(num_pages);
112	struct dm_io_client *client;
113
114	client = kmalloc(sizeof(*client), GFP_KERNEL);
115	if (!client)
116		return ERR_PTR(-ENOMEM);
117
118	client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
119	if (!client->pool)
120		goto bad;
121
122	client->bios = bioset_create(16, 16);
123	if (!client->bios)
124		goto bad;
125
126	return client;
127
128   bad:
129	if (client->pool)
130		mempool_destroy(client->pool);
131	kfree(client);
132	return ERR_PTR(-ENOMEM);
133}
134EXPORT_SYMBOL(dm_io_client_create);
135
136int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client)
137{
138	return mempool_resize(client->pool, pages_to_ios(num_pages),
139			      GFP_KERNEL);
140}
141EXPORT_SYMBOL(dm_io_client_resize);
142
143void dm_io_client_destroy(struct dm_io_client *client)
144{
145	mempool_destroy(client->pool);
146	bioset_free(client->bios);
147	kfree(client);
148}
149EXPORT_SYMBOL(dm_io_client_destroy);
150
151/*-----------------------------------------------------------------
152 * We need to keep track of which region a bio is doing io for.
153 * In order to save a memory allocation we store this the last
154 * bvec which we know is unused (blech).
155 * XXX This is ugly and can OOPS with some configs... find another way.
156 *---------------------------------------------------------------*/
157static inline void bio_set_region(struct bio *bio, unsigned region)
158{
159	bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
160}
161
162static inline unsigned bio_get_region(struct bio *bio)
163{
164	return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
165}
166
167/*-----------------------------------------------------------------
168 * We need an io object to keep track of the number of bios that
169 * have been dispatched for a particular io.
170 *---------------------------------------------------------------*/
171static void dec_count(struct io *io, unsigned int region, int error)
172{
173	if (error)
174		set_bit(region, &io->error);
175
176	if (atomic_dec_and_test(&io->count)) {
177		if (io->sleeper)
178			wake_up_process(io->sleeper);
179
180		else {
181			int r = io->error;
182			io_notify_fn fn = io->callback;
183			void *context = io->context;
184
185			mempool_free(io, io_pool(io->client));
186			fn(r, context);
187		}
188	}
189}
190
191static int endio(struct bio *bio, unsigned int done, int error)
192{
193	struct io *io;
194	unsigned region;
195
196	/* keep going until we've finished */
197	if (bio->bi_size)
198		return 1;
199
200	if (error && bio_data_dir(bio) == READ)
201		zero_fill_bio(bio);
202
203	/*
204	 * The bio destructor in bio_put() may use the io object.
205	 */
206	io = bio->bi_private;
207	region = bio_get_region(bio);
208
209	bio->bi_max_vecs++;
210	bio_put(bio);
211
212	dec_count(io, region, error);
213
214	return 0;
215}
216
217/*-----------------------------------------------------------------
218 * These little objects provide an abstraction for getting a new
219 * destination page for io.
220 *---------------------------------------------------------------*/
221struct dpages {
222	void (*get_page)(struct dpages *dp,
223			 struct page **p, unsigned long *len, unsigned *offset);
224	void (*next_page)(struct dpages *dp);
225
226	unsigned context_u;
227	void *context_ptr;
228};
229
230/*
231 * Functions for getting the pages from a list.
232 */
233static void list_get_page(struct dpages *dp,
234		  struct page **p, unsigned long *len, unsigned *offset)
235{
236	unsigned o = dp->context_u;
237	struct page_list *pl = (struct page_list *) dp->context_ptr;
238
239	*p = pl->page;
240	*len = PAGE_SIZE - o;
241	*offset = o;
242}
243
244static void list_next_page(struct dpages *dp)
245{
246	struct page_list *pl = (struct page_list *) dp->context_ptr;
247	dp->context_ptr = pl->next;
248	dp->context_u = 0;
249}
250
251static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
252{
253	dp->get_page = list_get_page;
254	dp->next_page = list_next_page;
255	dp->context_u = offset;
256	dp->context_ptr = pl;
257}
258
259/*
260 * Functions for getting the pages from a bvec.
261 */
262static void bvec_get_page(struct dpages *dp,
263		  struct page **p, unsigned long *len, unsigned *offset)
264{
265	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
266	*p = bvec->bv_page;
267	*len = bvec->bv_len;
268	*offset = bvec->bv_offset;
269}
270
271static void bvec_next_page(struct dpages *dp)
272{
273	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
274	dp->context_ptr = bvec + 1;
275}
276
277static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
278{
279	dp->get_page = bvec_get_page;
280	dp->next_page = bvec_next_page;
281	dp->context_ptr = bvec;
282}
283
284/*
285 * Functions for getting the pages from a VMA.
286 */
287static void vm_get_page(struct dpages *dp,
288		 struct page **p, unsigned long *len, unsigned *offset)
289{
290	*p = vmalloc_to_page(dp->context_ptr);
291	*offset = dp->context_u;
292	*len = PAGE_SIZE - dp->context_u;
293}
294
295static void vm_next_page(struct dpages *dp)
296{
297	dp->context_ptr += PAGE_SIZE - dp->context_u;
298	dp->context_u = 0;
299}
300
301static void vm_dp_init(struct dpages *dp, void *data)
302{
303	dp->get_page = vm_get_page;
304	dp->next_page = vm_next_page;
305	dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
306	dp->context_ptr = data;
307}
308
309static void dm_bio_destructor(struct bio *bio)
310{
311	struct io *io = bio->bi_private;
312
313	bio_free(bio, bios(io->client));
314}
315
316/*
317 * Functions for getting the pages from kernel memory.
318 */
319static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len,
320			unsigned *offset)
321{
322	*p = virt_to_page(dp->context_ptr);
323	*offset = dp->context_u;
324	*len = PAGE_SIZE - dp->context_u;
325}
326
327static void km_next_page(struct dpages *dp)
328{
329	dp->context_ptr += PAGE_SIZE - dp->context_u;
330	dp->context_u = 0;
331}
332
333static void km_dp_init(struct dpages *dp, void *data)
334{
335	dp->get_page = km_get_page;
336	dp->next_page = km_next_page;
337	dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
338	dp->context_ptr = data;
339}
340
341/*-----------------------------------------------------------------
342 * IO routines that accept a list of pages.
343 *---------------------------------------------------------------*/
344static void do_region(int rw, unsigned int region, struct io_region *where,
345		      struct dpages *dp, struct io *io)
346{
347	struct bio *bio;
348	struct page *page;
349	unsigned long len;
350	unsigned offset;
351	unsigned num_bvecs;
352	sector_t remaining = where->count;
353
354	while (remaining) {
355		/*
356		 * Allocate a suitably sized-bio: we add an extra
357		 * bvec for bio_get/set_region() and decrement bi_max_vecs
358		 * to hide it from bio_add_page().
359		 */
360		num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2;
361		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, bios(io->client));
362		bio->bi_sector = where->sector + (where->count - remaining);
363		bio->bi_bdev = where->bdev;
364		bio->bi_end_io = endio;
365		bio->bi_private = io;
366		bio->bi_destructor = dm_bio_destructor;
367		bio->bi_max_vecs--;
368		bio_set_region(bio, region);
369
370		/*
371		 * Try and add as many pages as possible.
372		 */
373		while (remaining) {
374			dp->get_page(dp, &page, &len, &offset);
375			len = min(len, to_bytes(remaining));
376			if (!bio_add_page(bio, page, len, offset))
377				break;
378
379			offset = 0;
380			remaining -= to_sector(len);
381			dp->next_page(dp);
382		}
383
384		atomic_inc(&io->count);
385		submit_bio(rw, bio);
386	}
387}
388
389static void dispatch_io(int rw, unsigned int num_regions,
390			struct io_region *where, struct dpages *dp,
391			struct io *io, int sync)
392{
393	int i;
394	struct dpages old_pages = *dp;
395
396	if (sync)
397		rw |= (1 << BIO_RW_SYNC);
398
399	/*
400	 * For multiple regions we need to be careful to rewind
401	 * the dp object for each call to do_region.
402	 */
403	for (i = 0; i < num_regions; i++) {
404		*dp = old_pages;
405		if (where[i].count)
406			do_region(rw, i, where + i, dp, io);
407	}
408
409	/*
410	 * Drop the extra reference that we were holding to avoid
411	 * the io being completed too early.
412	 */
413	dec_count(io, 0, 0);
414}
415
416static int sync_io(struct dm_io_client *client, unsigned int num_regions,
417		   struct io_region *where, int rw, struct dpages *dp,
418		   unsigned long *error_bits)
419{
420	struct io io;
421
422	if (num_regions > 1 && rw != WRITE) {
423		WARN_ON(1);
424		return -EIO;
425	}
426
427	io.error = 0;
428	atomic_set(&io.count, 1); /* see dispatch_io() */
429	io.sleeper = current;
430	io.client = client;
431
432	dispatch_io(rw, num_regions, where, dp, &io, 1);
433
434	while (1) {
435		set_current_state(TASK_UNINTERRUPTIBLE);
436
437		if (!atomic_read(&io.count) || signal_pending(current))
438			break;
439
440		io_schedule();
441	}
442	set_current_state(TASK_RUNNING);
443
444	if (atomic_read(&io.count))
445		return -EINTR;
446
447	if (error_bits)
448		*error_bits = io.error;
449
450	return io.error ? -EIO : 0;
451}
452
453static int async_io(struct dm_io_client *client, unsigned int num_regions,
454		    struct io_region *where, int rw, struct dpages *dp,
455		    io_notify_fn fn, void *context)
456{
457	struct io *io;
458
459	if (num_regions > 1 && rw != WRITE) {
460		WARN_ON(1);
461		fn(1, context);
462		return -EIO;
463	}
464
465	io = mempool_alloc(io_pool(client), GFP_NOIO);
466	io->error = 0;
467	atomic_set(&io->count, 1); /* see dispatch_io() */
468	io->sleeper = NULL;
469	io->client = client;
470	io->callback = fn;
471	io->context = context;
472
473	dispatch_io(rw, num_regions, where, dp, io, 0);
474	return 0;
475}
476
477int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
478	       struct page_list *pl, unsigned int offset,
479	       unsigned long *error_bits)
480{
481	struct dpages dp;
482	list_dp_init(&dp, pl, offset);
483	return sync_io(NULL, num_regions, where, rw, &dp, error_bits);
484}
485
486int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
487		    struct bio_vec *bvec, unsigned long *error_bits)
488{
489	struct dpages dp;
490	bvec_dp_init(&dp, bvec);
491	return sync_io(NULL, num_regions, where, rw, &dp, error_bits);
492}
493
494int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
495		  void *data, unsigned long *error_bits)
496{
497	struct dpages dp;
498	vm_dp_init(&dp, data);
499	return sync_io(NULL, num_regions, where, rw, &dp, error_bits);
500}
501
502int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
503		struct page_list *pl, unsigned int offset,
504		io_notify_fn fn, void *context)
505{
506	struct dpages dp;
507	list_dp_init(&dp, pl, offset);
508	return async_io(NULL, num_regions, where, rw, &dp, fn, context);
509}
510
511int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
512		     struct bio_vec *bvec, io_notify_fn fn, void *context)
513{
514	struct dpages dp;
515	bvec_dp_init(&dp, bvec);
516	return async_io(NULL, num_regions, where, rw, &dp, fn, context);
517}
518
519int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
520		   void *data, io_notify_fn fn, void *context)
521{
522	struct dpages dp;
523	vm_dp_init(&dp, data);
524	return async_io(NULL, num_regions, where, rw, &dp, fn, context);
525}
526
527static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
528{
529	/* Set up dpages based on memory type */
530	switch (io_req->mem.type) {
531	case DM_IO_PAGE_LIST:
532		list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
533		break;
534
535	case DM_IO_BVEC:
536		bvec_dp_init(dp, io_req->mem.ptr.bvec);
537		break;
538
539	case DM_IO_VMA:
540		vm_dp_init(dp, io_req->mem.ptr.vma);
541		break;
542
543	case DM_IO_KMEM:
544		km_dp_init(dp, io_req->mem.ptr.addr);
545		break;
546
547	default:
548		return -EINVAL;
549	}
550
551	return 0;
552}
553
554/*
555 * New collapsed (a)synchronous interface
556 */
557int dm_io(struct dm_io_request *io_req, unsigned num_regions,
558	  struct io_region *where, unsigned long *sync_error_bits)
559{
560	int r;
561	struct dpages dp;
562
563	r = dp_init(io_req, &dp);
564	if (r)
565		return r;
566
567	if (!io_req->notify.fn)
568		return sync_io(io_req->client, num_regions, where,
569			       io_req->bi_rw, &dp, sync_error_bits);
570
571	return async_io(io_req->client, num_regions, where, io_req->bi_rw,
572			&dp, io_req->notify.fn, io_req->notify.context);
573}
574EXPORT_SYMBOL(dm_io);
575
576EXPORT_SYMBOL(dm_io_get);
577EXPORT_SYMBOL(dm_io_put);
578EXPORT_SYMBOL(dm_io_sync);
579EXPORT_SYMBOL(dm_io_async);
580EXPORT_SYMBOL(dm_io_sync_bvec);
581EXPORT_SYMBOL(dm_io_async_bvec);
582EXPORT_SYMBOL(dm_io_sync_vm);
583EXPORT_SYMBOL(dm_io_async_vm);
584