1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm.h"
9#include "dm-uevent.h"
10
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/mutex.h>
14#include <linux/moduleparam.h>
15#include <linux/blkpg.h>
16#include <linux/bio.h>
17#include <linux/mempool.h>
18#include <linux/slab.h>
19#include <linux/idr.h>
20#include <linux/hdreg.h>
21#include <linux/delay.h>
22
23#include <trace/events/block.h>
24
25#define DM_MSG_PREFIX "core"
26
27#ifdef CONFIG_PRINTK
28/*
29 * ratelimit state to be used in DMXXX_LIMIT().
30 */
31DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
32		       DEFAULT_RATELIMIT_INTERVAL,
33		       DEFAULT_RATELIMIT_BURST);
34EXPORT_SYMBOL(dm_ratelimit_state);
35#endif
36
37/*
38 * Cookies are numeric values sent with CHANGE and REMOVE
39 * uevents while resuming, removing or renaming the device.
40 */
41#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
42#define DM_COOKIE_LENGTH 24
43
44static const char *_name = DM_NAME;
45
46static unsigned int major = 0;
47static unsigned int _major = 0;
48
49static DEFINE_IDR(_minor_idr);
50
51static DEFINE_SPINLOCK(_minor_lock);
52/*
53 * For bio-based dm.
54 * One of these is allocated per bio.
55 */
56struct dm_io {
57	struct mapped_device *md;
58	int error;
59	atomic_t io_count;
60	struct bio *bio;
61	unsigned long start_time;
62	spinlock_t endio_lock;
63};
64
65/*
66 * For bio-based dm.
67 * One of these is allocated per target within a bio.  Hopefully
68 * this will be simplified out one day.
69 */
70struct dm_target_io {
71	struct dm_io *io;
72	struct dm_target *ti;
73	union map_info info;
74};
75
76/*
77 * For request-based dm.
78 * One of these is allocated per request.
79 */
80struct dm_rq_target_io {
81	struct mapped_device *md;
82	struct dm_target *ti;
83	struct request *orig, clone;
84	int error;
85	union map_info info;
86};
87
88/*
89 * For request-based dm.
90 * One of these is allocated per bio.
91 */
92struct dm_rq_clone_bio_info {
93	struct bio *orig;
94	struct dm_rq_target_io *tio;
95};
96
97union map_info *dm_get_mapinfo(struct bio *bio)
98{
99	if (bio && bio->bi_private)
100		return &((struct dm_target_io *)bio->bi_private)->info;
101	return NULL;
102}
103
104union map_info *dm_get_rq_mapinfo(struct request *rq)
105{
106	if (rq && rq->end_io_data)
107		return &((struct dm_rq_target_io *)rq->end_io_data)->info;
108	return NULL;
109}
110EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
111
112#define MINOR_ALLOCED ((void *)-1)
113
114/*
115 * Bits for the md->flags field.
116 */
117#define DMF_BLOCK_IO_FOR_SUSPEND 0
118#define DMF_SUSPENDED 1
119#define DMF_FROZEN 2
120#define DMF_FREEING 3
121#define DMF_DELETING 4
122#define DMF_NOFLUSH_SUSPENDING 5
123#define DMF_MERGE_IS_OPTIONAL 6
124
125/*
126 * Work processed by per-device workqueue.
127 */
128struct mapped_device {
129	struct rw_semaphore io_lock;
130	struct mutex suspend_lock;
131	rwlock_t map_lock;
132	atomic_t holders;
133	atomic_t open_count;
134
135	unsigned long flags;
136
137	struct request_queue *queue;
138	unsigned type;
139	/* Protect queue and type against concurrent access. */
140	struct mutex type_lock;
141
142	struct target_type *immutable_target_type;
143
144	struct gendisk *disk;
145	char name[16];
146
147	void *interface_ptr;
148
149	/*
150	 * A list of ios that arrived while we were suspended.
151	 */
152	atomic_t pending[2];
153	wait_queue_head_t wait;
154	struct work_struct work;
155	struct bio_list deferred;
156	spinlock_t deferred_lock;
157
158	/*
159	 * Processing queue (flush)
160	 */
161	struct workqueue_struct *wq;
162
163	/*
164	 * The current mapping.
165	 */
166	struct dm_table *map;
167
168	/*
169	 * io objects are allocated from here.
170	 */
171	mempool_t *io_pool;
172	mempool_t *tio_pool;
173
174	struct bio_set *bs;
175
176	/*
177	 * Event handling.
178	 */
179	atomic_t event_nr;
180	wait_queue_head_t eventq;
181	atomic_t uevent_seq;
182	struct list_head uevent_list;
183	spinlock_t uevent_lock; /* Protect access to uevent_list */
184
185	/*
186	 * freeze/thaw support require holding onto a super block
187	 */
188	struct super_block *frozen_sb;
189	struct block_device *bdev;
190
191	/* forced geometry settings */
192	struct hd_geometry geometry;
193
194	/* sysfs handle */
195	struct kobject kobj;
196
197	/* zero-length flush that will be cloned and submitted to targets */
198	struct bio flush_bio;
199};
200
201/*
202 * For mempools pre-allocation at the table loading time.
203 */
204struct dm_md_mempools {
205	mempool_t *io_pool;
206	mempool_t *tio_pool;
207	struct bio_set *bs;
208};
209
210#define MIN_IOS 256
211static struct kmem_cache *_io_cache;
212static struct kmem_cache *_tio_cache;
213static struct kmem_cache *_rq_tio_cache;
214static struct kmem_cache *_rq_bio_info_cache;
215
216static int __init local_init(void)
217{
218	int r = -ENOMEM;
219
220	/* allocate a slab for the dm_ios */
221	_io_cache = KMEM_CACHE(dm_io, 0);
222	if (!_io_cache)
223		return r;
224
225	/* allocate a slab for the target ios */
226	_tio_cache = KMEM_CACHE(dm_target_io, 0);
227	if (!_tio_cache)
228		goto out_free_io_cache;
229
230	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
231	if (!_rq_tio_cache)
232		goto out_free_tio_cache;
233
234	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
235	if (!_rq_bio_info_cache)
236		goto out_free_rq_tio_cache;
237
238	r = dm_uevent_init();
239	if (r)
240		goto out_free_rq_bio_info_cache;
241
242	_major = major;
243	r = register_blkdev(_major, _name);
244	if (r < 0)
245		goto out_uevent_exit;
246
247	if (!_major)
248		_major = r;
249
250	return 0;
251
252out_uevent_exit:
253	dm_uevent_exit();
254out_free_rq_bio_info_cache:
255	kmem_cache_destroy(_rq_bio_info_cache);
256out_free_rq_tio_cache:
257	kmem_cache_destroy(_rq_tio_cache);
258out_free_tio_cache:
259	kmem_cache_destroy(_tio_cache);
260out_free_io_cache:
261	kmem_cache_destroy(_io_cache);
262
263	return r;
264}
265
266static void local_exit(void)
267{
268	kmem_cache_destroy(_rq_bio_info_cache);
269	kmem_cache_destroy(_rq_tio_cache);
270	kmem_cache_destroy(_tio_cache);
271	kmem_cache_destroy(_io_cache);
272	unregister_blkdev(_major, _name);
273	dm_uevent_exit();
274
275	_major = 0;
276
277	DMINFO("cleaned up");
278}
279
280static int (*_inits[])(void) __initdata = {
281	local_init,
282	dm_target_init,
283	dm_linear_init,
284	dm_stripe_init,
285	dm_io_init,
286	dm_kcopyd_init,
287	dm_interface_init,
288};
289
290static void (*_exits[])(void) = {
291	local_exit,
292	dm_target_exit,
293	dm_linear_exit,
294	dm_stripe_exit,
295	dm_io_exit,
296	dm_kcopyd_exit,
297	dm_interface_exit,
298};
299
300static int __init dm_init(void)
301{
302	const int count = ARRAY_SIZE(_inits);
303
304	int r, i;
305
306	for (i = 0; i < count; i++) {
307		r = _inits[i]();
308		if (r)
309			goto bad;
310	}
311
312	return 0;
313
314      bad:
315	while (i--)
316		_exits[i]();
317
318	return r;
319}
320
321static void __exit dm_exit(void)
322{
323	int i = ARRAY_SIZE(_exits);
324
325	while (i--)
326		_exits[i]();
327
328	/*
329	 * Should be empty by this point.
330	 */
331	idr_remove_all(&_minor_idr);
332	idr_destroy(&_minor_idr);
333}
334
335/*
336 * Block device functions
337 */
338int dm_deleting_md(struct mapped_device *md)
339{
340	return test_bit(DMF_DELETING, &md->flags);
341}
342
343static int dm_blk_open(struct block_device *bdev, fmode_t mode)
344{
345	struct mapped_device *md;
346
347	spin_lock(&_minor_lock);
348
349	md = bdev->bd_disk->private_data;
350	if (!md)
351		goto out;
352
353	if (test_bit(DMF_FREEING, &md->flags) ||
354	    dm_deleting_md(md)) {
355		md = NULL;
356		goto out;
357	}
358
359	dm_get(md);
360	atomic_inc(&md->open_count);
361
362out:
363	spin_unlock(&_minor_lock);
364
365	return md ? 0 : -ENXIO;
366}
367
368static int dm_blk_close(struct gendisk *disk, fmode_t mode)
369{
370	struct mapped_device *md = disk->private_data;
371
372	spin_lock(&_minor_lock);
373
374	atomic_dec(&md->open_count);
375	dm_put(md);
376
377	spin_unlock(&_minor_lock);
378
379	return 0;
380}
381
382int dm_open_count(struct mapped_device *md)
383{
384	return atomic_read(&md->open_count);
385}
386
387/*
388 * Guarantees nothing is using the device before it's deleted.
389 */
390int dm_lock_for_deletion(struct mapped_device *md)
391{
392	int r = 0;
393
394	spin_lock(&_minor_lock);
395
396	if (dm_open_count(md))
397		r = -EBUSY;
398	else
399		set_bit(DMF_DELETING, &md->flags);
400
401	spin_unlock(&_minor_lock);
402
403	return r;
404}
405
406static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
407{
408	struct mapped_device *md = bdev->bd_disk->private_data;
409
410	return dm_get_geometry(md, geo);
411}
412
413static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
414			unsigned int cmd, unsigned long arg)
415{
416	struct mapped_device *md = bdev->bd_disk->private_data;
417	struct dm_table *map = dm_get_live_table(md);
418	struct dm_target *tgt;
419	int r = -ENOTTY;
420
421	if (!map || !dm_table_get_size(map))
422		goto out;
423
424	/* We only support devices that have a single target */
425	if (dm_table_get_num_targets(map) != 1)
426		goto out;
427
428	tgt = dm_table_get_target(map, 0);
429
430	if (dm_suspended_md(md)) {
431		r = -EAGAIN;
432		goto out;
433	}
434
435	if (tgt->type->ioctl)
436		r = tgt->type->ioctl(tgt, cmd, arg);
437
438out:
439	dm_table_put(map);
440
441	return r;
442}
443
444static struct dm_io *alloc_io(struct mapped_device *md)
445{
446	return mempool_alloc(md->io_pool, GFP_NOIO);
447}
448
449static void free_io(struct mapped_device *md, struct dm_io *io)
450{
451	mempool_free(io, md->io_pool);
452}
453
454static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
455{
456	mempool_free(tio, md->tio_pool);
457}
458
459static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
460					    gfp_t gfp_mask)
461{
462	return mempool_alloc(md->tio_pool, gfp_mask);
463}
464
465static void free_rq_tio(struct dm_rq_target_io *tio)
466{
467	mempool_free(tio, tio->md->tio_pool);
468}
469
470static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
471{
472	return mempool_alloc(md->io_pool, GFP_ATOMIC);
473}
474
475static void free_bio_info(struct dm_rq_clone_bio_info *info)
476{
477	mempool_free(info, info->tio->md->io_pool);
478}
479
480static int md_in_flight(struct mapped_device *md)
481{
482	return atomic_read(&md->pending[READ]) +
483	       atomic_read(&md->pending[WRITE]);
484}
485
486static void start_io_acct(struct dm_io *io)
487{
488	struct mapped_device *md = io->md;
489	int cpu;
490	int rw = bio_data_dir(io->bio);
491
492	io->start_time = jiffies;
493
494	cpu = part_stat_lock();
495	part_round_stats(cpu, &dm_disk(md)->part0);
496	part_stat_unlock();
497	atomic_set(&dm_disk(md)->part0.in_flight[rw],
498		atomic_inc_return(&md->pending[rw]));
499}
500
501static void end_io_acct(struct dm_io *io)
502{
503	struct mapped_device *md = io->md;
504	struct bio *bio = io->bio;
505	unsigned long duration = jiffies - io->start_time;
506	int pending, cpu;
507	int rw = bio_data_dir(bio);
508
509	cpu = part_stat_lock();
510	part_round_stats(cpu, &dm_disk(md)->part0);
511	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
512	part_stat_unlock();
513
514	/*
515	 * After this is decremented the bio must not be touched if it is
516	 * a flush.
517	 */
518	pending = atomic_dec_return(&md->pending[rw]);
519	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
520	pending += atomic_read(&md->pending[rw^0x1]);
521
522	/* nudge anyone waiting on suspend queue */
523	if (!pending)
524		wake_up(&md->wait);
525}
526
527/*
528 * Add the bio to the list of deferred io.
529 */
530static void queue_io(struct mapped_device *md, struct bio *bio)
531{
532	unsigned long flags;
533
534	spin_lock_irqsave(&md->deferred_lock, flags);
535	bio_list_add(&md->deferred, bio);
536	spin_unlock_irqrestore(&md->deferred_lock, flags);
537	queue_work(md->wq, &md->work);
538}
539
540/*
541 * Everyone (including functions in this file), should use this
542 * function to access the md->map field, and make sure they call
543 * dm_table_put() when finished.
544 */
545struct dm_table *dm_get_live_table(struct mapped_device *md)
546{
547	struct dm_table *t;
548	unsigned long flags;
549
550	read_lock_irqsave(&md->map_lock, flags);
551	t = md->map;
552	if (t)
553		dm_table_get(t);
554	read_unlock_irqrestore(&md->map_lock, flags);
555
556	return t;
557}
558
559/*
560 * Get the geometry associated with a dm device
561 */
562int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
563{
564	*geo = md->geometry;
565
566	return 0;
567}
568
569/*
570 * Set the geometry of a device.
571 */
572int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
573{
574	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
575
576	if (geo->start > sz) {
577		DMWARN("Start sector is beyond the geometry limits.");
578		return -EINVAL;
579	}
580
581	md->geometry = *geo;
582
583	return 0;
584}
585
586/*-----------------------------------------------------------------
587 * CRUD START:
588 *   A more elegant soln is in the works that uses the queue
589 *   merge fn, unfortunately there are a couple of changes to
590 *   the block layer that I want to make for this.  So in the
591 *   interests of getting something for people to use I give
592 *   you this clearly demarcated crap.
593 *---------------------------------------------------------------*/
594
595static int __noflush_suspending(struct mapped_device *md)
596{
597	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
598}
599
600/*
601 * Decrements the number of outstanding ios that a bio has been
602 * cloned into, completing the original io if necc.
603 */
604static void dec_pending(struct dm_io *io, int error)
605{
606	unsigned long flags;
607	int io_error;
608	struct bio *bio;
609	struct mapped_device *md = io->md;
610
611	/* Push-back supersedes any I/O errors */
612	if (unlikely(error)) {
613		spin_lock_irqsave(&io->endio_lock, flags);
614		if (!(io->error > 0 && __noflush_suspending(md)))
615			io->error = error;
616		spin_unlock_irqrestore(&io->endio_lock, flags);
617	}
618
619	if (atomic_dec_and_test(&io->io_count)) {
620		if (io->error == DM_ENDIO_REQUEUE) {
621			/*
622			 * Target requested pushing back the I/O.
623			 */
624			spin_lock_irqsave(&md->deferred_lock, flags);
625			if (__noflush_suspending(md))
626				bio_list_add_head(&md->deferred, io->bio);
627			else
628				/* noflush suspend was interrupted. */
629				io->error = -EIO;
630			spin_unlock_irqrestore(&md->deferred_lock, flags);
631		}
632
633		io_error = io->error;
634		bio = io->bio;
635		end_io_acct(io);
636		free_io(md, io);
637
638		if (io_error == DM_ENDIO_REQUEUE)
639			return;
640
641		if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
642			/*
643			 * Preflush done for flush with data, reissue
644			 * without REQ_FLUSH.
645			 */
646			bio->bi_rw &= ~REQ_FLUSH;
647			queue_io(md, bio);
648		} else {
649			/* done with normal IO or empty flush */
650			trace_block_bio_complete(md->queue, bio, io_error);
651			bio_endio(bio, io_error);
652		}
653	}
654}
655
656static void clone_endio(struct bio *bio, int error)
657{
658	int r = 0;
659	struct dm_target_io *tio = bio->bi_private;
660	struct dm_io *io = tio->io;
661	struct mapped_device *md = tio->io->md;
662	dm_endio_fn endio = tio->ti->type->end_io;
663
664	if (!bio_flagged(bio, BIO_UPTODATE) && !error)
665		error = -EIO;
666
667	if (endio) {
668		r = endio(tio->ti, bio, error, &tio->info);
669		if (r < 0 || r == DM_ENDIO_REQUEUE)
670			/*
671			 * error and requeue request are handled
672			 * in dec_pending().
673			 */
674			error = r;
675		else if (r == DM_ENDIO_INCOMPLETE)
676			/* The target will handle the io */
677			return;
678		else if (r) {
679			DMWARN("unimplemented target endio return value: %d", r);
680			BUG();
681		}
682	}
683
684	/*
685	 * Store md for cleanup instead of tio which is about to get freed.
686	 */
687	bio->bi_private = md->bs;
688
689	free_tio(md, tio);
690	bio_put(bio);
691	dec_pending(io, error);
692}
693
694/*
695 * Partial completion handling for request-based dm
696 */
697static void end_clone_bio(struct bio *clone, int error)
698{
699	struct dm_rq_clone_bio_info *info = clone->bi_private;
700	struct dm_rq_target_io *tio = info->tio;
701	struct bio *bio = info->orig;
702	unsigned int nr_bytes = info->orig->bi_size;
703
704	bio_put(clone);
705
706	if (tio->error)
707		/*
708		 * An error has already been detected on the request.
709		 * Once error occurred, just let clone->end_io() handle
710		 * the remainder.
711		 */
712		return;
713	else if (error) {
714		/*
715		 * Don't notice the error to the upper layer yet.
716		 * The error handling decision is made by the target driver,
717		 * when the request is completed.
718		 */
719		tio->error = error;
720		return;
721	}
722
723	/*
724	 * I/O for the bio successfully completed.
725	 * Notice the data completion to the upper layer.
726	 */
727
728	/*
729	 * bios are processed from the head of the list.
730	 * So the completing bio should always be rq->bio.
731	 * If it's not, something wrong is happening.
732	 */
733	if (tio->orig->bio != bio)
734		DMERR("bio completion is going in the middle of the request");
735
736	/*
737	 * Update the original request.
738	 * Do not use blk_end_request() here, because it may complete
739	 * the original request before the clone, and break the ordering.
740	 */
741	blk_update_request(tio->orig, 0, nr_bytes);
742}
743
744/*
745 * Don't touch any member of the md after calling this function because
746 * the md may be freed in dm_put() at the end of this function.
747 * Or do dm_get() before calling this function and dm_put() later.
748 */
749static void rq_completed(struct mapped_device *md, int rw, int run_queue)
750{
751	atomic_dec(&md->pending[rw]);
752
753	/* nudge anyone waiting on suspend queue */
754	if (!md_in_flight(md))
755		wake_up(&md->wait);
756
757	if (run_queue)
758		blk_run_queue(md->queue);
759
760	/*
761	 * dm_put() must be at the end of this function. See the comment above
762	 */
763	dm_put(md);
764}
765
766static void free_rq_clone(struct request *clone)
767{
768	struct dm_rq_target_io *tio = clone->end_io_data;
769
770	blk_rq_unprep_clone(clone);
771	free_rq_tio(tio);
772}
773
774/*
775 * Complete the clone and the original request.
776 * Must be called without queue lock.
777 */
778static void dm_end_request(struct request *clone, int error)
779{
780	int rw = rq_data_dir(clone);
781	struct dm_rq_target_io *tio = clone->end_io_data;
782	struct mapped_device *md = tio->md;
783	struct request *rq = tio->orig;
784
785	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
786		rq->errors = clone->errors;
787		rq->resid_len = clone->resid_len;
788
789		if (rq->sense)
790			/*
791			 * We are using the sense buffer of the original
792			 * request.
793			 * So setting the length of the sense data is enough.
794			 */
795			rq->sense_len = clone->sense_len;
796	}
797
798	free_rq_clone(clone);
799	blk_end_request_all(rq, error);
800	rq_completed(md, rw, true);
801}
802
803static void dm_unprep_request(struct request *rq)
804{
805	struct request *clone = rq->special;
806
807	rq->special = NULL;
808	rq->cmd_flags &= ~REQ_DONTPREP;
809
810	free_rq_clone(clone);
811}
812
813/*
814 * Requeue the original request of a clone.
815 */
816void dm_requeue_unmapped_request(struct request *clone)
817{
818	int rw = rq_data_dir(clone);
819	struct dm_rq_target_io *tio = clone->end_io_data;
820	struct mapped_device *md = tio->md;
821	struct request *rq = tio->orig;
822	struct request_queue *q = rq->q;
823	unsigned long flags;
824
825	dm_unprep_request(rq);
826
827	spin_lock_irqsave(q->queue_lock, flags);
828	blk_requeue_request(q, rq);
829	spin_unlock_irqrestore(q->queue_lock, flags);
830
831	rq_completed(md, rw, 0);
832}
833EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
834
835static void __stop_queue(struct request_queue *q)
836{
837	blk_stop_queue(q);
838}
839
840static void stop_queue(struct request_queue *q)
841{
842	unsigned long flags;
843
844	spin_lock_irqsave(q->queue_lock, flags);
845	__stop_queue(q);
846	spin_unlock_irqrestore(q->queue_lock, flags);
847}
848
849static void __start_queue(struct request_queue *q)
850{
851	if (blk_queue_stopped(q))
852		blk_start_queue(q);
853}
854
855static void start_queue(struct request_queue *q)
856{
857	unsigned long flags;
858
859	spin_lock_irqsave(q->queue_lock, flags);
860	__start_queue(q);
861	spin_unlock_irqrestore(q->queue_lock, flags);
862}
863
864static void dm_done(struct request *clone, int error, bool mapped)
865{
866	int r = error;
867	struct dm_rq_target_io *tio = clone->end_io_data;
868	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
869
870	if (mapped && rq_end_io)
871		r = rq_end_io(tio->ti, clone, error, &tio->info);
872
873	if (r <= 0)
874		/* The target wants to complete the I/O */
875		dm_end_request(clone, r);
876	else if (r == DM_ENDIO_INCOMPLETE)
877		/* The target will handle the I/O */
878		return;
879	else if (r == DM_ENDIO_REQUEUE)
880		/* The target wants to requeue the I/O */
881		dm_requeue_unmapped_request(clone);
882	else {
883		DMWARN("unimplemented target endio return value: %d", r);
884		BUG();
885	}
886}
887
888/*
889 * Request completion handler for request-based dm
890 */
891static void dm_softirq_done(struct request *rq)
892{
893	bool mapped = true;
894	struct request *clone = rq->completion_data;
895	struct dm_rq_target_io *tio = clone->end_io_data;
896
897	if (rq->cmd_flags & REQ_FAILED)
898		mapped = false;
899
900	dm_done(clone, tio->error, mapped);
901}
902
903/*
904 * Complete the clone and the original request with the error status
905 * through softirq context.
906 */
907static void dm_complete_request(struct request *clone, int error)
908{
909	struct dm_rq_target_io *tio = clone->end_io_data;
910	struct request *rq = tio->orig;
911
912	tio->error = error;
913	rq->completion_data = clone;
914	blk_complete_request(rq);
915}
916
917/*
918 * Complete the not-mapped clone and the original request with the error status
919 * through softirq context.
920 * Target's rq_end_io() function isn't called.
921 * This may be used when the target's map_rq() function fails.
922 */
923void dm_kill_unmapped_request(struct request *clone, int error)
924{
925	struct dm_rq_target_io *tio = clone->end_io_data;
926	struct request *rq = tio->orig;
927
928	rq->cmd_flags |= REQ_FAILED;
929	dm_complete_request(clone, error);
930}
931EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
932
933/*
934 * Called with the queue lock held
935 */
936static void end_clone_request(struct request *clone, int error)
937{
938	/*
939	 * For just cleaning up the information of the queue in which
940	 * the clone was dispatched.
941	 * The clone is *NOT* freed actually here because it is alloced from
942	 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
943	 */
944	__blk_put_request(clone->q, clone);
945
946	/*
947	 * Actual request completion is done in a softirq context which doesn't
948	 * hold the queue lock.  Otherwise, deadlock could occur because:
949	 *     - another request may be submitted by the upper level driver
950	 *       of the stacking during the completion
951	 *     - the submission which requires queue lock may be done
952	 *       against this queue
953	 */
954	dm_complete_request(clone, error);
955}
956
957/*
958 * Return maximum size of I/O possible at the supplied sector up to the current
959 * target boundary.
960 */
961static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
962{
963	sector_t target_offset = dm_target_offset(ti, sector);
964
965	return ti->len - target_offset;
966}
967
968static sector_t max_io_len(sector_t sector, struct dm_target *ti)
969{
970	sector_t len = max_io_len_target_boundary(sector, ti);
971
972	/*
973	 * Does the target need to split even further ?
974	 */
975	if (ti->split_io) {
976		sector_t boundary;
977		sector_t offset = dm_target_offset(ti, sector);
978		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
979			   - offset;
980		if (len > boundary)
981			len = boundary;
982	}
983
984	return len;
985}
986
987static void __map_bio(struct dm_target *ti, struct bio *clone,
988		      struct dm_target_io *tio)
989{
990	int r;
991	sector_t sector;
992	struct mapped_device *md;
993
994	clone->bi_end_io = clone_endio;
995	clone->bi_private = tio;
996
997	/*
998	 * Map the clone.  If r == 0 we don't need to do
999	 * anything, the target has assumed ownership of
1000	 * this io.
1001	 */
1002	atomic_inc(&tio->io->io_count);
1003	sector = clone->bi_sector;
1004	r = ti->type->map(ti, clone, &tio->info);
1005	if (r == DM_MAPIO_REMAPPED) {
1006		/* the bio has been remapped so dispatch it */
1007
1008		trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1009				      tio->io->bio->bi_bdev->bd_dev, sector);
1010
1011		generic_make_request(clone);
1012	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1013		/* error the io and bail out, or requeue it if needed */
1014		md = tio->io->md;
1015		dec_pending(tio->io, r);
1016		/*
1017		 * Store bio_set for cleanup.
1018		 */
1019		clone->bi_private = md->bs;
1020		bio_put(clone);
1021		free_tio(md, tio);
1022	} else if (r) {
1023		DMWARN("unimplemented target map return value: %d", r);
1024		BUG();
1025	}
1026}
1027
1028struct clone_info {
1029	struct mapped_device *md;
1030	struct dm_table *map;
1031	struct bio *bio;
1032	struct dm_io *io;
1033	sector_t sector;
1034	sector_t sector_count;
1035	unsigned short idx;
1036};
1037
1038static void dm_bio_destructor(struct bio *bio)
1039{
1040	struct bio_set *bs = bio->bi_private;
1041
1042	bio_free(bio, bs);
1043}
1044
1045/*
1046 * Creates a little bio that just does part of a bvec.
1047 */
1048static struct bio *split_bvec(struct bio *bio, sector_t sector,
1049			      unsigned short idx, unsigned int offset,
1050			      unsigned int len, struct bio_set *bs)
1051{
1052	struct bio *clone;
1053	struct bio_vec *bv = bio->bi_io_vec + idx;
1054
1055	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
1056	clone->bi_destructor = dm_bio_destructor;
1057	*clone->bi_io_vec = *bv;
1058
1059	clone->bi_sector = sector;
1060	clone->bi_bdev = bio->bi_bdev;
1061	clone->bi_rw = bio->bi_rw;
1062	clone->bi_vcnt = 1;
1063	clone->bi_size = to_bytes(len);
1064	clone->bi_io_vec->bv_offset = offset;
1065	clone->bi_io_vec->bv_len = clone->bi_size;
1066	clone->bi_flags |= 1 << BIO_CLONED;
1067
1068	if (bio_integrity(bio)) {
1069		bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1070		bio_integrity_trim(clone,
1071				   bio_sector_offset(bio, idx, offset), len);
1072	}
1073
1074	return clone;
1075}
1076
1077/*
1078 * Creates a bio that consists of range of complete bvecs.
1079 */
1080static struct bio *clone_bio(struct bio *bio, sector_t sector,
1081			     unsigned short idx, unsigned short bv_count,
1082			     unsigned int len, struct bio_set *bs)
1083{
1084	struct bio *clone;
1085
1086	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1087	__bio_clone(clone, bio);
1088	clone->bi_destructor = dm_bio_destructor;
1089	clone->bi_sector = sector;
1090	clone->bi_idx = idx;
1091	clone->bi_vcnt = idx + bv_count;
1092	clone->bi_size = to_bytes(len);
1093	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
1094
1095	if (bio_integrity(bio)) {
1096		bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1097
1098		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1099			bio_integrity_trim(clone,
1100					   bio_sector_offset(bio, idx, 0), len);
1101	}
1102
1103	return clone;
1104}
1105
1106static struct dm_target_io *alloc_tio(struct clone_info *ci,
1107				      struct dm_target *ti)
1108{
1109	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
1110
1111	tio->io = ci->io;
1112	tio->ti = ti;
1113	memset(&tio->info, 0, sizeof(tio->info));
1114
1115	return tio;
1116}
1117
1118static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
1119				   unsigned request_nr, sector_t len)
1120{
1121	struct dm_target_io *tio = alloc_tio(ci, ti);
1122	struct bio *clone;
1123
1124	tio->info.target_request_nr = request_nr;
1125
1126	/*
1127	 * Discard requests require the bio's inline iovecs be initialized.
1128	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1129	 * and discard, so no need for concern about wasted bvec allocations.
1130	 */
1131	clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
1132	__bio_clone(clone, ci->bio);
1133	clone->bi_destructor = dm_bio_destructor;
1134	if (len) {
1135		clone->bi_sector = ci->sector;
1136		clone->bi_size = to_bytes(len);
1137	}
1138
1139	__map_bio(ti, clone, tio);
1140}
1141
1142static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
1143				    unsigned num_requests, sector_t len)
1144{
1145	unsigned request_nr;
1146
1147	for (request_nr = 0; request_nr < num_requests; request_nr++)
1148		__issue_target_request(ci, ti, request_nr, len);
1149}
1150
1151static int __clone_and_map_empty_flush(struct clone_info *ci)
1152{
1153	unsigned target_nr = 0;
1154	struct dm_target *ti;
1155
1156	BUG_ON(bio_has_data(ci->bio));
1157	while ((ti = dm_table_get_target(ci->map, target_nr++)))
1158		__issue_target_requests(ci, ti, ti->num_flush_requests, 0);
1159
1160	return 0;
1161}
1162
1163/*
1164 * Perform all io with a single clone.
1165 */
1166static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
1167{
1168	struct bio *clone, *bio = ci->bio;
1169	struct dm_target_io *tio;
1170
1171	tio = alloc_tio(ci, ti);
1172	clone = clone_bio(bio, ci->sector, ci->idx,
1173			  bio->bi_vcnt - ci->idx, ci->sector_count,
1174			  ci->md->bs);
1175	__map_bio(ti, clone, tio);
1176	ci->sector_count = 0;
1177}
1178
1179static int __clone_and_map_discard(struct clone_info *ci)
1180{
1181	struct dm_target *ti;
1182	sector_t len;
1183
1184	do {
1185		ti = dm_table_find_target(ci->map, ci->sector);
1186		if (!dm_target_is_valid(ti))
1187			return -EIO;
1188
1189		/*
1190		 * Even though the device advertised discard support,
1191		 * that does not mean every target supports it, and
1192		 * reconfiguration might also have changed that since the
1193		 * check was performed.
1194		 */
1195		if (!ti->num_discard_requests)
1196			return -EOPNOTSUPP;
1197
1198		len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1199
1200		__issue_target_requests(ci, ti, ti->num_discard_requests, len);
1201
1202		ci->sector += len;
1203	} while (ci->sector_count -= len);
1204
1205	return 0;
1206}
1207
1208static int __clone_and_map(struct clone_info *ci)
1209{
1210	struct bio *clone, *bio = ci->bio;
1211	struct dm_target *ti;
1212	sector_t len = 0, max;
1213	struct dm_target_io *tio;
1214
1215	if (unlikely(bio->bi_rw & REQ_DISCARD))
1216		return __clone_and_map_discard(ci);
1217
1218	ti = dm_table_find_target(ci->map, ci->sector);
1219	if (!dm_target_is_valid(ti))
1220		return -EIO;
1221
1222	max = max_io_len(ci->sector, ti);
1223
1224	if (ci->sector_count <= max) {
1225		/*
1226		 * Optimise for the simple case where we can do all of
1227		 * the remaining io with a single clone.
1228		 */
1229		__clone_and_map_simple(ci, ti);
1230
1231	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1232		/*
1233		 * There are some bvecs that don't span targets.
1234		 * Do as many of these as possible.
1235		 */
1236		int i;
1237		sector_t remaining = max;
1238		sector_t bv_len;
1239
1240		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
1241			bv_len = to_sector(bio->bi_io_vec[i].bv_len);
1242
1243			if (bv_len > remaining)
1244				break;
1245
1246			remaining -= bv_len;
1247			len += bv_len;
1248		}
1249
1250		tio = alloc_tio(ci, ti);
1251		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
1252				  ci->md->bs);
1253		__map_bio(ti, clone, tio);
1254
1255		ci->sector += len;
1256		ci->sector_count -= len;
1257		ci->idx = i;
1258
1259	} else {
1260		/*
1261		 * Handle a bvec that must be split between two or more targets.
1262		 */
1263		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1264		sector_t remaining = to_sector(bv->bv_len);
1265		unsigned int offset = 0;
1266
1267		do {
1268			if (offset) {
1269				ti = dm_table_find_target(ci->map, ci->sector);
1270				if (!dm_target_is_valid(ti))
1271					return -EIO;
1272
1273				max = max_io_len(ci->sector, ti);
1274			}
1275
1276			len = min(remaining, max);
1277
1278			tio = alloc_tio(ci, ti);
1279			clone = split_bvec(bio, ci->sector, ci->idx,
1280					   bv->bv_offset + offset, len,
1281					   ci->md->bs);
1282
1283			__map_bio(ti, clone, tio);
1284
1285			ci->sector += len;
1286			ci->sector_count -= len;
1287			offset += to_bytes(len);
1288		} while (remaining -= len);
1289
1290		ci->idx++;
1291	}
1292
1293	return 0;
1294}
1295
1296/*
1297 * Split the bio into several clones and submit it to targets.
1298 */
1299static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1300{
1301	struct clone_info ci;
1302	int error = 0;
1303
1304	ci.map = dm_get_live_table(md);
1305	if (unlikely(!ci.map)) {
1306		bio_io_error(bio);
1307		return;
1308	}
1309
1310	ci.md = md;
1311	ci.io = alloc_io(md);
1312	ci.io->error = 0;
1313	atomic_set(&ci.io->io_count, 1);
1314	ci.io->bio = bio;
1315	ci.io->md = md;
1316	spin_lock_init(&ci.io->endio_lock);
1317	ci.sector = bio->bi_sector;
1318	ci.idx = bio->bi_idx;
1319
1320	start_io_acct(ci.io);
1321	if (bio->bi_rw & REQ_FLUSH) {
1322		ci.bio = &ci.md->flush_bio;
1323		ci.sector_count = 0;
1324		error = __clone_and_map_empty_flush(&ci);
1325		/* dec_pending submits any data associated with flush */
1326	} else {
1327		ci.bio = bio;
1328		ci.sector_count = bio_sectors(bio);
1329		while (ci.sector_count && !error)
1330			error = __clone_and_map(&ci);
1331	}
1332
1333	/* drop the extra reference count */
1334	dec_pending(ci.io, error);
1335	dm_table_put(ci.map);
1336}
1337/*-----------------------------------------------------------------
1338 * CRUD END
1339 *---------------------------------------------------------------*/
1340
1341static int dm_merge_bvec(struct request_queue *q,
1342			 struct bvec_merge_data *bvm,
1343			 struct bio_vec *biovec)
1344{
1345	struct mapped_device *md = q->queuedata;
1346	struct dm_table *map = dm_get_live_table(md);
1347	struct dm_target *ti;
1348	sector_t max_sectors;
1349	int max_size = 0;
1350
1351	if (unlikely(!map))
1352		goto out;
1353
1354	ti = dm_table_find_target(map, bvm->bi_sector);
1355	if (!dm_target_is_valid(ti))
1356		goto out_table;
1357
1358	/*
1359	 * Find maximum amount of I/O that won't need splitting
1360	 */
1361	max_sectors = min(max_io_len(bvm->bi_sector, ti),
1362			  (sector_t) BIO_MAX_SECTORS);
1363	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1364	if (max_size < 0)
1365		max_size = 0;
1366
1367	/*
1368	 * merge_bvec_fn() returns number of bytes
1369	 * it can accept at this offset
1370	 * max is precomputed maximal io size
1371	 */
1372	if (max_size && ti->type->merge)
1373		max_size = ti->type->merge(ti, bvm, biovec, max_size);
1374	/*
1375	 * If the target doesn't support merge method and some of the devices
1376	 * provided their merge_bvec method (we know this by looking at
1377	 * queue_max_hw_sectors), then we can't allow bios with multiple vector
1378	 * entries.  So always set max_size to 0, and the code below allows
1379	 * just one page.
1380	 */
1381	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1382
1383		max_size = 0;
1384
1385out_table:
1386	dm_table_put(map);
1387
1388out:
1389	/*
1390	 * Always allow an entire first page
1391	 */
1392	if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1393		max_size = biovec->bv_len;
1394
1395	return max_size;
1396}
1397
1398/*
1399 * The request function that just remaps the bio built up by
1400 * dm_merge_bvec.
1401 */
1402static void _dm_request(struct request_queue *q, struct bio *bio)
1403{
1404	int rw = bio_data_dir(bio);
1405	struct mapped_device *md = q->queuedata;
1406	int cpu;
1407
1408	down_read(&md->io_lock);
1409
1410	cpu = part_stat_lock();
1411	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1412	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1413	part_stat_unlock();
1414
1415	/* if we're suspended, we have to queue this io for later */
1416	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1417		up_read(&md->io_lock);
1418
1419		if (bio_rw(bio) != READA)
1420			queue_io(md, bio);
1421		else
1422			bio_io_error(bio);
1423		return;
1424	}
1425
1426	__split_and_process_bio(md, bio);
1427	up_read(&md->io_lock);
1428	return;
1429}
1430
1431static int dm_request_based(struct mapped_device *md)
1432{
1433	return blk_queue_stackable(md->queue);
1434}
1435
1436static void dm_request(struct request_queue *q, struct bio *bio)
1437{
1438	struct mapped_device *md = q->queuedata;
1439
1440	if (dm_request_based(md))
1441		blk_queue_bio(q, bio);
1442	else
1443		_dm_request(q, bio);
1444}
1445
1446void dm_dispatch_request(struct request *rq)
1447{
1448	int r;
1449
1450	if (blk_queue_io_stat(rq->q))
1451		rq->cmd_flags |= REQ_IO_STAT;
1452
1453	rq->start_time = jiffies;
1454	r = blk_insert_cloned_request(rq->q, rq);
1455	if (r)
1456		dm_complete_request(rq, r);
1457}
1458EXPORT_SYMBOL_GPL(dm_dispatch_request);
1459
1460static void dm_rq_bio_destructor(struct bio *bio)
1461{
1462	struct dm_rq_clone_bio_info *info = bio->bi_private;
1463	struct mapped_device *md = info->tio->md;
1464
1465	free_bio_info(info);
1466	bio_free(bio, md->bs);
1467}
1468
1469static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1470				 void *data)
1471{
1472	struct dm_rq_target_io *tio = data;
1473	struct mapped_device *md = tio->md;
1474	struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1475
1476	if (!info)
1477		return -ENOMEM;
1478
1479	info->orig = bio_orig;
1480	info->tio = tio;
1481	bio->bi_end_io = end_clone_bio;
1482	bio->bi_private = info;
1483	bio->bi_destructor = dm_rq_bio_destructor;
1484
1485	return 0;
1486}
1487
1488static int setup_clone(struct request *clone, struct request *rq,
1489		       struct dm_rq_target_io *tio)
1490{
1491	int r;
1492
1493	r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1494			      dm_rq_bio_constructor, tio);
1495	if (r)
1496		return r;
1497
1498	clone->cmd = rq->cmd;
1499	clone->cmd_len = rq->cmd_len;
1500	clone->sense = rq->sense;
1501	clone->buffer = rq->buffer;
1502	clone->end_io = end_clone_request;
1503	clone->end_io_data = tio;
1504
1505	return 0;
1506}
1507
1508static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1509				gfp_t gfp_mask)
1510{
1511	struct request *clone;
1512	struct dm_rq_target_io *tio;
1513
1514	tio = alloc_rq_tio(md, gfp_mask);
1515	if (!tio)
1516		return NULL;
1517
1518	tio->md = md;
1519	tio->ti = NULL;
1520	tio->orig = rq;
1521	tio->error = 0;
1522	memset(&tio->info, 0, sizeof(tio->info));
1523
1524	clone = &tio->clone;
1525	if (setup_clone(clone, rq, tio)) {
1526		/* -ENOMEM */
1527		free_rq_tio(tio);
1528		return NULL;
1529	}
1530
1531	return clone;
1532}
1533
1534/*
1535 * Called with the queue lock held.
1536 */
1537static int dm_prep_fn(struct request_queue *q, struct request *rq)
1538{
1539	struct mapped_device *md = q->queuedata;
1540	struct request *clone;
1541
1542	if (unlikely(rq->special)) {
1543		DMWARN("Already has something in rq->special.");
1544		return BLKPREP_KILL;
1545	}
1546
1547	clone = clone_rq(rq, md, GFP_ATOMIC);
1548	if (!clone)
1549		return BLKPREP_DEFER;
1550
1551	rq->special = clone;
1552	rq->cmd_flags |= REQ_DONTPREP;
1553
1554	return BLKPREP_OK;
1555}
1556
1557/*
1558 * Returns:
1559 * 0  : the request has been processed (not requeued)
1560 * !0 : the request has been requeued
1561 */
1562static int map_request(struct dm_target *ti, struct request *clone,
1563		       struct mapped_device *md)
1564{
1565	int r, requeued = 0;
1566	struct dm_rq_target_io *tio = clone->end_io_data;
1567
1568	/*
1569	 * Hold the md reference here for the in-flight I/O.
1570	 * We can't rely on the reference count by device opener,
1571	 * because the device may be closed during the request completion
1572	 * when all bios are completed.
1573	 * See the comment in rq_completed() too.
1574	 */
1575	dm_get(md);
1576
1577	tio->ti = ti;
1578	r = ti->type->map_rq(ti, clone, &tio->info);
1579	switch (r) {
1580	case DM_MAPIO_SUBMITTED:
1581		/* The target has taken the I/O to submit by itself later */
1582		break;
1583	case DM_MAPIO_REMAPPED:
1584		/* The target has remapped the I/O so dispatch it */
1585		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1586				     blk_rq_pos(tio->orig));
1587		dm_dispatch_request(clone);
1588		break;
1589	case DM_MAPIO_REQUEUE:
1590		/* The target wants to requeue the I/O */
1591		dm_requeue_unmapped_request(clone);
1592		requeued = 1;
1593		break;
1594	default:
1595		if (r > 0) {
1596			DMWARN("unimplemented target map return value: %d", r);
1597			BUG();
1598		}
1599
1600		/* The target wants to complete the I/O */
1601		dm_kill_unmapped_request(clone, r);
1602		break;
1603	}
1604
1605	return requeued;
1606}
1607
1608/*
1609 * q->request_fn for request-based dm.
1610 * Called with the queue lock held.
1611 */
1612static void dm_request_fn(struct request_queue *q)
1613{
1614	struct mapped_device *md = q->queuedata;
1615	struct dm_table *map = dm_get_live_table(md);
1616	struct dm_target *ti;
1617	struct request *rq, *clone;
1618	sector_t pos;
1619
1620	/*
1621	 * For suspend, check blk_queue_stopped() and increment
1622	 * ->pending within a single queue_lock not to increment the
1623	 * number of in-flight I/Os after the queue is stopped in
1624	 * dm_suspend().
1625	 */
1626	while (!blk_queue_stopped(q)) {
1627		rq = blk_peek_request(q);
1628		if (!rq)
1629			goto delay_and_out;
1630
1631		/* always use block 0 to find the target for flushes for now */
1632		pos = 0;
1633		if (!(rq->cmd_flags & REQ_FLUSH))
1634			pos = blk_rq_pos(rq);
1635
1636		ti = dm_table_find_target(map, pos);
1637		BUG_ON(!dm_target_is_valid(ti));
1638
1639		if (ti->type->busy && ti->type->busy(ti))
1640			goto delay_and_out;
1641
1642		blk_start_request(rq);
1643		clone = rq->special;
1644		atomic_inc(&md->pending[rq_data_dir(clone)]);
1645
1646		spin_unlock(q->queue_lock);
1647		if (map_request(ti, clone, md))
1648			goto requeued;
1649
1650		BUG_ON(!irqs_disabled());
1651		spin_lock(q->queue_lock);
1652	}
1653
1654	goto out;
1655
1656requeued:
1657	BUG_ON(!irqs_disabled());
1658	spin_lock(q->queue_lock);
1659
1660delay_and_out:
1661	blk_delay_queue(q, HZ / 10);
1662out:
1663	dm_table_put(map);
1664
1665	return;
1666}
1667
1668int dm_underlying_device_busy(struct request_queue *q)
1669{
1670	return blk_lld_busy(q);
1671}
1672EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1673
1674static int dm_lld_busy(struct request_queue *q)
1675{
1676	int r;
1677	struct mapped_device *md = q->queuedata;
1678	struct dm_table *map = dm_get_live_table(md);
1679
1680	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1681		r = 1;
1682	else
1683		r = dm_table_any_busy_target(map);
1684
1685	dm_table_put(map);
1686
1687	return r;
1688}
1689
1690static int dm_any_congested(void *congested_data, int bdi_bits)
1691{
1692	int r = bdi_bits;
1693	struct mapped_device *md = congested_data;
1694	struct dm_table *map;
1695
1696	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1697		map = dm_get_live_table(md);
1698		if (map) {
1699			/*
1700			 * Request-based dm cares about only own queue for
1701			 * the query about congestion status of request_queue
1702			 */
1703			if (dm_request_based(md))
1704				r = md->queue->backing_dev_info.state &
1705				    bdi_bits;
1706			else
1707				r = dm_table_any_congested(map, bdi_bits);
1708
1709			dm_table_put(map);
1710		}
1711	}
1712
1713	return r;
1714}
1715
1716/*-----------------------------------------------------------------
1717 * An IDR is used to keep track of allocated minor numbers.
1718 *---------------------------------------------------------------*/
1719static void free_minor(int minor)
1720{
1721	spin_lock(&_minor_lock);
1722	idr_remove(&_minor_idr, minor);
1723	spin_unlock(&_minor_lock);
1724}
1725
1726/*
1727 * See if the device with a specific minor # is free.
1728 */
1729static int specific_minor(int minor)
1730{
1731	int r, m;
1732
1733	if (minor >= (1 << MINORBITS))
1734		return -EINVAL;
1735
1736	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1737	if (!r)
1738		return -ENOMEM;
1739
1740	spin_lock(&_minor_lock);
1741
1742	if (idr_find(&_minor_idr, minor)) {
1743		r = -EBUSY;
1744		goto out;
1745	}
1746
1747	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
1748	if (r)
1749		goto out;
1750
1751	if (m != minor) {
1752		idr_remove(&_minor_idr, m);
1753		r = -EBUSY;
1754		goto out;
1755	}
1756
1757out:
1758	spin_unlock(&_minor_lock);
1759	return r;
1760}
1761
1762static int next_free_minor(int *minor)
1763{
1764	int r, m;
1765
1766	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1767	if (!r)
1768		return -ENOMEM;
1769
1770	spin_lock(&_minor_lock);
1771
1772	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
1773	if (r)
1774		goto out;
1775
1776	if (m >= (1 << MINORBITS)) {
1777		idr_remove(&_minor_idr, m);
1778		r = -ENOSPC;
1779		goto out;
1780	}
1781
1782	*minor = m;
1783
1784out:
1785	spin_unlock(&_minor_lock);
1786	return r;
1787}
1788
1789static const struct block_device_operations dm_blk_dops;
1790
1791static void dm_wq_work(struct work_struct *work);
1792
1793static void dm_init_md_queue(struct mapped_device *md)
1794{
1795	/*
1796	 * Request-based dm devices cannot be stacked on top of bio-based dm
1797	 * devices.  The type of this dm device has not been decided yet.
1798	 * The type is decided at the first table loading time.
1799	 * To prevent problematic device stacking, clear the queue flag
1800	 * for request stacking support until then.
1801	 *
1802	 * This queue is new, so no concurrency on the queue_flags.
1803	 */
1804	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1805
1806	md->queue->queuedata = md;
1807	md->queue->backing_dev_info.congested_fn = dm_any_congested;
1808	md->queue->backing_dev_info.congested_data = md;
1809	blk_queue_make_request(md->queue, dm_request);
1810	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1811	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1812}
1813
1814/*
1815 * Allocate and initialise a blank device with a given minor.
1816 */
1817static struct mapped_device *alloc_dev(int minor)
1818{
1819	int r;
1820	struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
1821	void *old_md;
1822
1823	if (!md) {
1824		DMWARN("unable to allocate device, out of memory.");
1825		return NULL;
1826	}
1827
1828	if (!try_module_get(THIS_MODULE))
1829		goto bad_module_get;
1830
1831	/* get a minor number for the dev */
1832	if (minor == DM_ANY_MINOR)
1833		r = next_free_minor(&minor);
1834	else
1835		r = specific_minor(minor);
1836	if (r < 0)
1837		goto bad_minor;
1838
1839	md->type = DM_TYPE_NONE;
1840	init_rwsem(&md->io_lock);
1841	mutex_init(&md->suspend_lock);
1842	mutex_init(&md->type_lock);
1843	spin_lock_init(&md->deferred_lock);
1844	rwlock_init(&md->map_lock);
1845	atomic_set(&md->holders, 1);
1846	atomic_set(&md->open_count, 0);
1847	atomic_set(&md->event_nr, 0);
1848	atomic_set(&md->uevent_seq, 0);
1849	INIT_LIST_HEAD(&md->uevent_list);
1850	spin_lock_init(&md->uevent_lock);
1851
1852	md->queue = blk_alloc_queue(GFP_KERNEL);
1853	if (!md->queue)
1854		goto bad_queue;
1855
1856	dm_init_md_queue(md);
1857
1858	md->disk = alloc_disk(1);
1859	if (!md->disk)
1860		goto bad_disk;
1861
1862	atomic_set(&md->pending[0], 0);
1863	atomic_set(&md->pending[1], 0);
1864	init_waitqueue_head(&md->wait);
1865	INIT_WORK(&md->work, dm_wq_work);
1866	init_waitqueue_head(&md->eventq);
1867
1868	md->disk->major = _major;
1869	md->disk->first_minor = minor;
1870	md->disk->fops = &dm_blk_dops;
1871	md->disk->queue = md->queue;
1872	md->disk->private_data = md;
1873	sprintf(md->disk->disk_name, "dm-%d", minor);
1874	add_disk(md->disk);
1875	format_dev_t(md->name, MKDEV(_major, minor));
1876
1877	md->wq = alloc_workqueue("kdmflush",
1878				 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1879	if (!md->wq)
1880		goto bad_thread;
1881
1882	md->bdev = bdget_disk(md->disk, 0);
1883	if (!md->bdev)
1884		goto bad_bdev;
1885
1886	bio_init(&md->flush_bio);
1887	md->flush_bio.bi_bdev = md->bdev;
1888	md->flush_bio.bi_rw = WRITE_FLUSH;
1889
1890	/* Populate the mapping, nobody knows we exist yet */
1891	spin_lock(&_minor_lock);
1892	old_md = idr_replace(&_minor_idr, md, minor);
1893	spin_unlock(&_minor_lock);
1894
1895	BUG_ON(old_md != MINOR_ALLOCED);
1896
1897	return md;
1898
1899bad_bdev:
1900	destroy_workqueue(md->wq);
1901bad_thread:
1902	del_gendisk(md->disk);
1903	put_disk(md->disk);
1904bad_disk:
1905	blk_cleanup_queue(md->queue);
1906bad_queue:
1907	free_minor(minor);
1908bad_minor:
1909	module_put(THIS_MODULE);
1910bad_module_get:
1911	kfree(md);
1912	return NULL;
1913}
1914
1915static void unlock_fs(struct mapped_device *md);
1916
1917static void free_dev(struct mapped_device *md)
1918{
1919	int minor = MINOR(disk_devt(md->disk));
1920
1921	unlock_fs(md);
1922	bdput(md->bdev);
1923	destroy_workqueue(md->wq);
1924	if (md->tio_pool)
1925		mempool_destroy(md->tio_pool);
1926	if (md->io_pool)
1927		mempool_destroy(md->io_pool);
1928	if (md->bs)
1929		bioset_free(md->bs);
1930	blk_integrity_unregister(md->disk);
1931	del_gendisk(md->disk);
1932	free_minor(minor);
1933
1934	spin_lock(&_minor_lock);
1935	md->disk->private_data = NULL;
1936	spin_unlock(&_minor_lock);
1937
1938	put_disk(md->disk);
1939	blk_cleanup_queue(md->queue);
1940	module_put(THIS_MODULE);
1941	kfree(md);
1942}
1943
1944static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1945{
1946	struct dm_md_mempools *p;
1947
1948	if (md->io_pool && md->tio_pool && md->bs)
1949		/* the md already has necessary mempools */
1950		goto out;
1951
1952	p = dm_table_get_md_mempools(t);
1953	BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
1954
1955	md->io_pool = p->io_pool;
1956	p->io_pool = NULL;
1957	md->tio_pool = p->tio_pool;
1958	p->tio_pool = NULL;
1959	md->bs = p->bs;
1960	p->bs = NULL;
1961
1962out:
1963	/* mempool bind completed, now no need any mempools in the table */
1964	dm_table_free_md_mempools(t);
1965}
1966
1967/*
1968 * Bind a table to the device.
1969 */
1970static void event_callback(void *context)
1971{
1972	unsigned long flags;
1973	LIST_HEAD(uevents);
1974	struct mapped_device *md = (struct mapped_device *) context;
1975
1976	spin_lock_irqsave(&md->uevent_lock, flags);
1977	list_splice_init(&md->uevent_list, &uevents);
1978	spin_unlock_irqrestore(&md->uevent_lock, flags);
1979
1980	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1981
1982	atomic_inc(&md->event_nr);
1983	wake_up(&md->eventq);
1984}
1985
1986/*
1987 * Protected by md->suspend_lock obtained by dm_swap_table().
1988 */
1989static void __set_size(struct mapped_device *md, sector_t size)
1990{
1991	set_capacity(md->disk, size);
1992
1993	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1994}
1995
1996/*
1997 * Return 1 if the queue has a compulsory merge_bvec_fn function.
1998 *
1999 * If this function returns 0, then the device is either a non-dm
2000 * device without a merge_bvec_fn, or it is a dm device that is
2001 * able to split any bios it receives that are too big.
2002 */
2003int dm_queue_merge_is_compulsory(struct request_queue *q)
2004{
2005	struct mapped_device *dev_md;
2006
2007	if (!q->merge_bvec_fn)
2008		return 0;
2009
2010	if (q->make_request_fn == dm_request) {
2011		dev_md = q->queuedata;
2012		if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2013			return 0;
2014	}
2015
2016	return 1;
2017}
2018
2019static int dm_device_merge_is_compulsory(struct dm_target *ti,
2020					 struct dm_dev *dev, sector_t start,
2021					 sector_t len, void *data)
2022{
2023	struct block_device *bdev = dev->bdev;
2024	struct request_queue *q = bdev_get_queue(bdev);
2025
2026	return dm_queue_merge_is_compulsory(q);
2027}
2028
2029/*
2030 * Return 1 if it is acceptable to ignore merge_bvec_fn based
2031 * on the properties of the underlying devices.
2032 */
2033static int dm_table_merge_is_optional(struct dm_table *table)
2034{
2035	unsigned i = 0;
2036	struct dm_target *ti;
2037
2038	while (i < dm_table_get_num_targets(table)) {
2039		ti = dm_table_get_target(table, i++);
2040
2041		if (ti->type->iterate_devices &&
2042		    ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
2043			return 0;
2044	}
2045
2046	return 1;
2047}
2048
2049/*
2050 * Returns old map, which caller must destroy.
2051 */
2052static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2053			       struct queue_limits *limits)
2054{
2055	struct dm_table *old_map;
2056	struct request_queue *q = md->queue;
2057	sector_t size;
2058	unsigned long flags;
2059	int merge_is_optional;
2060
2061	size = dm_table_get_size(t);
2062
2063	/*
2064	 * Wipe any geometry if the size of the table changed.
2065	 */
2066	if (size != get_capacity(md->disk))
2067		memset(&md->geometry, 0, sizeof(md->geometry));
2068
2069	__set_size(md, size);
2070
2071	dm_table_event_callback(t, event_callback, md);
2072
2073	/*
2074	 * The queue hasn't been stopped yet, if the old table type wasn't
2075	 * for request-based during suspension.  So stop it to prevent
2076	 * I/O mapping before resume.
2077	 * This must be done before setting the queue restrictions,
2078	 * because request-based dm may be run just after the setting.
2079	 */
2080	if (dm_table_request_based(t) && !blk_queue_stopped(q))
2081		stop_queue(q);
2082
2083	__bind_mempools(md, t);
2084
2085	merge_is_optional = dm_table_merge_is_optional(t);
2086
2087	write_lock_irqsave(&md->map_lock, flags);
2088	old_map = md->map;
2089	md->map = t;
2090	md->immutable_target_type = dm_table_get_immutable_target_type(t);
2091
2092	dm_table_set_restrictions(t, q, limits);
2093	if (merge_is_optional)
2094		set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2095	else
2096		clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2097	write_unlock_irqrestore(&md->map_lock, flags);
2098
2099	return old_map;
2100}
2101
2102/*
2103 * Returns unbound table for the caller to free.
2104 */
2105static struct dm_table *__unbind(struct mapped_device *md)
2106{
2107	struct dm_table *map = md->map;
2108	unsigned long flags;
2109
2110	if (!map)
2111		return NULL;
2112
2113	dm_table_event_callback(map, NULL, NULL);
2114	write_lock_irqsave(&md->map_lock, flags);
2115	md->map = NULL;
2116	write_unlock_irqrestore(&md->map_lock, flags);
2117
2118	return map;
2119}
2120
2121/*
2122 * Constructor for a new device.
2123 */
2124int dm_create(int minor, struct mapped_device **result)
2125{
2126	struct mapped_device *md;
2127
2128	md = alloc_dev(minor);
2129	if (!md)
2130		return -ENXIO;
2131
2132	dm_sysfs_init(md);
2133
2134	*result = md;
2135	return 0;
2136}
2137
2138/*
2139 * Functions to manage md->type.
2140 * All are required to hold md->type_lock.
2141 */
2142void dm_lock_md_type(struct mapped_device *md)
2143{
2144	mutex_lock(&md->type_lock);
2145}
2146
2147void dm_unlock_md_type(struct mapped_device *md)
2148{
2149	mutex_unlock(&md->type_lock);
2150}
2151
2152void dm_set_md_type(struct mapped_device *md, unsigned type)
2153{
2154	md->type = type;
2155}
2156
2157unsigned dm_get_md_type(struct mapped_device *md)
2158{
2159	return md->type;
2160}
2161
2162struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2163{
2164	return md->immutable_target_type;
2165}
2166
2167/*
2168 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2169 */
2170static int dm_init_request_based_queue(struct mapped_device *md)
2171{
2172	struct request_queue *q = NULL;
2173
2174	if (md->queue->elevator)
2175		return 1;
2176
2177	/* Fully initialize the queue */
2178	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2179	if (!q)
2180		return 0;
2181
2182	md->queue = q;
2183	dm_init_md_queue(md);
2184	blk_queue_softirq_done(md->queue, dm_softirq_done);
2185	blk_queue_prep_rq(md->queue, dm_prep_fn);
2186	blk_queue_lld_busy(md->queue, dm_lld_busy);
2187
2188	elv_register_queue(md->queue);
2189
2190	return 1;
2191}
2192
2193/*
2194 * Setup the DM device's queue based on md's type
2195 */
2196int dm_setup_md_queue(struct mapped_device *md)
2197{
2198	if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
2199	    !dm_init_request_based_queue(md)) {
2200		DMWARN("Cannot initialize queue for request-based mapped device");
2201		return -EINVAL;
2202	}
2203
2204	return 0;
2205}
2206
2207static struct mapped_device *dm_find_md(dev_t dev)
2208{
2209	struct mapped_device *md;
2210	unsigned minor = MINOR(dev);
2211
2212	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2213		return NULL;
2214
2215	spin_lock(&_minor_lock);
2216
2217	md = idr_find(&_minor_idr, minor);
2218	if (md && (md == MINOR_ALLOCED ||
2219		   (MINOR(disk_devt(dm_disk(md))) != minor) ||
2220		   dm_deleting_md(md) ||
2221		   test_bit(DMF_FREEING, &md->flags))) {
2222		md = NULL;
2223		goto out;
2224	}
2225
2226out:
2227	spin_unlock(&_minor_lock);
2228
2229	return md;
2230}
2231
2232struct mapped_device *dm_get_md(dev_t dev)
2233{
2234	struct mapped_device *md = dm_find_md(dev);
2235
2236	if (md)
2237		dm_get(md);
2238
2239	return md;
2240}
2241EXPORT_SYMBOL_GPL(dm_get_md);
2242
2243void *dm_get_mdptr(struct mapped_device *md)
2244{
2245	return md->interface_ptr;
2246}
2247
2248void dm_set_mdptr(struct mapped_device *md, void *ptr)
2249{
2250	md->interface_ptr = ptr;
2251}
2252
2253void dm_get(struct mapped_device *md)
2254{
2255	atomic_inc(&md->holders);
2256	BUG_ON(test_bit(DMF_FREEING, &md->flags));
2257}
2258
2259const char *dm_device_name(struct mapped_device *md)
2260{
2261	return md->name;
2262}
2263EXPORT_SYMBOL_GPL(dm_device_name);
2264
2265static void __dm_destroy(struct mapped_device *md, bool wait)
2266{
2267	struct dm_table *map;
2268
2269	might_sleep();
2270
2271	spin_lock(&_minor_lock);
2272	map = dm_get_live_table(md);
2273	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2274	set_bit(DMF_FREEING, &md->flags);
2275	spin_unlock(&_minor_lock);
2276
2277	if (!dm_suspended_md(md)) {
2278		dm_table_presuspend_targets(map);
2279		dm_table_postsuspend_targets(map);
2280	}
2281
2282	/*
2283	 * Rare, but there may be I/O requests still going to complete,
2284	 * for example.  Wait for all references to disappear.
2285	 * No one should increment the reference count of the mapped_device,
2286	 * after the mapped_device state becomes DMF_FREEING.
2287	 */
2288	if (wait)
2289		while (atomic_read(&md->holders))
2290			msleep(1);
2291	else if (atomic_read(&md->holders))
2292		DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2293		       dm_device_name(md), atomic_read(&md->holders));
2294
2295	dm_sysfs_exit(md);
2296	dm_table_put(map);
2297	dm_table_destroy(__unbind(md));
2298	free_dev(md);
2299}
2300
2301void dm_destroy(struct mapped_device *md)
2302{
2303	__dm_destroy(md, true);
2304}
2305
2306void dm_destroy_immediate(struct mapped_device *md)
2307{
2308	__dm_destroy(md, false);
2309}
2310
2311void dm_put(struct mapped_device *md)
2312{
2313	atomic_dec(&md->holders);
2314}
2315EXPORT_SYMBOL_GPL(dm_put);
2316
2317static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2318{
2319	int r = 0;
2320	DECLARE_WAITQUEUE(wait, current);
2321
2322	add_wait_queue(&md->wait, &wait);
2323
2324	while (1) {
2325		set_current_state(interruptible);
2326
2327		if (!md_in_flight(md))
2328			break;
2329
2330		if (interruptible == TASK_INTERRUPTIBLE &&
2331		    signal_pending(current)) {
2332			r = -EINTR;
2333			break;
2334		}
2335
2336		io_schedule();
2337	}
2338	set_current_state(TASK_RUNNING);
2339
2340	remove_wait_queue(&md->wait, &wait);
2341
2342	return r;
2343}
2344
2345/*
2346 * Process the deferred bios
2347 */
2348static void dm_wq_work(struct work_struct *work)
2349{
2350	struct mapped_device *md = container_of(work, struct mapped_device,
2351						work);
2352	struct bio *c;
2353
2354	down_read(&md->io_lock);
2355
2356	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2357		spin_lock_irq(&md->deferred_lock);
2358		c = bio_list_pop(&md->deferred);
2359		spin_unlock_irq(&md->deferred_lock);
2360
2361		if (!c)
2362			break;
2363
2364		up_read(&md->io_lock);
2365
2366		if (dm_request_based(md))
2367			generic_make_request(c);
2368		else
2369			__split_and_process_bio(md, c);
2370
2371		down_read(&md->io_lock);
2372	}
2373
2374	up_read(&md->io_lock);
2375}
2376
2377static void dm_queue_flush(struct mapped_device *md)
2378{
2379	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2380	smp_mb__after_clear_bit();
2381	queue_work(md->wq, &md->work);
2382}
2383
2384/*
2385 * Swap in a new table, returning the old one for the caller to destroy.
2386 */
2387struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2388{
2389	struct dm_table *map = ERR_PTR(-EINVAL);
2390	struct queue_limits limits;
2391	int r;
2392
2393	mutex_lock(&md->suspend_lock);
2394
2395	/* device must be suspended */
2396	if (!dm_suspended_md(md))
2397		goto out;
2398
2399	r = dm_calculate_queue_limits(table, &limits);
2400	if (r) {
2401		map = ERR_PTR(r);
2402		goto out;
2403	}
2404
2405	map = __bind(md, table, &limits);
2406
2407out:
2408	mutex_unlock(&md->suspend_lock);
2409	return map;
2410}
2411
2412/*
2413 * Functions to lock and unlock any filesystem running on the
2414 * device.
2415 */
2416static int lock_fs(struct mapped_device *md)
2417{
2418	int r;
2419
2420	WARN_ON(md->frozen_sb);
2421
2422	md->frozen_sb = freeze_bdev(md->bdev);
2423	if (IS_ERR(md->frozen_sb)) {
2424		r = PTR_ERR(md->frozen_sb);
2425		md->frozen_sb = NULL;
2426		return r;
2427	}
2428
2429	set_bit(DMF_FROZEN, &md->flags);
2430
2431	return 0;
2432}
2433
2434static void unlock_fs(struct mapped_device *md)
2435{
2436	if (!test_bit(DMF_FROZEN, &md->flags))
2437		return;
2438
2439	thaw_bdev(md->bdev, md->frozen_sb);
2440	md->frozen_sb = NULL;
2441	clear_bit(DMF_FROZEN, &md->flags);
2442}
2443
2444/*
2445 * We need to be able to change a mapping table under a mounted
2446 * filesystem.  For example we might want to move some data in
2447 * the background.  Before the table can be swapped with
2448 * dm_bind_table, dm_suspend must be called to flush any in
2449 * flight bios and ensure that any further io gets deferred.
2450 */
2451/*
2452 * Suspend mechanism in request-based dm.
2453 *
2454 * 1. Flush all I/Os by lock_fs() if needed.
2455 * 2. Stop dispatching any I/O by stopping the request_queue.
2456 * 3. Wait for all in-flight I/Os to be completed or requeued.
2457 *
2458 * To abort suspend, start the request_queue.
2459 */
2460int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2461{
2462	struct dm_table *map = NULL;
2463	int r = 0;
2464	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
2465	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
2466
2467	mutex_lock(&md->suspend_lock);
2468
2469	if (dm_suspended_md(md)) {
2470		r = -EINVAL;
2471		goto out_unlock;
2472	}
2473
2474	map = dm_get_live_table(md);
2475
2476	/*
2477	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2478	 * This flag is cleared before dm_suspend returns.
2479	 */
2480	if (noflush)
2481		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2482
2483	/* This does not get reverted if there's an error later. */
2484	dm_table_presuspend_targets(map);
2485
2486	/*
2487	 * Flush I/O to the device.
2488	 * Any I/O submitted after lock_fs() may not be flushed.
2489	 * noflush takes precedence over do_lockfs.
2490	 * (lock_fs() flushes I/Os and waits for them to complete.)
2491	 */
2492	if (!noflush && do_lockfs) {
2493		r = lock_fs(md);
2494		if (r)
2495			goto out;
2496	}
2497
2498	/*
2499	 * Here we must make sure that no processes are submitting requests
2500	 * to target drivers i.e. no one may be executing
2501	 * __split_and_process_bio. This is called from dm_request and
2502	 * dm_wq_work.
2503	 *
2504	 * To get all processes out of __split_and_process_bio in dm_request,
2505	 * we take the write lock. To prevent any process from reentering
2506	 * __split_and_process_bio from dm_request and quiesce the thread
2507	 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2508	 * flush_workqueue(md->wq).
2509	 */
2510	down_write(&md->io_lock);
2511	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2512	up_write(&md->io_lock);
2513
2514	/*
2515	 * Stop md->queue before flushing md->wq in case request-based
2516	 * dm defers requests to md->wq from md->queue.
2517	 */
2518	if (dm_request_based(md))
2519		stop_queue(md->queue);
2520
2521	flush_workqueue(md->wq);
2522
2523	/*
2524	 * At this point no more requests are entering target request routines.
2525	 * We call dm_wait_for_completion to wait for all existing requests
2526	 * to finish.
2527	 */
2528	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
2529
2530	down_write(&md->io_lock);
2531	if (noflush)
2532		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2533	up_write(&md->io_lock);
2534
2535	/* were we interrupted ? */
2536	if (r < 0) {
2537		dm_queue_flush(md);
2538
2539		if (dm_request_based(md))
2540			start_queue(md->queue);
2541
2542		unlock_fs(md);
2543		goto out; /* pushback list is already flushed, so skip flush */
2544	}
2545
2546	/*
2547	 * If dm_wait_for_completion returned 0, the device is completely
2548	 * quiescent now. There is no request-processing activity. All new
2549	 * requests are being added to md->deferred list.
2550	 */
2551
2552	set_bit(DMF_SUSPENDED, &md->flags);
2553
2554	dm_table_postsuspend_targets(map);
2555
2556out:
2557	dm_table_put(map);
2558
2559out_unlock:
2560	mutex_unlock(&md->suspend_lock);
2561	return r;
2562}
2563
2564int dm_resume(struct mapped_device *md)
2565{
2566	int r = -EINVAL;
2567	struct dm_table *map = NULL;
2568
2569	mutex_lock(&md->suspend_lock);
2570	if (!dm_suspended_md(md))
2571		goto out;
2572
2573	map = dm_get_live_table(md);
2574	if (!map || !dm_table_get_size(map))
2575		goto out;
2576
2577	r = dm_table_resume_targets(map);
2578	if (r)
2579		goto out;
2580
2581	dm_queue_flush(md);
2582
2583	/*
2584	 * Flushing deferred I/Os must be done after targets are resumed
2585	 * so that mapping of targets can work correctly.
2586	 * Request-based dm is queueing the deferred I/Os in its request_queue.
2587	 */
2588	if (dm_request_based(md))
2589		start_queue(md->queue);
2590
2591	unlock_fs(md);
2592
2593	clear_bit(DMF_SUSPENDED, &md->flags);
2594
2595	r = 0;
2596out:
2597	dm_table_put(map);
2598	mutex_unlock(&md->suspend_lock);
2599
2600	return r;
2601}
2602
2603/*-----------------------------------------------------------------
2604 * Event notification.
2605 *---------------------------------------------------------------*/
2606int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2607		       unsigned cookie)
2608{
2609	char udev_cookie[DM_COOKIE_LENGTH];
2610	char *envp[] = { udev_cookie, NULL };
2611
2612	if (!cookie)
2613		return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2614	else {
2615		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2616			 DM_COOKIE_ENV_VAR_NAME, cookie);
2617		return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2618					  action, envp);
2619	}
2620}
2621
2622uint32_t dm_next_uevent_seq(struct mapped_device *md)
2623{
2624	return atomic_add_return(1, &md->uevent_seq);
2625}
2626
2627uint32_t dm_get_event_nr(struct mapped_device *md)
2628{
2629	return atomic_read(&md->event_nr);
2630}
2631
2632int dm_wait_event(struct mapped_device *md, int event_nr)
2633{
2634	return wait_event_interruptible(md->eventq,
2635			(event_nr != atomic_read(&md->event_nr)));
2636}
2637
2638void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2639{
2640	unsigned long flags;
2641
2642	spin_lock_irqsave(&md->uevent_lock, flags);
2643	list_add(elist, &md->uevent_list);
2644	spin_unlock_irqrestore(&md->uevent_lock, flags);
2645}
2646
2647/*
2648 * The gendisk is only valid as long as you have a reference
2649 * count on 'md'.
2650 */
2651struct gendisk *dm_disk(struct mapped_device *md)
2652{
2653	return md->disk;
2654}
2655
2656struct kobject *dm_kobject(struct mapped_device *md)
2657{
2658	return &md->kobj;
2659}
2660
2661/*
2662 * struct mapped_device should not be exported outside of dm.c
2663 * so use this check to verify that kobj is part of md structure
2664 */
2665struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2666{
2667	struct mapped_device *md;
2668
2669	md = container_of(kobj, struct mapped_device, kobj);
2670	if (&md->kobj != kobj)
2671		return NULL;
2672
2673	if (test_bit(DMF_FREEING, &md->flags) ||
2674	    dm_deleting_md(md))
2675		return NULL;
2676
2677	dm_get(md);
2678	return md;
2679}
2680
2681int dm_suspended_md(struct mapped_device *md)
2682{
2683	return test_bit(DMF_SUSPENDED, &md->flags);
2684}
2685
2686int dm_suspended(struct dm_target *ti)
2687{
2688	return dm_suspended_md(dm_table_get_md(ti->table));
2689}
2690EXPORT_SYMBOL_GPL(dm_suspended);
2691
2692int dm_noflush_suspending(struct dm_target *ti)
2693{
2694	return __noflush_suspending(dm_table_get_md(ti->table));
2695}
2696EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2697
2698struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
2699{
2700	struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2701	unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
2702
2703	if (!pools)
2704		return NULL;
2705
2706	pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
2707			 mempool_create_slab_pool(MIN_IOS, _io_cache) :
2708			 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2709	if (!pools->io_pool)
2710		goto free_pools_and_out;
2711
2712	pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
2713			  mempool_create_slab_pool(MIN_IOS, _tio_cache) :
2714			  mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2715	if (!pools->tio_pool)
2716		goto free_io_pool_and_out;
2717
2718	pools->bs = bioset_create(pool_size, 0);
2719	if (!pools->bs)
2720		goto free_tio_pool_and_out;
2721
2722	if (integrity && bioset_integrity_create(pools->bs, pool_size))
2723		goto free_bioset_and_out;
2724
2725	return pools;
2726
2727free_bioset_and_out:
2728	bioset_free(pools->bs);
2729
2730free_tio_pool_and_out:
2731	mempool_destroy(pools->tio_pool);
2732
2733free_io_pool_and_out:
2734	mempool_destroy(pools->io_pool);
2735
2736free_pools_and_out:
2737	kfree(pools);
2738
2739	return NULL;
2740}
2741
2742void dm_free_md_mempools(struct dm_md_mempools *pools)
2743{
2744	if (!pools)
2745		return;
2746
2747	if (pools->io_pool)
2748		mempool_destroy(pools->io_pool);
2749
2750	if (pools->tio_pool)
2751		mempool_destroy(pools->tio_pool);
2752
2753	if (pools->bs)
2754		bioset_free(pools->bs);
2755
2756	kfree(pools);
2757}
2758
2759static const struct block_device_operations dm_blk_dops = {
2760	.open = dm_blk_open,
2761	.release = dm_blk_close,
2762	.ioctl = dm_blk_ioctl,
2763	.getgeo = dm_blk_getgeo,
2764	.owner = THIS_MODULE
2765};
2766
2767EXPORT_SYMBOL(dm_get_mapinfo);
2768
2769/*
2770 * module hooks
2771 */
2772module_init(dm_init);
2773module_exit(dm_exit);
2774
2775module_param(major, uint, 0);
2776MODULE_PARM_DESC(major, "The major number of the device mapper");
2777MODULE_DESCRIPTION(DM_NAME " driver");
2778MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2779MODULE_LICENSE("GPL");
2780