dm-snap.c revision 985903bb3a6d98623360ab6c855417f638840029
1/*
2 * dm-snapshot.c
3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 *
6 * This file is released under the GPL.
7 */
8
9#include <linux/blkdev.h>
10#include <linux/device-mapper.h>
11#include <linux/delay.h>
12#include <linux/fs.h>
13#include <linux/init.h>
14#include <linux/kdev_t.h>
15#include <linux/list.h>
16#include <linux/mempool.h>
17#include <linux/module.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h>
20#include <linux/log2.h>
21#include <linux/dm-kcopyd.h>
22#include <linux/workqueue.h>
23
24#include "dm-exception-store.h"
25
26#define DM_MSG_PREFIX "snapshots"
27
28/*
29 * The percentage increment we will wake up users at
30 */
31#define WAKE_UP_PERCENT 5
32
33/*
34 * kcopyd priority of snapshot operations
35 */
36#define SNAPSHOT_COPY_PRIORITY 2
37
38/*
39 * Reserve 1MB for each snapshot initially (with minimum of 1 page).
40 */
41#define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1)
42
43/*
44 * The size of the mempool used to track chunks in use.
45 */
46#define MIN_IOS 256
47
48#define DM_TRACKED_CHUNK_HASH_SIZE	16
49#define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
50					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
51
52struct dm_exception_table {
53	uint32_t hash_mask;
54	unsigned hash_shift;
55	struct list_head *table;
56};
57
58struct dm_snapshot {
59	struct rw_semaphore lock;
60
61	struct dm_dev *origin;
62
63	/* List of snapshots per Origin */
64	struct list_head list;
65
66	/* You can't use a snapshot if this is 0 (e.g. if full) */
67	int valid;
68
69	/* Origin writes don't trigger exceptions until this is set */
70	int active;
71
72	mempool_t *pending_pool;
73
74	atomic_t pending_exceptions_count;
75
76	struct dm_exception_table pending;
77	struct dm_exception_table complete;
78
79	/*
80	 * pe_lock protects all pending_exception operations and access
81	 * as well as the snapshot_bios list.
82	 */
83	spinlock_t pe_lock;
84
85	/* The on disk metadata handler */
86	struct dm_exception_store *store;
87
88	struct dm_kcopyd_client *kcopyd_client;
89
90	/* Queue of snapshot writes for ksnapd to flush */
91	struct bio_list queued_bios;
92	struct work_struct queued_bios_work;
93
94	/* Chunks with outstanding reads */
95	mempool_t *tracked_chunk_pool;
96	spinlock_t tracked_chunk_lock;
97	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
98};
99
100static struct workqueue_struct *ksnapd;
101static void flush_queued_bios(struct work_struct *work);
102
103static sector_t chunk_to_sector(struct dm_exception_store *store,
104				chunk_t chunk)
105{
106	return chunk << store->chunk_shift;
107}
108
109static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
110{
111	/*
112	 * There is only ever one instance of a particular block
113	 * device so we can compare pointers safely.
114	 */
115	return lhs == rhs;
116}
117
118struct dm_snap_pending_exception {
119	struct dm_exception e;
120
121	/*
122	 * Origin buffers waiting for this to complete are held
123	 * in a bio list
124	 */
125	struct bio_list origin_bios;
126	struct bio_list snapshot_bios;
127
128	/*
129	 * Short-term queue of pending exceptions prior to submission.
130	 */
131	struct list_head list;
132
133	/*
134	 * The primary pending_exception is the one that holds
135	 * the ref_count and the list of origin_bios for a
136	 * group of pending_exceptions.  It is always last to get freed.
137	 * These fields get set up when writing to the origin.
138	 */
139	struct dm_snap_pending_exception *primary_pe;
140
141	/*
142	 * Number of pending_exceptions processing this chunk.
143	 * When this drops to zero we must complete the origin bios.
144	 * If incrementing or decrementing this, hold pe->snap->lock for
145	 * the sibling concerned and not pe->primary_pe->snap->lock unless
146	 * they are the same.
147	 */
148	atomic_t ref_count;
149
150	/* Pointer back to snapshot context */
151	struct dm_snapshot *snap;
152
153	/*
154	 * 1 indicates the exception has already been sent to
155	 * kcopyd.
156	 */
157	int started;
158};
159
160/*
161 * Hash table mapping origin volumes to lists of snapshots and
162 * a lock to protect it
163 */
164static struct kmem_cache *exception_cache;
165static struct kmem_cache *pending_cache;
166
167struct dm_snap_tracked_chunk {
168	struct hlist_node node;
169	chunk_t chunk;
170};
171
172static struct kmem_cache *tracked_chunk_cache;
173
174static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s,
175						 chunk_t chunk)
176{
177	struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool,
178							GFP_NOIO);
179	unsigned long flags;
180
181	c->chunk = chunk;
182
183	spin_lock_irqsave(&s->tracked_chunk_lock, flags);
184	hlist_add_head(&c->node,
185		       &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
186	spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
187
188	return c;
189}
190
191static void stop_tracking_chunk(struct dm_snapshot *s,
192				struct dm_snap_tracked_chunk *c)
193{
194	unsigned long flags;
195
196	spin_lock_irqsave(&s->tracked_chunk_lock, flags);
197	hlist_del(&c->node);
198	spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
199
200	mempool_free(c, s->tracked_chunk_pool);
201}
202
203static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
204{
205	struct dm_snap_tracked_chunk *c;
206	struct hlist_node *hn;
207	int found = 0;
208
209	spin_lock_irq(&s->tracked_chunk_lock);
210
211	hlist_for_each_entry(c, hn,
212	    &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
213		if (c->chunk == chunk) {
214			found = 1;
215			break;
216		}
217	}
218
219	spin_unlock_irq(&s->tracked_chunk_lock);
220
221	return found;
222}
223
224/*
225 * One of these per registered origin, held in the snapshot_origins hash
226 */
227struct origin {
228	/* The origin device */
229	struct block_device *bdev;
230
231	struct list_head hash_list;
232
233	/* List of snapshots for this origin */
234	struct list_head snapshots;
235};
236
237/*
238 * Size of the hash table for origin volumes. If we make this
239 * the size of the minors list then it should be nearly perfect
240 */
241#define ORIGIN_HASH_SIZE 256
242#define ORIGIN_MASK      0xFF
243static struct list_head *_origins;
244static struct rw_semaphore _origins_lock;
245
246static int init_origin_hash(void)
247{
248	int i;
249
250	_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
251			   GFP_KERNEL);
252	if (!_origins) {
253		DMERR("unable to allocate memory");
254		return -ENOMEM;
255	}
256
257	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
258		INIT_LIST_HEAD(_origins + i);
259	init_rwsem(&_origins_lock);
260
261	return 0;
262}
263
264static void exit_origin_hash(void)
265{
266	kfree(_origins);
267}
268
269static unsigned origin_hash(struct block_device *bdev)
270{
271	return bdev->bd_dev & ORIGIN_MASK;
272}
273
274static struct origin *__lookup_origin(struct block_device *origin)
275{
276	struct list_head *ol;
277	struct origin *o;
278
279	ol = &_origins[origin_hash(origin)];
280	list_for_each_entry (o, ol, hash_list)
281		if (bdev_equal(o->bdev, origin))
282			return o;
283
284	return NULL;
285}
286
287static void __insert_origin(struct origin *o)
288{
289	struct list_head *sl = &_origins[origin_hash(o->bdev)];
290	list_add_tail(&o->hash_list, sl);
291}
292
293/*
294 * Make a note of the snapshot and its origin so we can look it
295 * up when the origin has a write on it.
296 */
297static int register_snapshot(struct dm_snapshot *snap)
298{
299	struct dm_snapshot *l;
300	struct origin *o, *new_o;
301	struct block_device *bdev = snap->origin->bdev;
302
303	new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
304	if (!new_o)
305		return -ENOMEM;
306
307	down_write(&_origins_lock);
308	o = __lookup_origin(bdev);
309
310	if (o)
311		kfree(new_o);
312	else {
313		/* New origin */
314		o = new_o;
315
316		/* Initialise the struct */
317		INIT_LIST_HEAD(&o->snapshots);
318		o->bdev = bdev;
319
320		__insert_origin(o);
321	}
322
323	/* Sort the list according to chunk size, largest-first smallest-last */
324	list_for_each_entry(l, &o->snapshots, list)
325		if (l->store->chunk_size < snap->store->chunk_size)
326			break;
327	list_add_tail(&snap->list, &l->list);
328
329	up_write(&_origins_lock);
330	return 0;
331}
332
333static void unregister_snapshot(struct dm_snapshot *s)
334{
335	struct origin *o;
336
337	down_write(&_origins_lock);
338	o = __lookup_origin(s->origin->bdev);
339
340	list_del(&s->list);
341	if (list_empty(&o->snapshots)) {
342		list_del(&o->hash_list);
343		kfree(o);
344	}
345
346	up_write(&_origins_lock);
347}
348
349/*
350 * Implementation of the exception hash tables.
351 * The lowest hash_shift bits of the chunk number are ignored, allowing
352 * some consecutive chunks to be grouped together.
353 */
354static int dm_exception_table_init(struct dm_exception_table *et,
355				   uint32_t size, unsigned hash_shift)
356{
357	unsigned int i;
358
359	et->hash_shift = hash_shift;
360	et->hash_mask = size - 1;
361	et->table = dm_vcalloc(size, sizeof(struct list_head));
362	if (!et->table)
363		return -ENOMEM;
364
365	for (i = 0; i < size; i++)
366		INIT_LIST_HEAD(et->table + i);
367
368	return 0;
369}
370
371static void dm_exception_table_exit(struct dm_exception_table *et,
372				    struct kmem_cache *mem)
373{
374	struct list_head *slot;
375	struct dm_exception *ex, *next;
376	int i, size;
377
378	size = et->hash_mask + 1;
379	for (i = 0; i < size; i++) {
380		slot = et->table + i;
381
382		list_for_each_entry_safe (ex, next, slot, hash_list)
383			kmem_cache_free(mem, ex);
384	}
385
386	vfree(et->table);
387}
388
389static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
390{
391	return (chunk >> et->hash_shift) & et->hash_mask;
392}
393
394static void dm_remove_exception(struct dm_exception *e)
395{
396	list_del(&e->hash_list);
397}
398
399/*
400 * Return the exception data for a sector, or NULL if not
401 * remapped.
402 */
403static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
404						chunk_t chunk)
405{
406	struct list_head *slot;
407	struct dm_exception *e;
408
409	slot = &et->table[exception_hash(et, chunk)];
410	list_for_each_entry (e, slot, hash_list)
411		if (chunk >= e->old_chunk &&
412		    chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
413			return e;
414
415	return NULL;
416}
417
418static struct dm_exception *alloc_completed_exception(void)
419{
420	struct dm_exception *e;
421
422	e = kmem_cache_alloc(exception_cache, GFP_NOIO);
423	if (!e)
424		e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
425
426	return e;
427}
428
429static void free_completed_exception(struct dm_exception *e)
430{
431	kmem_cache_free(exception_cache, e);
432}
433
434static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
435{
436	struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool,
437							     GFP_NOIO);
438
439	atomic_inc(&s->pending_exceptions_count);
440	pe->snap = s;
441
442	return pe;
443}
444
445static void free_pending_exception(struct dm_snap_pending_exception *pe)
446{
447	struct dm_snapshot *s = pe->snap;
448
449	mempool_free(pe, s->pending_pool);
450	smp_mb__before_atomic_dec();
451	atomic_dec(&s->pending_exceptions_count);
452}
453
454static void dm_insert_exception(struct dm_exception_table *eh,
455				struct dm_exception *new_e)
456{
457	struct list_head *l;
458	struct dm_exception *e = NULL;
459
460	l = &eh->table[exception_hash(eh, new_e->old_chunk)];
461
462	/* Add immediately if this table doesn't support consecutive chunks */
463	if (!eh->hash_shift)
464		goto out;
465
466	/* List is ordered by old_chunk */
467	list_for_each_entry_reverse(e, l, hash_list) {
468		/* Insert after an existing chunk? */
469		if (new_e->old_chunk == (e->old_chunk +
470					 dm_consecutive_chunk_count(e) + 1) &&
471		    new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
472					 dm_consecutive_chunk_count(e) + 1)) {
473			dm_consecutive_chunk_count_inc(e);
474			free_completed_exception(new_e);
475			return;
476		}
477
478		/* Insert before an existing chunk? */
479		if (new_e->old_chunk == (e->old_chunk - 1) &&
480		    new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
481			dm_consecutive_chunk_count_inc(e);
482			e->old_chunk--;
483			e->new_chunk--;
484			free_completed_exception(new_e);
485			return;
486		}
487
488		if (new_e->old_chunk > e->old_chunk)
489			break;
490	}
491
492out:
493	list_add(&new_e->hash_list, e ? &e->hash_list : l);
494}
495
496/*
497 * Callback used by the exception stores to load exceptions when
498 * initialising.
499 */
500static int dm_add_exception(void *context, chunk_t old, chunk_t new)
501{
502	struct dm_snapshot *s = context;
503	struct dm_exception *e;
504
505	e = alloc_completed_exception();
506	if (!e)
507		return -ENOMEM;
508
509	e->old_chunk = old;
510
511	/* Consecutive_count is implicitly initialised to zero */
512	e->new_chunk = new;
513
514	dm_insert_exception(&s->complete, e);
515
516	return 0;
517}
518
519#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
520
521/*
522 * Return a minimum chunk size of all snapshots that have the specified origin.
523 * Return zero if the origin has no snapshots.
524 */
525static sector_t __minimum_chunk_size(struct origin *o)
526{
527	struct dm_snapshot *snap;
528	unsigned chunk_size = 0;
529
530	if (o)
531		list_for_each_entry(snap, &o->snapshots, list)
532			chunk_size = min_not_zero(chunk_size,
533						  snap->store->chunk_size);
534
535	return chunk_size;
536}
537
538/*
539 * Hard coded magic.
540 */
541static int calc_max_buckets(void)
542{
543	/* use a fixed size of 2MB */
544	unsigned long mem = 2 * 1024 * 1024;
545	mem /= sizeof(struct list_head);
546
547	return mem;
548}
549
550/*
551 * Allocate room for a suitable hash table.
552 */
553static int init_hash_tables(struct dm_snapshot *s)
554{
555	sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
556
557	/*
558	 * Calculate based on the size of the original volume or
559	 * the COW volume...
560	 */
561	cow_dev_size = get_dev_size(s->store->cow->bdev);
562	origin_dev_size = get_dev_size(s->origin->bdev);
563	max_buckets = calc_max_buckets();
564
565	hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
566	hash_size = min(hash_size, max_buckets);
567
568	if (hash_size < 64)
569		hash_size = 64;
570	hash_size = rounddown_pow_of_two(hash_size);
571	if (dm_exception_table_init(&s->complete, hash_size,
572				    DM_CHUNK_CONSECUTIVE_BITS))
573		return -ENOMEM;
574
575	/*
576	 * Allocate hash table for in-flight exceptions
577	 * Make this smaller than the real hash table
578	 */
579	hash_size >>= 3;
580	if (hash_size < 64)
581		hash_size = 64;
582
583	if (dm_exception_table_init(&s->pending, hash_size, 0)) {
584		dm_exception_table_exit(&s->complete, exception_cache);
585		return -ENOMEM;
586	}
587
588	return 0;
589}
590
591/*
592 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
593 */
594static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
595{
596	struct dm_snapshot *s;
597	int i;
598	int r = -EINVAL;
599	char *origin_path;
600	struct dm_exception_store *store;
601	unsigned args_used;
602
603	if (argc != 4) {
604		ti->error = "requires exactly 4 arguments";
605		r = -EINVAL;
606		goto bad_args;
607	}
608
609	origin_path = argv[0];
610	argv++;
611	argc--;
612
613	r = dm_exception_store_create(ti, argc, argv, &args_used, &store);
614	if (r) {
615		ti->error = "Couldn't create exception store";
616		r = -EINVAL;
617		goto bad_args;
618	}
619
620	argv += args_used;
621	argc -= args_used;
622
623	s = kmalloc(sizeof(*s), GFP_KERNEL);
624	if (!s) {
625		ti->error = "Cannot allocate snapshot context private "
626		    "structure";
627		r = -ENOMEM;
628		goto bad_snap;
629	}
630
631	r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
632	if (r) {
633		ti->error = "Cannot get origin device";
634		goto bad_origin;
635	}
636
637	s->store = store;
638	s->valid = 1;
639	s->active = 0;
640	atomic_set(&s->pending_exceptions_count, 0);
641	init_rwsem(&s->lock);
642	spin_lock_init(&s->pe_lock);
643
644	/* Allocate hash table for COW data */
645	if (init_hash_tables(s)) {
646		ti->error = "Unable to allocate hash table space";
647		r = -ENOMEM;
648		goto bad_hash_tables;
649	}
650
651	r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
652	if (r) {
653		ti->error = "Could not create kcopyd client";
654		goto bad_kcopyd;
655	}
656
657	s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
658	if (!s->pending_pool) {
659		ti->error = "Could not allocate mempool for pending exceptions";
660		goto bad_pending_pool;
661	}
662
663	s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
664							 tracked_chunk_cache);
665	if (!s->tracked_chunk_pool) {
666		ti->error = "Could not allocate tracked_chunk mempool for "
667			    "tracking reads";
668		goto bad_tracked_chunk_pool;
669	}
670
671	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
672		INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
673
674	spin_lock_init(&s->tracked_chunk_lock);
675
676	/* Metadata must only be loaded into one table at once */
677	r = s->store->type->read_metadata(s->store, dm_add_exception,
678					  (void *)s);
679	if (r < 0) {
680		ti->error = "Failed to read snapshot metadata";
681		goto bad_load_and_register;
682	} else if (r > 0) {
683		s->valid = 0;
684		DMWARN("Snapshot is marked invalid.");
685	}
686
687	bio_list_init(&s->queued_bios);
688	INIT_WORK(&s->queued_bios_work, flush_queued_bios);
689
690	if (!s->store->chunk_size) {
691		ti->error = "Chunk size not set";
692		goto bad_load_and_register;
693	}
694
695	/* Add snapshot to the list of snapshots for this origin */
696	/* Exceptions aren't triggered till snapshot_resume() is called */
697	if (register_snapshot(s)) {
698		r = -EINVAL;
699		ti->error = "Cannot register snapshot origin";
700		goto bad_load_and_register;
701	}
702
703	ti->private = s;
704	ti->split_io = s->store->chunk_size;
705	ti->num_flush_requests = 1;
706
707	return 0;
708
709bad_load_and_register:
710	mempool_destroy(s->tracked_chunk_pool);
711
712bad_tracked_chunk_pool:
713	mempool_destroy(s->pending_pool);
714
715bad_pending_pool:
716	dm_kcopyd_client_destroy(s->kcopyd_client);
717
718bad_kcopyd:
719	dm_exception_table_exit(&s->pending, pending_cache);
720	dm_exception_table_exit(&s->complete, exception_cache);
721
722bad_hash_tables:
723	dm_put_device(ti, s->origin);
724
725bad_origin:
726	kfree(s);
727
728bad_snap:
729	dm_exception_store_destroy(store);
730
731bad_args:
732	return r;
733}
734
735static void __free_exceptions(struct dm_snapshot *s)
736{
737	dm_kcopyd_client_destroy(s->kcopyd_client);
738	s->kcopyd_client = NULL;
739
740	dm_exception_table_exit(&s->pending, pending_cache);
741	dm_exception_table_exit(&s->complete, exception_cache);
742}
743
744static void snapshot_dtr(struct dm_target *ti)
745{
746#ifdef CONFIG_DM_DEBUG
747	int i;
748#endif
749	struct dm_snapshot *s = ti->private;
750
751	flush_workqueue(ksnapd);
752
753	/* Prevent further origin writes from using this snapshot. */
754	/* After this returns there can be no new kcopyd jobs. */
755	unregister_snapshot(s);
756
757	while (atomic_read(&s->pending_exceptions_count))
758		msleep(1);
759	/*
760	 * Ensure instructions in mempool_destroy aren't reordered
761	 * before atomic_read.
762	 */
763	smp_mb();
764
765#ifdef CONFIG_DM_DEBUG
766	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
767		BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
768#endif
769
770	mempool_destroy(s->tracked_chunk_pool);
771
772	__free_exceptions(s);
773
774	mempool_destroy(s->pending_pool);
775
776	dm_put_device(ti, s->origin);
777
778	dm_exception_store_destroy(s->store);
779
780	kfree(s);
781}
782
783/*
784 * Flush a list of buffers.
785 */
786static void flush_bios(struct bio *bio)
787{
788	struct bio *n;
789
790	while (bio) {
791		n = bio->bi_next;
792		bio->bi_next = NULL;
793		generic_make_request(bio);
794		bio = n;
795	}
796}
797
798static void flush_queued_bios(struct work_struct *work)
799{
800	struct dm_snapshot *s =
801		container_of(work, struct dm_snapshot, queued_bios_work);
802	struct bio *queued_bios;
803	unsigned long flags;
804
805	spin_lock_irqsave(&s->pe_lock, flags);
806	queued_bios = bio_list_get(&s->queued_bios);
807	spin_unlock_irqrestore(&s->pe_lock, flags);
808
809	flush_bios(queued_bios);
810}
811
812/*
813 * Error a list of buffers.
814 */
815static void error_bios(struct bio *bio)
816{
817	struct bio *n;
818
819	while (bio) {
820		n = bio->bi_next;
821		bio->bi_next = NULL;
822		bio_io_error(bio);
823		bio = n;
824	}
825}
826
827static void __invalidate_snapshot(struct dm_snapshot *s, int err)
828{
829	if (!s->valid)
830		return;
831
832	if (err == -EIO)
833		DMERR("Invalidating snapshot: Error reading/writing.");
834	else if (err == -ENOMEM)
835		DMERR("Invalidating snapshot: Unable to allocate exception.");
836
837	if (s->store->type->drop_snapshot)
838		s->store->type->drop_snapshot(s->store);
839
840	s->valid = 0;
841
842	dm_table_event(s->store->ti->table);
843}
844
845static void get_pending_exception(struct dm_snap_pending_exception *pe)
846{
847	atomic_inc(&pe->ref_count);
848}
849
850static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
851{
852	struct dm_snap_pending_exception *primary_pe;
853	struct bio *origin_bios = NULL;
854
855	primary_pe = pe->primary_pe;
856
857	/*
858	 * If this pe is involved in a write to the origin and
859	 * it is the last sibling to complete then release
860	 * the bios for the original write to the origin.
861	 */
862	if (primary_pe &&
863	    atomic_dec_and_test(&primary_pe->ref_count)) {
864		origin_bios = bio_list_get(&primary_pe->origin_bios);
865		free_pending_exception(primary_pe);
866	}
867
868	/*
869	 * Free the pe if it's not linked to an origin write or if
870	 * it's not itself a primary pe.
871	 */
872	if (!primary_pe || primary_pe != pe)
873		free_pending_exception(pe);
874
875	return origin_bios;
876}
877
878static void pending_complete(struct dm_snap_pending_exception *pe, int success)
879{
880	struct dm_exception *e;
881	struct dm_snapshot *s = pe->snap;
882	struct bio *origin_bios = NULL;
883	struct bio *snapshot_bios = NULL;
884	int error = 0;
885
886	if (!success) {
887		/* Read/write error - snapshot is unusable */
888		down_write(&s->lock);
889		__invalidate_snapshot(s, -EIO);
890		error = 1;
891		goto out;
892	}
893
894	e = alloc_completed_exception();
895	if (!e) {
896		down_write(&s->lock);
897		__invalidate_snapshot(s, -ENOMEM);
898		error = 1;
899		goto out;
900	}
901	*e = pe->e;
902
903	down_write(&s->lock);
904	if (!s->valid) {
905		free_completed_exception(e);
906		error = 1;
907		goto out;
908	}
909
910	/*
911	 * Check for conflicting reads. This is extremely improbable,
912	 * so msleep(1) is sufficient and there is no need for a wait queue.
913	 */
914	while (__chunk_is_tracked(s, pe->e.old_chunk))
915		msleep(1);
916
917	/*
918	 * Add a proper exception, and remove the
919	 * in-flight exception from the list.
920	 */
921	dm_insert_exception(&s->complete, e);
922
923 out:
924	dm_remove_exception(&pe->e);
925	snapshot_bios = bio_list_get(&pe->snapshot_bios);
926	origin_bios = put_pending_exception(pe);
927
928	up_write(&s->lock);
929
930	/* Submit any pending write bios */
931	if (error)
932		error_bios(snapshot_bios);
933	else
934		flush_bios(snapshot_bios);
935
936	flush_bios(origin_bios);
937}
938
939static void commit_callback(void *context, int success)
940{
941	struct dm_snap_pending_exception *pe = context;
942
943	pending_complete(pe, success);
944}
945
946/*
947 * Called when the copy I/O has finished.  kcopyd actually runs
948 * this code so don't block.
949 */
950static void copy_callback(int read_err, unsigned long write_err, void *context)
951{
952	struct dm_snap_pending_exception *pe = context;
953	struct dm_snapshot *s = pe->snap;
954
955	if (read_err || write_err)
956		pending_complete(pe, 0);
957
958	else
959		/* Update the metadata if we are persistent */
960		s->store->type->commit_exception(s->store, &pe->e,
961						 commit_callback, pe);
962}
963
964/*
965 * Dispatches the copy operation to kcopyd.
966 */
967static void start_copy(struct dm_snap_pending_exception *pe)
968{
969	struct dm_snapshot *s = pe->snap;
970	struct dm_io_region src, dest;
971	struct block_device *bdev = s->origin->bdev;
972	sector_t dev_size;
973
974	dev_size = get_dev_size(bdev);
975
976	src.bdev = bdev;
977	src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
978	src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
979
980	dest.bdev = s->store->cow->bdev;
981	dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
982	dest.count = src.count;
983
984	/* Hand over to kcopyd */
985	dm_kcopyd_copy(s->kcopyd_client,
986		    &src, 1, &dest, 0, copy_callback, pe);
987}
988
989static struct dm_snap_pending_exception *
990__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
991{
992	struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
993
994	if (!e)
995		return NULL;
996
997	return container_of(e, struct dm_snap_pending_exception, e);
998}
999
1000/*
1001 * Looks to see if this snapshot already has a pending exception
1002 * for this chunk, otherwise it allocates a new one and inserts
1003 * it into the pending table.
1004 *
1005 * NOTE: a write lock must be held on snap->lock before calling
1006 * this.
1007 */
1008static struct dm_snap_pending_exception *
1009__find_pending_exception(struct dm_snapshot *s,
1010			 struct dm_snap_pending_exception *pe, chunk_t chunk)
1011{
1012	struct dm_snap_pending_exception *pe2;
1013
1014	pe2 = __lookup_pending_exception(s, chunk);
1015	if (pe2) {
1016		free_pending_exception(pe);
1017		return pe2;
1018	}
1019
1020	pe->e.old_chunk = chunk;
1021	bio_list_init(&pe->origin_bios);
1022	bio_list_init(&pe->snapshot_bios);
1023	pe->primary_pe = NULL;
1024	atomic_set(&pe->ref_count, 0);
1025	pe->started = 0;
1026
1027	if (s->store->type->prepare_exception(s->store, &pe->e)) {
1028		free_pending_exception(pe);
1029		return NULL;
1030	}
1031
1032	get_pending_exception(pe);
1033	dm_insert_exception(&s->pending, &pe->e);
1034
1035	return pe;
1036}
1037
1038static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1039			    struct bio *bio, chunk_t chunk)
1040{
1041	bio->bi_bdev = s->store->cow->bdev;
1042	bio->bi_sector = chunk_to_sector(s->store,
1043					 dm_chunk_number(e->new_chunk) +
1044					 (chunk - e->old_chunk)) +
1045					 (bio->bi_sector &
1046					  s->store->chunk_mask);
1047}
1048
1049static int snapshot_map(struct dm_target *ti, struct bio *bio,
1050			union map_info *map_context)
1051{
1052	struct dm_exception *e;
1053	struct dm_snapshot *s = ti->private;
1054	int r = DM_MAPIO_REMAPPED;
1055	chunk_t chunk;
1056	struct dm_snap_pending_exception *pe = NULL;
1057
1058	if (unlikely(bio_empty_barrier(bio))) {
1059		bio->bi_bdev = s->store->cow->bdev;
1060		return DM_MAPIO_REMAPPED;
1061	}
1062
1063	chunk = sector_to_chunk(s->store, bio->bi_sector);
1064
1065	/* Full snapshots are not usable */
1066	/* To get here the table must be live so s->active is always set. */
1067	if (!s->valid)
1068		return -EIO;
1069
1070	/* FIXME: should only take write lock if we need
1071	 * to copy an exception */
1072	down_write(&s->lock);
1073
1074	if (!s->valid) {
1075		r = -EIO;
1076		goto out_unlock;
1077	}
1078
1079	/* If the block is already remapped - use that, else remap it */
1080	e = dm_lookup_exception(&s->complete, chunk);
1081	if (e) {
1082		remap_exception(s, e, bio, chunk);
1083		goto out_unlock;
1084	}
1085
1086	/*
1087	 * Write to snapshot - higher level takes care of RW/RO
1088	 * flags so we should only get this if we are
1089	 * writeable.
1090	 */
1091	if (bio_rw(bio) == WRITE) {
1092		pe = __lookup_pending_exception(s, chunk);
1093		if (!pe) {
1094			up_write(&s->lock);
1095			pe = alloc_pending_exception(s);
1096			down_write(&s->lock);
1097
1098			if (!s->valid) {
1099				free_pending_exception(pe);
1100				r = -EIO;
1101				goto out_unlock;
1102			}
1103
1104			e = dm_lookup_exception(&s->complete, chunk);
1105			if (e) {
1106				free_pending_exception(pe);
1107				remap_exception(s, e, bio, chunk);
1108				goto out_unlock;
1109			}
1110
1111			pe = __find_pending_exception(s, pe, chunk);
1112			if (!pe) {
1113				__invalidate_snapshot(s, -ENOMEM);
1114				r = -EIO;
1115				goto out_unlock;
1116			}
1117		}
1118
1119		remap_exception(s, &pe->e, bio, chunk);
1120		bio_list_add(&pe->snapshot_bios, bio);
1121
1122		r = DM_MAPIO_SUBMITTED;
1123
1124		if (!pe->started) {
1125			/* this is protected by snap->lock */
1126			pe->started = 1;
1127			up_write(&s->lock);
1128			start_copy(pe);
1129			goto out;
1130		}
1131	} else {
1132		bio->bi_bdev = s->origin->bdev;
1133		map_context->ptr = track_chunk(s, chunk);
1134	}
1135
1136 out_unlock:
1137	up_write(&s->lock);
1138 out:
1139	return r;
1140}
1141
1142static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1143			   int error, union map_info *map_context)
1144{
1145	struct dm_snapshot *s = ti->private;
1146	struct dm_snap_tracked_chunk *c = map_context->ptr;
1147
1148	if (c)
1149		stop_tracking_chunk(s, c);
1150
1151	return 0;
1152}
1153
1154static void snapshot_resume(struct dm_target *ti)
1155{
1156	struct dm_snapshot *s = ti->private;
1157
1158	down_write(&s->lock);
1159	s->active = 1;
1160	up_write(&s->lock);
1161}
1162
1163static int snapshot_status(struct dm_target *ti, status_type_t type,
1164			   char *result, unsigned int maxlen)
1165{
1166	unsigned sz = 0;
1167	struct dm_snapshot *snap = ti->private;
1168
1169	switch (type) {
1170	case STATUSTYPE_INFO:
1171
1172		down_write(&snap->lock);
1173
1174		if (!snap->valid)
1175			DMEMIT("Invalid");
1176		else {
1177			if (snap->store->type->usage) {
1178				sector_t total_sectors, sectors_allocated,
1179					 metadata_sectors;
1180				snap->store->type->usage(snap->store,
1181							 &total_sectors,
1182							 &sectors_allocated,
1183							 &metadata_sectors);
1184				DMEMIT("%llu/%llu %llu",
1185				       (unsigned long long)sectors_allocated,
1186				       (unsigned long long)total_sectors,
1187				       (unsigned long long)metadata_sectors);
1188			}
1189			else
1190				DMEMIT("Unknown");
1191		}
1192
1193		up_write(&snap->lock);
1194
1195		break;
1196
1197	case STATUSTYPE_TABLE:
1198		/*
1199		 * kdevname returns a static pointer so we need
1200		 * to make private copies if the output is to
1201		 * make sense.
1202		 */
1203		DMEMIT("%s", snap->origin->name);
1204		snap->store->type->status(snap->store, type, result + sz,
1205					  maxlen - sz);
1206		break;
1207	}
1208
1209	return 0;
1210}
1211
1212static int snapshot_iterate_devices(struct dm_target *ti,
1213				    iterate_devices_callout_fn fn, void *data)
1214{
1215	struct dm_snapshot *snap = ti->private;
1216
1217	return fn(ti, snap->origin, 0, ti->len, data);
1218}
1219
1220
1221/*-----------------------------------------------------------------
1222 * Origin methods
1223 *---------------------------------------------------------------*/
1224static int __origin_write(struct list_head *snapshots, struct bio *bio)
1225{
1226	int r = DM_MAPIO_REMAPPED, first = 0;
1227	struct dm_snapshot *snap;
1228	struct dm_exception *e;
1229	struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL;
1230	chunk_t chunk;
1231	LIST_HEAD(pe_queue);
1232
1233	/* Do all the snapshots on this origin */
1234	list_for_each_entry (snap, snapshots, list) {
1235
1236		down_write(&snap->lock);
1237
1238		/* Only deal with valid and active snapshots */
1239		if (!snap->valid || !snap->active)
1240			goto next_snapshot;
1241
1242		/* Nothing to do if writing beyond end of snapshot */
1243		if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table))
1244			goto next_snapshot;
1245
1246		/*
1247		 * Remember, different snapshots can have
1248		 * different chunk sizes.
1249		 */
1250		chunk = sector_to_chunk(snap->store, bio->bi_sector);
1251
1252		/*
1253		 * Check exception table to see if block
1254		 * is already remapped in this snapshot
1255		 * and trigger an exception if not.
1256		 *
1257		 * ref_count is initialised to 1 so pending_complete()
1258		 * won't destroy the primary_pe while we're inside this loop.
1259		 */
1260		e = dm_lookup_exception(&snap->complete, chunk);
1261		if (e)
1262			goto next_snapshot;
1263
1264		pe = __lookup_pending_exception(snap, chunk);
1265		if (!pe) {
1266			up_write(&snap->lock);
1267			pe = alloc_pending_exception(snap);
1268			down_write(&snap->lock);
1269
1270			if (!snap->valid) {
1271				free_pending_exception(pe);
1272				goto next_snapshot;
1273			}
1274
1275			e = dm_lookup_exception(&snap->complete, chunk);
1276			if (e) {
1277				free_pending_exception(pe);
1278				goto next_snapshot;
1279			}
1280
1281			pe = __find_pending_exception(snap, pe, chunk);
1282			if (!pe) {
1283				__invalidate_snapshot(snap, -ENOMEM);
1284				goto next_snapshot;
1285			}
1286		}
1287
1288		if (!primary_pe) {
1289			/*
1290			 * Either every pe here has same
1291			 * primary_pe or none has one yet.
1292			 */
1293			if (pe->primary_pe)
1294				primary_pe = pe->primary_pe;
1295			else {
1296				primary_pe = pe;
1297				first = 1;
1298			}
1299
1300			bio_list_add(&primary_pe->origin_bios, bio);
1301
1302			r = DM_MAPIO_SUBMITTED;
1303		}
1304
1305		if (!pe->primary_pe) {
1306			pe->primary_pe = primary_pe;
1307			get_pending_exception(primary_pe);
1308		}
1309
1310		if (!pe->started) {
1311			pe->started = 1;
1312			list_add_tail(&pe->list, &pe_queue);
1313		}
1314
1315 next_snapshot:
1316		up_write(&snap->lock);
1317	}
1318
1319	if (!primary_pe)
1320		return r;
1321
1322	/*
1323	 * If this is the first time we're processing this chunk and
1324	 * ref_count is now 1 it means all the pending exceptions
1325	 * got completed while we were in the loop above, so it falls to
1326	 * us here to remove the primary_pe and submit any origin_bios.
1327	 */
1328
1329	if (first && atomic_dec_and_test(&primary_pe->ref_count)) {
1330		flush_bios(bio_list_get(&primary_pe->origin_bios));
1331		free_pending_exception(primary_pe);
1332		/* If we got here, pe_queue is necessarily empty. */
1333		return r;
1334	}
1335
1336	/*
1337	 * Now that we have a complete pe list we can start the copying.
1338	 */
1339	list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
1340		start_copy(pe);
1341
1342	return r;
1343}
1344
1345/*
1346 * Called on a write from the origin driver.
1347 */
1348static int do_origin(struct dm_dev *origin, struct bio *bio)
1349{
1350	struct origin *o;
1351	int r = DM_MAPIO_REMAPPED;
1352
1353	down_read(&_origins_lock);
1354	o = __lookup_origin(origin->bdev);
1355	if (o)
1356		r = __origin_write(&o->snapshots, bio);
1357	up_read(&_origins_lock);
1358
1359	return r;
1360}
1361
1362/*
1363 * Origin: maps a linear range of a device, with hooks for snapshotting.
1364 */
1365
1366/*
1367 * Construct an origin mapping: <dev_path>
1368 * The context for an origin is merely a 'struct dm_dev *'
1369 * pointing to the real device.
1370 */
1371static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1372{
1373	int r;
1374	struct dm_dev *dev;
1375
1376	if (argc != 1) {
1377		ti->error = "origin: incorrect number of arguments";
1378		return -EINVAL;
1379	}
1380
1381	r = dm_get_device(ti, argv[0], 0, ti->len,
1382			  dm_table_get_mode(ti->table), &dev);
1383	if (r) {
1384		ti->error = "Cannot get target device";
1385		return r;
1386	}
1387
1388	ti->private = dev;
1389	ti->num_flush_requests = 1;
1390
1391	return 0;
1392}
1393
1394static void origin_dtr(struct dm_target *ti)
1395{
1396	struct dm_dev *dev = ti->private;
1397	dm_put_device(ti, dev);
1398}
1399
1400static int origin_map(struct dm_target *ti, struct bio *bio,
1401		      union map_info *map_context)
1402{
1403	struct dm_dev *dev = ti->private;
1404	bio->bi_bdev = dev->bdev;
1405
1406	if (unlikely(bio_empty_barrier(bio)))
1407		return DM_MAPIO_REMAPPED;
1408
1409	/* Only tell snapshots if this is a write */
1410	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
1411}
1412
1413/*
1414 * Set the target "split_io" field to the minimum of all the snapshots'
1415 * chunk sizes.
1416 */
1417static void origin_resume(struct dm_target *ti)
1418{
1419	struct dm_dev *dev = ti->private;
1420
1421	down_read(&_origins_lock);
1422
1423	ti->split_io = __minimum_chunk_size(__lookup_origin(dev->bdev));
1424
1425	up_read(&_origins_lock);
1426}
1427
1428static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1429			 unsigned int maxlen)
1430{
1431	struct dm_dev *dev = ti->private;
1432
1433	switch (type) {
1434	case STATUSTYPE_INFO:
1435		result[0] = '\0';
1436		break;
1437
1438	case STATUSTYPE_TABLE:
1439		snprintf(result, maxlen, "%s", dev->name);
1440		break;
1441	}
1442
1443	return 0;
1444}
1445
1446static int origin_iterate_devices(struct dm_target *ti,
1447				  iterate_devices_callout_fn fn, void *data)
1448{
1449	struct dm_dev *dev = ti->private;
1450
1451	return fn(ti, dev, 0, ti->len, data);
1452}
1453
1454static struct target_type origin_target = {
1455	.name    = "snapshot-origin",
1456	.version = {1, 7, 0},
1457	.module  = THIS_MODULE,
1458	.ctr     = origin_ctr,
1459	.dtr     = origin_dtr,
1460	.map     = origin_map,
1461	.resume  = origin_resume,
1462	.status  = origin_status,
1463	.iterate_devices = origin_iterate_devices,
1464};
1465
1466static struct target_type snapshot_target = {
1467	.name    = "snapshot",
1468	.version = {1, 8, 0},
1469	.module  = THIS_MODULE,
1470	.ctr     = snapshot_ctr,
1471	.dtr     = snapshot_dtr,
1472	.map     = snapshot_map,
1473	.end_io  = snapshot_end_io,
1474	.resume  = snapshot_resume,
1475	.status  = snapshot_status,
1476	.iterate_devices = snapshot_iterate_devices,
1477};
1478
1479static int __init dm_snapshot_init(void)
1480{
1481	int r;
1482
1483	r = dm_exception_store_init();
1484	if (r) {
1485		DMERR("Failed to initialize exception stores");
1486		return r;
1487	}
1488
1489	r = dm_register_target(&snapshot_target);
1490	if (r) {
1491		DMERR("snapshot target register failed %d", r);
1492		goto bad_register_snapshot_target;
1493	}
1494
1495	r = dm_register_target(&origin_target);
1496	if (r < 0) {
1497		DMERR("Origin target register failed %d", r);
1498		goto bad1;
1499	}
1500
1501	r = init_origin_hash();
1502	if (r) {
1503		DMERR("init_origin_hash failed.");
1504		goto bad2;
1505	}
1506
1507	exception_cache = KMEM_CACHE(dm_exception, 0);
1508	if (!exception_cache) {
1509		DMERR("Couldn't create exception cache.");
1510		r = -ENOMEM;
1511		goto bad3;
1512	}
1513
1514	pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
1515	if (!pending_cache) {
1516		DMERR("Couldn't create pending cache.");
1517		r = -ENOMEM;
1518		goto bad4;
1519	}
1520
1521	tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
1522	if (!tracked_chunk_cache) {
1523		DMERR("Couldn't create cache to track chunks in use.");
1524		r = -ENOMEM;
1525		goto bad5;
1526	}
1527
1528	ksnapd = create_singlethread_workqueue("ksnapd");
1529	if (!ksnapd) {
1530		DMERR("Failed to create ksnapd workqueue.");
1531		r = -ENOMEM;
1532		goto bad_pending_pool;
1533	}
1534
1535	return 0;
1536
1537bad_pending_pool:
1538	kmem_cache_destroy(tracked_chunk_cache);
1539bad5:
1540	kmem_cache_destroy(pending_cache);
1541bad4:
1542	kmem_cache_destroy(exception_cache);
1543bad3:
1544	exit_origin_hash();
1545bad2:
1546	dm_unregister_target(&origin_target);
1547bad1:
1548	dm_unregister_target(&snapshot_target);
1549
1550bad_register_snapshot_target:
1551	dm_exception_store_exit();
1552	return r;
1553}
1554
1555static void __exit dm_snapshot_exit(void)
1556{
1557	destroy_workqueue(ksnapd);
1558
1559	dm_unregister_target(&snapshot_target);
1560	dm_unregister_target(&origin_target);
1561
1562	exit_origin_hash();
1563	kmem_cache_destroy(pending_cache);
1564	kmem_cache_destroy(exception_cache);
1565	kmem_cache_destroy(tracked_chunk_cache);
1566
1567	dm_exception_store_exit();
1568}
1569
1570/* Module hooks */
1571module_init(dm_snapshot_init);
1572module_exit(dm_snapshot_exit);
1573
1574MODULE_DESCRIPTION(DM_NAME " snapshot target");
1575MODULE_AUTHOR("Joe Thornber");
1576MODULE_LICENSE("GPL");
1577