1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8#include "dm-bio-prison.h"
9#include "dm-bio-record.h"
10#include "dm-cache-metadata.h"
11
12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h>
14#include <linux/init.h>
15#include <linux/mempool.h>
16#include <linux/module.h>
17#include <linux/slab.h>
18#include <linux/vmalloc.h>
19
20#define DM_MSG_PREFIX "cache"
21
22DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
23	"A percentage of time allocated for copying to and/or from cache");
24
25/*----------------------------------------------------------------*/
26
27/*
28 * Glossary:
29 *
30 * oblock: index of an origin block
31 * cblock: index of a cache block
32 * promotion: movement of a block from origin to cache
33 * demotion: movement of a block from cache to origin
34 * migration: movement of a block between the origin and cache device,
35 *	      either direction
36 */
37
38/*----------------------------------------------------------------*/
39
40static size_t bitset_size_in_bytes(unsigned nr_entries)
41{
42	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
43}
44
45static unsigned long *alloc_bitset(unsigned nr_entries)
46{
47	size_t s = bitset_size_in_bytes(nr_entries);
48	return vzalloc(s);
49}
50
51static void clear_bitset(void *bitset, unsigned nr_entries)
52{
53	size_t s = bitset_size_in_bytes(nr_entries);
54	memset(bitset, 0, s);
55}
56
57static void free_bitset(unsigned long *bits)
58{
59	vfree(bits);
60}
61
62/*----------------------------------------------------------------*/
63
64/*
65 * There are a couple of places where we let a bio run, but want to do some
66 * work before calling its endio function.  We do this by temporarily
67 * changing the endio fn.
68 */
69struct dm_hook_info {
70	bio_end_io_t *bi_end_io;
71	void *bi_private;
72};
73
74static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
75			bio_end_io_t *bi_end_io, void *bi_private)
76{
77	h->bi_end_io = bio->bi_end_io;
78	h->bi_private = bio->bi_private;
79
80	bio->bi_end_io = bi_end_io;
81	bio->bi_private = bi_private;
82}
83
84static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
85{
86	bio->bi_end_io = h->bi_end_io;
87	bio->bi_private = h->bi_private;
88
89	/*
90	 * Must bump bi_remaining to allow bio to complete with
91	 * restored bi_end_io.
92	 */
93	atomic_inc(&bio->bi_remaining);
94}
95
96/*----------------------------------------------------------------*/
97
98#define PRISON_CELLS 1024
99#define MIGRATION_POOL_SIZE 128
100#define COMMIT_PERIOD HZ
101#define MIGRATION_COUNT_WINDOW 10
102
103/*
104 * The block size of the device holding cache data must be
105 * between 32KB and 1GB.
106 */
107#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
108#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
109
110/*
111 * FIXME: the cache is read/write for the time being.
112 */
113enum cache_metadata_mode {
114	CM_WRITE,		/* metadata may be changed */
115	CM_READ_ONLY,		/* metadata may not be changed */
116};
117
118enum cache_io_mode {
119	/*
120	 * Data is written to cached blocks only.  These blocks are marked
121	 * dirty.  If you lose the cache device you will lose data.
122	 * Potential performance increase for both reads and writes.
123	 */
124	CM_IO_WRITEBACK,
125
126	/*
127	 * Data is written to both cache and origin.  Blocks are never
128	 * dirty.  Potential performance benfit for reads only.
129	 */
130	CM_IO_WRITETHROUGH,
131
132	/*
133	 * A degraded mode useful for various cache coherency situations
134	 * (eg, rolling back snapshots).  Reads and writes always go to the
135	 * origin.  If a write goes to a cached oblock, then the cache
136	 * block is invalidated.
137	 */
138	CM_IO_PASSTHROUGH
139};
140
141struct cache_features {
142	enum cache_metadata_mode mode;
143	enum cache_io_mode io_mode;
144};
145
146struct cache_stats {
147	atomic_t read_hit;
148	atomic_t read_miss;
149	atomic_t write_hit;
150	atomic_t write_miss;
151	atomic_t demotion;
152	atomic_t promotion;
153	atomic_t copies_avoided;
154	atomic_t cache_cell_clash;
155	atomic_t commit_count;
156	atomic_t discard_count;
157};
158
159/*
160 * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
161 * the one-past-the-end value.
162 */
163struct cblock_range {
164	dm_cblock_t begin;
165	dm_cblock_t end;
166};
167
168struct invalidation_request {
169	struct list_head list;
170	struct cblock_range *cblocks;
171
172	atomic_t complete;
173	int err;
174
175	wait_queue_head_t result_wait;
176};
177
178struct cache {
179	struct dm_target *ti;
180	struct dm_target_callbacks callbacks;
181
182	struct dm_cache_metadata *cmd;
183
184	/*
185	 * Metadata is written to this device.
186	 */
187	struct dm_dev *metadata_dev;
188
189	/*
190	 * The slower of the two data devices.  Typically a spindle.
191	 */
192	struct dm_dev *origin_dev;
193
194	/*
195	 * The faster of the two data devices.  Typically an SSD.
196	 */
197	struct dm_dev *cache_dev;
198
199	/*
200	 * Size of the origin device in _complete_ blocks and native sectors.
201	 */
202	dm_oblock_t origin_blocks;
203	sector_t origin_sectors;
204
205	/*
206	 * Size of the cache device in blocks.
207	 */
208	dm_cblock_t cache_size;
209
210	/*
211	 * Fields for converting from sectors to blocks.
212	 */
213	uint32_t sectors_per_block;
214	int sectors_per_block_shift;
215
216	spinlock_t lock;
217	struct bio_list deferred_bios;
218	struct bio_list deferred_flush_bios;
219	struct bio_list deferred_writethrough_bios;
220	struct list_head quiesced_migrations;
221	struct list_head completed_migrations;
222	struct list_head need_commit_migrations;
223	sector_t migration_threshold;
224	wait_queue_head_t migration_wait;
225	atomic_t nr_migrations;
226
227	wait_queue_head_t quiescing_wait;
228	atomic_t quiescing;
229	atomic_t quiescing_ack;
230
231	/*
232	 * cache_size entries, dirty if set
233	 */
234	atomic_t nr_dirty;
235	unsigned long *dirty_bitset;
236
237	/*
238	 * origin_blocks entries, discarded if set.
239	 */
240	dm_oblock_t discard_nr_blocks;
241	unsigned long *discard_bitset;
242
243	/*
244	 * Rather than reconstructing the table line for the status we just
245	 * save it and regurgitate.
246	 */
247	unsigned nr_ctr_args;
248	const char **ctr_args;
249
250	struct dm_kcopyd_client *copier;
251	struct workqueue_struct *wq;
252	struct work_struct worker;
253
254	struct delayed_work waker;
255	unsigned long last_commit_jiffies;
256
257	struct dm_bio_prison *prison;
258	struct dm_deferred_set *all_io_ds;
259
260	mempool_t *migration_pool;
261	struct dm_cache_migration *next_migration;
262
263	struct dm_cache_policy *policy;
264	unsigned policy_nr_args;
265
266	bool need_tick_bio:1;
267	bool sized:1;
268	bool invalidate:1;
269	bool commit_requested:1;
270	bool loaded_mappings:1;
271	bool loaded_discards:1;
272
273	/*
274	 * Cache features such as write-through.
275	 */
276	struct cache_features features;
277
278	struct cache_stats stats;
279
280	/*
281	 * Invalidation fields.
282	 */
283	spinlock_t invalidation_lock;
284	struct list_head invalidation_requests;
285};
286
287struct per_bio_data {
288	bool tick:1;
289	unsigned req_nr:2;
290	struct dm_deferred_entry *all_io_entry;
291	struct dm_hook_info hook_info;
292
293	/*
294	 * writethrough fields.  These MUST remain at the end of this
295	 * structure and the 'cache' member must be the first as it
296	 * is used to determine the offset of the writethrough fields.
297	 */
298	struct cache *cache;
299	dm_cblock_t cblock;
300	struct dm_bio_details bio_details;
301};
302
303struct dm_cache_migration {
304	struct list_head list;
305	struct cache *cache;
306
307	unsigned long start_jiffies;
308	dm_oblock_t old_oblock;
309	dm_oblock_t new_oblock;
310	dm_cblock_t cblock;
311
312	bool err:1;
313	bool writeback:1;
314	bool demote:1;
315	bool promote:1;
316	bool requeue_holder:1;
317	bool invalidate:1;
318
319	struct dm_bio_prison_cell *old_ocell;
320	struct dm_bio_prison_cell *new_ocell;
321};
322
323/*
324 * Processing a bio in the worker thread may require these memory
325 * allocations.  We prealloc to avoid deadlocks (the same worker thread
326 * frees them back to the mempool).
327 */
328struct prealloc {
329	struct dm_cache_migration *mg;
330	struct dm_bio_prison_cell *cell1;
331	struct dm_bio_prison_cell *cell2;
332};
333
334static void wake_worker(struct cache *cache)
335{
336	queue_work(cache->wq, &cache->worker);
337}
338
339/*----------------------------------------------------------------*/
340
341static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
342{
343	/* FIXME: change to use a local slab. */
344	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
345}
346
347static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
348{
349	dm_bio_prison_free_cell(cache->prison, cell);
350}
351
352static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
353{
354	if (!p->mg) {
355		p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
356		if (!p->mg)
357			return -ENOMEM;
358	}
359
360	if (!p->cell1) {
361		p->cell1 = alloc_prison_cell(cache);
362		if (!p->cell1)
363			return -ENOMEM;
364	}
365
366	if (!p->cell2) {
367		p->cell2 = alloc_prison_cell(cache);
368		if (!p->cell2)
369			return -ENOMEM;
370	}
371
372	return 0;
373}
374
375static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
376{
377	if (p->cell2)
378		free_prison_cell(cache, p->cell2);
379
380	if (p->cell1)
381		free_prison_cell(cache, p->cell1);
382
383	if (p->mg)
384		mempool_free(p->mg, cache->migration_pool);
385}
386
387static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
388{
389	struct dm_cache_migration *mg = p->mg;
390
391	BUG_ON(!mg);
392	p->mg = NULL;
393
394	return mg;
395}
396
397/*
398 * You must have a cell within the prealloc struct to return.  If not this
399 * function will BUG() rather than returning NULL.
400 */
401static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
402{
403	struct dm_bio_prison_cell *r = NULL;
404
405	if (p->cell1) {
406		r = p->cell1;
407		p->cell1 = NULL;
408
409	} else if (p->cell2) {
410		r = p->cell2;
411		p->cell2 = NULL;
412	} else
413		BUG();
414
415	return r;
416}
417
418/*
419 * You can't have more than two cells in a prealloc struct.  BUG() will be
420 * called if you try and overfill.
421 */
422static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
423{
424	if (!p->cell2)
425		p->cell2 = cell;
426
427	else if (!p->cell1)
428		p->cell1 = cell;
429
430	else
431		BUG();
432}
433
434/*----------------------------------------------------------------*/
435
436static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
437{
438	key->virtual = 0;
439	key->dev = 0;
440	key->block = from_oblock(oblock);
441}
442
443/*
444 * The caller hands in a preallocated cell, and a free function for it.
445 * The cell will be freed if there's an error, or if it wasn't used because
446 * a cell with that key already exists.
447 */
448typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
449
450static int bio_detain(struct cache *cache, dm_oblock_t oblock,
451		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
452		      cell_free_fn free_fn, void *free_context,
453		      struct dm_bio_prison_cell **cell_result)
454{
455	int r;
456	struct dm_cell_key key;
457
458	build_key(oblock, &key);
459	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
460	if (r)
461		free_fn(free_context, cell_prealloc);
462
463	return r;
464}
465
466static int get_cell(struct cache *cache,
467		    dm_oblock_t oblock,
468		    struct prealloc *structs,
469		    struct dm_bio_prison_cell **cell_result)
470{
471	int r;
472	struct dm_cell_key key;
473	struct dm_bio_prison_cell *cell_prealloc;
474
475	cell_prealloc = prealloc_get_cell(structs);
476
477	build_key(oblock, &key);
478	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
479	if (r)
480		prealloc_put_cell(structs, cell_prealloc);
481
482	return r;
483}
484
485/*----------------------------------------------------------------*/
486
487static bool is_dirty(struct cache *cache, dm_cblock_t b)
488{
489	return test_bit(from_cblock(b), cache->dirty_bitset);
490}
491
492static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
493{
494	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
495		atomic_inc(&cache->nr_dirty);
496		policy_set_dirty(cache->policy, oblock);
497	}
498}
499
500static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
501{
502	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
503		policy_clear_dirty(cache->policy, oblock);
504		if (atomic_dec_return(&cache->nr_dirty) == 0)
505			dm_table_event(cache->ti->table);
506	}
507}
508
509/*----------------------------------------------------------------*/
510
511static bool block_size_is_power_of_two(struct cache *cache)
512{
513	return cache->sectors_per_block_shift >= 0;
514}
515
516/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
517#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
518__always_inline
519#endif
520static dm_block_t block_div(dm_block_t b, uint32_t n)
521{
522	do_div(b, n);
523
524	return b;
525}
526
527static void set_discard(struct cache *cache, dm_oblock_t b)
528{
529	unsigned long flags;
530
531	atomic_inc(&cache->stats.discard_count);
532
533	spin_lock_irqsave(&cache->lock, flags);
534	set_bit(from_oblock(b), cache->discard_bitset);
535	spin_unlock_irqrestore(&cache->lock, flags);
536}
537
538static void clear_discard(struct cache *cache, dm_oblock_t b)
539{
540	unsigned long flags;
541
542	spin_lock_irqsave(&cache->lock, flags);
543	clear_bit(from_oblock(b), cache->discard_bitset);
544	spin_unlock_irqrestore(&cache->lock, flags);
545}
546
547static bool is_discarded(struct cache *cache, dm_oblock_t b)
548{
549	int r;
550	unsigned long flags;
551
552	spin_lock_irqsave(&cache->lock, flags);
553	r = test_bit(from_oblock(b), cache->discard_bitset);
554	spin_unlock_irqrestore(&cache->lock, flags);
555
556	return r;
557}
558
559static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
560{
561	int r;
562	unsigned long flags;
563
564	spin_lock_irqsave(&cache->lock, flags);
565	r = test_bit(from_oblock(b), cache->discard_bitset);
566	spin_unlock_irqrestore(&cache->lock, flags);
567
568	return r;
569}
570
571/*----------------------------------------------------------------*/
572
573static void load_stats(struct cache *cache)
574{
575	struct dm_cache_statistics stats;
576
577	dm_cache_metadata_get_stats(cache->cmd, &stats);
578	atomic_set(&cache->stats.read_hit, stats.read_hits);
579	atomic_set(&cache->stats.read_miss, stats.read_misses);
580	atomic_set(&cache->stats.write_hit, stats.write_hits);
581	atomic_set(&cache->stats.write_miss, stats.write_misses);
582}
583
584static void save_stats(struct cache *cache)
585{
586	struct dm_cache_statistics stats;
587
588	stats.read_hits = atomic_read(&cache->stats.read_hit);
589	stats.read_misses = atomic_read(&cache->stats.read_miss);
590	stats.write_hits = atomic_read(&cache->stats.write_hit);
591	stats.write_misses = atomic_read(&cache->stats.write_miss);
592
593	dm_cache_metadata_set_stats(cache->cmd, &stats);
594}
595
596/*----------------------------------------------------------------
597 * Per bio data
598 *--------------------------------------------------------------*/
599
600/*
601 * If using writeback, leave out struct per_bio_data's writethrough fields.
602 */
603#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
604#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
605
606static bool writethrough_mode(struct cache_features *f)
607{
608	return f->io_mode == CM_IO_WRITETHROUGH;
609}
610
611static bool writeback_mode(struct cache_features *f)
612{
613	return f->io_mode == CM_IO_WRITEBACK;
614}
615
616static bool passthrough_mode(struct cache_features *f)
617{
618	return f->io_mode == CM_IO_PASSTHROUGH;
619}
620
621static size_t get_per_bio_data_size(struct cache *cache)
622{
623	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
624}
625
626static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
627{
628	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
629	BUG_ON(!pb);
630	return pb;
631}
632
633static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
634{
635	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
636
637	pb->tick = false;
638	pb->req_nr = dm_bio_get_target_bio_nr(bio);
639	pb->all_io_entry = NULL;
640
641	return pb;
642}
643
644/*----------------------------------------------------------------
645 * Remapping
646 *--------------------------------------------------------------*/
647static void remap_to_origin(struct cache *cache, struct bio *bio)
648{
649	bio->bi_bdev = cache->origin_dev->bdev;
650}
651
652static void remap_to_cache(struct cache *cache, struct bio *bio,
653			   dm_cblock_t cblock)
654{
655	sector_t bi_sector = bio->bi_iter.bi_sector;
656	sector_t block = from_cblock(cblock);
657
658	bio->bi_bdev = cache->cache_dev->bdev;
659	if (!block_size_is_power_of_two(cache))
660		bio->bi_iter.bi_sector =
661			(block * cache->sectors_per_block) +
662			sector_div(bi_sector, cache->sectors_per_block);
663	else
664		bio->bi_iter.bi_sector =
665			(block << cache->sectors_per_block_shift) |
666			(bi_sector & (cache->sectors_per_block - 1));
667}
668
669static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
670{
671	unsigned long flags;
672	size_t pb_data_size = get_per_bio_data_size(cache);
673	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
674
675	spin_lock_irqsave(&cache->lock, flags);
676	if (cache->need_tick_bio &&
677	    !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
678		pb->tick = true;
679		cache->need_tick_bio = false;
680	}
681	spin_unlock_irqrestore(&cache->lock, flags);
682}
683
684static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
685				  dm_oblock_t oblock)
686{
687	check_if_tick_bio_needed(cache, bio);
688	remap_to_origin(cache, bio);
689	if (bio_data_dir(bio) == WRITE)
690		clear_discard(cache, oblock);
691}
692
693static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
694				 dm_oblock_t oblock, dm_cblock_t cblock)
695{
696	check_if_tick_bio_needed(cache, bio);
697	remap_to_cache(cache, bio, cblock);
698	if (bio_data_dir(bio) == WRITE) {
699		set_dirty(cache, oblock, cblock);
700		clear_discard(cache, oblock);
701	}
702}
703
704static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
705{
706	sector_t block_nr = bio->bi_iter.bi_sector;
707
708	if (!block_size_is_power_of_two(cache))
709		(void) sector_div(block_nr, cache->sectors_per_block);
710	else
711		block_nr >>= cache->sectors_per_block_shift;
712
713	return to_oblock(block_nr);
714}
715
716static int bio_triggers_commit(struct cache *cache, struct bio *bio)
717{
718	return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
719}
720
721/*
722 * You must increment the deferred set whilst the prison cell is held.  To
723 * encourage this, we ask for 'cell' to be passed in.
724 */
725static void inc_ds(struct cache *cache, struct bio *bio,
726		   struct dm_bio_prison_cell *cell)
727{
728	size_t pb_data_size = get_per_bio_data_size(cache);
729	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
730
731	BUG_ON(!cell);
732	BUG_ON(pb->all_io_entry);
733
734	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
735}
736
737static void issue(struct cache *cache, struct bio *bio)
738{
739	unsigned long flags;
740
741	if (!bio_triggers_commit(cache, bio)) {
742		generic_make_request(bio);
743		return;
744	}
745
746	/*
747	 * Batch together any bios that trigger commits and then issue a
748	 * single commit for them in do_worker().
749	 */
750	spin_lock_irqsave(&cache->lock, flags);
751	cache->commit_requested = true;
752	bio_list_add(&cache->deferred_flush_bios, bio);
753	spin_unlock_irqrestore(&cache->lock, flags);
754}
755
756static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
757{
758	inc_ds(cache, bio, cell);
759	issue(cache, bio);
760}
761
762static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
763{
764	unsigned long flags;
765
766	spin_lock_irqsave(&cache->lock, flags);
767	bio_list_add(&cache->deferred_writethrough_bios, bio);
768	spin_unlock_irqrestore(&cache->lock, flags);
769
770	wake_worker(cache);
771}
772
773static void writethrough_endio(struct bio *bio, int err)
774{
775	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
776
777	dm_unhook_bio(&pb->hook_info, bio);
778
779	if (err) {
780		bio_endio(bio, err);
781		return;
782	}
783
784	dm_bio_restore(&pb->bio_details, bio);
785	remap_to_cache(pb->cache, bio, pb->cblock);
786
787	/*
788	 * We can't issue this bio directly, since we're in interrupt
789	 * context.  So it gets put on a bio list for processing by the
790	 * worker thread.
791	 */
792	defer_writethrough_bio(pb->cache, bio);
793}
794
795/*
796 * When running in writethrough mode we need to send writes to clean blocks
797 * to both the cache and origin devices.  In future we'd like to clone the
798 * bio and send them in parallel, but for now we're doing them in
799 * series as this is easier.
800 */
801static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
802				       dm_oblock_t oblock, dm_cblock_t cblock)
803{
804	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
805
806	pb->cache = cache;
807	pb->cblock = cblock;
808	dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
809	dm_bio_record(&pb->bio_details, bio);
810
811	remap_to_origin_clear_discard(pb->cache, bio, oblock);
812}
813
814/*----------------------------------------------------------------
815 * Migration processing
816 *
817 * Migration covers moving data from the origin device to the cache, or
818 * vice versa.
819 *--------------------------------------------------------------*/
820static void free_migration(struct dm_cache_migration *mg)
821{
822	mempool_free(mg, mg->cache->migration_pool);
823}
824
825static void inc_nr_migrations(struct cache *cache)
826{
827	atomic_inc(&cache->nr_migrations);
828}
829
830static void dec_nr_migrations(struct cache *cache)
831{
832	atomic_dec(&cache->nr_migrations);
833
834	/*
835	 * Wake the worker in case we're suspending the target.
836	 */
837	wake_up(&cache->migration_wait);
838}
839
840static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
841			 bool holder)
842{
843	(holder ? dm_cell_release : dm_cell_release_no_holder)
844		(cache->prison, cell, &cache->deferred_bios);
845	free_prison_cell(cache, cell);
846}
847
848static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
849		       bool holder)
850{
851	unsigned long flags;
852
853	spin_lock_irqsave(&cache->lock, flags);
854	__cell_defer(cache, cell, holder);
855	spin_unlock_irqrestore(&cache->lock, flags);
856
857	wake_worker(cache);
858}
859
860static void cleanup_migration(struct dm_cache_migration *mg)
861{
862	struct cache *cache = mg->cache;
863	free_migration(mg);
864	dec_nr_migrations(cache);
865}
866
867static void migration_failure(struct dm_cache_migration *mg)
868{
869	struct cache *cache = mg->cache;
870
871	if (mg->writeback) {
872		DMWARN_LIMIT("writeback failed; couldn't copy block");
873		set_dirty(cache, mg->old_oblock, mg->cblock);
874		cell_defer(cache, mg->old_ocell, false);
875
876	} else if (mg->demote) {
877		DMWARN_LIMIT("demotion failed; couldn't copy block");
878		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
879
880		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
881		if (mg->promote)
882			cell_defer(cache, mg->new_ocell, true);
883	} else {
884		DMWARN_LIMIT("promotion failed; couldn't copy block");
885		policy_remove_mapping(cache->policy, mg->new_oblock);
886		cell_defer(cache, mg->new_ocell, true);
887	}
888
889	cleanup_migration(mg);
890}
891
892static void migration_success_pre_commit(struct dm_cache_migration *mg)
893{
894	unsigned long flags;
895	struct cache *cache = mg->cache;
896
897	if (mg->writeback) {
898		clear_dirty(cache, mg->old_oblock, mg->cblock);
899		cell_defer(cache, mg->old_ocell, false);
900		cleanup_migration(mg);
901		return;
902
903	} else if (mg->demote) {
904		if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
905			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
906			policy_force_mapping(cache->policy, mg->new_oblock,
907					     mg->old_oblock);
908			if (mg->promote)
909				cell_defer(cache, mg->new_ocell, true);
910			cleanup_migration(mg);
911			return;
912		}
913	} else {
914		if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
915			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
916			policy_remove_mapping(cache->policy, mg->new_oblock);
917			cleanup_migration(mg);
918			return;
919		}
920	}
921
922	spin_lock_irqsave(&cache->lock, flags);
923	list_add_tail(&mg->list, &cache->need_commit_migrations);
924	cache->commit_requested = true;
925	spin_unlock_irqrestore(&cache->lock, flags);
926}
927
928static void migration_success_post_commit(struct dm_cache_migration *mg)
929{
930	unsigned long flags;
931	struct cache *cache = mg->cache;
932
933	if (mg->writeback) {
934		DMWARN("writeback unexpectedly triggered commit");
935		return;
936
937	} else if (mg->demote) {
938		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
939
940		if (mg->promote) {
941			mg->demote = false;
942
943			spin_lock_irqsave(&cache->lock, flags);
944			list_add_tail(&mg->list, &cache->quiesced_migrations);
945			spin_unlock_irqrestore(&cache->lock, flags);
946
947		} else {
948			if (mg->invalidate)
949				policy_remove_mapping(cache->policy, mg->old_oblock);
950			cleanup_migration(mg);
951		}
952
953	} else {
954		clear_dirty(cache, mg->new_oblock, mg->cblock);
955		if (mg->requeue_holder)
956			cell_defer(cache, mg->new_ocell, true);
957		else {
958			bio_endio(mg->new_ocell->holder, 0);
959			cell_defer(cache, mg->new_ocell, false);
960		}
961		cleanup_migration(mg);
962	}
963}
964
965static void copy_complete(int read_err, unsigned long write_err, void *context)
966{
967	unsigned long flags;
968	struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
969	struct cache *cache = mg->cache;
970
971	if (read_err || write_err)
972		mg->err = true;
973
974	spin_lock_irqsave(&cache->lock, flags);
975	list_add_tail(&mg->list, &cache->completed_migrations);
976	spin_unlock_irqrestore(&cache->lock, flags);
977
978	wake_worker(cache);
979}
980
981static void issue_copy_real(struct dm_cache_migration *mg)
982{
983	int r;
984	struct dm_io_region o_region, c_region;
985	struct cache *cache = mg->cache;
986	sector_t cblock = from_cblock(mg->cblock);
987
988	o_region.bdev = cache->origin_dev->bdev;
989	o_region.count = cache->sectors_per_block;
990
991	c_region.bdev = cache->cache_dev->bdev;
992	c_region.sector = cblock * cache->sectors_per_block;
993	c_region.count = cache->sectors_per_block;
994
995	if (mg->writeback || mg->demote) {
996		/* demote */
997		o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
998		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
999	} else {
1000		/* promote */
1001		o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
1002		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1003	}
1004
1005	if (r < 0) {
1006		DMERR_LIMIT("issuing migration failed");
1007		migration_failure(mg);
1008	}
1009}
1010
1011static void overwrite_endio(struct bio *bio, int err)
1012{
1013	struct dm_cache_migration *mg = bio->bi_private;
1014	struct cache *cache = mg->cache;
1015	size_t pb_data_size = get_per_bio_data_size(cache);
1016	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1017	unsigned long flags;
1018
1019	dm_unhook_bio(&pb->hook_info, bio);
1020
1021	if (err)
1022		mg->err = true;
1023
1024	mg->requeue_holder = false;
1025
1026	spin_lock_irqsave(&cache->lock, flags);
1027	list_add_tail(&mg->list, &cache->completed_migrations);
1028	spin_unlock_irqrestore(&cache->lock, flags);
1029
1030	wake_worker(cache);
1031}
1032
1033static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1034{
1035	size_t pb_data_size = get_per_bio_data_size(mg->cache);
1036	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1037
1038	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1039	remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1040
1041	/*
1042	 * No need to inc_ds() here, since the cell will be held for the
1043	 * duration of the io.
1044	 */
1045	generic_make_request(bio);
1046}
1047
1048static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1049{
1050	return (bio_data_dir(bio) == WRITE) &&
1051		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1052}
1053
1054static void avoid_copy(struct dm_cache_migration *mg)
1055{
1056	atomic_inc(&mg->cache->stats.copies_avoided);
1057	migration_success_pre_commit(mg);
1058}
1059
1060static void issue_copy(struct dm_cache_migration *mg)
1061{
1062	bool avoid;
1063	struct cache *cache = mg->cache;
1064
1065	if (mg->writeback || mg->demote)
1066		avoid = !is_dirty(cache, mg->cblock) ||
1067			is_discarded_oblock(cache, mg->old_oblock);
1068	else {
1069		struct bio *bio = mg->new_ocell->holder;
1070
1071		avoid = is_discarded_oblock(cache, mg->new_oblock);
1072
1073		if (!avoid && bio_writes_complete_block(cache, bio)) {
1074			issue_overwrite(mg, bio);
1075			return;
1076		}
1077	}
1078
1079	avoid ? avoid_copy(mg) : issue_copy_real(mg);
1080}
1081
1082static void complete_migration(struct dm_cache_migration *mg)
1083{
1084	if (mg->err)
1085		migration_failure(mg);
1086	else
1087		migration_success_pre_commit(mg);
1088}
1089
1090static void process_migrations(struct cache *cache, struct list_head *head,
1091			       void (*fn)(struct dm_cache_migration *))
1092{
1093	unsigned long flags;
1094	struct list_head list;
1095	struct dm_cache_migration *mg, *tmp;
1096
1097	INIT_LIST_HEAD(&list);
1098	spin_lock_irqsave(&cache->lock, flags);
1099	list_splice_init(head, &list);
1100	spin_unlock_irqrestore(&cache->lock, flags);
1101
1102	list_for_each_entry_safe(mg, tmp, &list, list)
1103		fn(mg);
1104}
1105
1106static void __queue_quiesced_migration(struct dm_cache_migration *mg)
1107{
1108	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
1109}
1110
1111static void queue_quiesced_migration(struct dm_cache_migration *mg)
1112{
1113	unsigned long flags;
1114	struct cache *cache = mg->cache;
1115
1116	spin_lock_irqsave(&cache->lock, flags);
1117	__queue_quiesced_migration(mg);
1118	spin_unlock_irqrestore(&cache->lock, flags);
1119
1120	wake_worker(cache);
1121}
1122
1123static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
1124{
1125	unsigned long flags;
1126	struct dm_cache_migration *mg, *tmp;
1127
1128	spin_lock_irqsave(&cache->lock, flags);
1129	list_for_each_entry_safe(mg, tmp, work, list)
1130		__queue_quiesced_migration(mg);
1131	spin_unlock_irqrestore(&cache->lock, flags);
1132
1133	wake_worker(cache);
1134}
1135
1136static void check_for_quiesced_migrations(struct cache *cache,
1137					  struct per_bio_data *pb)
1138{
1139	struct list_head work;
1140
1141	if (!pb->all_io_entry)
1142		return;
1143
1144	INIT_LIST_HEAD(&work);
1145	dm_deferred_entry_dec(pb->all_io_entry, &work);
1146
1147	if (!list_empty(&work))
1148		queue_quiesced_migrations(cache, &work);
1149}
1150
1151static void quiesce_migration(struct dm_cache_migration *mg)
1152{
1153	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1154		queue_quiesced_migration(mg);
1155}
1156
1157static void promote(struct cache *cache, struct prealloc *structs,
1158		    dm_oblock_t oblock, dm_cblock_t cblock,
1159		    struct dm_bio_prison_cell *cell)
1160{
1161	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1162
1163	mg->err = false;
1164	mg->writeback = false;
1165	mg->demote = false;
1166	mg->promote = true;
1167	mg->requeue_holder = true;
1168	mg->invalidate = false;
1169	mg->cache = cache;
1170	mg->new_oblock = oblock;
1171	mg->cblock = cblock;
1172	mg->old_ocell = NULL;
1173	mg->new_ocell = cell;
1174	mg->start_jiffies = jiffies;
1175
1176	inc_nr_migrations(cache);
1177	quiesce_migration(mg);
1178}
1179
1180static void writeback(struct cache *cache, struct prealloc *structs,
1181		      dm_oblock_t oblock, dm_cblock_t cblock,
1182		      struct dm_bio_prison_cell *cell)
1183{
1184	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1185
1186	mg->err = false;
1187	mg->writeback = true;
1188	mg->demote = false;
1189	mg->promote = false;
1190	mg->requeue_holder = true;
1191	mg->invalidate = false;
1192	mg->cache = cache;
1193	mg->old_oblock = oblock;
1194	mg->cblock = cblock;
1195	mg->old_ocell = cell;
1196	mg->new_ocell = NULL;
1197	mg->start_jiffies = jiffies;
1198
1199	inc_nr_migrations(cache);
1200	quiesce_migration(mg);
1201}
1202
1203static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1204				dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1205				dm_cblock_t cblock,
1206				struct dm_bio_prison_cell *old_ocell,
1207				struct dm_bio_prison_cell *new_ocell)
1208{
1209	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1210
1211	mg->err = false;
1212	mg->writeback = false;
1213	mg->demote = true;
1214	mg->promote = true;
1215	mg->requeue_holder = true;
1216	mg->invalidate = false;
1217	mg->cache = cache;
1218	mg->old_oblock = old_oblock;
1219	mg->new_oblock = new_oblock;
1220	mg->cblock = cblock;
1221	mg->old_ocell = old_ocell;
1222	mg->new_ocell = new_ocell;
1223	mg->start_jiffies = jiffies;
1224
1225	inc_nr_migrations(cache);
1226	quiesce_migration(mg);
1227}
1228
1229/*
1230 * Invalidate a cache entry.  No writeback occurs; any changes in the cache
1231 * block are thrown away.
1232 */
1233static void invalidate(struct cache *cache, struct prealloc *structs,
1234		       dm_oblock_t oblock, dm_cblock_t cblock,
1235		       struct dm_bio_prison_cell *cell)
1236{
1237	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1238
1239	mg->err = false;
1240	mg->writeback = false;
1241	mg->demote = true;
1242	mg->promote = false;
1243	mg->requeue_holder = true;
1244	mg->invalidate = true;
1245	mg->cache = cache;
1246	mg->old_oblock = oblock;
1247	mg->cblock = cblock;
1248	mg->old_ocell = cell;
1249	mg->new_ocell = NULL;
1250	mg->start_jiffies = jiffies;
1251
1252	inc_nr_migrations(cache);
1253	quiesce_migration(mg);
1254}
1255
1256/*----------------------------------------------------------------
1257 * bio processing
1258 *--------------------------------------------------------------*/
1259static void defer_bio(struct cache *cache, struct bio *bio)
1260{
1261	unsigned long flags;
1262
1263	spin_lock_irqsave(&cache->lock, flags);
1264	bio_list_add(&cache->deferred_bios, bio);
1265	spin_unlock_irqrestore(&cache->lock, flags);
1266
1267	wake_worker(cache);
1268}
1269
1270static void process_flush_bio(struct cache *cache, struct bio *bio)
1271{
1272	size_t pb_data_size = get_per_bio_data_size(cache);
1273	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1274
1275	BUG_ON(bio->bi_iter.bi_size);
1276	if (!pb->req_nr)
1277		remap_to_origin(cache, bio);
1278	else
1279		remap_to_cache(cache, bio, 0);
1280
1281	/*
1282	 * REQ_FLUSH is not directed at any particular block so we don't
1283	 * need to inc_ds().  REQ_FUA's are split into a write + REQ_FLUSH
1284	 * by dm-core.
1285	 */
1286	issue(cache, bio);
1287}
1288
1289/*
1290 * People generally discard large parts of a device, eg, the whole device
1291 * when formatting.  Splitting these large discards up into cache block
1292 * sized ios and then quiescing (always neccessary for discard) takes too
1293 * long.
1294 *
1295 * We keep it simple, and allow any size of discard to come in, and just
1296 * mark off blocks on the discard bitset.  No passdown occurs!
1297 *
1298 * To implement passdown we need to change the bio_prison such that a cell
1299 * can have a key that spans many blocks.
1300 */
1301static void process_discard_bio(struct cache *cache, struct bio *bio)
1302{
1303	dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
1304						  cache->sectors_per_block);
1305	dm_block_t end_block = bio_end_sector(bio);
1306	dm_block_t b;
1307
1308	end_block = block_div(end_block, cache->sectors_per_block);
1309
1310	for (b = start_block; b < end_block; b++)
1311		set_discard(cache, to_oblock(b));
1312
1313	bio_endio(bio, 0);
1314}
1315
1316static bool spare_migration_bandwidth(struct cache *cache)
1317{
1318	sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1319		cache->sectors_per_block;
1320	return current_volume < cache->migration_threshold;
1321}
1322
1323static void inc_hit_counter(struct cache *cache, struct bio *bio)
1324{
1325	atomic_inc(bio_data_dir(bio) == READ ?
1326		   &cache->stats.read_hit : &cache->stats.write_hit);
1327}
1328
1329static void inc_miss_counter(struct cache *cache, struct bio *bio)
1330{
1331	atomic_inc(bio_data_dir(bio) == READ ?
1332		   &cache->stats.read_miss : &cache->stats.write_miss);
1333}
1334
1335static void process_bio(struct cache *cache, struct prealloc *structs,
1336			struct bio *bio)
1337{
1338	int r;
1339	bool release_cell = true;
1340	dm_oblock_t block = get_bio_block(cache, bio);
1341	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1342	struct policy_result lookup_result;
1343	bool discarded_block = is_discarded_oblock(cache, block);
1344	bool passthrough = passthrough_mode(&cache->features);
1345	bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
1346
1347	/*
1348	 * Check to see if that block is currently migrating.
1349	 */
1350	cell_prealloc = prealloc_get_cell(structs);
1351	r = bio_detain(cache, block, bio, cell_prealloc,
1352		       (cell_free_fn) prealloc_put_cell,
1353		       structs, &new_ocell);
1354	if (r > 0)
1355		return;
1356
1357	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1358		       bio, &lookup_result);
1359
1360	if (r == -EWOULDBLOCK)
1361		/* migration has been denied */
1362		lookup_result.op = POLICY_MISS;
1363
1364	switch (lookup_result.op) {
1365	case POLICY_HIT:
1366		if (passthrough) {
1367			inc_miss_counter(cache, bio);
1368
1369			/*
1370			 * Passthrough always maps to the origin,
1371			 * invalidating any cache blocks that are written
1372			 * to.
1373			 */
1374
1375			if (bio_data_dir(bio) == WRITE) {
1376				atomic_inc(&cache->stats.demotion);
1377				invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
1378				release_cell = false;
1379
1380			} else {
1381				/* FIXME: factor out issue_origin() */
1382				remap_to_origin_clear_discard(cache, bio, block);
1383				inc_and_issue(cache, bio, new_ocell);
1384			}
1385		} else {
1386			inc_hit_counter(cache, bio);
1387
1388			if (bio_data_dir(bio) == WRITE &&
1389			    writethrough_mode(&cache->features) &&
1390			    !is_dirty(cache, lookup_result.cblock)) {
1391				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1392				inc_and_issue(cache, bio, new_ocell);
1393
1394			} else  {
1395				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1396				inc_and_issue(cache, bio, new_ocell);
1397			}
1398		}
1399
1400		break;
1401
1402	case POLICY_MISS:
1403		inc_miss_counter(cache, bio);
1404		remap_to_origin_clear_discard(cache, bio, block);
1405		inc_and_issue(cache, bio, new_ocell);
1406		break;
1407
1408	case POLICY_NEW:
1409		atomic_inc(&cache->stats.promotion);
1410		promote(cache, structs, block, lookup_result.cblock, new_ocell);
1411		release_cell = false;
1412		break;
1413
1414	case POLICY_REPLACE:
1415		cell_prealloc = prealloc_get_cell(structs);
1416		r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1417			       (cell_free_fn) prealloc_put_cell,
1418			       structs, &old_ocell);
1419		if (r > 0) {
1420			/*
1421			 * We have to be careful to avoid lock inversion of
1422			 * the cells.  So we back off, and wait for the
1423			 * old_ocell to become free.
1424			 */
1425			policy_force_mapping(cache->policy, block,
1426					     lookup_result.old_oblock);
1427			atomic_inc(&cache->stats.cache_cell_clash);
1428			break;
1429		}
1430		atomic_inc(&cache->stats.demotion);
1431		atomic_inc(&cache->stats.promotion);
1432
1433		demote_then_promote(cache, structs, lookup_result.old_oblock,
1434				    block, lookup_result.cblock,
1435				    old_ocell, new_ocell);
1436		release_cell = false;
1437		break;
1438
1439	default:
1440		DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1441			    (unsigned) lookup_result.op);
1442		bio_io_error(bio);
1443	}
1444
1445	if (release_cell)
1446		cell_defer(cache, new_ocell, false);
1447}
1448
1449static int need_commit_due_to_time(struct cache *cache)
1450{
1451	return jiffies < cache->last_commit_jiffies ||
1452	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1453}
1454
1455static int commit_if_needed(struct cache *cache)
1456{
1457	int r = 0;
1458
1459	if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
1460	    dm_cache_changed_this_transaction(cache->cmd)) {
1461		atomic_inc(&cache->stats.commit_count);
1462		cache->commit_requested = false;
1463		r = dm_cache_commit(cache->cmd, false);
1464		cache->last_commit_jiffies = jiffies;
1465	}
1466
1467	return r;
1468}
1469
1470static void process_deferred_bios(struct cache *cache)
1471{
1472	unsigned long flags;
1473	struct bio_list bios;
1474	struct bio *bio;
1475	struct prealloc structs;
1476
1477	memset(&structs, 0, sizeof(structs));
1478	bio_list_init(&bios);
1479
1480	spin_lock_irqsave(&cache->lock, flags);
1481	bio_list_merge(&bios, &cache->deferred_bios);
1482	bio_list_init(&cache->deferred_bios);
1483	spin_unlock_irqrestore(&cache->lock, flags);
1484
1485	while (!bio_list_empty(&bios)) {
1486		/*
1487		 * If we've got no free migration structs, and processing
1488		 * this bio might require one, we pause until there are some
1489		 * prepared mappings to process.
1490		 */
1491		if (prealloc_data_structs(cache, &structs)) {
1492			spin_lock_irqsave(&cache->lock, flags);
1493			bio_list_merge(&cache->deferred_bios, &bios);
1494			spin_unlock_irqrestore(&cache->lock, flags);
1495			break;
1496		}
1497
1498		bio = bio_list_pop(&bios);
1499
1500		if (bio->bi_rw & REQ_FLUSH)
1501			process_flush_bio(cache, bio);
1502		else if (bio->bi_rw & REQ_DISCARD)
1503			process_discard_bio(cache, bio);
1504		else
1505			process_bio(cache, &structs, bio);
1506	}
1507
1508	prealloc_free_structs(cache, &structs);
1509}
1510
1511static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1512{
1513	unsigned long flags;
1514	struct bio_list bios;
1515	struct bio *bio;
1516
1517	bio_list_init(&bios);
1518
1519	spin_lock_irqsave(&cache->lock, flags);
1520	bio_list_merge(&bios, &cache->deferred_flush_bios);
1521	bio_list_init(&cache->deferred_flush_bios);
1522	spin_unlock_irqrestore(&cache->lock, flags);
1523
1524	/*
1525	 * These bios have already been through inc_ds()
1526	 */
1527	while ((bio = bio_list_pop(&bios)))
1528		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1529}
1530
1531static void process_deferred_writethrough_bios(struct cache *cache)
1532{
1533	unsigned long flags;
1534	struct bio_list bios;
1535	struct bio *bio;
1536
1537	bio_list_init(&bios);
1538
1539	spin_lock_irqsave(&cache->lock, flags);
1540	bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1541	bio_list_init(&cache->deferred_writethrough_bios);
1542	spin_unlock_irqrestore(&cache->lock, flags);
1543
1544	/*
1545	 * These bios have already been through inc_ds()
1546	 */
1547	while ((bio = bio_list_pop(&bios)))
1548		generic_make_request(bio);
1549}
1550
1551static void writeback_some_dirty_blocks(struct cache *cache)
1552{
1553	int r = 0;
1554	dm_oblock_t oblock;
1555	dm_cblock_t cblock;
1556	struct prealloc structs;
1557	struct dm_bio_prison_cell *old_ocell;
1558
1559	memset(&structs, 0, sizeof(structs));
1560
1561	while (spare_migration_bandwidth(cache)) {
1562		if (prealloc_data_structs(cache, &structs))
1563			break;
1564
1565		r = policy_writeback_work(cache->policy, &oblock, &cblock);
1566		if (r)
1567			break;
1568
1569		r = get_cell(cache, oblock, &structs, &old_ocell);
1570		if (r) {
1571			policy_set_dirty(cache->policy, oblock);
1572			break;
1573		}
1574
1575		writeback(cache, &structs, oblock, cblock, old_ocell);
1576	}
1577
1578	prealloc_free_structs(cache, &structs);
1579}
1580
1581/*----------------------------------------------------------------
1582 * Invalidations.
1583 * Dropping something from the cache *without* writing back.
1584 *--------------------------------------------------------------*/
1585
1586static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
1587{
1588	int r = 0;
1589	uint64_t begin = from_cblock(req->cblocks->begin);
1590	uint64_t end = from_cblock(req->cblocks->end);
1591
1592	while (begin != end) {
1593		r = policy_remove_cblock(cache->policy, to_cblock(begin));
1594		if (!r) {
1595			r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
1596			if (r)
1597				break;
1598
1599		} else if (r == -ENODATA) {
1600			/* harmless, already unmapped */
1601			r = 0;
1602
1603		} else {
1604			DMERR("policy_remove_cblock failed");
1605			break;
1606		}
1607
1608		begin++;
1609        }
1610
1611	cache->commit_requested = true;
1612
1613	req->err = r;
1614	atomic_set(&req->complete, 1);
1615
1616	wake_up(&req->result_wait);
1617}
1618
1619static void process_invalidation_requests(struct cache *cache)
1620{
1621	struct list_head list;
1622	struct invalidation_request *req, *tmp;
1623
1624	INIT_LIST_HEAD(&list);
1625	spin_lock(&cache->invalidation_lock);
1626	list_splice_init(&cache->invalidation_requests, &list);
1627	spin_unlock(&cache->invalidation_lock);
1628
1629	list_for_each_entry_safe (req, tmp, &list, list)
1630		process_invalidation_request(cache, req);
1631}
1632
1633/*----------------------------------------------------------------
1634 * Main worker loop
1635 *--------------------------------------------------------------*/
1636static bool is_quiescing(struct cache *cache)
1637{
1638	return atomic_read(&cache->quiescing);
1639}
1640
1641static void ack_quiescing(struct cache *cache)
1642{
1643	if (is_quiescing(cache)) {
1644		atomic_inc(&cache->quiescing_ack);
1645		wake_up(&cache->quiescing_wait);
1646	}
1647}
1648
1649static void wait_for_quiescing_ack(struct cache *cache)
1650{
1651	wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
1652}
1653
1654static void start_quiescing(struct cache *cache)
1655{
1656	atomic_inc(&cache->quiescing);
1657	wait_for_quiescing_ack(cache);
1658}
1659
1660static void stop_quiescing(struct cache *cache)
1661{
1662	atomic_set(&cache->quiescing, 0);
1663	atomic_set(&cache->quiescing_ack, 0);
1664}
1665
1666static void wait_for_migrations(struct cache *cache)
1667{
1668	wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1669}
1670
1671static void stop_worker(struct cache *cache)
1672{
1673	cancel_delayed_work(&cache->waker);
1674	flush_workqueue(cache->wq);
1675}
1676
1677static void requeue_deferred_io(struct cache *cache)
1678{
1679	struct bio *bio;
1680	struct bio_list bios;
1681
1682	bio_list_init(&bios);
1683	bio_list_merge(&bios, &cache->deferred_bios);
1684	bio_list_init(&cache->deferred_bios);
1685
1686	while ((bio = bio_list_pop(&bios)))
1687		bio_endio(bio, DM_ENDIO_REQUEUE);
1688}
1689
1690static int more_work(struct cache *cache)
1691{
1692	if (is_quiescing(cache))
1693		return !list_empty(&cache->quiesced_migrations) ||
1694			!list_empty(&cache->completed_migrations) ||
1695			!list_empty(&cache->need_commit_migrations);
1696	else
1697		return !bio_list_empty(&cache->deferred_bios) ||
1698			!bio_list_empty(&cache->deferred_flush_bios) ||
1699			!bio_list_empty(&cache->deferred_writethrough_bios) ||
1700			!list_empty(&cache->quiesced_migrations) ||
1701			!list_empty(&cache->completed_migrations) ||
1702			!list_empty(&cache->need_commit_migrations) ||
1703			cache->invalidate;
1704}
1705
1706static void do_worker(struct work_struct *ws)
1707{
1708	struct cache *cache = container_of(ws, struct cache, worker);
1709
1710	do {
1711		if (!is_quiescing(cache)) {
1712			writeback_some_dirty_blocks(cache);
1713			process_deferred_writethrough_bios(cache);
1714			process_deferred_bios(cache);
1715			process_invalidation_requests(cache);
1716		}
1717
1718		process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1719		process_migrations(cache, &cache->completed_migrations, complete_migration);
1720
1721		if (commit_if_needed(cache)) {
1722			process_deferred_flush_bios(cache, false);
1723			process_migrations(cache, &cache->need_commit_migrations, migration_failure);
1724
1725			/*
1726			 * FIXME: rollback metadata or just go into a
1727			 * failure mode and error everything
1728			 */
1729		} else {
1730			process_deferred_flush_bios(cache, true);
1731			process_migrations(cache, &cache->need_commit_migrations,
1732					   migration_success_post_commit);
1733		}
1734
1735		ack_quiescing(cache);
1736
1737	} while (more_work(cache));
1738}
1739
1740/*
1741 * We want to commit periodically so that not too much
1742 * unwritten metadata builds up.
1743 */
1744static void do_waker(struct work_struct *ws)
1745{
1746	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1747	policy_tick(cache->policy);
1748	wake_worker(cache);
1749	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1750}
1751
1752/*----------------------------------------------------------------*/
1753
1754static int is_congested(struct dm_dev *dev, int bdi_bits)
1755{
1756	struct request_queue *q = bdev_get_queue(dev->bdev);
1757	return bdi_congested(&q->backing_dev_info, bdi_bits);
1758}
1759
1760static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1761{
1762	struct cache *cache = container_of(cb, struct cache, callbacks);
1763
1764	return is_congested(cache->origin_dev, bdi_bits) ||
1765		is_congested(cache->cache_dev, bdi_bits);
1766}
1767
1768/*----------------------------------------------------------------
1769 * Target methods
1770 *--------------------------------------------------------------*/
1771
1772/*
1773 * This function gets called on the error paths of the constructor, so we
1774 * have to cope with a partially initialised struct.
1775 */
1776static void destroy(struct cache *cache)
1777{
1778	unsigned i;
1779
1780	if (cache->next_migration)
1781		mempool_free(cache->next_migration, cache->migration_pool);
1782
1783	if (cache->migration_pool)
1784		mempool_destroy(cache->migration_pool);
1785
1786	if (cache->all_io_ds)
1787		dm_deferred_set_destroy(cache->all_io_ds);
1788
1789	if (cache->prison)
1790		dm_bio_prison_destroy(cache->prison);
1791
1792	if (cache->wq)
1793		destroy_workqueue(cache->wq);
1794
1795	if (cache->dirty_bitset)
1796		free_bitset(cache->dirty_bitset);
1797
1798	if (cache->discard_bitset)
1799		free_bitset(cache->discard_bitset);
1800
1801	if (cache->copier)
1802		dm_kcopyd_client_destroy(cache->copier);
1803
1804	if (cache->cmd)
1805		dm_cache_metadata_close(cache->cmd);
1806
1807	if (cache->metadata_dev)
1808		dm_put_device(cache->ti, cache->metadata_dev);
1809
1810	if (cache->origin_dev)
1811		dm_put_device(cache->ti, cache->origin_dev);
1812
1813	if (cache->cache_dev)
1814		dm_put_device(cache->ti, cache->cache_dev);
1815
1816	if (cache->policy)
1817		dm_cache_policy_destroy(cache->policy);
1818
1819	for (i = 0; i < cache->nr_ctr_args ; i++)
1820		kfree(cache->ctr_args[i]);
1821	kfree(cache->ctr_args);
1822
1823	kfree(cache);
1824}
1825
1826static void cache_dtr(struct dm_target *ti)
1827{
1828	struct cache *cache = ti->private;
1829
1830	destroy(cache);
1831}
1832
1833static sector_t get_dev_size(struct dm_dev *dev)
1834{
1835	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1836}
1837
1838/*----------------------------------------------------------------*/
1839
1840/*
1841 * Construct a cache device mapping.
1842 *
1843 * cache <metadata dev> <cache dev> <origin dev> <block size>
1844 *       <#feature args> [<feature arg>]*
1845 *       <policy> <#policy args> [<policy arg>]*
1846 *
1847 * metadata dev    : fast device holding the persistent metadata
1848 * cache dev	   : fast device holding cached data blocks
1849 * origin dev	   : slow device holding original data blocks
1850 * block size	   : cache unit size in sectors
1851 *
1852 * #feature args   : number of feature arguments passed
1853 * feature args    : writethrough.  (The default is writeback.)
1854 *
1855 * policy	   : the replacement policy to use
1856 * #policy args    : an even number of policy arguments corresponding
1857 *		     to key/value pairs passed to the policy
1858 * policy args	   : key/value pairs passed to the policy
1859 *		     E.g. 'sequential_threshold 1024'
1860 *		     See cache-policies.txt for details.
1861 *
1862 * Optional feature arguments are:
1863 *   writethrough  : write through caching that prohibits cache block
1864 *		     content from being different from origin block content.
1865 *		     Without this argument, the default behaviour is to write
1866 *		     back cache block contents later for performance reasons,
1867 *		     so they may differ from the corresponding origin blocks.
1868 */
1869struct cache_args {
1870	struct dm_target *ti;
1871
1872	struct dm_dev *metadata_dev;
1873
1874	struct dm_dev *cache_dev;
1875	sector_t cache_sectors;
1876
1877	struct dm_dev *origin_dev;
1878	sector_t origin_sectors;
1879
1880	uint32_t block_size;
1881
1882	const char *policy_name;
1883	int policy_argc;
1884	const char **policy_argv;
1885
1886	struct cache_features features;
1887};
1888
1889static void destroy_cache_args(struct cache_args *ca)
1890{
1891	if (ca->metadata_dev)
1892		dm_put_device(ca->ti, ca->metadata_dev);
1893
1894	if (ca->cache_dev)
1895		dm_put_device(ca->ti, ca->cache_dev);
1896
1897	if (ca->origin_dev)
1898		dm_put_device(ca->ti, ca->origin_dev);
1899
1900	kfree(ca);
1901}
1902
1903static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1904{
1905	if (!as->argc) {
1906		*error = "Insufficient args";
1907		return false;
1908	}
1909
1910	return true;
1911}
1912
1913static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1914			      char **error)
1915{
1916	int r;
1917	sector_t metadata_dev_size;
1918	char b[BDEVNAME_SIZE];
1919
1920	if (!at_least_one_arg(as, error))
1921		return -EINVAL;
1922
1923	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1924			  &ca->metadata_dev);
1925	if (r) {
1926		*error = "Error opening metadata device";
1927		return r;
1928	}
1929
1930	metadata_dev_size = get_dev_size(ca->metadata_dev);
1931	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1932		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1933		       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1934
1935	return 0;
1936}
1937
1938static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1939			   char **error)
1940{
1941	int r;
1942
1943	if (!at_least_one_arg(as, error))
1944		return -EINVAL;
1945
1946	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1947			  &ca->cache_dev);
1948	if (r) {
1949		*error = "Error opening cache device";
1950		return r;
1951	}
1952	ca->cache_sectors = get_dev_size(ca->cache_dev);
1953
1954	return 0;
1955}
1956
1957static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1958			    char **error)
1959{
1960	int r;
1961
1962	if (!at_least_one_arg(as, error))
1963		return -EINVAL;
1964
1965	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1966			  &ca->origin_dev);
1967	if (r) {
1968		*error = "Error opening origin device";
1969		return r;
1970	}
1971
1972	ca->origin_sectors = get_dev_size(ca->origin_dev);
1973	if (ca->ti->len > ca->origin_sectors) {
1974		*error = "Device size larger than cached device";
1975		return -EINVAL;
1976	}
1977
1978	return 0;
1979}
1980
1981static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1982			    char **error)
1983{
1984	unsigned long block_size;
1985
1986	if (!at_least_one_arg(as, error))
1987		return -EINVAL;
1988
1989	if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
1990	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1991	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1992	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1993		*error = "Invalid data block size";
1994		return -EINVAL;
1995	}
1996
1997	if (block_size > ca->cache_sectors) {
1998		*error = "Data block size is larger than the cache device";
1999		return -EINVAL;
2000	}
2001
2002	ca->block_size = block_size;
2003
2004	return 0;
2005}
2006
2007static void init_features(struct cache_features *cf)
2008{
2009	cf->mode = CM_WRITE;
2010	cf->io_mode = CM_IO_WRITEBACK;
2011}
2012
2013static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2014			  char **error)
2015{
2016	static struct dm_arg _args[] = {
2017		{0, 1, "Invalid number of cache feature arguments"},
2018	};
2019
2020	int r;
2021	unsigned argc;
2022	const char *arg;
2023	struct cache_features *cf = &ca->features;
2024
2025	init_features(cf);
2026
2027	r = dm_read_arg_group(_args, as, &argc, error);
2028	if (r)
2029		return -EINVAL;
2030
2031	while (argc--) {
2032		arg = dm_shift_arg(as);
2033
2034		if (!strcasecmp(arg, "writeback"))
2035			cf->io_mode = CM_IO_WRITEBACK;
2036
2037		else if (!strcasecmp(arg, "writethrough"))
2038			cf->io_mode = CM_IO_WRITETHROUGH;
2039
2040		else if (!strcasecmp(arg, "passthrough"))
2041			cf->io_mode = CM_IO_PASSTHROUGH;
2042
2043		else {
2044			*error = "Unrecognised cache feature requested";
2045			return -EINVAL;
2046		}
2047	}
2048
2049	return 0;
2050}
2051
2052static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2053			char **error)
2054{
2055	static struct dm_arg _args[] = {
2056		{0, 1024, "Invalid number of policy arguments"},
2057	};
2058
2059	int r;
2060
2061	if (!at_least_one_arg(as, error))
2062		return -EINVAL;
2063
2064	ca->policy_name = dm_shift_arg(as);
2065
2066	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2067	if (r)
2068		return -EINVAL;
2069
2070	ca->policy_argv = (const char **)as->argv;
2071	dm_consume_args(as, ca->policy_argc);
2072
2073	return 0;
2074}
2075
2076static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2077			    char **error)
2078{
2079	int r;
2080	struct dm_arg_set as;
2081
2082	as.argc = argc;
2083	as.argv = argv;
2084
2085	r = parse_metadata_dev(ca, &as, error);
2086	if (r)
2087		return r;
2088
2089	r = parse_cache_dev(ca, &as, error);
2090	if (r)
2091		return r;
2092
2093	r = parse_origin_dev(ca, &as, error);
2094	if (r)
2095		return r;
2096
2097	r = parse_block_size(ca, &as, error);
2098	if (r)
2099		return r;
2100
2101	r = parse_features(ca, &as, error);
2102	if (r)
2103		return r;
2104
2105	r = parse_policy(ca, &as, error);
2106	if (r)
2107		return r;
2108
2109	return 0;
2110}
2111
2112/*----------------------------------------------------------------*/
2113
2114static struct kmem_cache *migration_cache;
2115
2116#define NOT_CORE_OPTION 1
2117
2118static int process_config_option(struct cache *cache, const char *key, const char *value)
2119{
2120	unsigned long tmp;
2121
2122	if (!strcasecmp(key, "migration_threshold")) {
2123		if (kstrtoul(value, 10, &tmp))
2124			return -EINVAL;
2125
2126		cache->migration_threshold = tmp;
2127		return 0;
2128	}
2129
2130	return NOT_CORE_OPTION;
2131}
2132
2133static int set_config_value(struct cache *cache, const char *key, const char *value)
2134{
2135	int r = process_config_option(cache, key, value);
2136
2137	if (r == NOT_CORE_OPTION)
2138		r = policy_set_config_value(cache->policy, key, value);
2139
2140	if (r)
2141		DMWARN("bad config value for %s: %s", key, value);
2142
2143	return r;
2144}
2145
2146static int set_config_values(struct cache *cache, int argc, const char **argv)
2147{
2148	int r = 0;
2149
2150	if (argc & 1) {
2151		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2152		return -EINVAL;
2153	}
2154
2155	while (argc) {
2156		r = set_config_value(cache, argv[0], argv[1]);
2157		if (r)
2158			break;
2159
2160		argc -= 2;
2161		argv += 2;
2162	}
2163
2164	return r;
2165}
2166
2167static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2168			       char **error)
2169{
2170	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2171							   cache->cache_size,
2172							   cache->origin_sectors,
2173							   cache->sectors_per_block);
2174	if (IS_ERR(p)) {
2175		*error = "Error creating cache's policy";
2176		return PTR_ERR(p);
2177	}
2178	cache->policy = p;
2179
2180	return 0;
2181}
2182
2183#define DEFAULT_MIGRATION_THRESHOLD 2048
2184
2185static int cache_create(struct cache_args *ca, struct cache **result)
2186{
2187	int r = 0;
2188	char **error = &ca->ti->error;
2189	struct cache *cache;
2190	struct dm_target *ti = ca->ti;
2191	dm_block_t origin_blocks;
2192	struct dm_cache_metadata *cmd;
2193	bool may_format = ca->features.mode == CM_WRITE;
2194
2195	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2196	if (!cache)
2197		return -ENOMEM;
2198
2199	cache->ti = ca->ti;
2200	ti->private = cache;
2201	ti->num_flush_bios = 2;
2202	ti->flush_supported = true;
2203
2204	ti->num_discard_bios = 1;
2205	ti->discards_supported = true;
2206	ti->discard_zeroes_data_unsupported = true;
2207	/* Discard bios must be split on a block boundary */
2208	ti->split_discard_bios = true;
2209
2210	cache->features = ca->features;
2211	ti->per_bio_data_size = get_per_bio_data_size(cache);
2212
2213	cache->callbacks.congested_fn = cache_is_congested;
2214	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2215
2216	cache->metadata_dev = ca->metadata_dev;
2217	cache->origin_dev = ca->origin_dev;
2218	cache->cache_dev = ca->cache_dev;
2219
2220	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2221
2222	/* FIXME: factor out this whole section */
2223	origin_blocks = cache->origin_sectors = ca->origin_sectors;
2224	origin_blocks = block_div(origin_blocks, ca->block_size);
2225	cache->origin_blocks = to_oblock(origin_blocks);
2226
2227	cache->sectors_per_block = ca->block_size;
2228	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2229		r = -EINVAL;
2230		goto bad;
2231	}
2232
2233	if (ca->block_size & (ca->block_size - 1)) {
2234		dm_block_t cache_size = ca->cache_sectors;
2235
2236		cache->sectors_per_block_shift = -1;
2237		cache_size = block_div(cache_size, ca->block_size);
2238		cache->cache_size = to_cblock(cache_size);
2239	} else {
2240		cache->sectors_per_block_shift = __ffs(ca->block_size);
2241		cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
2242	}
2243
2244	r = create_cache_policy(cache, ca, error);
2245	if (r)
2246		goto bad;
2247
2248	cache->policy_nr_args = ca->policy_argc;
2249	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2250
2251	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2252	if (r) {
2253		*error = "Error setting cache policy's config values";
2254		goto bad;
2255	}
2256
2257	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2258				     ca->block_size, may_format,
2259				     dm_cache_policy_get_hint_size(cache->policy));
2260	if (IS_ERR(cmd)) {
2261		*error = "Error creating metadata object";
2262		r = PTR_ERR(cmd);
2263		goto bad;
2264	}
2265	cache->cmd = cmd;
2266
2267	if (passthrough_mode(&cache->features)) {
2268		bool all_clean;
2269
2270		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2271		if (r) {
2272			*error = "dm_cache_metadata_all_clean() failed";
2273			goto bad;
2274		}
2275
2276		if (!all_clean) {
2277			*error = "Cannot enter passthrough mode unless all blocks are clean";
2278			r = -EINVAL;
2279			goto bad;
2280		}
2281	}
2282
2283	spin_lock_init(&cache->lock);
2284	bio_list_init(&cache->deferred_bios);
2285	bio_list_init(&cache->deferred_flush_bios);
2286	bio_list_init(&cache->deferred_writethrough_bios);
2287	INIT_LIST_HEAD(&cache->quiesced_migrations);
2288	INIT_LIST_HEAD(&cache->completed_migrations);
2289	INIT_LIST_HEAD(&cache->need_commit_migrations);
2290	atomic_set(&cache->nr_migrations, 0);
2291	init_waitqueue_head(&cache->migration_wait);
2292
2293	init_waitqueue_head(&cache->quiescing_wait);
2294	atomic_set(&cache->quiescing, 0);
2295	atomic_set(&cache->quiescing_ack, 0);
2296
2297	r = -ENOMEM;
2298	atomic_set(&cache->nr_dirty, 0);
2299	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2300	if (!cache->dirty_bitset) {
2301		*error = "could not allocate dirty bitset";
2302		goto bad;
2303	}
2304	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2305
2306	cache->discard_nr_blocks = cache->origin_blocks;
2307	cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks));
2308	if (!cache->discard_bitset) {
2309		*error = "could not allocate discard bitset";
2310		goto bad;
2311	}
2312	clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks));
2313
2314	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2315	if (IS_ERR(cache->copier)) {
2316		*error = "could not create kcopyd client";
2317		r = PTR_ERR(cache->copier);
2318		goto bad;
2319	}
2320
2321	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2322	if (!cache->wq) {
2323		*error = "could not create workqueue for metadata object";
2324		goto bad;
2325	}
2326	INIT_WORK(&cache->worker, do_worker);
2327	INIT_DELAYED_WORK(&cache->waker, do_waker);
2328	cache->last_commit_jiffies = jiffies;
2329
2330	cache->prison = dm_bio_prison_create(PRISON_CELLS);
2331	if (!cache->prison) {
2332		*error = "could not create bio prison";
2333		goto bad;
2334	}
2335
2336	cache->all_io_ds = dm_deferred_set_create();
2337	if (!cache->all_io_ds) {
2338		*error = "could not create all_io deferred set";
2339		goto bad;
2340	}
2341
2342	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2343							 migration_cache);
2344	if (!cache->migration_pool) {
2345		*error = "Error creating cache's migration mempool";
2346		goto bad;
2347	}
2348
2349	cache->next_migration = NULL;
2350
2351	cache->need_tick_bio = true;
2352	cache->sized = false;
2353	cache->invalidate = false;
2354	cache->commit_requested = false;
2355	cache->loaded_mappings = false;
2356	cache->loaded_discards = false;
2357
2358	load_stats(cache);
2359
2360	atomic_set(&cache->stats.demotion, 0);
2361	atomic_set(&cache->stats.promotion, 0);
2362	atomic_set(&cache->stats.copies_avoided, 0);
2363	atomic_set(&cache->stats.cache_cell_clash, 0);
2364	atomic_set(&cache->stats.commit_count, 0);
2365	atomic_set(&cache->stats.discard_count, 0);
2366
2367	spin_lock_init(&cache->invalidation_lock);
2368	INIT_LIST_HEAD(&cache->invalidation_requests);
2369
2370	*result = cache;
2371	return 0;
2372
2373bad:
2374	destroy(cache);
2375	return r;
2376}
2377
2378static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2379{
2380	unsigned i;
2381	const char **copy;
2382
2383	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2384	if (!copy)
2385		return -ENOMEM;
2386	for (i = 0; i < argc; i++) {
2387		copy[i] = kstrdup(argv[i], GFP_KERNEL);
2388		if (!copy[i]) {
2389			while (i--)
2390				kfree(copy[i]);
2391			kfree(copy);
2392			return -ENOMEM;
2393		}
2394	}
2395
2396	cache->nr_ctr_args = argc;
2397	cache->ctr_args = copy;
2398
2399	return 0;
2400}
2401
2402static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2403{
2404	int r = -EINVAL;
2405	struct cache_args *ca;
2406	struct cache *cache = NULL;
2407
2408	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2409	if (!ca) {
2410		ti->error = "Error allocating memory for cache";
2411		return -ENOMEM;
2412	}
2413	ca->ti = ti;
2414
2415	r = parse_cache_args(ca, argc, argv, &ti->error);
2416	if (r)
2417		goto out;
2418
2419	r = cache_create(ca, &cache);
2420	if (r)
2421		goto out;
2422
2423	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2424	if (r) {
2425		destroy(cache);
2426		goto out;
2427	}
2428
2429	ti->private = cache;
2430
2431out:
2432	destroy_cache_args(ca);
2433	return r;
2434}
2435
2436static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
2437{
2438	int r;
2439	dm_oblock_t block = get_bio_block(cache, bio);
2440	size_t pb_data_size = get_per_bio_data_size(cache);
2441	bool can_migrate = false;
2442	bool discarded_block;
2443	struct policy_result lookup_result;
2444	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
2445
2446	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2447		/*
2448		 * This can only occur if the io goes to a partial block at
2449		 * the end of the origin device.  We don't cache these.
2450		 * Just remap to the origin and carry on.
2451		 */
2452		remap_to_origin(cache, bio);
2453		return DM_MAPIO_REMAPPED;
2454	}
2455
2456	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2457		defer_bio(cache, bio);
2458		return DM_MAPIO_SUBMITTED;
2459	}
2460
2461	/*
2462	 * Check to see if that block is currently migrating.
2463	 */
2464	*cell = alloc_prison_cell(cache);
2465	if (!*cell) {
2466		defer_bio(cache, bio);
2467		return DM_MAPIO_SUBMITTED;
2468	}
2469
2470	r = bio_detain(cache, block, bio, *cell,
2471		       (cell_free_fn) free_prison_cell,
2472		       cache, cell);
2473	if (r) {
2474		if (r < 0)
2475			defer_bio(cache, bio);
2476
2477		return DM_MAPIO_SUBMITTED;
2478	}
2479
2480	discarded_block = is_discarded_oblock(cache, block);
2481
2482	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2483		       bio, &lookup_result);
2484	if (r == -EWOULDBLOCK) {
2485		cell_defer(cache, *cell, true);
2486		return DM_MAPIO_SUBMITTED;
2487
2488	} else if (r) {
2489		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2490		cell_defer(cache, *cell, false);
2491		bio_io_error(bio);
2492		return DM_MAPIO_SUBMITTED;
2493	}
2494
2495	r = DM_MAPIO_REMAPPED;
2496	switch (lookup_result.op) {
2497	case POLICY_HIT:
2498		if (passthrough_mode(&cache->features)) {
2499			if (bio_data_dir(bio) == WRITE) {
2500				/*
2501				 * We need to invalidate this block, so
2502				 * defer for the worker thread.
2503				 */
2504				cell_defer(cache, *cell, true);
2505				r = DM_MAPIO_SUBMITTED;
2506
2507			} else {
2508				inc_miss_counter(cache, bio);
2509				remap_to_origin_clear_discard(cache, bio, block);
2510			}
2511
2512		} else {
2513			inc_hit_counter(cache, bio);
2514			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
2515			    !is_dirty(cache, lookup_result.cblock))
2516				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2517			else
2518				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2519		}
2520		break;
2521
2522	case POLICY_MISS:
2523		inc_miss_counter(cache, bio);
2524		if (pb->req_nr != 0) {
2525			/*
2526			 * This is a duplicate writethrough io that is no
2527			 * longer needed because the block has been demoted.
2528			 */
2529			bio_endio(bio, 0);
2530			cell_defer(cache, *cell, false);
2531			r = DM_MAPIO_SUBMITTED;
2532
2533		} else
2534			remap_to_origin_clear_discard(cache, bio, block);
2535
2536		break;
2537
2538	default:
2539		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2540			    (unsigned) lookup_result.op);
2541		cell_defer(cache, *cell, false);
2542		bio_io_error(bio);
2543		r = DM_MAPIO_SUBMITTED;
2544	}
2545
2546	return r;
2547}
2548
2549static int cache_map(struct dm_target *ti, struct bio *bio)
2550{
2551	int r;
2552	struct dm_bio_prison_cell *cell;
2553	struct cache *cache = ti->private;
2554
2555	r = __cache_map(cache, bio, &cell);
2556	if (r == DM_MAPIO_REMAPPED) {
2557		inc_ds(cache, bio, cell);
2558		cell_defer(cache, cell, false);
2559	}
2560
2561	return r;
2562}
2563
2564static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2565{
2566	struct cache *cache = ti->private;
2567	unsigned long flags;
2568	size_t pb_data_size = get_per_bio_data_size(cache);
2569	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
2570
2571	if (pb->tick) {
2572		policy_tick(cache->policy);
2573
2574		spin_lock_irqsave(&cache->lock, flags);
2575		cache->need_tick_bio = true;
2576		spin_unlock_irqrestore(&cache->lock, flags);
2577	}
2578
2579	check_for_quiesced_migrations(cache, pb);
2580
2581	return 0;
2582}
2583
2584static int write_dirty_bitset(struct cache *cache)
2585{
2586	unsigned i, r;
2587
2588	for (i = 0; i < from_cblock(cache->cache_size); i++) {
2589		r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2590				       is_dirty(cache, to_cblock(i)));
2591		if (r)
2592			return r;
2593	}
2594
2595	return 0;
2596}
2597
2598static int write_discard_bitset(struct cache *cache)
2599{
2600	unsigned i, r;
2601
2602	r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block,
2603					   cache->origin_blocks);
2604	if (r) {
2605		DMERR("could not resize on-disk discard bitset");
2606		return r;
2607	}
2608
2609	for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) {
2610		r = dm_cache_set_discard(cache->cmd, to_oblock(i),
2611					 is_discarded(cache, to_oblock(i)));
2612		if (r)
2613			return r;
2614	}
2615
2616	return 0;
2617}
2618
2619/*
2620 * returns true on success
2621 */
2622static bool sync_metadata(struct cache *cache)
2623{
2624	int r1, r2, r3, r4;
2625
2626	r1 = write_dirty_bitset(cache);
2627	if (r1)
2628		DMERR("could not write dirty bitset");
2629
2630	r2 = write_discard_bitset(cache);
2631	if (r2)
2632		DMERR("could not write discard bitset");
2633
2634	save_stats(cache);
2635
2636	r3 = dm_cache_write_hints(cache->cmd, cache->policy);
2637	if (r3)
2638		DMERR("could not write hints");
2639
2640	/*
2641	 * If writing the above metadata failed, we still commit, but don't
2642	 * set the clean shutdown flag.  This will effectively force every
2643	 * dirty bit to be set on reload.
2644	 */
2645	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2646	if (r4)
2647		DMERR("could not write cache metadata.  Data loss may occur.");
2648
2649	return !r1 && !r2 && !r3 && !r4;
2650}
2651
2652static void cache_postsuspend(struct dm_target *ti)
2653{
2654	struct cache *cache = ti->private;
2655
2656	start_quiescing(cache);
2657	wait_for_migrations(cache);
2658	stop_worker(cache);
2659	requeue_deferred_io(cache);
2660	stop_quiescing(cache);
2661
2662	(void) sync_metadata(cache);
2663}
2664
2665static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2666			bool dirty, uint32_t hint, bool hint_valid)
2667{
2668	int r;
2669	struct cache *cache = context;
2670
2671	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2672	if (r)
2673		return r;
2674
2675	if (dirty)
2676		set_dirty(cache, oblock, cblock);
2677	else
2678		clear_dirty(cache, oblock, cblock);
2679
2680	return 0;
2681}
2682
2683static int load_discard(void *context, sector_t discard_block_size,
2684			dm_oblock_t oblock, bool discard)
2685{
2686	struct cache *cache = context;
2687
2688	if (discard)
2689		set_discard(cache, oblock);
2690	else
2691		clear_discard(cache, oblock);
2692
2693	return 0;
2694}
2695
2696static dm_cblock_t get_cache_dev_size(struct cache *cache)
2697{
2698	sector_t size = get_dev_size(cache->cache_dev);
2699	(void) sector_div(size, cache->sectors_per_block);
2700	return to_cblock(size);
2701}
2702
2703static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2704{
2705	if (from_cblock(new_size) > from_cblock(cache->cache_size))
2706		return true;
2707
2708	/*
2709	 * We can't drop a dirty block when shrinking the cache.
2710	 */
2711	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
2712		new_size = to_cblock(from_cblock(new_size) + 1);
2713		if (is_dirty(cache, new_size)) {
2714			DMERR("unable to shrink cache; cache block %llu is dirty",
2715			      (unsigned long long) from_cblock(new_size));
2716			return false;
2717		}
2718	}
2719
2720	return true;
2721}
2722
2723static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2724{
2725	int r;
2726
2727	r = dm_cache_resize(cache->cmd, new_size);
2728	if (r) {
2729		DMERR("could not resize cache metadata");
2730		return r;
2731	}
2732
2733	cache->cache_size = new_size;
2734
2735	return 0;
2736}
2737
2738static int cache_preresume(struct dm_target *ti)
2739{
2740	int r = 0;
2741	struct cache *cache = ti->private;
2742	dm_cblock_t csize = get_cache_dev_size(cache);
2743
2744	/*
2745	 * Check to see if the cache has resized.
2746	 */
2747	if (!cache->sized) {
2748		r = resize_cache_dev(cache, csize);
2749		if (r)
2750			return r;
2751
2752		cache->sized = true;
2753
2754	} else if (csize != cache->cache_size) {
2755		if (!can_resize(cache, csize))
2756			return -EINVAL;
2757
2758		r = resize_cache_dev(cache, csize);
2759		if (r)
2760			return r;
2761	}
2762
2763	if (!cache->loaded_mappings) {
2764		r = dm_cache_load_mappings(cache->cmd, cache->policy,
2765					   load_mapping, cache);
2766		if (r) {
2767			DMERR("could not load cache mappings");
2768			return r;
2769		}
2770
2771		cache->loaded_mappings = true;
2772	}
2773
2774	if (!cache->loaded_discards) {
2775		r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2776		if (r) {
2777			DMERR("could not load origin discards");
2778			return r;
2779		}
2780
2781		cache->loaded_discards = true;
2782	}
2783
2784	return r;
2785}
2786
2787static void cache_resume(struct dm_target *ti)
2788{
2789	struct cache *cache = ti->private;
2790
2791	cache->need_tick_bio = true;
2792	do_waker(&cache->waker.work);
2793}
2794
2795/*
2796 * Status format:
2797 *
2798 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
2799 * <cache block size> <#used cache blocks>/<#total cache blocks>
2800 * <#read hits> <#read misses> <#write hits> <#write misses>
2801 * <#demotions> <#promotions> <#dirty>
2802 * <#features> <features>*
2803 * <#core args> <core args>
2804 * <policy name> <#policy args> <policy args>*
2805 */
2806static void cache_status(struct dm_target *ti, status_type_t type,
2807			 unsigned status_flags, char *result, unsigned maxlen)
2808{
2809	int r = 0;
2810	unsigned i;
2811	ssize_t sz = 0;
2812	dm_block_t nr_free_blocks_metadata = 0;
2813	dm_block_t nr_blocks_metadata = 0;
2814	char buf[BDEVNAME_SIZE];
2815	struct cache *cache = ti->private;
2816	dm_cblock_t residency;
2817
2818	switch (type) {
2819	case STATUSTYPE_INFO:
2820		/* Commit to ensure statistics aren't out-of-date */
2821		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2822			r = dm_cache_commit(cache->cmd, false);
2823			if (r)
2824				DMERR("could not commit metadata for accurate status");
2825		}
2826
2827		r = dm_cache_get_free_metadata_block_count(cache->cmd,
2828							   &nr_free_blocks_metadata);
2829		if (r) {
2830			DMERR("could not get metadata free block count");
2831			goto err;
2832		}
2833
2834		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2835		if (r) {
2836			DMERR("could not get metadata device size");
2837			goto err;
2838		}
2839
2840		residency = policy_residency(cache->policy);
2841
2842		DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
2843		       (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
2844		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2845		       (unsigned long long)nr_blocks_metadata,
2846		       cache->sectors_per_block,
2847		       (unsigned long long) from_cblock(residency),
2848		       (unsigned long long) from_cblock(cache->cache_size),
2849		       (unsigned) atomic_read(&cache->stats.read_hit),
2850		       (unsigned) atomic_read(&cache->stats.read_miss),
2851		       (unsigned) atomic_read(&cache->stats.write_hit),
2852		       (unsigned) atomic_read(&cache->stats.write_miss),
2853		       (unsigned) atomic_read(&cache->stats.demotion),
2854		       (unsigned) atomic_read(&cache->stats.promotion),
2855		       (unsigned long) atomic_read(&cache->nr_dirty));
2856
2857		if (writethrough_mode(&cache->features))
2858			DMEMIT("1 writethrough ");
2859
2860		else if (passthrough_mode(&cache->features))
2861			DMEMIT("1 passthrough ");
2862
2863		else if (writeback_mode(&cache->features))
2864			DMEMIT("1 writeback ");
2865
2866		else {
2867			DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
2868			goto err;
2869		}
2870
2871		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2872
2873		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
2874		if (sz < maxlen) {
2875			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2876			if (r)
2877				DMERR("policy_emit_config_values returned %d", r);
2878		}
2879
2880		break;
2881
2882	case STATUSTYPE_TABLE:
2883		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2884		DMEMIT("%s ", buf);
2885		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2886		DMEMIT("%s ", buf);
2887		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2888		DMEMIT("%s", buf);
2889
2890		for (i = 0; i < cache->nr_ctr_args - 1; i++)
2891			DMEMIT(" %s", cache->ctr_args[i]);
2892		if (cache->nr_ctr_args)
2893			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2894	}
2895
2896	return;
2897
2898err:
2899	DMEMIT("Error");
2900}
2901
2902/*
2903 * A cache block range can take two forms:
2904 *
2905 * i) A single cblock, eg. '3456'
2906 * ii) A begin and end cblock with dots between, eg. 123-234
2907 */
2908static int parse_cblock_range(struct cache *cache, const char *str,
2909			      struct cblock_range *result)
2910{
2911	char dummy;
2912	uint64_t b, e;
2913	int r;
2914
2915	/*
2916	 * Try and parse form (ii) first.
2917	 */
2918	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
2919	if (r < 0)
2920		return r;
2921
2922	if (r == 2) {
2923		result->begin = to_cblock(b);
2924		result->end = to_cblock(e);
2925		return 0;
2926	}
2927
2928	/*
2929	 * That didn't work, try form (i).
2930	 */
2931	r = sscanf(str, "%llu%c", &b, &dummy);
2932	if (r < 0)
2933		return r;
2934
2935	if (r == 1) {
2936		result->begin = to_cblock(b);
2937		result->end = to_cblock(from_cblock(result->begin) + 1u);
2938		return 0;
2939	}
2940
2941	DMERR("invalid cblock range '%s'", str);
2942	return -EINVAL;
2943}
2944
2945static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
2946{
2947	uint64_t b = from_cblock(range->begin);
2948	uint64_t e = from_cblock(range->end);
2949	uint64_t n = from_cblock(cache->cache_size);
2950
2951	if (b >= n) {
2952		DMERR("begin cblock out of range: %llu >= %llu", b, n);
2953		return -EINVAL;
2954	}
2955
2956	if (e > n) {
2957		DMERR("end cblock out of range: %llu > %llu", e, n);
2958		return -EINVAL;
2959	}
2960
2961	if (b >= e) {
2962		DMERR("invalid cblock range: %llu >= %llu", b, e);
2963		return -EINVAL;
2964	}
2965
2966	return 0;
2967}
2968
2969static int request_invalidation(struct cache *cache, struct cblock_range *range)
2970{
2971	struct invalidation_request req;
2972
2973	INIT_LIST_HEAD(&req.list);
2974	req.cblocks = range;
2975	atomic_set(&req.complete, 0);
2976	req.err = 0;
2977	init_waitqueue_head(&req.result_wait);
2978
2979	spin_lock(&cache->invalidation_lock);
2980	list_add(&req.list, &cache->invalidation_requests);
2981	spin_unlock(&cache->invalidation_lock);
2982	wake_worker(cache);
2983
2984	wait_event(req.result_wait, atomic_read(&req.complete));
2985	return req.err;
2986}
2987
2988static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
2989					      const char **cblock_ranges)
2990{
2991	int r = 0;
2992	unsigned i;
2993	struct cblock_range range;
2994
2995	if (!passthrough_mode(&cache->features)) {
2996		DMERR("cache has to be in passthrough mode for invalidation");
2997		return -EPERM;
2998	}
2999
3000	for (i = 0; i < count; i++) {
3001		r = parse_cblock_range(cache, cblock_ranges[i], &range);
3002		if (r)
3003			break;
3004
3005		r = validate_cblock_range(cache, &range);
3006		if (r)
3007			break;
3008
3009		/*
3010		 * Pass begin and end origin blocks to the worker and wake it.
3011		 */
3012		r = request_invalidation(cache, &range);
3013		if (r)
3014			break;
3015	}
3016
3017	return r;
3018}
3019
3020/*
3021 * Supports
3022 *	"<key> <value>"
3023 * and
3024 *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3025 *
3026 * The key migration_threshold is supported by the cache target core.
3027 */
3028static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
3029{
3030	struct cache *cache = ti->private;
3031
3032	if (!argc)
3033		return -EINVAL;
3034
3035	if (!strcasecmp(argv[0], "invalidate_cblocks"))
3036		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3037
3038	if (argc != 2)
3039		return -EINVAL;
3040
3041	return set_config_value(cache, argv[0], argv[1]);
3042}
3043
3044static int cache_iterate_devices(struct dm_target *ti,
3045				 iterate_devices_callout_fn fn, void *data)
3046{
3047	int r = 0;
3048	struct cache *cache = ti->private;
3049
3050	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3051	if (!r)
3052		r = fn(ti, cache->origin_dev, 0, ti->len, data);
3053
3054	return r;
3055}
3056
3057/*
3058 * We assume I/O is going to the origin (which is the volume
3059 * more likely to have restrictions e.g. by being striped).
3060 * (Looking up the exact location of the data would be expensive
3061 * and could always be out of date by the time the bio is submitted.)
3062 */
3063static int cache_bvec_merge(struct dm_target *ti,
3064			    struct bvec_merge_data *bvm,
3065			    struct bio_vec *biovec, int max_size)
3066{
3067	struct cache *cache = ti->private;
3068	struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
3069
3070	if (!q->merge_bvec_fn)
3071		return max_size;
3072
3073	bvm->bi_bdev = cache->origin_dev->bdev;
3074	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3075}
3076
3077static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3078{
3079	/*
3080	 * FIXME: these limits may be incompatible with the cache device
3081	 */
3082	limits->max_discard_sectors = cache->sectors_per_block;
3083	limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT;
3084}
3085
3086static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3087{
3088	struct cache *cache = ti->private;
3089	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3090
3091	/*
3092	 * If the system-determined stacked limits are compatible with the
3093	 * cache's blocksize (io_opt is a factor) do not override them.
3094	 */
3095	if (io_opt_sectors < cache->sectors_per_block ||
3096	    do_div(io_opt_sectors, cache->sectors_per_block)) {
3097		blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3098		blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3099	}
3100	set_discard_limits(cache, limits);
3101}
3102
3103/*----------------------------------------------------------------*/
3104
3105static struct target_type cache_target = {
3106	.name = "cache",
3107	.version = {1, 5, 0},
3108	.module = THIS_MODULE,
3109	.ctr = cache_ctr,
3110	.dtr = cache_dtr,
3111	.map = cache_map,
3112	.end_io = cache_end_io,
3113	.postsuspend = cache_postsuspend,
3114	.preresume = cache_preresume,
3115	.resume = cache_resume,
3116	.status = cache_status,
3117	.message = cache_message,
3118	.iterate_devices = cache_iterate_devices,
3119	.merge = cache_bvec_merge,
3120	.io_hints = cache_io_hints,
3121};
3122
3123static int __init dm_cache_init(void)
3124{
3125	int r;
3126
3127	r = dm_register_target(&cache_target);
3128	if (r) {
3129		DMERR("cache target registration failed: %d", r);
3130		return r;
3131	}
3132
3133	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3134	if (!migration_cache) {
3135		dm_unregister_target(&cache_target);
3136		return -ENOMEM;
3137	}
3138
3139	return 0;
3140}
3141
3142static void __exit dm_cache_exit(void)
3143{
3144	dm_unregister_target(&cache_target);
3145	kmem_cache_destroy(migration_cache);
3146}
3147
3148module_init(dm_cache_init);
3149module_exit(dm_cache_exit);
3150
3151MODULE_DESCRIPTION(DM_NAME " cache target");
3152MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3153MODULE_LICENSE("GPL");
3154