scrub.c revision 00d01bc17cc2807292303961519d9c005794eb1d
1/*
2 * Copyright (C) 2011 STRATO.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "ordered-data.h"
30
31/*
32 * This is only the first step towards a full-features scrub. It reads all
33 * extent and super block and verifies the checksums. In case a bad checksum
34 * is found or the extent cannot be read, good data will be written back if
35 * any can be found.
36 *
37 * Future enhancements:
38 *  - To enhance the performance, better read-ahead strategies for the
39 *    extent-tree can be employed.
40 *  - In case an unrepairable extent is encountered, track which files are
41 *    affected and report them
42 *  - In case of a read error on files with nodatasum, map the file and read
43 *    the extent to trigger a writeback of the good copy
44 *  - track and record media errors, throw out bad devices
45 *  - add a mode to also read unallocated space
46 *  - make the prefetch cancellable
47 */
48
49struct scrub_bio;
50struct scrub_page;
51struct scrub_dev;
52static void scrub_bio_end_io(struct bio *bio, int err);
53static void scrub_checksum(struct btrfs_work *work);
54static int scrub_checksum_data(struct scrub_dev *sdev,
55			       struct scrub_page *spag, void *buffer);
56static int scrub_checksum_tree_block(struct scrub_dev *sdev,
57				     struct scrub_page *spag, u64 logical,
58				     void *buffer);
59static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
60static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
61static void scrub_fixup_end_io(struct bio *bio, int err);
62static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
63			  struct page *page);
64static void scrub_fixup(struct scrub_bio *sbio, int ix);
65
66#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */
67#define SCRUB_BIOS_PER_DEV	16	/* 1 MB per device in flight */
68
69struct scrub_page {
70	u64			flags;  /* extent flags */
71	u64			generation;
72	u64			mirror_num;
73	int			have_csum;
74	u8			csum[BTRFS_CSUM_SIZE];
75};
76
77struct scrub_bio {
78	int			index;
79	struct scrub_dev	*sdev;
80	struct bio		*bio;
81	int			err;
82	u64			logical;
83	u64			physical;
84	struct scrub_page	spag[SCRUB_PAGES_PER_BIO];
85	u64			count;
86	int			next_free;
87	struct btrfs_work	work;
88};
89
90struct scrub_dev {
91	struct scrub_bio	*bios[SCRUB_BIOS_PER_DEV];
92	struct btrfs_device	*dev;
93	int			first_free;
94	int			curr;
95	atomic_t		in_flight;
96	spinlock_t		list_lock;
97	wait_queue_head_t	list_wait;
98	u16			csum_size;
99	struct list_head	csum_list;
100	atomic_t		cancel_req;
101	int			readonly;
102	/*
103	 * statistics
104	 */
105	struct btrfs_scrub_progress stat;
106	spinlock_t		stat_lock;
107};
108
109static void scrub_free_csums(struct scrub_dev *sdev)
110{
111	while (!list_empty(&sdev->csum_list)) {
112		struct btrfs_ordered_sum *sum;
113		sum = list_first_entry(&sdev->csum_list,
114				       struct btrfs_ordered_sum, list);
115		list_del(&sum->list);
116		kfree(sum);
117	}
118}
119
120static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
121{
122	int i;
123	int j;
124	struct page *last_page;
125
126	if (!sdev)
127		return;
128
129	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
130		struct scrub_bio *sbio = sdev->bios[i];
131		struct bio *bio;
132
133		if (!sbio)
134			break;
135
136		bio = sbio->bio;
137		if (bio) {
138			last_page = NULL;
139			for (j = 0; j < bio->bi_vcnt; ++j) {
140				if (bio->bi_io_vec[j].bv_page == last_page)
141					continue;
142				last_page = bio->bi_io_vec[j].bv_page;
143				__free_page(last_page);
144			}
145			bio_put(bio);
146		}
147		kfree(sbio);
148	}
149
150	scrub_free_csums(sdev);
151	kfree(sdev);
152}
153
154static noinline_for_stack
155struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
156{
157	struct scrub_dev *sdev;
158	int		i;
159	int		j;
160	int		ret;
161	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
162
163	sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
164	if (!sdev)
165		goto nomem;
166	sdev->dev = dev;
167	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
168		struct bio *bio;
169		struct scrub_bio *sbio;
170
171		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
172		if (!sbio)
173			goto nomem;
174		sdev->bios[i] = sbio;
175
176		bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
177		if (!bio)
178			goto nomem;
179
180		sbio->index = i;
181		sbio->sdev = sdev;
182		sbio->bio = bio;
183		sbio->count = 0;
184		sbio->work.func = scrub_checksum;
185		bio->bi_private = sdev->bios[i];
186		bio->bi_end_io = scrub_bio_end_io;
187		bio->bi_sector = 0;
188		bio->bi_bdev = dev->bdev;
189		bio->bi_size = 0;
190
191		for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) {
192			struct page *page;
193			page = alloc_page(GFP_NOFS);
194			if (!page)
195				goto nomem;
196
197			ret = bio_add_page(bio, page, PAGE_SIZE, 0);
198			if (!ret)
199				goto nomem;
200		}
201		WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO);
202
203		if (i != SCRUB_BIOS_PER_DEV-1)
204			sdev->bios[i]->next_free = i + 1;
205		 else
206			sdev->bios[i]->next_free = -1;
207	}
208	sdev->first_free = 0;
209	sdev->curr = -1;
210	atomic_set(&sdev->in_flight, 0);
211	atomic_set(&sdev->cancel_req, 0);
212	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
213	INIT_LIST_HEAD(&sdev->csum_list);
214
215	spin_lock_init(&sdev->list_lock);
216	spin_lock_init(&sdev->stat_lock);
217	init_waitqueue_head(&sdev->list_wait);
218	return sdev;
219
220nomem:
221	scrub_free_dev(sdev);
222	return ERR_PTR(-ENOMEM);
223}
224
225/*
226 * scrub_recheck_error gets called when either verification of the page
227 * failed or the bio failed to read, e.g. with EIO. In the latter case,
228 * recheck_error gets called for every page in the bio, even though only
229 * one may be bad
230 */
231static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
232{
233	if (sbio->err) {
234		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
235				   (sbio->physical + ix * PAGE_SIZE) >> 9,
236				   sbio->bio->bi_io_vec[ix].bv_page) == 0) {
237			if (scrub_fixup_check(sbio, ix) == 0)
238				return;
239		}
240	}
241
242	scrub_fixup(sbio, ix);
243}
244
245static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
246{
247	int ret = 1;
248	struct page *page;
249	void *buffer;
250	u64 flags = sbio->spag[ix].flags;
251
252	page = sbio->bio->bi_io_vec[ix].bv_page;
253	buffer = kmap_atomic(page, KM_USER0);
254	if (flags & BTRFS_EXTENT_FLAG_DATA) {
255		ret = scrub_checksum_data(sbio->sdev,
256					  sbio->spag + ix, buffer);
257	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
258		ret = scrub_checksum_tree_block(sbio->sdev,
259						sbio->spag + ix,
260						sbio->logical + ix * PAGE_SIZE,
261						buffer);
262	} else {
263		WARN_ON(1);
264	}
265	kunmap_atomic(buffer, KM_USER0);
266
267	return ret;
268}
269
270static void scrub_fixup_end_io(struct bio *bio, int err)
271{
272	complete((struct completion *)bio->bi_private);
273}
274
275static void scrub_fixup(struct scrub_bio *sbio, int ix)
276{
277	struct scrub_dev *sdev = sbio->sdev;
278	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
279	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
280	struct btrfs_multi_bio *multi = NULL;
281	u64 logical = sbio->logical + ix * PAGE_SIZE;
282	u64 length;
283	int i;
284	int ret;
285	DECLARE_COMPLETION_ONSTACK(complete);
286
287	if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
288	    (sbio->spag[ix].have_csum == 0)) {
289		/*
290		 * nodatasum, don't try to fix anything
291		 * FIXME: we can do better, open the inode and trigger a
292		 * writeback
293		 */
294		goto uncorrectable;
295	}
296
297	length = PAGE_SIZE;
298	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
299			      &multi, 0);
300	if (ret || !multi || length < PAGE_SIZE) {
301		printk(KERN_ERR
302		       "scrub_fixup: btrfs_map_block failed us for %llu\n",
303		       (unsigned long long)logical);
304		WARN_ON(1);
305		return;
306	}
307
308	if (multi->num_stripes == 1)
309		/* there aren't any replicas */
310		goto uncorrectable;
311
312	/*
313	 * first find a good copy
314	 */
315	for (i = 0; i < multi->num_stripes; ++i) {
316		if (i == sbio->spag[ix].mirror_num)
317			continue;
318
319		if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
320				   multi->stripes[i].physical >> 9,
321				   sbio->bio->bi_io_vec[ix].bv_page)) {
322			/* I/O-error, this is not a good copy */
323			continue;
324		}
325
326		if (scrub_fixup_check(sbio, ix) == 0)
327			break;
328	}
329	if (i == multi->num_stripes)
330		goto uncorrectable;
331
332	if (!sdev->readonly) {
333		/*
334		 * bi_io_vec[ix].bv_page now contains good data, write it back
335		 */
336		if (scrub_fixup_io(WRITE, sdev->dev->bdev,
337				   (sbio->physical + ix * PAGE_SIZE) >> 9,
338				   sbio->bio->bi_io_vec[ix].bv_page)) {
339			/* I/O-error, writeback failed, give up */
340			goto uncorrectable;
341		}
342	}
343
344	kfree(multi);
345	spin_lock(&sdev->stat_lock);
346	++sdev->stat.corrected_errors;
347	spin_unlock(&sdev->stat_lock);
348
349	if (printk_ratelimit())
350		printk(KERN_ERR "btrfs: fixed up at %llu\n",
351		       (unsigned long long)logical);
352	return;
353
354uncorrectable:
355	kfree(multi);
356	spin_lock(&sdev->stat_lock);
357	++sdev->stat.uncorrectable_errors;
358	spin_unlock(&sdev->stat_lock);
359
360	if (printk_ratelimit())
361		printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
362			 (unsigned long long)logical);
363}
364
365static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
366			 struct page *page)
367{
368	struct bio *bio = NULL;
369	int ret;
370	DECLARE_COMPLETION_ONSTACK(complete);
371
372	/* we are going to wait on this IO */
373	rw |= REQ_SYNC;
374
375	bio = bio_alloc(GFP_NOFS, 1);
376	bio->bi_bdev = bdev;
377	bio->bi_sector = sector;
378	bio_add_page(bio, page, PAGE_SIZE, 0);
379	bio->bi_end_io = scrub_fixup_end_io;
380	bio->bi_private = &complete;
381	submit_bio(rw, bio);
382
383	wait_for_completion(&complete);
384
385	ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
386	bio_put(bio);
387	return ret;
388}
389
390static void scrub_bio_end_io(struct bio *bio, int err)
391{
392	struct scrub_bio *sbio = bio->bi_private;
393	struct scrub_dev *sdev = sbio->sdev;
394	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
395
396	sbio->err = err;
397
398	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
399}
400
401static void scrub_checksum(struct btrfs_work *work)
402{
403	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
404	struct scrub_dev *sdev = sbio->sdev;
405	struct page *page;
406	void *buffer;
407	int i;
408	u64 flags;
409	u64 logical;
410	int ret;
411
412	if (sbio->err) {
413		for (i = 0; i < sbio->count; ++i)
414			scrub_recheck_error(sbio, i);
415
416		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
417		sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
418		sbio->bio->bi_phys_segments = 0;
419		sbio->bio->bi_idx = 0;
420
421		for (i = 0; i < sbio->count; i++) {
422			struct bio_vec *bi;
423			bi = &sbio->bio->bi_io_vec[i];
424			bi->bv_offset = 0;
425			bi->bv_len = PAGE_SIZE;
426		}
427
428		spin_lock(&sdev->stat_lock);
429		++sdev->stat.read_errors;
430		spin_unlock(&sdev->stat_lock);
431		goto out;
432	}
433	for (i = 0; i < sbio->count; ++i) {
434		page = sbio->bio->bi_io_vec[i].bv_page;
435		buffer = kmap_atomic(page, KM_USER0);
436		flags = sbio->spag[i].flags;
437		logical = sbio->logical + i * PAGE_SIZE;
438		ret = 0;
439		if (flags & BTRFS_EXTENT_FLAG_DATA) {
440			ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
441		} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
442			ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
443							logical, buffer);
444		} else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
445			BUG_ON(i);
446			(void)scrub_checksum_super(sbio, buffer);
447		} else {
448			WARN_ON(1);
449		}
450		kunmap_atomic(buffer, KM_USER0);
451		if (ret)
452			scrub_recheck_error(sbio, i);
453	}
454
455out:
456	spin_lock(&sdev->list_lock);
457	sbio->next_free = sdev->first_free;
458	sdev->first_free = sbio->index;
459	spin_unlock(&sdev->list_lock);
460	atomic_dec(&sdev->in_flight);
461	wake_up(&sdev->list_wait);
462}
463
464static int scrub_checksum_data(struct scrub_dev *sdev,
465			       struct scrub_page *spag, void *buffer)
466{
467	u8 csum[BTRFS_CSUM_SIZE];
468	u32 crc = ~(u32)0;
469	int fail = 0;
470	struct btrfs_root *root = sdev->dev->dev_root;
471
472	if (!spag->have_csum)
473		return 0;
474
475	crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
476	btrfs_csum_final(crc, csum);
477	if (memcmp(csum, spag->csum, sdev->csum_size))
478		fail = 1;
479
480	spin_lock(&sdev->stat_lock);
481	++sdev->stat.data_extents_scrubbed;
482	sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
483	if (fail)
484		++sdev->stat.csum_errors;
485	spin_unlock(&sdev->stat_lock);
486
487	return fail;
488}
489
490static int scrub_checksum_tree_block(struct scrub_dev *sdev,
491				     struct scrub_page *spag, u64 logical,
492				     void *buffer)
493{
494	struct btrfs_header *h;
495	struct btrfs_root *root = sdev->dev->dev_root;
496	struct btrfs_fs_info *fs_info = root->fs_info;
497	u8 csum[BTRFS_CSUM_SIZE];
498	u32 crc = ~(u32)0;
499	int fail = 0;
500	int crc_fail = 0;
501
502	/*
503	 * we don't use the getter functions here, as we
504	 * a) don't have an extent buffer and
505	 * b) the page is already kmapped
506	 */
507	h = (struct btrfs_header *)buffer;
508
509	if (logical != le64_to_cpu(h->bytenr))
510		++fail;
511
512	if (spag->generation != le64_to_cpu(h->generation))
513		++fail;
514
515	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
516		++fail;
517
518	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
519		   BTRFS_UUID_SIZE))
520		++fail;
521
522	crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
523			      PAGE_SIZE - BTRFS_CSUM_SIZE);
524	btrfs_csum_final(crc, csum);
525	if (memcmp(csum, h->csum, sdev->csum_size))
526		++crc_fail;
527
528	spin_lock(&sdev->stat_lock);
529	++sdev->stat.tree_extents_scrubbed;
530	sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
531	if (crc_fail)
532		++sdev->stat.csum_errors;
533	if (fail)
534		++sdev->stat.verify_errors;
535	spin_unlock(&sdev->stat_lock);
536
537	return fail || crc_fail;
538}
539
540static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
541{
542	struct btrfs_super_block *s;
543	u64 logical;
544	struct scrub_dev *sdev = sbio->sdev;
545	struct btrfs_root *root = sdev->dev->dev_root;
546	struct btrfs_fs_info *fs_info = root->fs_info;
547	u8 csum[BTRFS_CSUM_SIZE];
548	u32 crc = ~(u32)0;
549	int fail = 0;
550
551	s = (struct btrfs_super_block *)buffer;
552	logical = sbio->logical;
553
554	if (logical != le64_to_cpu(s->bytenr))
555		++fail;
556
557	if (sbio->spag[0].generation != le64_to_cpu(s->generation))
558		++fail;
559
560	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
561		++fail;
562
563	crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
564			      PAGE_SIZE - BTRFS_CSUM_SIZE);
565	btrfs_csum_final(crc, csum);
566	if (memcmp(csum, s->csum, sbio->sdev->csum_size))
567		++fail;
568
569	if (fail) {
570		/*
571		 * if we find an error in a super block, we just report it.
572		 * They will get written with the next transaction commit
573		 * anyway
574		 */
575		spin_lock(&sdev->stat_lock);
576		++sdev->stat.super_errors;
577		spin_unlock(&sdev->stat_lock);
578	}
579
580	return fail;
581}
582
583static int scrub_submit(struct scrub_dev *sdev)
584{
585	struct scrub_bio *sbio;
586
587	if (sdev->curr == -1)
588		return 0;
589
590	sbio = sdev->bios[sdev->curr];
591
592	sbio->bio->bi_sector = sbio->physical >> 9;
593	sbio->bio->bi_size = sbio->count * PAGE_SIZE;
594	sbio->bio->bi_next = NULL;
595	sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
596	sbio->bio->bi_comp_cpu = -1;
597	sbio->bio->bi_bdev = sdev->dev->bdev;
598	sbio->err = 0;
599	sdev->curr = -1;
600	atomic_inc(&sdev->in_flight);
601
602	submit_bio(0, sbio->bio);
603
604	return 0;
605}
606
607static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
608		      u64 physical, u64 flags, u64 gen, u64 mirror_num,
609		      u8 *csum, int force)
610{
611	struct scrub_bio *sbio;
612
613again:
614	/*
615	 * grab a fresh bio or wait for one to become available
616	 */
617	while (sdev->curr == -1) {
618		spin_lock(&sdev->list_lock);
619		sdev->curr = sdev->first_free;
620		if (sdev->curr != -1) {
621			sdev->first_free = sdev->bios[sdev->curr]->next_free;
622			sdev->bios[sdev->curr]->next_free = -1;
623			sdev->bios[sdev->curr]->count = 0;
624			spin_unlock(&sdev->list_lock);
625		} else {
626			spin_unlock(&sdev->list_lock);
627			wait_event(sdev->list_wait, sdev->first_free != -1);
628		}
629	}
630	sbio = sdev->bios[sdev->curr];
631	if (sbio->count == 0) {
632		sbio->physical = physical;
633		sbio->logical = logical;
634	} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
635		   sbio->logical + sbio->count * PAGE_SIZE != logical) {
636		scrub_submit(sdev);
637		goto again;
638	}
639	sbio->spag[sbio->count].flags = flags;
640	sbio->spag[sbio->count].generation = gen;
641	sbio->spag[sbio->count].have_csum = 0;
642	sbio->spag[sbio->count].mirror_num = mirror_num;
643	if (csum) {
644		sbio->spag[sbio->count].have_csum = 1;
645		memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
646	}
647	++sbio->count;
648	if (sbio->count == SCRUB_PAGES_PER_BIO || force)
649		scrub_submit(sdev);
650
651	return 0;
652}
653
654static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
655			   u8 *csum)
656{
657	struct btrfs_ordered_sum *sum = NULL;
658	int ret = 0;
659	unsigned long i;
660	unsigned long num_sectors;
661	u32 sectorsize = sdev->dev->dev_root->sectorsize;
662
663	while (!list_empty(&sdev->csum_list)) {
664		sum = list_first_entry(&sdev->csum_list,
665				       struct btrfs_ordered_sum, list);
666		if (sum->bytenr > logical)
667			return 0;
668		if (sum->bytenr + sum->len > logical)
669			break;
670
671		++sdev->stat.csum_discards;
672		list_del(&sum->list);
673		kfree(sum);
674		sum = NULL;
675	}
676	if (!sum)
677		return 0;
678
679	num_sectors = sum->len / sectorsize;
680	for (i = 0; i < num_sectors; ++i) {
681		if (sum->sums[i].bytenr == logical) {
682			memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
683			ret = 1;
684			break;
685		}
686	}
687	if (ret && i == num_sectors - 1) {
688		list_del(&sum->list);
689		kfree(sum);
690	}
691	return ret;
692}
693
694/* scrub extent tries to collect up to 64 kB for each bio */
695static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
696			u64 physical, u64 flags, u64 gen, u64 mirror_num)
697{
698	int ret;
699	u8 csum[BTRFS_CSUM_SIZE];
700
701	while (len) {
702		u64 l = min_t(u64, len, PAGE_SIZE);
703		int have_csum = 0;
704
705		if (flags & BTRFS_EXTENT_FLAG_DATA) {
706			/* push csums to sbio */
707			have_csum = scrub_find_csum(sdev, logical, l, csum);
708			if (have_csum == 0)
709				++sdev->stat.no_csum;
710		}
711		ret = scrub_page(sdev, logical, l, physical, flags, gen,
712				 mirror_num, have_csum ? csum : NULL, 0);
713		if (ret)
714			return ret;
715		len -= l;
716		logical += l;
717		physical += l;
718	}
719	return 0;
720}
721
722static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
723	struct map_lookup *map, int num, u64 base, u64 length)
724{
725	struct btrfs_path *path;
726	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
727	struct btrfs_root *root = fs_info->extent_root;
728	struct btrfs_root *csum_root = fs_info->csum_root;
729	struct btrfs_extent_item *extent;
730	u64 flags;
731	int ret;
732	int slot;
733	int i;
734	u64 nstripes;
735	int start_stripe;
736	struct extent_buffer *l;
737	struct btrfs_key key;
738	u64 physical;
739	u64 logical;
740	u64 generation;
741	u64 mirror_num;
742
743	u64 increment = map->stripe_len;
744	u64 offset;
745
746	nstripes = length;
747	offset = 0;
748	do_div(nstripes, map->stripe_len);
749	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
750		offset = map->stripe_len * num;
751		increment = map->stripe_len * map->num_stripes;
752		mirror_num = 0;
753	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
754		int factor = map->num_stripes / map->sub_stripes;
755		offset = map->stripe_len * (num / map->sub_stripes);
756		increment = map->stripe_len * factor;
757		mirror_num = num % map->sub_stripes;
758	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
759		increment = map->stripe_len;
760		mirror_num = num % map->num_stripes;
761	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
762		increment = map->stripe_len;
763		mirror_num = num % map->num_stripes;
764	} else {
765		increment = map->stripe_len;
766		mirror_num = 0;
767	}
768
769	path = btrfs_alloc_path();
770	if (!path)
771		return -ENOMEM;
772
773	path->reada = 2;
774	path->search_commit_root = 1;
775	path->skip_locking = 1;
776
777	/*
778	 * find all extents for each stripe and just read them to get
779	 * them into the page cache
780	 * FIXME: we can do better. build a more intelligent prefetching
781	 */
782	logical = base + offset;
783	physical = map->stripes[num].physical;
784	ret = 0;
785	for (i = 0; i < nstripes; ++i) {
786		key.objectid = logical;
787		key.type = BTRFS_EXTENT_ITEM_KEY;
788		key.offset = (u64)0;
789
790		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
791		if (ret < 0)
792			goto out;
793
794		l = path->nodes[0];
795		slot = path->slots[0];
796		btrfs_item_key_to_cpu(l, &key, slot);
797		if (key.objectid != logical) {
798			ret = btrfs_previous_item(root, path, 0,
799						  BTRFS_EXTENT_ITEM_KEY);
800			if (ret < 0)
801				goto out;
802		}
803
804		while (1) {
805			l = path->nodes[0];
806			slot = path->slots[0];
807			if (slot >= btrfs_header_nritems(l)) {
808				ret = btrfs_next_leaf(root, path);
809				if (ret == 0)
810					continue;
811				if (ret < 0)
812					goto out;
813
814				break;
815			}
816			btrfs_item_key_to_cpu(l, &key, slot);
817
818			if (key.objectid >= logical + map->stripe_len)
819				break;
820
821			path->slots[0]++;
822		}
823		btrfs_release_path(path);
824		logical += increment;
825		physical += map->stripe_len;
826		cond_resched();
827	}
828
829	/*
830	 * collect all data csums for the stripe to avoid seeking during
831	 * the scrub. This might currently (crc32) end up to be about 1MB
832	 */
833	start_stripe = 0;
834again:
835	logical = base + offset + start_stripe * increment;
836	for (i = start_stripe; i < nstripes; ++i) {
837		ret = btrfs_lookup_csums_range(csum_root, logical,
838					       logical + map->stripe_len - 1,
839					       &sdev->csum_list, 1);
840		if (ret)
841			goto out;
842
843		logical += increment;
844		cond_resched();
845	}
846	/*
847	 * now find all extents for each stripe and scrub them
848	 */
849	logical = base + offset + start_stripe * increment;
850	physical = map->stripes[num].physical + start_stripe * map->stripe_len;
851	ret = 0;
852	for (i = start_stripe; i < nstripes; ++i) {
853		/*
854		 * canceled?
855		 */
856		if (atomic_read(&fs_info->scrub_cancel_req) ||
857		    atomic_read(&sdev->cancel_req)) {
858			ret = -ECANCELED;
859			goto out;
860		}
861		/*
862		 * check to see if we have to pause
863		 */
864		if (atomic_read(&fs_info->scrub_pause_req)) {
865			/* push queued extents */
866			scrub_submit(sdev);
867			wait_event(sdev->list_wait,
868				   atomic_read(&sdev->in_flight) == 0);
869			atomic_inc(&fs_info->scrubs_paused);
870			wake_up(&fs_info->scrub_pause_wait);
871			mutex_lock(&fs_info->scrub_lock);
872			while (atomic_read(&fs_info->scrub_pause_req)) {
873				mutex_unlock(&fs_info->scrub_lock);
874				wait_event(fs_info->scrub_pause_wait,
875				   atomic_read(&fs_info->scrub_pause_req) == 0);
876				mutex_lock(&fs_info->scrub_lock);
877			}
878			atomic_dec(&fs_info->scrubs_paused);
879			mutex_unlock(&fs_info->scrub_lock);
880			wake_up(&fs_info->scrub_pause_wait);
881			scrub_free_csums(sdev);
882			start_stripe = i;
883			goto again;
884		}
885
886		key.objectid = logical;
887		key.type = BTRFS_EXTENT_ITEM_KEY;
888		key.offset = (u64)0;
889
890		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
891		if (ret < 0)
892			goto out;
893
894		l = path->nodes[0];
895		slot = path->slots[0];
896		btrfs_item_key_to_cpu(l, &key, slot);
897		if (key.objectid != logical) {
898			ret = btrfs_previous_item(root, path, 0,
899						  BTRFS_EXTENT_ITEM_KEY);
900			if (ret < 0)
901				goto out;
902		}
903
904		while (1) {
905			l = path->nodes[0];
906			slot = path->slots[0];
907			if (slot >= btrfs_header_nritems(l)) {
908				ret = btrfs_next_leaf(root, path);
909				if (ret == 0)
910					continue;
911				if (ret < 0)
912					goto out;
913
914				break;
915			}
916			btrfs_item_key_to_cpu(l, &key, slot);
917
918			if (key.objectid + key.offset <= logical)
919				goto next;
920
921			if (key.objectid >= logical + map->stripe_len)
922				break;
923
924			if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
925				goto next;
926
927			extent = btrfs_item_ptr(l, slot,
928						struct btrfs_extent_item);
929			flags = btrfs_extent_flags(l, extent);
930			generation = btrfs_extent_generation(l, extent);
931
932			if (key.objectid < logical &&
933			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
934				printk(KERN_ERR
935				       "btrfs scrub: tree block %llu spanning "
936				       "stripes, ignored. logical=%llu\n",
937				       (unsigned long long)key.objectid,
938				       (unsigned long long)logical);
939				goto next;
940			}
941
942			/*
943			 * trim extent to this stripe
944			 */
945			if (key.objectid < logical) {
946				key.offset -= logical - key.objectid;
947				key.objectid = logical;
948			}
949			if (key.objectid + key.offset >
950			    logical + map->stripe_len) {
951				key.offset = logical + map->stripe_len -
952					     key.objectid;
953			}
954
955			ret = scrub_extent(sdev, key.objectid, key.offset,
956					   key.objectid - logical + physical,
957					   flags, generation, mirror_num);
958			if (ret)
959				goto out;
960
961next:
962			path->slots[0]++;
963		}
964		btrfs_release_path(path);
965		logical += increment;
966		physical += map->stripe_len;
967		spin_lock(&sdev->stat_lock);
968		sdev->stat.last_physical = physical;
969		spin_unlock(&sdev->stat_lock);
970	}
971	/* push queued extents */
972	scrub_submit(sdev);
973
974out:
975	btrfs_free_path(path);
976	return ret < 0 ? ret : 0;
977}
978
979static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
980	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
981{
982	struct btrfs_mapping_tree *map_tree =
983		&sdev->dev->dev_root->fs_info->mapping_tree;
984	struct map_lookup *map;
985	struct extent_map *em;
986	int i;
987	int ret = -EINVAL;
988
989	read_lock(&map_tree->map_tree.lock);
990	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
991	read_unlock(&map_tree->map_tree.lock);
992
993	if (!em)
994		return -EINVAL;
995
996	map = (struct map_lookup *)em->bdev;
997	if (em->start != chunk_offset)
998		goto out;
999
1000	if (em->len < length)
1001		goto out;
1002
1003	for (i = 0; i < map->num_stripes; ++i) {
1004		if (map->stripes[i].dev == sdev->dev) {
1005			ret = scrub_stripe(sdev, map, i, chunk_offset, length);
1006			if (ret)
1007				goto out;
1008		}
1009	}
1010out:
1011	free_extent_map(em);
1012
1013	return ret;
1014}
1015
1016static noinline_for_stack
1017int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
1018{
1019	struct btrfs_dev_extent *dev_extent = NULL;
1020	struct btrfs_path *path;
1021	struct btrfs_root *root = sdev->dev->dev_root;
1022	struct btrfs_fs_info *fs_info = root->fs_info;
1023	u64 length;
1024	u64 chunk_tree;
1025	u64 chunk_objectid;
1026	u64 chunk_offset;
1027	int ret;
1028	int slot;
1029	struct extent_buffer *l;
1030	struct btrfs_key key;
1031	struct btrfs_key found_key;
1032	struct btrfs_block_group_cache *cache;
1033
1034	path = btrfs_alloc_path();
1035	if (!path)
1036		return -ENOMEM;
1037
1038	path->reada = 2;
1039	path->search_commit_root = 1;
1040	path->skip_locking = 1;
1041
1042	key.objectid = sdev->dev->devid;
1043	key.offset = 0ull;
1044	key.type = BTRFS_DEV_EXTENT_KEY;
1045
1046
1047	while (1) {
1048		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1049		if (ret < 0)
1050			goto out;
1051		ret = 0;
1052
1053		l = path->nodes[0];
1054		slot = path->slots[0];
1055
1056		btrfs_item_key_to_cpu(l, &found_key, slot);
1057
1058		if (found_key.objectid != sdev->dev->devid)
1059			break;
1060
1061		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
1062			break;
1063
1064		if (found_key.offset >= end)
1065			break;
1066
1067		if (found_key.offset < key.offset)
1068			break;
1069
1070		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1071		length = btrfs_dev_extent_length(l, dev_extent);
1072
1073		if (found_key.offset + length <= start) {
1074			key.offset = found_key.offset + length;
1075			btrfs_release_path(path);
1076			continue;
1077		}
1078
1079		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1080		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1081		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1082
1083		/*
1084		 * get a reference on the corresponding block group to prevent
1085		 * the chunk from going away while we scrub it
1086		 */
1087		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
1088		if (!cache) {
1089			ret = -ENOENT;
1090			goto out;
1091		}
1092		ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
1093				  chunk_offset, length);
1094		btrfs_put_block_group(cache);
1095		if (ret)
1096			break;
1097
1098		key.offset = found_key.offset + length;
1099		btrfs_release_path(path);
1100	}
1101
1102out:
1103	btrfs_free_path(path);
1104	return ret;
1105}
1106
1107static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1108{
1109	int	i;
1110	u64	bytenr;
1111	u64	gen;
1112	int	ret;
1113	struct btrfs_device *device = sdev->dev;
1114	struct btrfs_root *root = device->dev_root;
1115
1116	gen = root->fs_info->last_trans_committed;
1117
1118	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1119		bytenr = btrfs_sb_offset(i);
1120		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1121			break;
1122
1123		ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
1124				 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
1125		if (ret)
1126			return ret;
1127	}
1128	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1129
1130	return 0;
1131}
1132
1133/*
1134 * get a reference count on fs_info->scrub_workers. start worker if necessary
1135 */
1136static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1137{
1138	struct btrfs_fs_info *fs_info = root->fs_info;
1139
1140	mutex_lock(&fs_info->scrub_lock);
1141	if (fs_info->scrub_workers_refcnt == 0)
1142		btrfs_start_workers(&fs_info->scrub_workers, 1);
1143	++fs_info->scrub_workers_refcnt;
1144	mutex_unlock(&fs_info->scrub_lock);
1145
1146	return 0;
1147}
1148
1149static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
1150{
1151	struct btrfs_fs_info *fs_info = root->fs_info;
1152
1153	mutex_lock(&fs_info->scrub_lock);
1154	if (--fs_info->scrub_workers_refcnt == 0)
1155		btrfs_stop_workers(&fs_info->scrub_workers);
1156	WARN_ON(fs_info->scrub_workers_refcnt < 0);
1157	mutex_unlock(&fs_info->scrub_lock);
1158}
1159
1160
1161int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1162		    struct btrfs_scrub_progress *progress, int readonly)
1163{
1164	struct scrub_dev *sdev;
1165	struct btrfs_fs_info *fs_info = root->fs_info;
1166	int ret;
1167	struct btrfs_device *dev;
1168
1169	if (root->fs_info->closing)
1170		return -EINVAL;
1171
1172	/*
1173	 * check some assumptions
1174	 */
1175	if (root->sectorsize != PAGE_SIZE ||
1176	    root->sectorsize != root->leafsize ||
1177	    root->sectorsize != root->nodesize) {
1178		printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
1179		return -EINVAL;
1180	}
1181
1182	ret = scrub_workers_get(root);
1183	if (ret)
1184		return ret;
1185
1186	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1187	dev = btrfs_find_device(root, devid, NULL, NULL);
1188	if (!dev || dev->missing) {
1189		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1190		scrub_workers_put(root);
1191		return -ENODEV;
1192	}
1193	mutex_lock(&fs_info->scrub_lock);
1194
1195	if (!dev->in_fs_metadata) {
1196		mutex_unlock(&fs_info->scrub_lock);
1197		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1198		scrub_workers_put(root);
1199		return -ENODEV;
1200	}
1201
1202	if (dev->scrub_device) {
1203		mutex_unlock(&fs_info->scrub_lock);
1204		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1205		scrub_workers_put(root);
1206		return -EINPROGRESS;
1207	}
1208	sdev = scrub_setup_dev(dev);
1209	if (IS_ERR(sdev)) {
1210		mutex_unlock(&fs_info->scrub_lock);
1211		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1212		scrub_workers_put(root);
1213		return PTR_ERR(sdev);
1214	}
1215	sdev->readonly = readonly;
1216	dev->scrub_device = sdev;
1217
1218	atomic_inc(&fs_info->scrubs_running);
1219	mutex_unlock(&fs_info->scrub_lock);
1220	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1221
1222	down_read(&fs_info->scrub_super_lock);
1223	ret = scrub_supers(sdev);
1224	up_read(&fs_info->scrub_super_lock);
1225
1226	if (!ret)
1227		ret = scrub_enumerate_chunks(sdev, start, end);
1228
1229	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1230
1231	atomic_dec(&fs_info->scrubs_running);
1232	wake_up(&fs_info->scrub_pause_wait);
1233
1234	if (progress)
1235		memcpy(progress, &sdev->stat, sizeof(*progress));
1236
1237	mutex_lock(&fs_info->scrub_lock);
1238	dev->scrub_device = NULL;
1239	mutex_unlock(&fs_info->scrub_lock);
1240
1241	scrub_free_dev(sdev);
1242	scrub_workers_put(root);
1243
1244	return ret;
1245}
1246
1247int btrfs_scrub_pause(struct btrfs_root *root)
1248{
1249	struct btrfs_fs_info *fs_info = root->fs_info;
1250
1251	mutex_lock(&fs_info->scrub_lock);
1252	atomic_inc(&fs_info->scrub_pause_req);
1253	while (atomic_read(&fs_info->scrubs_paused) !=
1254	       atomic_read(&fs_info->scrubs_running)) {
1255		mutex_unlock(&fs_info->scrub_lock);
1256		wait_event(fs_info->scrub_pause_wait,
1257			   atomic_read(&fs_info->scrubs_paused) ==
1258			   atomic_read(&fs_info->scrubs_running));
1259		mutex_lock(&fs_info->scrub_lock);
1260	}
1261	mutex_unlock(&fs_info->scrub_lock);
1262
1263	return 0;
1264}
1265
1266int btrfs_scrub_continue(struct btrfs_root *root)
1267{
1268	struct btrfs_fs_info *fs_info = root->fs_info;
1269
1270	atomic_dec(&fs_info->scrub_pause_req);
1271	wake_up(&fs_info->scrub_pause_wait);
1272	return 0;
1273}
1274
1275int btrfs_scrub_pause_super(struct btrfs_root *root)
1276{
1277	down_write(&root->fs_info->scrub_super_lock);
1278	return 0;
1279}
1280
1281int btrfs_scrub_continue_super(struct btrfs_root *root)
1282{
1283	up_write(&root->fs_info->scrub_super_lock);
1284	return 0;
1285}
1286
1287int btrfs_scrub_cancel(struct btrfs_root *root)
1288{
1289	struct btrfs_fs_info *fs_info = root->fs_info;
1290
1291	mutex_lock(&fs_info->scrub_lock);
1292	if (!atomic_read(&fs_info->scrubs_running)) {
1293		mutex_unlock(&fs_info->scrub_lock);
1294		return -ENOTCONN;
1295	}
1296
1297	atomic_inc(&fs_info->scrub_cancel_req);
1298	while (atomic_read(&fs_info->scrubs_running)) {
1299		mutex_unlock(&fs_info->scrub_lock);
1300		wait_event(fs_info->scrub_pause_wait,
1301			   atomic_read(&fs_info->scrubs_running) == 0);
1302		mutex_lock(&fs_info->scrub_lock);
1303	}
1304	atomic_dec(&fs_info->scrub_cancel_req);
1305	mutex_unlock(&fs_info->scrub_lock);
1306
1307	return 0;
1308}
1309
1310int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
1311{
1312	struct btrfs_fs_info *fs_info = root->fs_info;
1313	struct scrub_dev *sdev;
1314
1315	mutex_lock(&fs_info->scrub_lock);
1316	sdev = dev->scrub_device;
1317	if (!sdev) {
1318		mutex_unlock(&fs_info->scrub_lock);
1319		return -ENOTCONN;
1320	}
1321	atomic_inc(&sdev->cancel_req);
1322	while (dev->scrub_device) {
1323		mutex_unlock(&fs_info->scrub_lock);
1324		wait_event(fs_info->scrub_pause_wait,
1325			   dev->scrub_device == NULL);
1326		mutex_lock(&fs_info->scrub_lock);
1327	}
1328	mutex_unlock(&fs_info->scrub_lock);
1329
1330	return 0;
1331}
1332int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
1333{
1334	struct btrfs_fs_info *fs_info = root->fs_info;
1335	struct btrfs_device *dev;
1336	int ret;
1337
1338	/*
1339	 * we have to hold the device_list_mutex here so the device
1340	 * does not go away in cancel_dev. FIXME: find a better solution
1341	 */
1342	mutex_lock(&fs_info->fs_devices->device_list_mutex);
1343	dev = btrfs_find_device(root, devid, NULL, NULL);
1344	if (!dev) {
1345		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1346		return -ENODEV;
1347	}
1348	ret = btrfs_scrub_cancel_dev(root, dev);
1349	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1350
1351	return ret;
1352}
1353
1354int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
1355			 struct btrfs_scrub_progress *progress)
1356{
1357	struct btrfs_device *dev;
1358	struct scrub_dev *sdev = NULL;
1359
1360	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1361	dev = btrfs_find_device(root, devid, NULL, NULL);
1362	if (dev)
1363		sdev = dev->scrub_device;
1364	if (sdev)
1365		memcpy(progress, &sdev->stat, sizeof(*progress));
1366	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1367
1368	return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
1369}
1370