scrub.c revision 1bc8779349d6278e2713a1ff94418c2a6746a791
1/*
2 * Copyright (C) 2011 STRATO.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "ordered-data.h"
30
31/*
32 * This is only the first step towards a full-features scrub. It reads all
33 * extent and super block and verifies the checksums. In case a bad checksum
34 * is found or the extent cannot be read, good data will be written back if
35 * any can be found.
36 *
37 * Future enhancements:
38 *  - To enhance the performance, better read-ahead strategies for the
39 *    extent-tree can be employed.
40 *  - In case an unrepairable extent is encountered, track which files are
41 *    affected and report them
42 *  - In case of a read error on files with nodatasum, map the file and read
43 *    the extent to trigger a writeback of the good copy
44 *  - track and record media errors, throw out bad devices
45 *  - add a mode to also read unallocated space
46 *  - make the prefetch cancellable
47 */
48
49struct scrub_bio;
50struct scrub_page;
51struct scrub_dev;
52static void scrub_bio_end_io(struct bio *bio, int err);
53static void scrub_checksum(struct btrfs_work *work);
54static int scrub_checksum_data(struct scrub_dev *sdev,
55			       struct scrub_page *spag, void *buffer);
56static int scrub_checksum_tree_block(struct scrub_dev *sdev,
57				     struct scrub_page *spag, u64 logical,
58				     void *buffer);
59static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
60static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
61static void scrub_fixup_end_io(struct bio *bio, int err);
62static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
63			  struct page *page);
64static void scrub_fixup(struct scrub_bio *sbio, int ix);
65
66#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */
67#define SCRUB_BIOS_PER_DEV	16	/* 1 MB per device in flight */
68
69struct scrub_page {
70	u64			flags;  /* extent flags */
71	u64			generation;
72	u64			mirror_num;
73	int			have_csum;
74	u8			csum[BTRFS_CSUM_SIZE];
75};
76
77struct scrub_bio {
78	int			index;
79	struct scrub_dev	*sdev;
80	struct bio		*bio;
81	int			err;
82	u64			logical;
83	u64			physical;
84	struct scrub_page	spag[SCRUB_PAGES_PER_BIO];
85	u64			count;
86	int			next_free;
87	struct btrfs_work	work;
88};
89
90struct scrub_dev {
91	struct scrub_bio	*bios[SCRUB_BIOS_PER_DEV];
92	struct btrfs_device	*dev;
93	int			first_free;
94	int			curr;
95	atomic_t		in_flight;
96	spinlock_t		list_lock;
97	wait_queue_head_t	list_wait;
98	u16			csum_size;
99	struct list_head	csum_list;
100	atomic_t		cancel_req;
101	int			readonly;
102	/*
103	 * statistics
104	 */
105	struct btrfs_scrub_progress stat;
106	spinlock_t		stat_lock;
107};
108
109static void scrub_free_csums(struct scrub_dev *sdev)
110{
111	while (!list_empty(&sdev->csum_list)) {
112		struct btrfs_ordered_sum *sum;
113		sum = list_first_entry(&sdev->csum_list,
114				       struct btrfs_ordered_sum, list);
115		list_del(&sum->list);
116		kfree(sum);
117	}
118}
119
120static void scrub_free_bio(struct bio *bio)
121{
122	int i;
123	struct page *last_page = NULL;
124
125	if (!bio)
126		return;
127
128	for (i = 0; i < bio->bi_vcnt; ++i) {
129		if (bio->bi_io_vec[i].bv_page == last_page)
130			continue;
131		last_page = bio->bi_io_vec[i].bv_page;
132		__free_page(last_page);
133	}
134	bio_put(bio);
135}
136
137static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
138{
139	int i;
140
141	if (!sdev)
142		return;
143
144	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
145		struct scrub_bio *sbio = sdev->bios[i];
146
147		if (!sbio)
148			break;
149
150		scrub_free_bio(sbio->bio);
151		kfree(sbio);
152	}
153
154	scrub_free_csums(sdev);
155	kfree(sdev);
156}
157
158static noinline_for_stack
159struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
160{
161	struct scrub_dev *sdev;
162	int		i;
163	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
164
165	sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
166	if (!sdev)
167		goto nomem;
168	sdev->dev = dev;
169	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
170		struct scrub_bio *sbio;
171
172		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
173		if (!sbio)
174			goto nomem;
175		sdev->bios[i] = sbio;
176
177		sbio->index = i;
178		sbio->sdev = sdev;
179		sbio->count = 0;
180		sbio->work.func = scrub_checksum;
181
182		if (i != SCRUB_BIOS_PER_DEV-1)
183			sdev->bios[i]->next_free = i + 1;
184		 else
185			sdev->bios[i]->next_free = -1;
186	}
187	sdev->first_free = 0;
188	sdev->curr = -1;
189	atomic_set(&sdev->in_flight, 0);
190	atomic_set(&sdev->cancel_req, 0);
191	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
192	INIT_LIST_HEAD(&sdev->csum_list);
193
194	spin_lock_init(&sdev->list_lock);
195	spin_lock_init(&sdev->stat_lock);
196	init_waitqueue_head(&sdev->list_wait);
197	return sdev;
198
199nomem:
200	scrub_free_dev(sdev);
201	return ERR_PTR(-ENOMEM);
202}
203
204/*
205 * scrub_recheck_error gets called when either verification of the page
206 * failed or the bio failed to read, e.g. with EIO. In the latter case,
207 * recheck_error gets called for every page in the bio, even though only
208 * one may be bad
209 */
210static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
211{
212	if (sbio->err) {
213		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
214				   (sbio->physical + ix * PAGE_SIZE) >> 9,
215				   sbio->bio->bi_io_vec[ix].bv_page) == 0) {
216			if (scrub_fixup_check(sbio, ix) == 0)
217				return;
218		}
219	}
220
221	scrub_fixup(sbio, ix);
222}
223
224static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
225{
226	int ret = 1;
227	struct page *page;
228	void *buffer;
229	u64 flags = sbio->spag[ix].flags;
230
231	page = sbio->bio->bi_io_vec[ix].bv_page;
232	buffer = kmap_atomic(page, KM_USER0);
233	if (flags & BTRFS_EXTENT_FLAG_DATA) {
234		ret = scrub_checksum_data(sbio->sdev,
235					  sbio->spag + ix, buffer);
236	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
237		ret = scrub_checksum_tree_block(sbio->sdev,
238						sbio->spag + ix,
239						sbio->logical + ix * PAGE_SIZE,
240						buffer);
241	} else {
242		WARN_ON(1);
243	}
244	kunmap_atomic(buffer, KM_USER0);
245
246	return ret;
247}
248
249static void scrub_fixup_end_io(struct bio *bio, int err)
250{
251	complete((struct completion *)bio->bi_private);
252}
253
254static void scrub_fixup(struct scrub_bio *sbio, int ix)
255{
256	struct scrub_dev *sdev = sbio->sdev;
257	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
258	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
259	struct btrfs_multi_bio *multi = NULL;
260	u64 logical = sbio->logical + ix * PAGE_SIZE;
261	u64 length;
262	int i;
263	int ret;
264	DECLARE_COMPLETION_ONSTACK(complete);
265
266	if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
267	    (sbio->spag[ix].have_csum == 0)) {
268		/*
269		 * nodatasum, don't try to fix anything
270		 * FIXME: we can do better, open the inode and trigger a
271		 * writeback
272		 */
273		goto uncorrectable;
274	}
275
276	length = PAGE_SIZE;
277	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
278			      &multi, 0);
279	if (ret || !multi || length < PAGE_SIZE) {
280		printk(KERN_ERR
281		       "scrub_fixup: btrfs_map_block failed us for %llu\n",
282		       (unsigned long long)logical);
283		WARN_ON(1);
284		return;
285	}
286
287	if (multi->num_stripes == 1)
288		/* there aren't any replicas */
289		goto uncorrectable;
290
291	/*
292	 * first find a good copy
293	 */
294	for (i = 0; i < multi->num_stripes; ++i) {
295		if (i == sbio->spag[ix].mirror_num)
296			continue;
297
298		if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
299				   multi->stripes[i].physical >> 9,
300				   sbio->bio->bi_io_vec[ix].bv_page)) {
301			/* I/O-error, this is not a good copy */
302			continue;
303		}
304
305		if (scrub_fixup_check(sbio, ix) == 0)
306			break;
307	}
308	if (i == multi->num_stripes)
309		goto uncorrectable;
310
311	if (!sdev->readonly) {
312		/*
313		 * bi_io_vec[ix].bv_page now contains good data, write it back
314		 */
315		if (scrub_fixup_io(WRITE, sdev->dev->bdev,
316				   (sbio->physical + ix * PAGE_SIZE) >> 9,
317				   sbio->bio->bi_io_vec[ix].bv_page)) {
318			/* I/O-error, writeback failed, give up */
319			goto uncorrectable;
320		}
321	}
322
323	kfree(multi);
324	spin_lock(&sdev->stat_lock);
325	++sdev->stat.corrected_errors;
326	spin_unlock(&sdev->stat_lock);
327
328	if (printk_ratelimit())
329		printk(KERN_ERR "btrfs: fixed up at %llu\n",
330		       (unsigned long long)logical);
331	return;
332
333uncorrectable:
334	kfree(multi);
335	spin_lock(&sdev->stat_lock);
336	++sdev->stat.uncorrectable_errors;
337	spin_unlock(&sdev->stat_lock);
338
339	if (printk_ratelimit())
340		printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
341			 (unsigned long long)logical);
342}
343
344static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
345			 struct page *page)
346{
347	struct bio *bio = NULL;
348	int ret;
349	DECLARE_COMPLETION_ONSTACK(complete);
350
351	/* we are going to wait on this IO */
352	rw |= REQ_SYNC;
353
354	bio = bio_alloc(GFP_NOFS, 1);
355	bio->bi_bdev = bdev;
356	bio->bi_sector = sector;
357	bio_add_page(bio, page, PAGE_SIZE, 0);
358	bio->bi_end_io = scrub_fixup_end_io;
359	bio->bi_private = &complete;
360	submit_bio(rw, bio);
361
362	wait_for_completion(&complete);
363
364	ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
365	bio_put(bio);
366	return ret;
367}
368
369static void scrub_bio_end_io(struct bio *bio, int err)
370{
371	struct scrub_bio *sbio = bio->bi_private;
372	struct scrub_dev *sdev = sbio->sdev;
373	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
374
375	sbio->err = err;
376	sbio->bio = bio;
377
378	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
379}
380
381static void scrub_checksum(struct btrfs_work *work)
382{
383	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
384	struct scrub_dev *sdev = sbio->sdev;
385	struct page *page;
386	void *buffer;
387	int i;
388	u64 flags;
389	u64 logical;
390	int ret;
391
392	if (sbio->err) {
393		for (i = 0; i < sbio->count; ++i)
394			scrub_recheck_error(sbio, i);
395
396		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
397		sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
398		sbio->bio->bi_phys_segments = 0;
399		sbio->bio->bi_idx = 0;
400
401		for (i = 0; i < sbio->count; i++) {
402			struct bio_vec *bi;
403			bi = &sbio->bio->bi_io_vec[i];
404			bi->bv_offset = 0;
405			bi->bv_len = PAGE_SIZE;
406		}
407
408		spin_lock(&sdev->stat_lock);
409		++sdev->stat.read_errors;
410		spin_unlock(&sdev->stat_lock);
411		goto out;
412	}
413	for (i = 0; i < sbio->count; ++i) {
414		page = sbio->bio->bi_io_vec[i].bv_page;
415		buffer = kmap_atomic(page, KM_USER0);
416		flags = sbio->spag[i].flags;
417		logical = sbio->logical + i * PAGE_SIZE;
418		ret = 0;
419		if (flags & BTRFS_EXTENT_FLAG_DATA) {
420			ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
421		} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
422			ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
423							logical, buffer);
424		} else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
425			BUG_ON(i);
426			(void)scrub_checksum_super(sbio, buffer);
427		} else {
428			WARN_ON(1);
429		}
430		kunmap_atomic(buffer, KM_USER0);
431		if (ret)
432			scrub_recheck_error(sbio, i);
433	}
434
435out:
436	scrub_free_bio(sbio->bio);
437	sbio->bio = NULL;
438	spin_lock(&sdev->list_lock);
439	sbio->next_free = sdev->first_free;
440	sdev->first_free = sbio->index;
441	spin_unlock(&sdev->list_lock);
442	atomic_dec(&sdev->in_flight);
443	wake_up(&sdev->list_wait);
444}
445
446static int scrub_checksum_data(struct scrub_dev *sdev,
447			       struct scrub_page *spag, void *buffer)
448{
449	u8 csum[BTRFS_CSUM_SIZE];
450	u32 crc = ~(u32)0;
451	int fail = 0;
452	struct btrfs_root *root = sdev->dev->dev_root;
453
454	if (!spag->have_csum)
455		return 0;
456
457	crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
458	btrfs_csum_final(crc, csum);
459	if (memcmp(csum, spag->csum, sdev->csum_size))
460		fail = 1;
461
462	spin_lock(&sdev->stat_lock);
463	++sdev->stat.data_extents_scrubbed;
464	sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
465	if (fail)
466		++sdev->stat.csum_errors;
467	spin_unlock(&sdev->stat_lock);
468
469	return fail;
470}
471
472static int scrub_checksum_tree_block(struct scrub_dev *sdev,
473				     struct scrub_page *spag, u64 logical,
474				     void *buffer)
475{
476	struct btrfs_header *h;
477	struct btrfs_root *root = sdev->dev->dev_root;
478	struct btrfs_fs_info *fs_info = root->fs_info;
479	u8 csum[BTRFS_CSUM_SIZE];
480	u32 crc = ~(u32)0;
481	int fail = 0;
482	int crc_fail = 0;
483
484	/*
485	 * we don't use the getter functions here, as we
486	 * a) don't have an extent buffer and
487	 * b) the page is already kmapped
488	 */
489	h = (struct btrfs_header *)buffer;
490
491	if (logical != le64_to_cpu(h->bytenr))
492		++fail;
493
494	if (spag->generation != le64_to_cpu(h->generation))
495		++fail;
496
497	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
498		++fail;
499
500	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
501		   BTRFS_UUID_SIZE))
502		++fail;
503
504	crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
505			      PAGE_SIZE - BTRFS_CSUM_SIZE);
506	btrfs_csum_final(crc, csum);
507	if (memcmp(csum, h->csum, sdev->csum_size))
508		++crc_fail;
509
510	spin_lock(&sdev->stat_lock);
511	++sdev->stat.tree_extents_scrubbed;
512	sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
513	if (crc_fail)
514		++sdev->stat.csum_errors;
515	if (fail)
516		++sdev->stat.verify_errors;
517	spin_unlock(&sdev->stat_lock);
518
519	return fail || crc_fail;
520}
521
522static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
523{
524	struct btrfs_super_block *s;
525	u64 logical;
526	struct scrub_dev *sdev = sbio->sdev;
527	struct btrfs_root *root = sdev->dev->dev_root;
528	struct btrfs_fs_info *fs_info = root->fs_info;
529	u8 csum[BTRFS_CSUM_SIZE];
530	u32 crc = ~(u32)0;
531	int fail = 0;
532
533	s = (struct btrfs_super_block *)buffer;
534	logical = sbio->logical;
535
536	if (logical != le64_to_cpu(s->bytenr))
537		++fail;
538
539	if (sbio->spag[0].generation != le64_to_cpu(s->generation))
540		++fail;
541
542	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
543		++fail;
544
545	crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
546			      PAGE_SIZE - BTRFS_CSUM_SIZE);
547	btrfs_csum_final(crc, csum);
548	if (memcmp(csum, s->csum, sbio->sdev->csum_size))
549		++fail;
550
551	if (fail) {
552		/*
553		 * if we find an error in a super block, we just report it.
554		 * They will get written with the next transaction commit
555		 * anyway
556		 */
557		spin_lock(&sdev->stat_lock);
558		++sdev->stat.super_errors;
559		spin_unlock(&sdev->stat_lock);
560	}
561
562	return fail;
563}
564
565static int scrub_submit(struct scrub_dev *sdev)
566{
567	struct scrub_bio *sbio;
568	struct bio *bio;
569	int i;
570
571	if (sdev->curr == -1)
572		return 0;
573
574	sbio = sdev->bios[sdev->curr];
575
576	bio = bio_alloc(GFP_NOFS, sbio->count);
577	if (!bio)
578		goto nomem;
579
580	bio->bi_private = sbio;
581	bio->bi_end_io = scrub_bio_end_io;
582	bio->bi_bdev = sdev->dev->bdev;
583	bio->bi_sector = sbio->physical >> 9;
584
585	for (i = 0; i < sbio->count; ++i) {
586		struct page *page;
587		int ret;
588
589		page = alloc_page(GFP_NOFS);
590		if (!page)
591			goto nomem;
592
593		ret = bio_add_page(bio, page, PAGE_SIZE, 0);
594		if (!ret) {
595			__free_page(page);
596			goto nomem;
597		}
598	}
599
600	sbio->err = 0;
601	sdev->curr = -1;
602	atomic_inc(&sdev->in_flight);
603
604	submit_bio(READ, bio);
605
606	return 0;
607
608nomem:
609	scrub_free_bio(bio);
610
611	return -ENOMEM;
612}
613
614static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
615		      u64 physical, u64 flags, u64 gen, u64 mirror_num,
616		      u8 *csum, int force)
617{
618	struct scrub_bio *sbio;
619
620again:
621	/*
622	 * grab a fresh bio or wait for one to become available
623	 */
624	while (sdev->curr == -1) {
625		spin_lock(&sdev->list_lock);
626		sdev->curr = sdev->first_free;
627		if (sdev->curr != -1) {
628			sdev->first_free = sdev->bios[sdev->curr]->next_free;
629			sdev->bios[sdev->curr]->next_free = -1;
630			sdev->bios[sdev->curr]->count = 0;
631			spin_unlock(&sdev->list_lock);
632		} else {
633			spin_unlock(&sdev->list_lock);
634			wait_event(sdev->list_wait, sdev->first_free != -1);
635		}
636	}
637	sbio = sdev->bios[sdev->curr];
638	if (sbio->count == 0) {
639		sbio->physical = physical;
640		sbio->logical = logical;
641	} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
642		   sbio->logical + sbio->count * PAGE_SIZE != logical) {
643		int ret;
644
645		ret = scrub_submit(sdev);
646		if (ret)
647			return ret;
648		goto again;
649	}
650	sbio->spag[sbio->count].flags = flags;
651	sbio->spag[sbio->count].generation = gen;
652	sbio->spag[sbio->count].have_csum = 0;
653	sbio->spag[sbio->count].mirror_num = mirror_num;
654	if (csum) {
655		sbio->spag[sbio->count].have_csum = 1;
656		memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
657	}
658	++sbio->count;
659	if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
660		int ret;
661
662		ret = scrub_submit(sdev);
663		if (ret)
664			return ret;
665	}
666
667	return 0;
668}
669
670static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
671			   u8 *csum)
672{
673	struct btrfs_ordered_sum *sum = NULL;
674	int ret = 0;
675	unsigned long i;
676	unsigned long num_sectors;
677	u32 sectorsize = sdev->dev->dev_root->sectorsize;
678
679	while (!list_empty(&sdev->csum_list)) {
680		sum = list_first_entry(&sdev->csum_list,
681				       struct btrfs_ordered_sum, list);
682		if (sum->bytenr > logical)
683			return 0;
684		if (sum->bytenr + sum->len > logical)
685			break;
686
687		++sdev->stat.csum_discards;
688		list_del(&sum->list);
689		kfree(sum);
690		sum = NULL;
691	}
692	if (!sum)
693		return 0;
694
695	num_sectors = sum->len / sectorsize;
696	for (i = 0; i < num_sectors; ++i) {
697		if (sum->sums[i].bytenr == logical) {
698			memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
699			ret = 1;
700			break;
701		}
702	}
703	if (ret && i == num_sectors - 1) {
704		list_del(&sum->list);
705		kfree(sum);
706	}
707	return ret;
708}
709
710/* scrub extent tries to collect up to 64 kB for each bio */
711static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
712			u64 physical, u64 flags, u64 gen, u64 mirror_num)
713{
714	int ret;
715	u8 csum[BTRFS_CSUM_SIZE];
716
717	while (len) {
718		u64 l = min_t(u64, len, PAGE_SIZE);
719		int have_csum = 0;
720
721		if (flags & BTRFS_EXTENT_FLAG_DATA) {
722			/* push csums to sbio */
723			have_csum = scrub_find_csum(sdev, logical, l, csum);
724			if (have_csum == 0)
725				++sdev->stat.no_csum;
726		}
727		ret = scrub_page(sdev, logical, l, physical, flags, gen,
728				 mirror_num, have_csum ? csum : NULL, 0);
729		if (ret)
730			return ret;
731		len -= l;
732		logical += l;
733		physical += l;
734	}
735	return 0;
736}
737
738static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
739	struct map_lookup *map, int num, u64 base, u64 length)
740{
741	struct btrfs_path *path;
742	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
743	struct btrfs_root *root = fs_info->extent_root;
744	struct btrfs_root *csum_root = fs_info->csum_root;
745	struct btrfs_extent_item *extent;
746	u64 flags;
747	int ret;
748	int slot;
749	int i;
750	u64 nstripes;
751	int start_stripe;
752	struct extent_buffer *l;
753	struct btrfs_key key;
754	u64 physical;
755	u64 logical;
756	u64 generation;
757	u64 mirror_num;
758
759	u64 increment = map->stripe_len;
760	u64 offset;
761
762	nstripes = length;
763	offset = 0;
764	do_div(nstripes, map->stripe_len);
765	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
766		offset = map->stripe_len * num;
767		increment = map->stripe_len * map->num_stripes;
768		mirror_num = 0;
769	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
770		int factor = map->num_stripes / map->sub_stripes;
771		offset = map->stripe_len * (num / map->sub_stripes);
772		increment = map->stripe_len * factor;
773		mirror_num = num % map->sub_stripes;
774	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
775		increment = map->stripe_len;
776		mirror_num = num % map->num_stripes;
777	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
778		increment = map->stripe_len;
779		mirror_num = num % map->num_stripes;
780	} else {
781		increment = map->stripe_len;
782		mirror_num = 0;
783	}
784
785	path = btrfs_alloc_path();
786	if (!path)
787		return -ENOMEM;
788
789	path->reada = 2;
790	path->search_commit_root = 1;
791	path->skip_locking = 1;
792
793	/*
794	 * find all extents for each stripe and just read them to get
795	 * them into the page cache
796	 * FIXME: we can do better. build a more intelligent prefetching
797	 */
798	logical = base + offset;
799	physical = map->stripes[num].physical;
800	ret = 0;
801	for (i = 0; i < nstripes; ++i) {
802		key.objectid = logical;
803		key.type = BTRFS_EXTENT_ITEM_KEY;
804		key.offset = (u64)0;
805
806		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
807		if (ret < 0)
808			goto out;
809
810		l = path->nodes[0];
811		slot = path->slots[0];
812		btrfs_item_key_to_cpu(l, &key, slot);
813		if (key.objectid != logical) {
814			ret = btrfs_previous_item(root, path, 0,
815						  BTRFS_EXTENT_ITEM_KEY);
816			if (ret < 0)
817				goto out;
818		}
819
820		while (1) {
821			l = path->nodes[0];
822			slot = path->slots[0];
823			if (slot >= btrfs_header_nritems(l)) {
824				ret = btrfs_next_leaf(root, path);
825				if (ret == 0)
826					continue;
827				if (ret < 0)
828					goto out;
829
830				break;
831			}
832			btrfs_item_key_to_cpu(l, &key, slot);
833
834			if (key.objectid >= logical + map->stripe_len)
835				break;
836
837			path->slots[0]++;
838		}
839		btrfs_release_path(path);
840		logical += increment;
841		physical += map->stripe_len;
842		cond_resched();
843	}
844
845	/*
846	 * collect all data csums for the stripe to avoid seeking during
847	 * the scrub. This might currently (crc32) end up to be about 1MB
848	 */
849	start_stripe = 0;
850again:
851	logical = base + offset + start_stripe * increment;
852	for (i = start_stripe; i < nstripes; ++i) {
853		ret = btrfs_lookup_csums_range(csum_root, logical,
854					       logical + map->stripe_len - 1,
855					       &sdev->csum_list, 1);
856		if (ret)
857			goto out;
858
859		logical += increment;
860		cond_resched();
861	}
862	/*
863	 * now find all extents for each stripe and scrub them
864	 */
865	logical = base + offset + start_stripe * increment;
866	physical = map->stripes[num].physical + start_stripe * map->stripe_len;
867	ret = 0;
868	for (i = start_stripe; i < nstripes; ++i) {
869		/*
870		 * canceled?
871		 */
872		if (atomic_read(&fs_info->scrub_cancel_req) ||
873		    atomic_read(&sdev->cancel_req)) {
874			ret = -ECANCELED;
875			goto out;
876		}
877		/*
878		 * check to see if we have to pause
879		 */
880		if (atomic_read(&fs_info->scrub_pause_req)) {
881			/* push queued extents */
882			scrub_submit(sdev);
883			wait_event(sdev->list_wait,
884				   atomic_read(&sdev->in_flight) == 0);
885			atomic_inc(&fs_info->scrubs_paused);
886			wake_up(&fs_info->scrub_pause_wait);
887			mutex_lock(&fs_info->scrub_lock);
888			while (atomic_read(&fs_info->scrub_pause_req)) {
889				mutex_unlock(&fs_info->scrub_lock);
890				wait_event(fs_info->scrub_pause_wait,
891				   atomic_read(&fs_info->scrub_pause_req) == 0);
892				mutex_lock(&fs_info->scrub_lock);
893			}
894			atomic_dec(&fs_info->scrubs_paused);
895			mutex_unlock(&fs_info->scrub_lock);
896			wake_up(&fs_info->scrub_pause_wait);
897			scrub_free_csums(sdev);
898			start_stripe = i;
899			goto again;
900		}
901
902		key.objectid = logical;
903		key.type = BTRFS_EXTENT_ITEM_KEY;
904		key.offset = (u64)0;
905
906		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
907		if (ret < 0)
908			goto out;
909
910		l = path->nodes[0];
911		slot = path->slots[0];
912		btrfs_item_key_to_cpu(l, &key, slot);
913		if (key.objectid != logical) {
914			ret = btrfs_previous_item(root, path, 0,
915						  BTRFS_EXTENT_ITEM_KEY);
916			if (ret < 0)
917				goto out;
918		}
919
920		while (1) {
921			l = path->nodes[0];
922			slot = path->slots[0];
923			if (slot >= btrfs_header_nritems(l)) {
924				ret = btrfs_next_leaf(root, path);
925				if (ret == 0)
926					continue;
927				if (ret < 0)
928					goto out;
929
930				break;
931			}
932			btrfs_item_key_to_cpu(l, &key, slot);
933
934			if (key.objectid + key.offset <= logical)
935				goto next;
936
937			if (key.objectid >= logical + map->stripe_len)
938				break;
939
940			if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
941				goto next;
942
943			extent = btrfs_item_ptr(l, slot,
944						struct btrfs_extent_item);
945			flags = btrfs_extent_flags(l, extent);
946			generation = btrfs_extent_generation(l, extent);
947
948			if (key.objectid < logical &&
949			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
950				printk(KERN_ERR
951				       "btrfs scrub: tree block %llu spanning "
952				       "stripes, ignored. logical=%llu\n",
953				       (unsigned long long)key.objectid,
954				       (unsigned long long)logical);
955				goto next;
956			}
957
958			/*
959			 * trim extent to this stripe
960			 */
961			if (key.objectid < logical) {
962				key.offset -= logical - key.objectid;
963				key.objectid = logical;
964			}
965			if (key.objectid + key.offset >
966			    logical + map->stripe_len) {
967				key.offset = logical + map->stripe_len -
968					     key.objectid;
969			}
970
971			ret = scrub_extent(sdev, key.objectid, key.offset,
972					   key.objectid - logical + physical,
973					   flags, generation, mirror_num);
974			if (ret)
975				goto out;
976
977next:
978			path->slots[0]++;
979		}
980		btrfs_release_path(path);
981		logical += increment;
982		physical += map->stripe_len;
983		spin_lock(&sdev->stat_lock);
984		sdev->stat.last_physical = physical;
985		spin_unlock(&sdev->stat_lock);
986	}
987	/* push queued extents */
988	scrub_submit(sdev);
989
990out:
991	btrfs_free_path(path);
992	return ret < 0 ? ret : 0;
993}
994
995static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
996	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
997{
998	struct btrfs_mapping_tree *map_tree =
999		&sdev->dev->dev_root->fs_info->mapping_tree;
1000	struct map_lookup *map;
1001	struct extent_map *em;
1002	int i;
1003	int ret = -EINVAL;
1004
1005	read_lock(&map_tree->map_tree.lock);
1006	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
1007	read_unlock(&map_tree->map_tree.lock);
1008
1009	if (!em)
1010		return -EINVAL;
1011
1012	map = (struct map_lookup *)em->bdev;
1013	if (em->start != chunk_offset)
1014		goto out;
1015
1016	if (em->len < length)
1017		goto out;
1018
1019	for (i = 0; i < map->num_stripes; ++i) {
1020		if (map->stripes[i].dev == sdev->dev) {
1021			ret = scrub_stripe(sdev, map, i, chunk_offset, length);
1022			if (ret)
1023				goto out;
1024		}
1025	}
1026out:
1027	free_extent_map(em);
1028
1029	return ret;
1030}
1031
1032static noinline_for_stack
1033int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
1034{
1035	struct btrfs_dev_extent *dev_extent = NULL;
1036	struct btrfs_path *path;
1037	struct btrfs_root *root = sdev->dev->dev_root;
1038	struct btrfs_fs_info *fs_info = root->fs_info;
1039	u64 length;
1040	u64 chunk_tree;
1041	u64 chunk_objectid;
1042	u64 chunk_offset;
1043	int ret;
1044	int slot;
1045	struct extent_buffer *l;
1046	struct btrfs_key key;
1047	struct btrfs_key found_key;
1048	struct btrfs_block_group_cache *cache;
1049
1050	path = btrfs_alloc_path();
1051	if (!path)
1052		return -ENOMEM;
1053
1054	path->reada = 2;
1055	path->search_commit_root = 1;
1056	path->skip_locking = 1;
1057
1058	key.objectid = sdev->dev->devid;
1059	key.offset = 0ull;
1060	key.type = BTRFS_DEV_EXTENT_KEY;
1061
1062
1063	while (1) {
1064		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1065		if (ret < 0)
1066			goto out;
1067		ret = 0;
1068
1069		l = path->nodes[0];
1070		slot = path->slots[0];
1071
1072		btrfs_item_key_to_cpu(l, &found_key, slot);
1073
1074		if (found_key.objectid != sdev->dev->devid)
1075			break;
1076
1077		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
1078			break;
1079
1080		if (found_key.offset >= end)
1081			break;
1082
1083		if (found_key.offset < key.offset)
1084			break;
1085
1086		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1087		length = btrfs_dev_extent_length(l, dev_extent);
1088
1089		if (found_key.offset + length <= start) {
1090			key.offset = found_key.offset + length;
1091			btrfs_release_path(path);
1092			continue;
1093		}
1094
1095		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1096		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1097		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1098
1099		/*
1100		 * get a reference on the corresponding block group to prevent
1101		 * the chunk from going away while we scrub it
1102		 */
1103		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
1104		if (!cache) {
1105			ret = -ENOENT;
1106			goto out;
1107		}
1108		ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
1109				  chunk_offset, length);
1110		btrfs_put_block_group(cache);
1111		if (ret)
1112			break;
1113
1114		key.offset = found_key.offset + length;
1115		btrfs_release_path(path);
1116	}
1117
1118out:
1119	btrfs_free_path(path);
1120	return ret;
1121}
1122
1123static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1124{
1125	int	i;
1126	u64	bytenr;
1127	u64	gen;
1128	int	ret;
1129	struct btrfs_device *device = sdev->dev;
1130	struct btrfs_root *root = device->dev_root;
1131
1132	gen = root->fs_info->last_trans_committed;
1133
1134	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1135		bytenr = btrfs_sb_offset(i);
1136		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1137			break;
1138
1139		ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
1140				 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
1141		if (ret)
1142			return ret;
1143	}
1144	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1145
1146	return 0;
1147}
1148
1149/*
1150 * get a reference count on fs_info->scrub_workers. start worker if necessary
1151 */
1152static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1153{
1154	struct btrfs_fs_info *fs_info = root->fs_info;
1155
1156	mutex_lock(&fs_info->scrub_lock);
1157	if (fs_info->scrub_workers_refcnt == 0)
1158		btrfs_start_workers(&fs_info->scrub_workers, 1);
1159	++fs_info->scrub_workers_refcnt;
1160	mutex_unlock(&fs_info->scrub_lock);
1161
1162	return 0;
1163}
1164
1165static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
1166{
1167	struct btrfs_fs_info *fs_info = root->fs_info;
1168
1169	mutex_lock(&fs_info->scrub_lock);
1170	if (--fs_info->scrub_workers_refcnt == 0)
1171		btrfs_stop_workers(&fs_info->scrub_workers);
1172	WARN_ON(fs_info->scrub_workers_refcnt < 0);
1173	mutex_unlock(&fs_info->scrub_lock);
1174}
1175
1176
1177int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1178		    struct btrfs_scrub_progress *progress, int readonly)
1179{
1180	struct scrub_dev *sdev;
1181	struct btrfs_fs_info *fs_info = root->fs_info;
1182	int ret;
1183	struct btrfs_device *dev;
1184
1185	if (root->fs_info->closing)
1186		return -EINVAL;
1187
1188	/*
1189	 * check some assumptions
1190	 */
1191	if (root->sectorsize != PAGE_SIZE ||
1192	    root->sectorsize != root->leafsize ||
1193	    root->sectorsize != root->nodesize) {
1194		printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
1195		return -EINVAL;
1196	}
1197
1198	ret = scrub_workers_get(root);
1199	if (ret)
1200		return ret;
1201
1202	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1203	dev = btrfs_find_device(root, devid, NULL, NULL);
1204	if (!dev || dev->missing) {
1205		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1206		scrub_workers_put(root);
1207		return -ENODEV;
1208	}
1209	mutex_lock(&fs_info->scrub_lock);
1210
1211	if (!dev->in_fs_metadata) {
1212		mutex_unlock(&fs_info->scrub_lock);
1213		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1214		scrub_workers_put(root);
1215		return -ENODEV;
1216	}
1217
1218	if (dev->scrub_device) {
1219		mutex_unlock(&fs_info->scrub_lock);
1220		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1221		scrub_workers_put(root);
1222		return -EINPROGRESS;
1223	}
1224	sdev = scrub_setup_dev(dev);
1225	if (IS_ERR(sdev)) {
1226		mutex_unlock(&fs_info->scrub_lock);
1227		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1228		scrub_workers_put(root);
1229		return PTR_ERR(sdev);
1230	}
1231	sdev->readonly = readonly;
1232	dev->scrub_device = sdev;
1233
1234	atomic_inc(&fs_info->scrubs_running);
1235	mutex_unlock(&fs_info->scrub_lock);
1236	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1237
1238	down_read(&fs_info->scrub_super_lock);
1239	ret = scrub_supers(sdev);
1240	up_read(&fs_info->scrub_super_lock);
1241
1242	if (!ret)
1243		ret = scrub_enumerate_chunks(sdev, start, end);
1244
1245	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1246
1247	atomic_dec(&fs_info->scrubs_running);
1248	wake_up(&fs_info->scrub_pause_wait);
1249
1250	if (progress)
1251		memcpy(progress, &sdev->stat, sizeof(*progress));
1252
1253	mutex_lock(&fs_info->scrub_lock);
1254	dev->scrub_device = NULL;
1255	mutex_unlock(&fs_info->scrub_lock);
1256
1257	scrub_free_dev(sdev);
1258	scrub_workers_put(root);
1259
1260	return ret;
1261}
1262
1263int btrfs_scrub_pause(struct btrfs_root *root)
1264{
1265	struct btrfs_fs_info *fs_info = root->fs_info;
1266
1267	mutex_lock(&fs_info->scrub_lock);
1268	atomic_inc(&fs_info->scrub_pause_req);
1269	while (atomic_read(&fs_info->scrubs_paused) !=
1270	       atomic_read(&fs_info->scrubs_running)) {
1271		mutex_unlock(&fs_info->scrub_lock);
1272		wait_event(fs_info->scrub_pause_wait,
1273			   atomic_read(&fs_info->scrubs_paused) ==
1274			   atomic_read(&fs_info->scrubs_running));
1275		mutex_lock(&fs_info->scrub_lock);
1276	}
1277	mutex_unlock(&fs_info->scrub_lock);
1278
1279	return 0;
1280}
1281
1282int btrfs_scrub_continue(struct btrfs_root *root)
1283{
1284	struct btrfs_fs_info *fs_info = root->fs_info;
1285
1286	atomic_dec(&fs_info->scrub_pause_req);
1287	wake_up(&fs_info->scrub_pause_wait);
1288	return 0;
1289}
1290
1291int btrfs_scrub_pause_super(struct btrfs_root *root)
1292{
1293	down_write(&root->fs_info->scrub_super_lock);
1294	return 0;
1295}
1296
1297int btrfs_scrub_continue_super(struct btrfs_root *root)
1298{
1299	up_write(&root->fs_info->scrub_super_lock);
1300	return 0;
1301}
1302
1303int btrfs_scrub_cancel(struct btrfs_root *root)
1304{
1305	struct btrfs_fs_info *fs_info = root->fs_info;
1306
1307	mutex_lock(&fs_info->scrub_lock);
1308	if (!atomic_read(&fs_info->scrubs_running)) {
1309		mutex_unlock(&fs_info->scrub_lock);
1310		return -ENOTCONN;
1311	}
1312
1313	atomic_inc(&fs_info->scrub_cancel_req);
1314	while (atomic_read(&fs_info->scrubs_running)) {
1315		mutex_unlock(&fs_info->scrub_lock);
1316		wait_event(fs_info->scrub_pause_wait,
1317			   atomic_read(&fs_info->scrubs_running) == 0);
1318		mutex_lock(&fs_info->scrub_lock);
1319	}
1320	atomic_dec(&fs_info->scrub_cancel_req);
1321	mutex_unlock(&fs_info->scrub_lock);
1322
1323	return 0;
1324}
1325
1326int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
1327{
1328	struct btrfs_fs_info *fs_info = root->fs_info;
1329	struct scrub_dev *sdev;
1330
1331	mutex_lock(&fs_info->scrub_lock);
1332	sdev = dev->scrub_device;
1333	if (!sdev) {
1334		mutex_unlock(&fs_info->scrub_lock);
1335		return -ENOTCONN;
1336	}
1337	atomic_inc(&sdev->cancel_req);
1338	while (dev->scrub_device) {
1339		mutex_unlock(&fs_info->scrub_lock);
1340		wait_event(fs_info->scrub_pause_wait,
1341			   dev->scrub_device == NULL);
1342		mutex_lock(&fs_info->scrub_lock);
1343	}
1344	mutex_unlock(&fs_info->scrub_lock);
1345
1346	return 0;
1347}
1348int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
1349{
1350	struct btrfs_fs_info *fs_info = root->fs_info;
1351	struct btrfs_device *dev;
1352	int ret;
1353
1354	/*
1355	 * we have to hold the device_list_mutex here so the device
1356	 * does not go away in cancel_dev. FIXME: find a better solution
1357	 */
1358	mutex_lock(&fs_info->fs_devices->device_list_mutex);
1359	dev = btrfs_find_device(root, devid, NULL, NULL);
1360	if (!dev) {
1361		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1362		return -ENODEV;
1363	}
1364	ret = btrfs_scrub_cancel_dev(root, dev);
1365	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1366
1367	return ret;
1368}
1369
1370int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
1371			 struct btrfs_scrub_progress *progress)
1372{
1373	struct btrfs_device *dev;
1374	struct scrub_dev *sdev = NULL;
1375
1376	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1377	dev = btrfs_find_device(root, devid, NULL, NULL);
1378	if (dev)
1379		sdev = dev->scrub_device;
1380	if (sdev)
1381		memcpy(progress, &sdev->stat, sizeof(*progress));
1382	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1383
1384	return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
1385}
1386