scrub.c revision cf93dccea67ad8f5e0d9163c6a0a584550bbd7cd
1/*
2 * Copyright (C) 2011 STRATO.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
21#include "ctree.h"
22#include "volumes.h"
23#include "disk-io.h"
24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
28#include "check-integrity.h"
29#include "rcu-string.h"
30
31/*
32 * This is only the first step towards a full-features scrub. It reads all
33 * extent and super block and verifies the checksums. In case a bad checksum
34 * is found or the extent cannot be read, good data will be written back if
35 * any can be found.
36 *
37 * Future enhancements:
38 *  - In case an unrepairable extent is encountered, track which files are
39 *    affected and report them
40 *  - track and record media errors, throw out bad devices
41 *  - add a mode to also read unallocated space
42 */
43
44struct scrub_block;
45struct scrub_dev;
46
47#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */
48#define SCRUB_BIOS_PER_DEV	16	/* 1 MB per device in flight */
49#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
50
51struct scrub_page {
52	struct scrub_block	*sblock;
53	struct page		*page;
54	struct btrfs_device	*dev;
55	u64			flags;  /* extent flags */
56	u64			generation;
57	u64			logical;
58	u64			physical;
59	struct {
60		unsigned int	mirror_num:8;
61		unsigned int	have_csum:1;
62		unsigned int	io_error:1;
63	};
64	u8			csum[BTRFS_CSUM_SIZE];
65};
66
67struct scrub_bio {
68	int			index;
69	struct scrub_dev	*sdev;
70	struct bio		*bio;
71	int			err;
72	u64			logical;
73	u64			physical;
74	struct scrub_page	*pagev[SCRUB_PAGES_PER_BIO];
75	int			page_count;
76	int			next_free;
77	struct btrfs_work	work;
78};
79
80struct scrub_block {
81	struct scrub_page	pagev[SCRUB_MAX_PAGES_PER_BLOCK];
82	int			page_count;
83	atomic_t		outstanding_pages;
84	atomic_t		ref_count; /* free mem on transition to zero */
85	struct scrub_dev	*sdev;
86	struct {
87		unsigned int	header_error:1;
88		unsigned int	checksum_error:1;
89		unsigned int	no_io_error_seen:1;
90		unsigned int	generation_error:1; /* also sets header_error */
91	};
92};
93
94struct scrub_dev {
95	struct scrub_bio	*bios[SCRUB_BIOS_PER_DEV];
96	struct btrfs_device	*dev;
97	int			first_free;
98	int			curr;
99	atomic_t		in_flight;
100	atomic_t		fixup_cnt;
101	spinlock_t		list_lock;
102	wait_queue_head_t	list_wait;
103	u16			csum_size;
104	struct list_head	csum_list;
105	atomic_t		cancel_req;
106	int			readonly;
107	int			pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
108	u32			sectorsize;
109	u32			nodesize;
110	u32			leafsize;
111	/*
112	 * statistics
113	 */
114	struct btrfs_scrub_progress stat;
115	spinlock_t		stat_lock;
116};
117
118struct scrub_fixup_nodatasum {
119	struct scrub_dev	*sdev;
120	u64			logical;
121	struct btrfs_root	*root;
122	struct btrfs_work	work;
123	int			mirror_num;
124};
125
126struct scrub_warning {
127	struct btrfs_path	*path;
128	u64			extent_item_size;
129	char			*scratch_buf;
130	char			*msg_buf;
131	const char		*errstr;
132	sector_t		sector;
133	u64			logical;
134	struct btrfs_device	*dev;
135	int			msg_bufsize;
136	int			scratch_bufsize;
137};
138
139
140static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
141static int scrub_setup_recheck_block(struct scrub_dev *sdev,
142				     struct btrfs_mapping_tree *map_tree,
143				     u64 length, u64 logical,
144				     struct scrub_block *sblock);
145static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
146			       struct scrub_block *sblock, int is_metadata,
147			       int have_csum, u8 *csum, u64 generation,
148			       u16 csum_size);
149static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
150					 struct scrub_block *sblock,
151					 int is_metadata, int have_csum,
152					 const u8 *csum, u64 generation,
153					 u16 csum_size);
154static void scrub_complete_bio_end_io(struct bio *bio, int err);
155static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
156					     struct scrub_block *sblock_good,
157					     int force_write);
158static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
159					    struct scrub_block *sblock_good,
160					    int page_num, int force_write);
161static int scrub_checksum_data(struct scrub_block *sblock);
162static int scrub_checksum_tree_block(struct scrub_block *sblock);
163static int scrub_checksum_super(struct scrub_block *sblock);
164static void scrub_block_get(struct scrub_block *sblock);
165static void scrub_block_put(struct scrub_block *sblock);
166static int scrub_add_page_to_bio(struct scrub_dev *sdev,
167				 struct scrub_page *spage);
168static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
169		       u64 physical, u64 flags, u64 gen, int mirror_num,
170		       u8 *csum, int force);
171static void scrub_bio_end_io(struct bio *bio, int err);
172static void scrub_bio_end_io_worker(struct btrfs_work *work);
173static void scrub_block_complete(struct scrub_block *sblock);
174
175
176static void scrub_free_csums(struct scrub_dev *sdev)
177{
178	while (!list_empty(&sdev->csum_list)) {
179		struct btrfs_ordered_sum *sum;
180		sum = list_first_entry(&sdev->csum_list,
181				       struct btrfs_ordered_sum, list);
182		list_del(&sum->list);
183		kfree(sum);
184	}
185}
186
187static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
188{
189	int i;
190
191	if (!sdev)
192		return;
193
194	/* this can happen when scrub is cancelled */
195	if (sdev->curr != -1) {
196		struct scrub_bio *sbio = sdev->bios[sdev->curr];
197
198		for (i = 0; i < sbio->page_count; i++) {
199			BUG_ON(!sbio->pagev[i]);
200			BUG_ON(!sbio->pagev[i]->page);
201			scrub_block_put(sbio->pagev[i]->sblock);
202		}
203		bio_put(sbio->bio);
204	}
205
206	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
207		struct scrub_bio *sbio = sdev->bios[i];
208
209		if (!sbio)
210			break;
211		kfree(sbio);
212	}
213
214	scrub_free_csums(sdev);
215	kfree(sdev);
216}
217
218static noinline_for_stack
219struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
220{
221	struct scrub_dev *sdev;
222	int		i;
223	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
224	int pages_per_bio;
225
226	pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
227			      bio_get_nr_vecs(dev->bdev));
228	sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
229	if (!sdev)
230		goto nomem;
231	sdev->dev = dev;
232	sdev->pages_per_bio = pages_per_bio;
233	sdev->curr = -1;
234	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
235		struct scrub_bio *sbio;
236
237		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
238		if (!sbio)
239			goto nomem;
240		sdev->bios[i] = sbio;
241
242		sbio->index = i;
243		sbio->sdev = sdev;
244		sbio->page_count = 0;
245		sbio->work.func = scrub_bio_end_io_worker;
246
247		if (i != SCRUB_BIOS_PER_DEV-1)
248			sdev->bios[i]->next_free = i + 1;
249		else
250			sdev->bios[i]->next_free = -1;
251	}
252	sdev->first_free = 0;
253	sdev->nodesize = dev->dev_root->nodesize;
254	sdev->leafsize = dev->dev_root->leafsize;
255	sdev->sectorsize = dev->dev_root->sectorsize;
256	atomic_set(&sdev->in_flight, 0);
257	atomic_set(&sdev->fixup_cnt, 0);
258	atomic_set(&sdev->cancel_req, 0);
259	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
260	INIT_LIST_HEAD(&sdev->csum_list);
261
262	spin_lock_init(&sdev->list_lock);
263	spin_lock_init(&sdev->stat_lock);
264	init_waitqueue_head(&sdev->list_wait);
265	return sdev;
266
267nomem:
268	scrub_free_dev(sdev);
269	return ERR_PTR(-ENOMEM);
270}
271
272static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
273{
274	u64 isize;
275	u32 nlink;
276	int ret;
277	int i;
278	struct extent_buffer *eb;
279	struct btrfs_inode_item *inode_item;
280	struct scrub_warning *swarn = ctx;
281	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
282	struct inode_fs_paths *ipath = NULL;
283	struct btrfs_root *local_root;
284	struct btrfs_key root_key;
285
286	root_key.objectid = root;
287	root_key.type = BTRFS_ROOT_ITEM_KEY;
288	root_key.offset = (u64)-1;
289	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
290	if (IS_ERR(local_root)) {
291		ret = PTR_ERR(local_root);
292		goto err;
293	}
294
295	ret = inode_item_info(inum, 0, local_root, swarn->path);
296	if (ret) {
297		btrfs_release_path(swarn->path);
298		goto err;
299	}
300
301	eb = swarn->path->nodes[0];
302	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
303					struct btrfs_inode_item);
304	isize = btrfs_inode_size(eb, inode_item);
305	nlink = btrfs_inode_nlink(eb, inode_item);
306	btrfs_release_path(swarn->path);
307
308	ipath = init_ipath(4096, local_root, swarn->path);
309	if (IS_ERR(ipath)) {
310		ret = PTR_ERR(ipath);
311		ipath = NULL;
312		goto err;
313	}
314	ret = paths_from_inode(inum, ipath);
315
316	if (ret < 0)
317		goto err;
318
319	/*
320	 * we deliberately ignore the bit ipath might have been too small to
321	 * hold all of the paths here
322	 */
323	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
324		printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
325			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
326			"length %llu, links %u (path: %s)\n", swarn->errstr,
327			swarn->logical, rcu_str_deref(swarn->dev->name),
328			(unsigned long long)swarn->sector, root, inum, offset,
329			min(isize - offset, (u64)PAGE_SIZE), nlink,
330			(char *)(unsigned long)ipath->fspath->val[i]);
331
332	free_ipath(ipath);
333	return 0;
334
335err:
336	printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
337		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
338		"resolving failed with ret=%d\n", swarn->errstr,
339		swarn->logical, rcu_str_deref(swarn->dev->name),
340		(unsigned long long)swarn->sector, root, inum, offset, ret);
341
342	free_ipath(ipath);
343	return 0;
344}
345
346static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
347{
348	struct btrfs_device *dev = sblock->sdev->dev;
349	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
350	struct btrfs_path *path;
351	struct btrfs_key found_key;
352	struct extent_buffer *eb;
353	struct btrfs_extent_item *ei;
354	struct scrub_warning swarn;
355	u32 item_size;
356	int ret;
357	u64 ref_root;
358	u8 ref_level;
359	unsigned long ptr = 0;
360	const int bufsize = 4096;
361	u64 extent_item_pos;
362
363	path = btrfs_alloc_path();
364
365	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
366	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
367	BUG_ON(sblock->page_count < 1);
368	swarn.sector = (sblock->pagev[0].physical) >> 9;
369	swarn.logical = sblock->pagev[0].logical;
370	swarn.errstr = errstr;
371	swarn.dev = dev;
372	swarn.msg_bufsize = bufsize;
373	swarn.scratch_bufsize = bufsize;
374
375	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
376		goto out;
377
378	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
379	if (ret < 0)
380		goto out;
381
382	extent_item_pos = swarn.logical - found_key.objectid;
383	swarn.extent_item_size = found_key.offset;
384
385	eb = path->nodes[0];
386	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
387	item_size = btrfs_item_size_nr(eb, path->slots[0]);
388	btrfs_release_path(path);
389
390	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
391		do {
392			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
393							&ref_root, &ref_level);
394			printk_in_rcu(KERN_WARNING
395				"btrfs: %s at logical %llu on dev %s, "
396				"sector %llu: metadata %s (level %d) in tree "
397				"%llu\n", errstr, swarn.logical,
398				rcu_str_deref(dev->name),
399				(unsigned long long)swarn.sector,
400				ref_level ? "node" : "leaf",
401				ret < 0 ? -1 : ref_level,
402				ret < 0 ? -1 : ref_root);
403		} while (ret != 1);
404	} else {
405		swarn.path = path;
406		iterate_extent_inodes(fs_info, found_key.objectid,
407					extent_item_pos, 1,
408					scrub_print_warning_inode, &swarn);
409	}
410
411out:
412	btrfs_free_path(path);
413	kfree(swarn.scratch_buf);
414	kfree(swarn.msg_buf);
415}
416
417static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
418{
419	struct page *page = NULL;
420	unsigned long index;
421	struct scrub_fixup_nodatasum *fixup = ctx;
422	int ret;
423	int corrected = 0;
424	struct btrfs_key key;
425	struct inode *inode = NULL;
426	u64 end = offset + PAGE_SIZE - 1;
427	struct btrfs_root *local_root;
428
429	key.objectid = root;
430	key.type = BTRFS_ROOT_ITEM_KEY;
431	key.offset = (u64)-1;
432	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
433	if (IS_ERR(local_root))
434		return PTR_ERR(local_root);
435
436	key.type = BTRFS_INODE_ITEM_KEY;
437	key.objectid = inum;
438	key.offset = 0;
439	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
440	if (IS_ERR(inode))
441		return PTR_ERR(inode);
442
443	index = offset >> PAGE_CACHE_SHIFT;
444
445	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
446	if (!page) {
447		ret = -ENOMEM;
448		goto out;
449	}
450
451	if (PageUptodate(page)) {
452		struct btrfs_mapping_tree *map_tree;
453		if (PageDirty(page)) {
454			/*
455			 * we need to write the data to the defect sector. the
456			 * data that was in that sector is not in memory,
457			 * because the page was modified. we must not write the
458			 * modified page to that sector.
459			 *
460			 * TODO: what could be done here: wait for the delalloc
461			 *       runner to write out that page (might involve
462			 *       COW) and see whether the sector is still
463			 *       referenced afterwards.
464			 *
465			 * For the meantime, we'll treat this error
466			 * incorrectable, although there is a chance that a
467			 * later scrub will find the bad sector again and that
468			 * there's no dirty page in memory, then.
469			 */
470			ret = -EIO;
471			goto out;
472		}
473		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
474		ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
475					fixup->logical, page,
476					fixup->mirror_num);
477		unlock_page(page);
478		corrected = !ret;
479	} else {
480		/*
481		 * we need to get good data first. the general readpage path
482		 * will call repair_io_failure for us, we just have to make
483		 * sure we read the bad mirror.
484		 */
485		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
486					EXTENT_DAMAGED, GFP_NOFS);
487		if (ret) {
488			/* set_extent_bits should give proper error */
489			WARN_ON(ret > 0);
490			if (ret > 0)
491				ret = -EFAULT;
492			goto out;
493		}
494
495		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
496						btrfs_get_extent,
497						fixup->mirror_num);
498		wait_on_page_locked(page);
499
500		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
501						end, EXTENT_DAMAGED, 0, NULL);
502		if (!corrected)
503			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
504						EXTENT_DAMAGED, GFP_NOFS);
505	}
506
507out:
508	if (page)
509		put_page(page);
510	if (inode)
511		iput(inode);
512
513	if (ret < 0)
514		return ret;
515
516	if (ret == 0 && corrected) {
517		/*
518		 * we only need to call readpage for one of the inodes belonging
519		 * to this extent. so make iterate_extent_inodes stop
520		 */
521		return 1;
522	}
523
524	return -EIO;
525}
526
527static void scrub_fixup_nodatasum(struct btrfs_work *work)
528{
529	int ret;
530	struct scrub_fixup_nodatasum *fixup;
531	struct scrub_dev *sdev;
532	struct btrfs_trans_handle *trans = NULL;
533	struct btrfs_fs_info *fs_info;
534	struct btrfs_path *path;
535	int uncorrectable = 0;
536
537	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
538	sdev = fixup->sdev;
539	fs_info = fixup->root->fs_info;
540
541	path = btrfs_alloc_path();
542	if (!path) {
543		spin_lock(&sdev->stat_lock);
544		++sdev->stat.malloc_errors;
545		spin_unlock(&sdev->stat_lock);
546		uncorrectable = 1;
547		goto out;
548	}
549
550	trans = btrfs_join_transaction(fixup->root);
551	if (IS_ERR(trans)) {
552		uncorrectable = 1;
553		goto out;
554	}
555
556	/*
557	 * the idea is to trigger a regular read through the standard path. we
558	 * read a page from the (failed) logical address by specifying the
559	 * corresponding copynum of the failed sector. thus, that readpage is
560	 * expected to fail.
561	 * that is the point where on-the-fly error correction will kick in
562	 * (once it's finished) and rewrite the failed sector if a good copy
563	 * can be found.
564	 */
565	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
566						path, scrub_fixup_readpage,
567						fixup);
568	if (ret < 0) {
569		uncorrectable = 1;
570		goto out;
571	}
572	WARN_ON(ret != 1);
573
574	spin_lock(&sdev->stat_lock);
575	++sdev->stat.corrected_errors;
576	spin_unlock(&sdev->stat_lock);
577
578out:
579	if (trans && !IS_ERR(trans))
580		btrfs_end_transaction(trans, fixup->root);
581	if (uncorrectable) {
582		spin_lock(&sdev->stat_lock);
583		++sdev->stat.uncorrectable_errors;
584		spin_unlock(&sdev->stat_lock);
585
586		printk_ratelimited_in_rcu(KERN_ERR
587			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
588			(unsigned long long)fixup->logical,
589			rcu_str_deref(sdev->dev->name));
590	}
591
592	btrfs_free_path(path);
593	kfree(fixup);
594
595	/* see caller why we're pretending to be paused in the scrub counters */
596	mutex_lock(&fs_info->scrub_lock);
597	atomic_dec(&fs_info->scrubs_running);
598	atomic_dec(&fs_info->scrubs_paused);
599	mutex_unlock(&fs_info->scrub_lock);
600	atomic_dec(&sdev->fixup_cnt);
601	wake_up(&fs_info->scrub_pause_wait);
602	wake_up(&sdev->list_wait);
603}
604
605/*
606 * scrub_handle_errored_block gets called when either verification of the
607 * pages failed or the bio failed to read, e.g. with EIO. In the latter
608 * case, this function handles all pages in the bio, even though only one
609 * may be bad.
610 * The goal of this function is to repair the errored block by using the
611 * contents of one of the mirrors.
612 */
613static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
614{
615	struct scrub_dev *sdev = sblock_to_check->sdev;
616	struct btrfs_fs_info *fs_info;
617	u64 length;
618	u64 logical;
619	u64 generation;
620	unsigned int failed_mirror_index;
621	unsigned int is_metadata;
622	unsigned int have_csum;
623	u8 *csum;
624	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
625	struct scrub_block *sblock_bad;
626	int ret;
627	int mirror_index;
628	int page_num;
629	int success;
630	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
631				      DEFAULT_RATELIMIT_BURST);
632
633	BUG_ON(sblock_to_check->page_count < 1);
634	fs_info = sdev->dev->dev_root->fs_info;
635	length = sblock_to_check->page_count * PAGE_SIZE;
636	logical = sblock_to_check->pagev[0].logical;
637	generation = sblock_to_check->pagev[0].generation;
638	BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
639	failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
640	is_metadata = !(sblock_to_check->pagev[0].flags &
641			BTRFS_EXTENT_FLAG_DATA);
642	have_csum = sblock_to_check->pagev[0].have_csum;
643	csum = sblock_to_check->pagev[0].csum;
644
645	/*
646	 * read all mirrors one after the other. This includes to
647	 * re-read the extent or metadata block that failed (that was
648	 * the cause that this fixup code is called) another time,
649	 * page by page this time in order to know which pages
650	 * caused I/O errors and which ones are good (for all mirrors).
651	 * It is the goal to handle the situation when more than one
652	 * mirror contains I/O errors, but the errors do not
653	 * overlap, i.e. the data can be repaired by selecting the
654	 * pages from those mirrors without I/O error on the
655	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
656	 * would be that mirror #1 has an I/O error on the first page,
657	 * the second page is good, and mirror #2 has an I/O error on
658	 * the second page, but the first page is good.
659	 * Then the first page of the first mirror can be repaired by
660	 * taking the first page of the second mirror, and the
661	 * second page of the second mirror can be repaired by
662	 * copying the contents of the 2nd page of the 1st mirror.
663	 * One more note: if the pages of one mirror contain I/O
664	 * errors, the checksum cannot be verified. In order to get
665	 * the best data for repairing, the first attempt is to find
666	 * a mirror without I/O errors and with a validated checksum.
667	 * Only if this is not possible, the pages are picked from
668	 * mirrors with I/O errors without considering the checksum.
669	 * If the latter is the case, at the end, the checksum of the
670	 * repaired area is verified in order to correctly maintain
671	 * the statistics.
672	 */
673
674	sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
675				     sizeof(*sblocks_for_recheck),
676				     GFP_NOFS);
677	if (!sblocks_for_recheck) {
678		spin_lock(&sdev->stat_lock);
679		sdev->stat.malloc_errors++;
680		sdev->stat.read_errors++;
681		sdev->stat.uncorrectable_errors++;
682		spin_unlock(&sdev->stat_lock);
683		btrfs_dev_stat_inc_and_print(sdev->dev,
684					     BTRFS_DEV_STAT_READ_ERRS);
685		goto out;
686	}
687
688	/* setup the context, map the logical blocks and alloc the pages */
689	ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
690					logical, sblocks_for_recheck);
691	if (ret) {
692		spin_lock(&sdev->stat_lock);
693		sdev->stat.read_errors++;
694		sdev->stat.uncorrectable_errors++;
695		spin_unlock(&sdev->stat_lock);
696		btrfs_dev_stat_inc_and_print(sdev->dev,
697					     BTRFS_DEV_STAT_READ_ERRS);
698		goto out;
699	}
700	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
701	sblock_bad = sblocks_for_recheck + failed_mirror_index;
702
703	/* build and submit the bios for the failed mirror, check checksums */
704	ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
705				  csum, generation, sdev->csum_size);
706	if (ret) {
707		spin_lock(&sdev->stat_lock);
708		sdev->stat.read_errors++;
709		sdev->stat.uncorrectable_errors++;
710		spin_unlock(&sdev->stat_lock);
711		btrfs_dev_stat_inc_and_print(sdev->dev,
712					     BTRFS_DEV_STAT_READ_ERRS);
713		goto out;
714	}
715
716	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
717	    sblock_bad->no_io_error_seen) {
718		/*
719		 * the error disappeared after reading page by page, or
720		 * the area was part of a huge bio and other parts of the
721		 * bio caused I/O errors, or the block layer merged several
722		 * read requests into one and the error is caused by a
723		 * different bio (usually one of the two latter cases is
724		 * the cause)
725		 */
726		spin_lock(&sdev->stat_lock);
727		sdev->stat.unverified_errors++;
728		spin_unlock(&sdev->stat_lock);
729
730		goto out;
731	}
732
733	if (!sblock_bad->no_io_error_seen) {
734		spin_lock(&sdev->stat_lock);
735		sdev->stat.read_errors++;
736		spin_unlock(&sdev->stat_lock);
737		if (__ratelimit(&_rs))
738			scrub_print_warning("i/o error", sblock_to_check);
739		btrfs_dev_stat_inc_and_print(sdev->dev,
740					     BTRFS_DEV_STAT_READ_ERRS);
741	} else if (sblock_bad->checksum_error) {
742		spin_lock(&sdev->stat_lock);
743		sdev->stat.csum_errors++;
744		spin_unlock(&sdev->stat_lock);
745		if (__ratelimit(&_rs))
746			scrub_print_warning("checksum error", sblock_to_check);
747		btrfs_dev_stat_inc_and_print(sdev->dev,
748					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
749	} else if (sblock_bad->header_error) {
750		spin_lock(&sdev->stat_lock);
751		sdev->stat.verify_errors++;
752		spin_unlock(&sdev->stat_lock);
753		if (__ratelimit(&_rs))
754			scrub_print_warning("checksum/header error",
755					    sblock_to_check);
756		if (sblock_bad->generation_error)
757			btrfs_dev_stat_inc_and_print(sdev->dev,
758				BTRFS_DEV_STAT_GENERATION_ERRS);
759		else
760			btrfs_dev_stat_inc_and_print(sdev->dev,
761				BTRFS_DEV_STAT_CORRUPTION_ERRS);
762	}
763
764	if (sdev->readonly)
765		goto did_not_correct_error;
766
767	if (!is_metadata && !have_csum) {
768		struct scrub_fixup_nodatasum *fixup_nodatasum;
769
770		/*
771		 * !is_metadata and !have_csum, this means that the data
772		 * might not be COW'ed, that it might be modified
773		 * concurrently. The general strategy to work on the
774		 * commit root does not help in the case when COW is not
775		 * used.
776		 */
777		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
778		if (!fixup_nodatasum)
779			goto did_not_correct_error;
780		fixup_nodatasum->sdev = sdev;
781		fixup_nodatasum->logical = logical;
782		fixup_nodatasum->root = fs_info->extent_root;
783		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
784		/*
785		 * increment scrubs_running to prevent cancel requests from
786		 * completing as long as a fixup worker is running. we must also
787		 * increment scrubs_paused to prevent deadlocking on pause
788		 * requests used for transactions commits (as the worker uses a
789		 * transaction context). it is safe to regard the fixup worker
790		 * as paused for all matters practical. effectively, we only
791		 * avoid cancellation requests from completing.
792		 */
793		mutex_lock(&fs_info->scrub_lock);
794		atomic_inc(&fs_info->scrubs_running);
795		atomic_inc(&fs_info->scrubs_paused);
796		mutex_unlock(&fs_info->scrub_lock);
797		atomic_inc(&sdev->fixup_cnt);
798		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
799		btrfs_queue_worker(&fs_info->scrub_workers,
800				   &fixup_nodatasum->work);
801		goto out;
802	}
803
804	/*
805	 * now build and submit the bios for the other mirrors, check
806	 * checksums
807	 */
808	for (mirror_index = 0;
809	     mirror_index < BTRFS_MAX_MIRRORS &&
810	     sblocks_for_recheck[mirror_index].page_count > 0;
811	     mirror_index++) {
812		if (mirror_index == failed_mirror_index)
813			continue;
814
815		/* build and submit the bios, check checksums */
816		ret = scrub_recheck_block(fs_info,
817					  sblocks_for_recheck + mirror_index,
818					  is_metadata, have_csum, csum,
819					  generation, sdev->csum_size);
820		if (ret)
821			goto did_not_correct_error;
822	}
823
824	/*
825	 * first try to pick the mirror which is completely without I/O
826	 * errors and also does not have a checksum error.
827	 * If one is found, and if a checksum is present, the full block
828	 * that is known to contain an error is rewritten. Afterwards
829	 * the block is known to be corrected.
830	 * If a mirror is found which is completely correct, and no
831	 * checksum is present, only those pages are rewritten that had
832	 * an I/O error in the block to be repaired, since it cannot be
833	 * determined, which copy of the other pages is better (and it
834	 * could happen otherwise that a correct page would be
835	 * overwritten by a bad one).
836	 */
837	for (mirror_index = 0;
838	     mirror_index < BTRFS_MAX_MIRRORS &&
839	     sblocks_for_recheck[mirror_index].page_count > 0;
840	     mirror_index++) {
841		struct scrub_block *sblock_other = sblocks_for_recheck +
842						   mirror_index;
843
844		if (!sblock_other->header_error &&
845		    !sblock_other->checksum_error &&
846		    sblock_other->no_io_error_seen) {
847			int force_write = is_metadata || have_csum;
848
849			ret = scrub_repair_block_from_good_copy(sblock_bad,
850								sblock_other,
851								force_write);
852			if (0 == ret)
853				goto corrected_error;
854		}
855	}
856
857	/*
858	 * in case of I/O errors in the area that is supposed to be
859	 * repaired, continue by picking good copies of those pages.
860	 * Select the good pages from mirrors to rewrite bad pages from
861	 * the area to fix. Afterwards verify the checksum of the block
862	 * that is supposed to be repaired. This verification step is
863	 * only done for the purpose of statistic counting and for the
864	 * final scrub report, whether errors remain.
865	 * A perfect algorithm could make use of the checksum and try
866	 * all possible combinations of pages from the different mirrors
867	 * until the checksum verification succeeds. For example, when
868	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
869	 * of mirror #2 is readable but the final checksum test fails,
870	 * then the 2nd page of mirror #3 could be tried, whether now
871	 * the final checksum succeedes. But this would be a rare
872	 * exception and is therefore not implemented. At least it is
873	 * avoided that the good copy is overwritten.
874	 * A more useful improvement would be to pick the sectors
875	 * without I/O error based on sector sizes (512 bytes on legacy
876	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
877	 * mirror could be repaired by taking 512 byte of a different
878	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
879	 * area are unreadable.
880	 */
881
882	/* can only fix I/O errors from here on */
883	if (sblock_bad->no_io_error_seen)
884		goto did_not_correct_error;
885
886	success = 1;
887	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
888		struct scrub_page *page_bad = sblock_bad->pagev + page_num;
889
890		if (!page_bad->io_error)
891			continue;
892
893		for (mirror_index = 0;
894		     mirror_index < BTRFS_MAX_MIRRORS &&
895		     sblocks_for_recheck[mirror_index].page_count > 0;
896		     mirror_index++) {
897			struct scrub_block *sblock_other = sblocks_for_recheck +
898							   mirror_index;
899			struct scrub_page *page_other = sblock_other->pagev +
900							page_num;
901
902			if (!page_other->io_error) {
903				ret = scrub_repair_page_from_good_copy(
904					sblock_bad, sblock_other, page_num, 0);
905				if (0 == ret) {
906					page_bad->io_error = 0;
907					break; /* succeeded for this page */
908				}
909			}
910		}
911
912		if (page_bad->io_error) {
913			/* did not find a mirror to copy the page from */
914			success = 0;
915		}
916	}
917
918	if (success) {
919		if (is_metadata || have_csum) {
920			/*
921			 * need to verify the checksum now that all
922			 * sectors on disk are repaired (the write
923			 * request for data to be repaired is on its way).
924			 * Just be lazy and use scrub_recheck_block()
925			 * which re-reads the data before the checksum
926			 * is verified, but most likely the data comes out
927			 * of the page cache.
928			 */
929			ret = scrub_recheck_block(fs_info, sblock_bad,
930						  is_metadata, have_csum, csum,
931						  generation, sdev->csum_size);
932			if (!ret && !sblock_bad->header_error &&
933			    !sblock_bad->checksum_error &&
934			    sblock_bad->no_io_error_seen)
935				goto corrected_error;
936			else
937				goto did_not_correct_error;
938		} else {
939corrected_error:
940			spin_lock(&sdev->stat_lock);
941			sdev->stat.corrected_errors++;
942			spin_unlock(&sdev->stat_lock);
943			printk_ratelimited_in_rcu(KERN_ERR
944				"btrfs: fixed up error at logical %llu on dev %s\n",
945				(unsigned long long)logical,
946				rcu_str_deref(sdev->dev->name));
947		}
948	} else {
949did_not_correct_error:
950		spin_lock(&sdev->stat_lock);
951		sdev->stat.uncorrectable_errors++;
952		spin_unlock(&sdev->stat_lock);
953		printk_ratelimited_in_rcu(KERN_ERR
954			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
955			(unsigned long long)logical,
956			rcu_str_deref(sdev->dev->name));
957	}
958
959out:
960	if (sblocks_for_recheck) {
961		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
962		     mirror_index++) {
963			struct scrub_block *sblock = sblocks_for_recheck +
964						     mirror_index;
965			int page_index;
966
967			for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
968			     page_index++)
969				if (sblock->pagev[page_index].page)
970					__free_page(
971						sblock->pagev[page_index].page);
972		}
973		kfree(sblocks_for_recheck);
974	}
975
976	return 0;
977}
978
979static int scrub_setup_recheck_block(struct scrub_dev *sdev,
980				     struct btrfs_mapping_tree *map_tree,
981				     u64 length, u64 logical,
982				     struct scrub_block *sblocks_for_recheck)
983{
984	int page_index;
985	int mirror_index;
986	int ret;
987
988	/*
989	 * note: the three members sdev, ref_count and outstanding_pages
990	 * are not used (and not set) in the blocks that are used for
991	 * the recheck procedure
992	 */
993
994	page_index = 0;
995	while (length > 0) {
996		u64 sublen = min_t(u64, length, PAGE_SIZE);
997		u64 mapped_length = sublen;
998		struct btrfs_bio *bbio = NULL;
999
1000		/*
1001		 * with a length of PAGE_SIZE, each returned stripe
1002		 * represents one mirror
1003		 */
1004		ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
1005				      &bbio, 0);
1006		if (ret || !bbio || mapped_length < sublen) {
1007			kfree(bbio);
1008			return -EIO;
1009		}
1010
1011		BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
1012		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1013		     mirror_index++) {
1014			struct scrub_block *sblock;
1015			struct scrub_page *page;
1016
1017			if (mirror_index >= BTRFS_MAX_MIRRORS)
1018				continue;
1019
1020			sblock = sblocks_for_recheck + mirror_index;
1021			page = sblock->pagev + page_index;
1022			page->logical = logical;
1023			page->physical = bbio->stripes[mirror_index].physical;
1024			/* for missing devices, dev->bdev is NULL */
1025			page->dev = bbio->stripes[mirror_index].dev;
1026			page->mirror_num = mirror_index + 1;
1027			page->page = alloc_page(GFP_NOFS);
1028			if (!page->page) {
1029				spin_lock(&sdev->stat_lock);
1030				sdev->stat.malloc_errors++;
1031				spin_unlock(&sdev->stat_lock);
1032				kfree(bbio);
1033				return -ENOMEM;
1034			}
1035			sblock->page_count++;
1036		}
1037		kfree(bbio);
1038		length -= sublen;
1039		logical += sublen;
1040		page_index++;
1041	}
1042
1043	return 0;
1044}
1045
1046/*
1047 * this function will check the on disk data for checksum errors, header
1048 * errors and read I/O errors. If any I/O errors happen, the exact pages
1049 * which are errored are marked as being bad. The goal is to enable scrub
1050 * to take those pages that are not errored from all the mirrors so that
1051 * the pages that are errored in the just handled mirror can be repaired.
1052 */
1053static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1054			       struct scrub_block *sblock, int is_metadata,
1055			       int have_csum, u8 *csum, u64 generation,
1056			       u16 csum_size)
1057{
1058	int page_num;
1059
1060	sblock->no_io_error_seen = 1;
1061	sblock->header_error = 0;
1062	sblock->checksum_error = 0;
1063
1064	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1065		struct bio *bio;
1066		int ret;
1067		struct scrub_page *page = sblock->pagev + page_num;
1068		DECLARE_COMPLETION_ONSTACK(complete);
1069
1070		if (page->dev->bdev == NULL) {
1071			page->io_error = 1;
1072			sblock->no_io_error_seen = 0;
1073			continue;
1074		}
1075
1076		BUG_ON(!page->page);
1077		bio = bio_alloc(GFP_NOFS, 1);
1078		if (!bio)
1079			return -EIO;
1080		bio->bi_bdev = page->dev->bdev;
1081		bio->bi_sector = page->physical >> 9;
1082		bio->bi_end_io = scrub_complete_bio_end_io;
1083		bio->bi_private = &complete;
1084
1085		ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
1086		if (PAGE_SIZE != ret) {
1087			bio_put(bio);
1088			return -EIO;
1089		}
1090		btrfsic_submit_bio(READ, bio);
1091
1092		/* this will also unplug the queue */
1093		wait_for_completion(&complete);
1094
1095		page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
1096		if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1097			sblock->no_io_error_seen = 0;
1098		bio_put(bio);
1099	}
1100
1101	if (sblock->no_io_error_seen)
1102		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1103					     have_csum, csum, generation,
1104					     csum_size);
1105
1106	return 0;
1107}
1108
1109static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1110					 struct scrub_block *sblock,
1111					 int is_metadata, int have_csum,
1112					 const u8 *csum, u64 generation,
1113					 u16 csum_size)
1114{
1115	int page_num;
1116	u8 calculated_csum[BTRFS_CSUM_SIZE];
1117	u32 crc = ~(u32)0;
1118	struct btrfs_root *root = fs_info->extent_root;
1119	void *mapped_buffer;
1120
1121	BUG_ON(!sblock->pagev[0].page);
1122	if (is_metadata) {
1123		struct btrfs_header *h;
1124
1125		mapped_buffer = kmap_atomic(sblock->pagev[0].page);
1126		h = (struct btrfs_header *)mapped_buffer;
1127
1128		if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
1129		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1130		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1131			   BTRFS_UUID_SIZE)) {
1132			sblock->header_error = 1;
1133		} else if (generation != le64_to_cpu(h->generation)) {
1134			sblock->header_error = 1;
1135			sblock->generation_error = 1;
1136		}
1137		csum = h->csum;
1138	} else {
1139		if (!have_csum)
1140			return;
1141
1142		mapped_buffer = kmap_atomic(sblock->pagev[0].page);
1143	}
1144
1145	for (page_num = 0;;) {
1146		if (page_num == 0 && is_metadata)
1147			crc = btrfs_csum_data(root,
1148				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1149				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1150		else
1151			crc = btrfs_csum_data(root, mapped_buffer, crc,
1152					      PAGE_SIZE);
1153
1154		kunmap_atomic(mapped_buffer);
1155		page_num++;
1156		if (page_num >= sblock->page_count)
1157			break;
1158		BUG_ON(!sblock->pagev[page_num].page);
1159
1160		mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
1161	}
1162
1163	btrfs_csum_final(crc, calculated_csum);
1164	if (memcmp(calculated_csum, csum, csum_size))
1165		sblock->checksum_error = 1;
1166}
1167
1168static void scrub_complete_bio_end_io(struct bio *bio, int err)
1169{
1170	complete((struct completion *)bio->bi_private);
1171}
1172
1173static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1174					     struct scrub_block *sblock_good,
1175					     int force_write)
1176{
1177	int page_num;
1178	int ret = 0;
1179
1180	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1181		int ret_sub;
1182
1183		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1184							   sblock_good,
1185							   page_num,
1186							   force_write);
1187		if (ret_sub)
1188			ret = ret_sub;
1189	}
1190
1191	return ret;
1192}
1193
1194static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1195					    struct scrub_block *sblock_good,
1196					    int page_num, int force_write)
1197{
1198	struct scrub_page *page_bad = sblock_bad->pagev + page_num;
1199	struct scrub_page *page_good = sblock_good->pagev + page_num;
1200
1201	BUG_ON(sblock_bad->pagev[page_num].page == NULL);
1202	BUG_ON(sblock_good->pagev[page_num].page == NULL);
1203	if (force_write || sblock_bad->header_error ||
1204	    sblock_bad->checksum_error || page_bad->io_error) {
1205		struct bio *bio;
1206		int ret;
1207		DECLARE_COMPLETION_ONSTACK(complete);
1208
1209		bio = bio_alloc(GFP_NOFS, 1);
1210		if (!bio)
1211			return -EIO;
1212		bio->bi_bdev = page_bad->dev->bdev;
1213		bio->bi_sector = page_bad->physical >> 9;
1214		bio->bi_end_io = scrub_complete_bio_end_io;
1215		bio->bi_private = &complete;
1216
1217		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1218		if (PAGE_SIZE != ret) {
1219			bio_put(bio);
1220			return -EIO;
1221		}
1222		btrfsic_submit_bio(WRITE, bio);
1223
1224		/* this will also unplug the queue */
1225		wait_for_completion(&complete);
1226		if (!bio_flagged(bio, BIO_UPTODATE)) {
1227			btrfs_dev_stat_inc_and_print(page_bad->dev,
1228				BTRFS_DEV_STAT_WRITE_ERRS);
1229			bio_put(bio);
1230			return -EIO;
1231		}
1232		bio_put(bio);
1233	}
1234
1235	return 0;
1236}
1237
1238static void scrub_checksum(struct scrub_block *sblock)
1239{
1240	u64 flags;
1241	int ret;
1242
1243	BUG_ON(sblock->page_count < 1);
1244	flags = sblock->pagev[0].flags;
1245	ret = 0;
1246	if (flags & BTRFS_EXTENT_FLAG_DATA)
1247		ret = scrub_checksum_data(sblock);
1248	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1249		ret = scrub_checksum_tree_block(sblock);
1250	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1251		(void)scrub_checksum_super(sblock);
1252	else
1253		WARN_ON(1);
1254	if (ret)
1255		scrub_handle_errored_block(sblock);
1256}
1257
1258static int scrub_checksum_data(struct scrub_block *sblock)
1259{
1260	struct scrub_dev *sdev = sblock->sdev;
1261	u8 csum[BTRFS_CSUM_SIZE];
1262	u8 *on_disk_csum;
1263	struct page *page;
1264	void *buffer;
1265	u32 crc = ~(u32)0;
1266	int fail = 0;
1267	struct btrfs_root *root = sdev->dev->dev_root;
1268	u64 len;
1269	int index;
1270
1271	BUG_ON(sblock->page_count < 1);
1272	if (!sblock->pagev[0].have_csum)
1273		return 0;
1274
1275	on_disk_csum = sblock->pagev[0].csum;
1276	page = sblock->pagev[0].page;
1277	buffer = kmap_atomic(page);
1278
1279	len = sdev->sectorsize;
1280	index = 0;
1281	for (;;) {
1282		u64 l = min_t(u64, len, PAGE_SIZE);
1283
1284		crc = btrfs_csum_data(root, buffer, crc, l);
1285		kunmap_atomic(buffer);
1286		len -= l;
1287		if (len == 0)
1288			break;
1289		index++;
1290		BUG_ON(index >= sblock->page_count);
1291		BUG_ON(!sblock->pagev[index].page);
1292		page = sblock->pagev[index].page;
1293		buffer = kmap_atomic(page);
1294	}
1295
1296	btrfs_csum_final(crc, csum);
1297	if (memcmp(csum, on_disk_csum, sdev->csum_size))
1298		fail = 1;
1299
1300	return fail;
1301}
1302
1303static int scrub_checksum_tree_block(struct scrub_block *sblock)
1304{
1305	struct scrub_dev *sdev = sblock->sdev;
1306	struct btrfs_header *h;
1307	struct btrfs_root *root = sdev->dev->dev_root;
1308	struct btrfs_fs_info *fs_info = root->fs_info;
1309	u8 calculated_csum[BTRFS_CSUM_SIZE];
1310	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1311	struct page *page;
1312	void *mapped_buffer;
1313	u64 mapped_size;
1314	void *p;
1315	u32 crc = ~(u32)0;
1316	int fail = 0;
1317	int crc_fail = 0;
1318	u64 len;
1319	int index;
1320
1321	BUG_ON(sblock->page_count < 1);
1322	page = sblock->pagev[0].page;
1323	mapped_buffer = kmap_atomic(page);
1324	h = (struct btrfs_header *)mapped_buffer;
1325	memcpy(on_disk_csum, h->csum, sdev->csum_size);
1326
1327	/*
1328	 * we don't use the getter functions here, as we
1329	 * a) don't have an extent buffer and
1330	 * b) the page is already kmapped
1331	 */
1332
1333	if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
1334		++fail;
1335
1336	if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
1337		++fail;
1338
1339	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1340		++fail;
1341
1342	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1343		   BTRFS_UUID_SIZE))
1344		++fail;
1345
1346	BUG_ON(sdev->nodesize != sdev->leafsize);
1347	len = sdev->nodesize - BTRFS_CSUM_SIZE;
1348	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1349	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1350	index = 0;
1351	for (;;) {
1352		u64 l = min_t(u64, len, mapped_size);
1353
1354		crc = btrfs_csum_data(root, p, crc, l);
1355		kunmap_atomic(mapped_buffer);
1356		len -= l;
1357		if (len == 0)
1358			break;
1359		index++;
1360		BUG_ON(index >= sblock->page_count);
1361		BUG_ON(!sblock->pagev[index].page);
1362		page = sblock->pagev[index].page;
1363		mapped_buffer = kmap_atomic(page);
1364		mapped_size = PAGE_SIZE;
1365		p = mapped_buffer;
1366	}
1367
1368	btrfs_csum_final(crc, calculated_csum);
1369	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
1370		++crc_fail;
1371
1372	return fail || crc_fail;
1373}
1374
1375static int scrub_checksum_super(struct scrub_block *sblock)
1376{
1377	struct btrfs_super_block *s;
1378	struct scrub_dev *sdev = sblock->sdev;
1379	struct btrfs_root *root = sdev->dev->dev_root;
1380	struct btrfs_fs_info *fs_info = root->fs_info;
1381	u8 calculated_csum[BTRFS_CSUM_SIZE];
1382	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1383	struct page *page;
1384	void *mapped_buffer;
1385	u64 mapped_size;
1386	void *p;
1387	u32 crc = ~(u32)0;
1388	int fail_gen = 0;
1389	int fail_cor = 0;
1390	u64 len;
1391	int index;
1392
1393	BUG_ON(sblock->page_count < 1);
1394	page = sblock->pagev[0].page;
1395	mapped_buffer = kmap_atomic(page);
1396	s = (struct btrfs_super_block *)mapped_buffer;
1397	memcpy(on_disk_csum, s->csum, sdev->csum_size);
1398
1399	if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
1400		++fail_cor;
1401
1402	if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
1403		++fail_gen;
1404
1405	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1406		++fail_cor;
1407
1408	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1409	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1410	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1411	index = 0;
1412	for (;;) {
1413		u64 l = min_t(u64, len, mapped_size);
1414
1415		crc = btrfs_csum_data(root, p, crc, l);
1416		kunmap_atomic(mapped_buffer);
1417		len -= l;
1418		if (len == 0)
1419			break;
1420		index++;
1421		BUG_ON(index >= sblock->page_count);
1422		BUG_ON(!sblock->pagev[index].page);
1423		page = sblock->pagev[index].page;
1424		mapped_buffer = kmap_atomic(page);
1425		mapped_size = PAGE_SIZE;
1426		p = mapped_buffer;
1427	}
1428
1429	btrfs_csum_final(crc, calculated_csum);
1430	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
1431		++fail_cor;
1432
1433	if (fail_cor + fail_gen) {
1434		/*
1435		 * if we find an error in a super block, we just report it.
1436		 * They will get written with the next transaction commit
1437		 * anyway
1438		 */
1439		spin_lock(&sdev->stat_lock);
1440		++sdev->stat.super_errors;
1441		spin_unlock(&sdev->stat_lock);
1442		if (fail_cor)
1443			btrfs_dev_stat_inc_and_print(sdev->dev,
1444				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1445		else
1446			btrfs_dev_stat_inc_and_print(sdev->dev,
1447				BTRFS_DEV_STAT_GENERATION_ERRS);
1448	}
1449
1450	return fail_cor + fail_gen;
1451}
1452
1453static void scrub_block_get(struct scrub_block *sblock)
1454{
1455	atomic_inc(&sblock->ref_count);
1456}
1457
1458static void scrub_block_put(struct scrub_block *sblock)
1459{
1460	if (atomic_dec_and_test(&sblock->ref_count)) {
1461		int i;
1462
1463		for (i = 0; i < sblock->page_count; i++)
1464			if (sblock->pagev[i].page)
1465				__free_page(sblock->pagev[i].page);
1466		kfree(sblock);
1467	}
1468}
1469
1470static void scrub_submit(struct scrub_dev *sdev)
1471{
1472	struct scrub_bio *sbio;
1473
1474	if (sdev->curr == -1)
1475		return;
1476
1477	sbio = sdev->bios[sdev->curr];
1478	sdev->curr = -1;
1479	atomic_inc(&sdev->in_flight);
1480
1481	btrfsic_submit_bio(READ, sbio->bio);
1482}
1483
1484static int scrub_add_page_to_bio(struct scrub_dev *sdev,
1485				 struct scrub_page *spage)
1486{
1487	struct scrub_block *sblock = spage->sblock;
1488	struct scrub_bio *sbio;
1489	int ret;
1490
1491again:
1492	/*
1493	 * grab a fresh bio or wait for one to become available
1494	 */
1495	while (sdev->curr == -1) {
1496		spin_lock(&sdev->list_lock);
1497		sdev->curr = sdev->first_free;
1498		if (sdev->curr != -1) {
1499			sdev->first_free = sdev->bios[sdev->curr]->next_free;
1500			sdev->bios[sdev->curr]->next_free = -1;
1501			sdev->bios[sdev->curr]->page_count = 0;
1502			spin_unlock(&sdev->list_lock);
1503		} else {
1504			spin_unlock(&sdev->list_lock);
1505			wait_event(sdev->list_wait, sdev->first_free != -1);
1506		}
1507	}
1508	sbio = sdev->bios[sdev->curr];
1509	if (sbio->page_count == 0) {
1510		struct bio *bio;
1511
1512		sbio->physical = spage->physical;
1513		sbio->logical = spage->logical;
1514		bio = sbio->bio;
1515		if (!bio) {
1516			bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
1517			if (!bio)
1518				return -ENOMEM;
1519			sbio->bio = bio;
1520		}
1521
1522		bio->bi_private = sbio;
1523		bio->bi_end_io = scrub_bio_end_io;
1524		bio->bi_bdev = sdev->dev->bdev;
1525		bio->bi_sector = spage->physical >> 9;
1526		sbio->err = 0;
1527	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1528		   spage->physical ||
1529		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1530		   spage->logical) {
1531		scrub_submit(sdev);
1532		goto again;
1533	}
1534
1535	sbio->pagev[sbio->page_count] = spage;
1536	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1537	if (ret != PAGE_SIZE) {
1538		if (sbio->page_count < 1) {
1539			bio_put(sbio->bio);
1540			sbio->bio = NULL;
1541			return -EIO;
1542		}
1543		scrub_submit(sdev);
1544		goto again;
1545	}
1546
1547	scrub_block_get(sblock); /* one for the added page */
1548	atomic_inc(&sblock->outstanding_pages);
1549	sbio->page_count++;
1550	if (sbio->page_count == sdev->pages_per_bio)
1551		scrub_submit(sdev);
1552
1553	return 0;
1554}
1555
1556static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1557		       u64 physical, u64 flags, u64 gen, int mirror_num,
1558		       u8 *csum, int force)
1559{
1560	struct scrub_block *sblock;
1561	int index;
1562
1563	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1564	if (!sblock) {
1565		spin_lock(&sdev->stat_lock);
1566		sdev->stat.malloc_errors++;
1567		spin_unlock(&sdev->stat_lock);
1568		return -ENOMEM;
1569	}
1570
1571	/* one ref inside this function, plus one for each page later on */
1572	atomic_set(&sblock->ref_count, 1);
1573	sblock->sdev = sdev;
1574	sblock->no_io_error_seen = 1;
1575
1576	for (index = 0; len > 0; index++) {
1577		struct scrub_page *spage = sblock->pagev + index;
1578		u64 l = min_t(u64, len, PAGE_SIZE);
1579
1580		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
1581		spage->page = alloc_page(GFP_NOFS);
1582		if (!spage->page) {
1583			spin_lock(&sdev->stat_lock);
1584			sdev->stat.malloc_errors++;
1585			spin_unlock(&sdev->stat_lock);
1586			while (index > 0) {
1587				index--;
1588				__free_page(sblock->pagev[index].page);
1589			}
1590			kfree(sblock);
1591			return -ENOMEM;
1592		}
1593		spage->sblock = sblock;
1594		spage->dev = sdev->dev;
1595		spage->flags = flags;
1596		spage->generation = gen;
1597		spage->logical = logical;
1598		spage->physical = physical;
1599		spage->mirror_num = mirror_num;
1600		if (csum) {
1601			spage->have_csum = 1;
1602			memcpy(spage->csum, csum, sdev->csum_size);
1603		} else {
1604			spage->have_csum = 0;
1605		}
1606		sblock->page_count++;
1607		len -= l;
1608		logical += l;
1609		physical += l;
1610	}
1611
1612	BUG_ON(sblock->page_count == 0);
1613	for (index = 0; index < sblock->page_count; index++) {
1614		struct scrub_page *spage = sblock->pagev + index;
1615		int ret;
1616
1617		ret = scrub_add_page_to_bio(sdev, spage);
1618		if (ret) {
1619			scrub_block_put(sblock);
1620			return ret;
1621		}
1622	}
1623
1624	if (force)
1625		scrub_submit(sdev);
1626
1627	/* last one frees, either here or in bio completion for last page */
1628	scrub_block_put(sblock);
1629	return 0;
1630}
1631
1632static void scrub_bio_end_io(struct bio *bio, int err)
1633{
1634	struct scrub_bio *sbio = bio->bi_private;
1635	struct scrub_dev *sdev = sbio->sdev;
1636	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1637
1638	sbio->err = err;
1639	sbio->bio = bio;
1640
1641	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
1642}
1643
1644static void scrub_bio_end_io_worker(struct btrfs_work *work)
1645{
1646	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1647	struct scrub_dev *sdev = sbio->sdev;
1648	int i;
1649
1650	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
1651	if (sbio->err) {
1652		for (i = 0; i < sbio->page_count; i++) {
1653			struct scrub_page *spage = sbio->pagev[i];
1654
1655			spage->io_error = 1;
1656			spage->sblock->no_io_error_seen = 0;
1657		}
1658	}
1659
1660	/* now complete the scrub_block items that have all pages completed */
1661	for (i = 0; i < sbio->page_count; i++) {
1662		struct scrub_page *spage = sbio->pagev[i];
1663		struct scrub_block *sblock = spage->sblock;
1664
1665		if (atomic_dec_and_test(&sblock->outstanding_pages))
1666			scrub_block_complete(sblock);
1667		scrub_block_put(sblock);
1668	}
1669
1670	if (sbio->err) {
1671		/* what is this good for??? */
1672		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1673		sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
1674		sbio->bio->bi_phys_segments = 0;
1675		sbio->bio->bi_idx = 0;
1676
1677		for (i = 0; i < sbio->page_count; i++) {
1678			struct bio_vec *bi;
1679			bi = &sbio->bio->bi_io_vec[i];
1680			bi->bv_offset = 0;
1681			bi->bv_len = PAGE_SIZE;
1682		}
1683	}
1684
1685	bio_put(sbio->bio);
1686	sbio->bio = NULL;
1687	spin_lock(&sdev->list_lock);
1688	sbio->next_free = sdev->first_free;
1689	sdev->first_free = sbio->index;
1690	spin_unlock(&sdev->list_lock);
1691	atomic_dec(&sdev->in_flight);
1692	wake_up(&sdev->list_wait);
1693}
1694
1695static void scrub_block_complete(struct scrub_block *sblock)
1696{
1697	if (!sblock->no_io_error_seen)
1698		scrub_handle_errored_block(sblock);
1699	else
1700		scrub_checksum(sblock);
1701}
1702
1703static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1704			   u8 *csum)
1705{
1706	struct btrfs_ordered_sum *sum = NULL;
1707	int ret = 0;
1708	unsigned long i;
1709	unsigned long num_sectors;
1710
1711	while (!list_empty(&sdev->csum_list)) {
1712		sum = list_first_entry(&sdev->csum_list,
1713				       struct btrfs_ordered_sum, list);
1714		if (sum->bytenr > logical)
1715			return 0;
1716		if (sum->bytenr + sum->len > logical)
1717			break;
1718
1719		++sdev->stat.csum_discards;
1720		list_del(&sum->list);
1721		kfree(sum);
1722		sum = NULL;
1723	}
1724	if (!sum)
1725		return 0;
1726
1727	num_sectors = sum->len / sdev->sectorsize;
1728	for (i = 0; i < num_sectors; ++i) {
1729		if (sum->sums[i].bytenr == logical) {
1730			memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
1731			ret = 1;
1732			break;
1733		}
1734	}
1735	if (ret && i == num_sectors - 1) {
1736		list_del(&sum->list);
1737		kfree(sum);
1738	}
1739	return ret;
1740}
1741
1742/* scrub extent tries to collect up to 64 kB for each bio */
1743static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1744			u64 physical, u64 flags, u64 gen, int mirror_num)
1745{
1746	int ret;
1747	u8 csum[BTRFS_CSUM_SIZE];
1748	u32 blocksize;
1749
1750	if (flags & BTRFS_EXTENT_FLAG_DATA) {
1751		blocksize = sdev->sectorsize;
1752		spin_lock(&sdev->stat_lock);
1753		sdev->stat.data_extents_scrubbed++;
1754		sdev->stat.data_bytes_scrubbed += len;
1755		spin_unlock(&sdev->stat_lock);
1756	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1757		BUG_ON(sdev->nodesize != sdev->leafsize);
1758		blocksize = sdev->nodesize;
1759		spin_lock(&sdev->stat_lock);
1760		sdev->stat.tree_extents_scrubbed++;
1761		sdev->stat.tree_bytes_scrubbed += len;
1762		spin_unlock(&sdev->stat_lock);
1763	} else {
1764		blocksize = sdev->sectorsize;
1765		BUG_ON(1);
1766	}
1767
1768	while (len) {
1769		u64 l = min_t(u64, len, blocksize);
1770		int have_csum = 0;
1771
1772		if (flags & BTRFS_EXTENT_FLAG_DATA) {
1773			/* push csums to sbio */
1774			have_csum = scrub_find_csum(sdev, logical, l, csum);
1775			if (have_csum == 0)
1776				++sdev->stat.no_csum;
1777		}
1778		ret = scrub_pages(sdev, logical, l, physical, flags, gen,
1779				  mirror_num, have_csum ? csum : NULL, 0);
1780		if (ret)
1781			return ret;
1782		len -= l;
1783		logical += l;
1784		physical += l;
1785	}
1786	return 0;
1787}
1788
1789static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1790	struct map_lookup *map, int num, u64 base, u64 length)
1791{
1792	struct btrfs_path *path;
1793	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1794	struct btrfs_root *root = fs_info->extent_root;
1795	struct btrfs_root *csum_root = fs_info->csum_root;
1796	struct btrfs_extent_item *extent;
1797	struct blk_plug plug;
1798	u64 flags;
1799	int ret;
1800	int slot;
1801	int i;
1802	u64 nstripes;
1803	struct extent_buffer *l;
1804	struct btrfs_key key;
1805	u64 physical;
1806	u64 logical;
1807	u64 generation;
1808	int mirror_num;
1809	struct reada_control *reada1;
1810	struct reada_control *reada2;
1811	struct btrfs_key key_start;
1812	struct btrfs_key key_end;
1813
1814	u64 increment = map->stripe_len;
1815	u64 offset;
1816
1817	nstripes = length;
1818	offset = 0;
1819	do_div(nstripes, map->stripe_len);
1820	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1821		offset = map->stripe_len * num;
1822		increment = map->stripe_len * map->num_stripes;
1823		mirror_num = 1;
1824	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1825		int factor = map->num_stripes / map->sub_stripes;
1826		offset = map->stripe_len * (num / map->sub_stripes);
1827		increment = map->stripe_len * factor;
1828		mirror_num = num % map->sub_stripes + 1;
1829	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
1830		increment = map->stripe_len;
1831		mirror_num = num % map->num_stripes + 1;
1832	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
1833		increment = map->stripe_len;
1834		mirror_num = num % map->num_stripes + 1;
1835	} else {
1836		increment = map->stripe_len;
1837		mirror_num = 1;
1838	}
1839
1840	path = btrfs_alloc_path();
1841	if (!path)
1842		return -ENOMEM;
1843
1844	/*
1845	 * work on commit root. The related disk blocks are static as
1846	 * long as COW is applied. This means, it is save to rewrite
1847	 * them to repair disk errors without any race conditions
1848	 */
1849	path->search_commit_root = 1;
1850	path->skip_locking = 1;
1851
1852	/*
1853	 * trigger the readahead for extent tree csum tree and wait for
1854	 * completion. During readahead, the scrub is officially paused
1855	 * to not hold off transaction commits
1856	 */
1857	logical = base + offset;
1858
1859	wait_event(sdev->list_wait,
1860		   atomic_read(&sdev->in_flight) == 0);
1861	atomic_inc(&fs_info->scrubs_paused);
1862	wake_up(&fs_info->scrub_pause_wait);
1863
1864	/* FIXME it might be better to start readahead at commit root */
1865	key_start.objectid = logical;
1866	key_start.type = BTRFS_EXTENT_ITEM_KEY;
1867	key_start.offset = (u64)0;
1868	key_end.objectid = base + offset + nstripes * increment;
1869	key_end.type = BTRFS_EXTENT_ITEM_KEY;
1870	key_end.offset = (u64)0;
1871	reada1 = btrfs_reada_add(root, &key_start, &key_end);
1872
1873	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1874	key_start.type = BTRFS_EXTENT_CSUM_KEY;
1875	key_start.offset = logical;
1876	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1877	key_end.type = BTRFS_EXTENT_CSUM_KEY;
1878	key_end.offset = base + offset + nstripes * increment;
1879	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1880
1881	if (!IS_ERR(reada1))
1882		btrfs_reada_wait(reada1);
1883	if (!IS_ERR(reada2))
1884		btrfs_reada_wait(reada2);
1885
1886	mutex_lock(&fs_info->scrub_lock);
1887	while (atomic_read(&fs_info->scrub_pause_req)) {
1888		mutex_unlock(&fs_info->scrub_lock);
1889		wait_event(fs_info->scrub_pause_wait,
1890		   atomic_read(&fs_info->scrub_pause_req) == 0);
1891		mutex_lock(&fs_info->scrub_lock);
1892	}
1893	atomic_dec(&fs_info->scrubs_paused);
1894	mutex_unlock(&fs_info->scrub_lock);
1895	wake_up(&fs_info->scrub_pause_wait);
1896
1897	/*
1898	 * collect all data csums for the stripe to avoid seeking during
1899	 * the scrub. This might currently (crc32) end up to be about 1MB
1900	 */
1901	blk_start_plug(&plug);
1902
1903	/*
1904	 * now find all extents for each stripe and scrub them
1905	 */
1906	logical = base + offset;
1907	physical = map->stripes[num].physical;
1908	ret = 0;
1909	for (i = 0; i < nstripes; ++i) {
1910		/*
1911		 * canceled?
1912		 */
1913		if (atomic_read(&fs_info->scrub_cancel_req) ||
1914		    atomic_read(&sdev->cancel_req)) {
1915			ret = -ECANCELED;
1916			goto out;
1917		}
1918		/*
1919		 * check to see if we have to pause
1920		 */
1921		if (atomic_read(&fs_info->scrub_pause_req)) {
1922			/* push queued extents */
1923			scrub_submit(sdev);
1924			wait_event(sdev->list_wait,
1925				   atomic_read(&sdev->in_flight) == 0);
1926			atomic_inc(&fs_info->scrubs_paused);
1927			wake_up(&fs_info->scrub_pause_wait);
1928			mutex_lock(&fs_info->scrub_lock);
1929			while (atomic_read(&fs_info->scrub_pause_req)) {
1930				mutex_unlock(&fs_info->scrub_lock);
1931				wait_event(fs_info->scrub_pause_wait,
1932				   atomic_read(&fs_info->scrub_pause_req) == 0);
1933				mutex_lock(&fs_info->scrub_lock);
1934			}
1935			atomic_dec(&fs_info->scrubs_paused);
1936			mutex_unlock(&fs_info->scrub_lock);
1937			wake_up(&fs_info->scrub_pause_wait);
1938		}
1939
1940		ret = btrfs_lookup_csums_range(csum_root, logical,
1941					       logical + map->stripe_len - 1,
1942					       &sdev->csum_list, 1);
1943		if (ret)
1944			goto out;
1945
1946		key.objectid = logical;
1947		key.type = BTRFS_EXTENT_ITEM_KEY;
1948		key.offset = (u64)0;
1949
1950		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1951		if (ret < 0)
1952			goto out;
1953		if (ret > 0) {
1954			ret = btrfs_previous_item(root, path, 0,
1955						  BTRFS_EXTENT_ITEM_KEY);
1956			if (ret < 0)
1957				goto out;
1958			if (ret > 0) {
1959				/* there's no smaller item, so stick with the
1960				 * larger one */
1961				btrfs_release_path(path);
1962				ret = btrfs_search_slot(NULL, root, &key,
1963							path, 0, 0);
1964				if (ret < 0)
1965					goto out;
1966			}
1967		}
1968
1969		while (1) {
1970			l = path->nodes[0];
1971			slot = path->slots[0];
1972			if (slot >= btrfs_header_nritems(l)) {
1973				ret = btrfs_next_leaf(root, path);
1974				if (ret == 0)
1975					continue;
1976				if (ret < 0)
1977					goto out;
1978
1979				break;
1980			}
1981			btrfs_item_key_to_cpu(l, &key, slot);
1982
1983			if (key.objectid + key.offset <= logical)
1984				goto next;
1985
1986			if (key.objectid >= logical + map->stripe_len)
1987				break;
1988
1989			if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
1990				goto next;
1991
1992			extent = btrfs_item_ptr(l, slot,
1993						struct btrfs_extent_item);
1994			flags = btrfs_extent_flags(l, extent);
1995			generation = btrfs_extent_generation(l, extent);
1996
1997			if (key.objectid < logical &&
1998			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
1999				printk(KERN_ERR
2000				       "btrfs scrub: tree block %llu spanning "
2001				       "stripes, ignored. logical=%llu\n",
2002				       (unsigned long long)key.objectid,
2003				       (unsigned long long)logical);
2004				goto next;
2005			}
2006
2007			/*
2008			 * trim extent to this stripe
2009			 */
2010			if (key.objectid < logical) {
2011				key.offset -= logical - key.objectid;
2012				key.objectid = logical;
2013			}
2014			if (key.objectid + key.offset >
2015			    logical + map->stripe_len) {
2016				key.offset = logical + map->stripe_len -
2017					     key.objectid;
2018			}
2019
2020			ret = scrub_extent(sdev, key.objectid, key.offset,
2021					   key.objectid - logical + physical,
2022					   flags, generation, mirror_num);
2023			if (ret)
2024				goto out;
2025
2026next:
2027			path->slots[0]++;
2028		}
2029		btrfs_release_path(path);
2030		logical += increment;
2031		physical += map->stripe_len;
2032		spin_lock(&sdev->stat_lock);
2033		sdev->stat.last_physical = physical;
2034		spin_unlock(&sdev->stat_lock);
2035	}
2036	/* push queued extents */
2037	scrub_submit(sdev);
2038
2039out:
2040	blk_finish_plug(&plug);
2041	btrfs_free_path(path);
2042	return ret < 0 ? ret : 0;
2043}
2044
2045static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
2046	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length,
2047	u64 dev_offset)
2048{
2049	struct btrfs_mapping_tree *map_tree =
2050		&sdev->dev->dev_root->fs_info->mapping_tree;
2051	struct map_lookup *map;
2052	struct extent_map *em;
2053	int i;
2054	int ret = -EINVAL;
2055
2056	read_lock(&map_tree->map_tree.lock);
2057	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2058	read_unlock(&map_tree->map_tree.lock);
2059
2060	if (!em)
2061		return -EINVAL;
2062
2063	map = (struct map_lookup *)em->bdev;
2064	if (em->start != chunk_offset)
2065		goto out;
2066
2067	if (em->len < length)
2068		goto out;
2069
2070	for (i = 0; i < map->num_stripes; ++i) {
2071		if (map->stripes[i].dev == sdev->dev &&
2072		    map->stripes[i].physical == dev_offset) {
2073			ret = scrub_stripe(sdev, map, i, chunk_offset, length);
2074			if (ret)
2075				goto out;
2076		}
2077	}
2078out:
2079	free_extent_map(em);
2080
2081	return ret;
2082}
2083
2084static noinline_for_stack
2085int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2086{
2087	struct btrfs_dev_extent *dev_extent = NULL;
2088	struct btrfs_path *path;
2089	struct btrfs_root *root = sdev->dev->dev_root;
2090	struct btrfs_fs_info *fs_info = root->fs_info;
2091	u64 length;
2092	u64 chunk_tree;
2093	u64 chunk_objectid;
2094	u64 chunk_offset;
2095	int ret;
2096	int slot;
2097	struct extent_buffer *l;
2098	struct btrfs_key key;
2099	struct btrfs_key found_key;
2100	struct btrfs_block_group_cache *cache;
2101
2102	path = btrfs_alloc_path();
2103	if (!path)
2104		return -ENOMEM;
2105
2106	path->reada = 2;
2107	path->search_commit_root = 1;
2108	path->skip_locking = 1;
2109
2110	key.objectid = sdev->dev->devid;
2111	key.offset = 0ull;
2112	key.type = BTRFS_DEV_EXTENT_KEY;
2113
2114
2115	while (1) {
2116		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2117		if (ret < 0)
2118			break;
2119		if (ret > 0) {
2120			if (path->slots[0] >=
2121			    btrfs_header_nritems(path->nodes[0])) {
2122				ret = btrfs_next_leaf(root, path);
2123				if (ret)
2124					break;
2125			}
2126		}
2127
2128		l = path->nodes[0];
2129		slot = path->slots[0];
2130
2131		btrfs_item_key_to_cpu(l, &found_key, slot);
2132
2133		if (found_key.objectid != sdev->dev->devid)
2134			break;
2135
2136		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2137			break;
2138
2139		if (found_key.offset >= end)
2140			break;
2141
2142		if (found_key.offset < key.offset)
2143			break;
2144
2145		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2146		length = btrfs_dev_extent_length(l, dev_extent);
2147
2148		if (found_key.offset + length <= start) {
2149			key.offset = found_key.offset + length;
2150			btrfs_release_path(path);
2151			continue;
2152		}
2153
2154		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2155		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2156		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2157
2158		/*
2159		 * get a reference on the corresponding block group to prevent
2160		 * the chunk from going away while we scrub it
2161		 */
2162		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2163		if (!cache) {
2164			ret = -ENOENT;
2165			break;
2166		}
2167		ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
2168				  chunk_offset, length, found_key.offset);
2169		btrfs_put_block_group(cache);
2170		if (ret)
2171			break;
2172
2173		key.offset = found_key.offset + length;
2174		btrfs_release_path(path);
2175	}
2176
2177	btrfs_free_path(path);
2178
2179	/*
2180	 * ret can still be 1 from search_slot or next_leaf,
2181	 * that's not an error
2182	 */
2183	return ret < 0 ? ret : 0;
2184}
2185
2186static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2187{
2188	int	i;
2189	u64	bytenr;
2190	u64	gen;
2191	int	ret;
2192	struct btrfs_device *device = sdev->dev;
2193	struct btrfs_root *root = device->dev_root;
2194
2195	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2196		return -EIO;
2197
2198	gen = root->fs_info->last_trans_committed;
2199
2200	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2201		bytenr = btrfs_sb_offset(i);
2202		if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
2203			break;
2204
2205		ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2206				     BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
2207		if (ret)
2208			return ret;
2209	}
2210	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
2211
2212	return 0;
2213}
2214
2215/*
2216 * get a reference count on fs_info->scrub_workers. start worker if necessary
2217 */
2218static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
2219{
2220	struct btrfs_fs_info *fs_info = root->fs_info;
2221	int ret = 0;
2222
2223	mutex_lock(&fs_info->scrub_lock);
2224	if (fs_info->scrub_workers_refcnt == 0) {
2225		btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2226			   fs_info->thread_pool_size, &fs_info->generic_worker);
2227		fs_info->scrub_workers.idle_thresh = 4;
2228		ret = btrfs_start_workers(&fs_info->scrub_workers);
2229		if (ret)
2230			goto out;
2231	}
2232	++fs_info->scrub_workers_refcnt;
2233out:
2234	mutex_unlock(&fs_info->scrub_lock);
2235
2236	return ret;
2237}
2238
2239static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
2240{
2241	struct btrfs_fs_info *fs_info = root->fs_info;
2242
2243	mutex_lock(&fs_info->scrub_lock);
2244	if (--fs_info->scrub_workers_refcnt == 0)
2245		btrfs_stop_workers(&fs_info->scrub_workers);
2246	WARN_ON(fs_info->scrub_workers_refcnt < 0);
2247	mutex_unlock(&fs_info->scrub_lock);
2248}
2249
2250
2251int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2252		    struct btrfs_scrub_progress *progress, int readonly)
2253{
2254	struct scrub_dev *sdev;
2255	struct btrfs_fs_info *fs_info = root->fs_info;
2256	int ret;
2257	struct btrfs_device *dev;
2258
2259	if (btrfs_fs_closing(root->fs_info))
2260		return -EINVAL;
2261
2262	/*
2263	 * check some assumptions
2264	 */
2265	if (root->nodesize != root->leafsize) {
2266		printk(KERN_ERR
2267		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2268		       root->nodesize, root->leafsize);
2269		return -EINVAL;
2270	}
2271
2272	if (root->nodesize > BTRFS_STRIPE_LEN) {
2273		/*
2274		 * in this case scrub is unable to calculate the checksum
2275		 * the way scrub is implemented. Do not handle this
2276		 * situation at all because it won't ever happen.
2277		 */
2278		printk(KERN_ERR
2279		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2280		       root->nodesize, BTRFS_STRIPE_LEN);
2281		return -EINVAL;
2282	}
2283
2284	if (root->sectorsize != PAGE_SIZE) {
2285		/* not supported for data w/o checksums */
2286		printk(KERN_ERR
2287		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2288		       root->sectorsize, (unsigned long long)PAGE_SIZE);
2289		return -EINVAL;
2290	}
2291
2292	ret = scrub_workers_get(root);
2293	if (ret)
2294		return ret;
2295
2296	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2297	dev = btrfs_find_device(root, devid, NULL, NULL);
2298	if (!dev || dev->missing) {
2299		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2300		scrub_workers_put(root);
2301		return -ENODEV;
2302	}
2303	mutex_lock(&fs_info->scrub_lock);
2304
2305	if (!dev->in_fs_metadata) {
2306		mutex_unlock(&fs_info->scrub_lock);
2307		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2308		scrub_workers_put(root);
2309		return -ENODEV;
2310	}
2311
2312	if (dev->scrub_device) {
2313		mutex_unlock(&fs_info->scrub_lock);
2314		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2315		scrub_workers_put(root);
2316		return -EINPROGRESS;
2317	}
2318	sdev = scrub_setup_dev(dev);
2319	if (IS_ERR(sdev)) {
2320		mutex_unlock(&fs_info->scrub_lock);
2321		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2322		scrub_workers_put(root);
2323		return PTR_ERR(sdev);
2324	}
2325	sdev->readonly = readonly;
2326	dev->scrub_device = sdev;
2327
2328	atomic_inc(&fs_info->scrubs_running);
2329	mutex_unlock(&fs_info->scrub_lock);
2330	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2331
2332	down_read(&fs_info->scrub_super_lock);
2333	ret = scrub_supers(sdev);
2334	up_read(&fs_info->scrub_super_lock);
2335
2336	if (!ret)
2337		ret = scrub_enumerate_chunks(sdev, start, end);
2338
2339	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
2340	atomic_dec(&fs_info->scrubs_running);
2341	wake_up(&fs_info->scrub_pause_wait);
2342
2343	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
2344
2345	if (progress)
2346		memcpy(progress, &sdev->stat, sizeof(*progress));
2347
2348	mutex_lock(&fs_info->scrub_lock);
2349	dev->scrub_device = NULL;
2350	mutex_unlock(&fs_info->scrub_lock);
2351
2352	scrub_free_dev(sdev);
2353	scrub_workers_put(root);
2354
2355	return ret;
2356}
2357
2358void btrfs_scrub_pause(struct btrfs_root *root)
2359{
2360	struct btrfs_fs_info *fs_info = root->fs_info;
2361
2362	mutex_lock(&fs_info->scrub_lock);
2363	atomic_inc(&fs_info->scrub_pause_req);
2364	while (atomic_read(&fs_info->scrubs_paused) !=
2365	       atomic_read(&fs_info->scrubs_running)) {
2366		mutex_unlock(&fs_info->scrub_lock);
2367		wait_event(fs_info->scrub_pause_wait,
2368			   atomic_read(&fs_info->scrubs_paused) ==
2369			   atomic_read(&fs_info->scrubs_running));
2370		mutex_lock(&fs_info->scrub_lock);
2371	}
2372	mutex_unlock(&fs_info->scrub_lock);
2373}
2374
2375void btrfs_scrub_continue(struct btrfs_root *root)
2376{
2377	struct btrfs_fs_info *fs_info = root->fs_info;
2378
2379	atomic_dec(&fs_info->scrub_pause_req);
2380	wake_up(&fs_info->scrub_pause_wait);
2381}
2382
2383void btrfs_scrub_pause_super(struct btrfs_root *root)
2384{
2385	down_write(&root->fs_info->scrub_super_lock);
2386}
2387
2388void btrfs_scrub_continue_super(struct btrfs_root *root)
2389{
2390	up_write(&root->fs_info->scrub_super_lock);
2391}
2392
2393int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2394{
2395
2396	mutex_lock(&fs_info->scrub_lock);
2397	if (!atomic_read(&fs_info->scrubs_running)) {
2398		mutex_unlock(&fs_info->scrub_lock);
2399		return -ENOTCONN;
2400	}
2401
2402	atomic_inc(&fs_info->scrub_cancel_req);
2403	while (atomic_read(&fs_info->scrubs_running)) {
2404		mutex_unlock(&fs_info->scrub_lock);
2405		wait_event(fs_info->scrub_pause_wait,
2406			   atomic_read(&fs_info->scrubs_running) == 0);
2407		mutex_lock(&fs_info->scrub_lock);
2408	}
2409	atomic_dec(&fs_info->scrub_cancel_req);
2410	mutex_unlock(&fs_info->scrub_lock);
2411
2412	return 0;
2413}
2414
2415int btrfs_scrub_cancel(struct btrfs_root *root)
2416{
2417	return __btrfs_scrub_cancel(root->fs_info);
2418}
2419
2420int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
2421{
2422	struct btrfs_fs_info *fs_info = root->fs_info;
2423	struct scrub_dev *sdev;
2424
2425	mutex_lock(&fs_info->scrub_lock);
2426	sdev = dev->scrub_device;
2427	if (!sdev) {
2428		mutex_unlock(&fs_info->scrub_lock);
2429		return -ENOTCONN;
2430	}
2431	atomic_inc(&sdev->cancel_req);
2432	while (dev->scrub_device) {
2433		mutex_unlock(&fs_info->scrub_lock);
2434		wait_event(fs_info->scrub_pause_wait,
2435			   dev->scrub_device == NULL);
2436		mutex_lock(&fs_info->scrub_lock);
2437	}
2438	mutex_unlock(&fs_info->scrub_lock);
2439
2440	return 0;
2441}
2442
2443int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2444{
2445	struct btrfs_fs_info *fs_info = root->fs_info;
2446	struct btrfs_device *dev;
2447	int ret;
2448
2449	/*
2450	 * we have to hold the device_list_mutex here so the device
2451	 * does not go away in cancel_dev. FIXME: find a better solution
2452	 */
2453	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2454	dev = btrfs_find_device(root, devid, NULL, NULL);
2455	if (!dev) {
2456		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2457		return -ENODEV;
2458	}
2459	ret = btrfs_scrub_cancel_dev(root, dev);
2460	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2461
2462	return ret;
2463}
2464
2465int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2466			 struct btrfs_scrub_progress *progress)
2467{
2468	struct btrfs_device *dev;
2469	struct scrub_dev *sdev = NULL;
2470
2471	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2472	dev = btrfs_find_device(root, devid, NULL, NULL);
2473	if (dev)
2474		sdev = dev->scrub_device;
2475	if (sdev)
2476		memcpy(progress, &sdev->stat, sizeof(*progress));
2477	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2478
2479	return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
2480}
2481