scrub.c revision a36cf8b8933e4a7a7f2f2cbc3c70b097e97f7fd1
1/*
2 * Copyright (C) 2011 STRATO.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
21#include "ctree.h"
22#include "volumes.h"
23#include "disk-io.h"
24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
28#include "check-integrity.h"
29#include "rcu-string.h"
30
31/*
32 * This is only the first step towards a full-features scrub. It reads all
33 * extent and super block and verifies the checksums. In case a bad checksum
34 * is found or the extent cannot be read, good data will be written back if
35 * any can be found.
36 *
37 * Future enhancements:
38 *  - In case an unrepairable extent is encountered, track which files are
39 *    affected and report them
40 *  - track and record media errors, throw out bad devices
41 *  - add a mode to also read unallocated space
42 */
43
44struct scrub_block;
45struct scrub_ctx;
46
47#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */
48#define SCRUB_BIOS_PER_CTX	16	/* 1 MB per device in flight */
49#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
50
51struct scrub_page {
52	struct scrub_block	*sblock;
53	struct page		*page;
54	struct btrfs_device	*dev;
55	u64			flags;  /* extent flags */
56	u64			generation;
57	u64			logical;
58	u64			physical;
59	struct {
60		unsigned int	mirror_num:8;
61		unsigned int	have_csum:1;
62		unsigned int	io_error:1;
63	};
64	u8			csum[BTRFS_CSUM_SIZE];
65};
66
67struct scrub_bio {
68	int			index;
69	struct scrub_ctx	*sctx;
70	struct btrfs_device	*dev;
71	struct bio		*bio;
72	int			err;
73	u64			logical;
74	u64			physical;
75	struct scrub_page	*pagev[SCRUB_PAGES_PER_BIO];
76	int			page_count;
77	int			next_free;
78	struct btrfs_work	work;
79};
80
81struct scrub_block {
82	struct scrub_page	pagev[SCRUB_MAX_PAGES_PER_BLOCK];
83	int			page_count;
84	atomic_t		outstanding_pages;
85	atomic_t		ref_count; /* free mem on transition to zero */
86	struct scrub_ctx	*sctx;
87	struct {
88		unsigned int	header_error:1;
89		unsigned int	checksum_error:1;
90		unsigned int	no_io_error_seen:1;
91		unsigned int	generation_error:1; /* also sets header_error */
92	};
93};
94
95struct scrub_ctx {
96	struct scrub_bio	*bios[SCRUB_BIOS_PER_CTX];
97	struct btrfs_root	*dev_root;
98	int			first_free;
99	int			curr;
100	atomic_t		in_flight;
101	atomic_t		fixup_cnt;
102	spinlock_t		list_lock;
103	wait_queue_head_t	list_wait;
104	u16			csum_size;
105	struct list_head	csum_list;
106	atomic_t		cancel_req;
107	int			readonly;
108	int			pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
109	u32			sectorsize;
110	u32			nodesize;
111	u32			leafsize;
112	/*
113	 * statistics
114	 */
115	struct btrfs_scrub_progress stat;
116	spinlock_t		stat_lock;
117};
118
119struct scrub_fixup_nodatasum {
120	struct scrub_ctx	*sctx;
121	struct btrfs_device	*dev;
122	u64			logical;
123	struct btrfs_root	*root;
124	struct btrfs_work	work;
125	int			mirror_num;
126};
127
128struct scrub_warning {
129	struct btrfs_path	*path;
130	u64			extent_item_size;
131	char			*scratch_buf;
132	char			*msg_buf;
133	const char		*errstr;
134	sector_t		sector;
135	u64			logical;
136	struct btrfs_device	*dev;
137	int			msg_bufsize;
138	int			scratch_bufsize;
139};
140
141
142static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
143static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
144				     struct btrfs_mapping_tree *map_tree,
145				     u64 length, u64 logical,
146				     struct scrub_block *sblock);
147static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
148			       struct scrub_block *sblock, int is_metadata,
149			       int have_csum, u8 *csum, u64 generation,
150			       u16 csum_size);
151static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
152					 struct scrub_block *sblock,
153					 int is_metadata, int have_csum,
154					 const u8 *csum, u64 generation,
155					 u16 csum_size);
156static void scrub_complete_bio_end_io(struct bio *bio, int err);
157static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
158					     struct scrub_block *sblock_good,
159					     int force_write);
160static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
161					    struct scrub_block *sblock_good,
162					    int page_num, int force_write);
163static int scrub_checksum_data(struct scrub_block *sblock);
164static int scrub_checksum_tree_block(struct scrub_block *sblock);
165static int scrub_checksum_super(struct scrub_block *sblock);
166static void scrub_block_get(struct scrub_block *sblock);
167static void scrub_block_put(struct scrub_block *sblock);
168static int scrub_add_page_to_bio(struct scrub_ctx *sctx,
169				 struct scrub_page *spage);
170static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
171		       u64 physical, struct btrfs_device *dev, u64 flags,
172		       u64 gen, int mirror_num, u8 *csum, int force);
173static void scrub_bio_end_io(struct bio *bio, int err);
174static void scrub_bio_end_io_worker(struct btrfs_work *work);
175static void scrub_block_complete(struct scrub_block *sblock);
176
177
178static void scrub_free_csums(struct scrub_ctx *sctx)
179{
180	while (!list_empty(&sctx->csum_list)) {
181		struct btrfs_ordered_sum *sum;
182		sum = list_first_entry(&sctx->csum_list,
183				       struct btrfs_ordered_sum, list);
184		list_del(&sum->list);
185		kfree(sum);
186	}
187}
188
189static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
190{
191	int i;
192
193	if (!sctx)
194		return;
195
196	/* this can happen when scrub is cancelled */
197	if (sctx->curr != -1) {
198		struct scrub_bio *sbio = sctx->bios[sctx->curr];
199
200		for (i = 0; i < sbio->page_count; i++) {
201			BUG_ON(!sbio->pagev[i]);
202			BUG_ON(!sbio->pagev[i]->page);
203			scrub_block_put(sbio->pagev[i]->sblock);
204		}
205		bio_put(sbio->bio);
206	}
207
208	for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) {
209		struct scrub_bio *sbio = sctx->bios[i];
210
211		if (!sbio)
212			break;
213		kfree(sbio);
214	}
215
216	scrub_free_csums(sctx);
217	kfree(sctx);
218}
219
220static noinline_for_stack
221struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev)
222{
223	struct scrub_ctx *sctx;
224	int		i;
225	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
226	int pages_per_bio;
227
228	pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
229			      bio_get_nr_vecs(dev->bdev));
230	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
231	if (!sctx)
232		goto nomem;
233	sctx->pages_per_bio = pages_per_bio;
234	sctx->curr = -1;
235	sctx->dev_root = dev->dev_root;
236	for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) {
237		struct scrub_bio *sbio;
238
239		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
240		if (!sbio)
241			goto nomem;
242		sctx->bios[i] = sbio;
243
244		sbio->index = i;
245		sbio->sctx = sctx;
246		sbio->page_count = 0;
247		sbio->work.func = scrub_bio_end_io_worker;
248
249		if (i != SCRUB_BIOS_PER_CTX - 1)
250			sctx->bios[i]->next_free = i + 1;
251		else
252			sctx->bios[i]->next_free = -1;
253	}
254	sctx->first_free = 0;
255	sctx->nodesize = dev->dev_root->nodesize;
256	sctx->leafsize = dev->dev_root->leafsize;
257	sctx->sectorsize = dev->dev_root->sectorsize;
258	atomic_set(&sctx->in_flight, 0);
259	atomic_set(&sctx->fixup_cnt, 0);
260	atomic_set(&sctx->cancel_req, 0);
261	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
262	INIT_LIST_HEAD(&sctx->csum_list);
263
264	spin_lock_init(&sctx->list_lock);
265	spin_lock_init(&sctx->stat_lock);
266	init_waitqueue_head(&sctx->list_wait);
267	return sctx;
268
269nomem:
270	scrub_free_ctx(sctx);
271	return ERR_PTR(-ENOMEM);
272}
273
274static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
275{
276	u64 isize;
277	u32 nlink;
278	int ret;
279	int i;
280	struct extent_buffer *eb;
281	struct btrfs_inode_item *inode_item;
282	struct scrub_warning *swarn = ctx;
283	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
284	struct inode_fs_paths *ipath = NULL;
285	struct btrfs_root *local_root;
286	struct btrfs_key root_key;
287
288	root_key.objectid = root;
289	root_key.type = BTRFS_ROOT_ITEM_KEY;
290	root_key.offset = (u64)-1;
291	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
292	if (IS_ERR(local_root)) {
293		ret = PTR_ERR(local_root);
294		goto err;
295	}
296
297	ret = inode_item_info(inum, 0, local_root, swarn->path);
298	if (ret) {
299		btrfs_release_path(swarn->path);
300		goto err;
301	}
302
303	eb = swarn->path->nodes[0];
304	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
305					struct btrfs_inode_item);
306	isize = btrfs_inode_size(eb, inode_item);
307	nlink = btrfs_inode_nlink(eb, inode_item);
308	btrfs_release_path(swarn->path);
309
310	ipath = init_ipath(4096, local_root, swarn->path);
311	if (IS_ERR(ipath)) {
312		ret = PTR_ERR(ipath);
313		ipath = NULL;
314		goto err;
315	}
316	ret = paths_from_inode(inum, ipath);
317
318	if (ret < 0)
319		goto err;
320
321	/*
322	 * we deliberately ignore the bit ipath might have been too small to
323	 * hold all of the paths here
324	 */
325	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
326		printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
327			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
328			"length %llu, links %u (path: %s)\n", swarn->errstr,
329			swarn->logical, rcu_str_deref(swarn->dev->name),
330			(unsigned long long)swarn->sector, root, inum, offset,
331			min(isize - offset, (u64)PAGE_SIZE), nlink,
332			(char *)(unsigned long)ipath->fspath->val[i]);
333
334	free_ipath(ipath);
335	return 0;
336
337err:
338	printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
339		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
340		"resolving failed with ret=%d\n", swarn->errstr,
341		swarn->logical, rcu_str_deref(swarn->dev->name),
342		(unsigned long long)swarn->sector, root, inum, offset, ret);
343
344	free_ipath(ipath);
345	return 0;
346}
347
348static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
349{
350	struct btrfs_device *dev;
351	struct btrfs_fs_info *fs_info;
352	struct btrfs_path *path;
353	struct btrfs_key found_key;
354	struct extent_buffer *eb;
355	struct btrfs_extent_item *ei;
356	struct scrub_warning swarn;
357	unsigned long ptr = 0;
358	u64 extent_item_pos;
359	u64 flags = 0;
360	u64 ref_root;
361	u32 item_size;
362	u8 ref_level;
363	const int bufsize = 4096;
364	int ret;
365
366	WARN_ON(sblock->page_count < 1);
367	dev = sblock->pagev[0].dev;
368	fs_info = sblock->sctx->dev_root->fs_info;
369
370	path = btrfs_alloc_path();
371
372	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
373	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
374	swarn.sector = (sblock->pagev[0].physical) >> 9;
375	swarn.logical = sblock->pagev[0].logical;
376	swarn.errstr = errstr;
377	swarn.dev = NULL;
378	swarn.msg_bufsize = bufsize;
379	swarn.scratch_bufsize = bufsize;
380
381	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
382		goto out;
383
384	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
385				  &flags);
386	if (ret < 0)
387		goto out;
388
389	extent_item_pos = swarn.logical - found_key.objectid;
390	swarn.extent_item_size = found_key.offset;
391
392	eb = path->nodes[0];
393	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
394	item_size = btrfs_item_size_nr(eb, path->slots[0]);
395	btrfs_release_path(path);
396
397	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
398		do {
399			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
400							&ref_root, &ref_level);
401			printk_in_rcu(KERN_WARNING
402				"btrfs: %s at logical %llu on dev %s, "
403				"sector %llu: metadata %s (level %d) in tree "
404				"%llu\n", errstr, swarn.logical,
405				rcu_str_deref(dev->name),
406				(unsigned long long)swarn.sector,
407				ref_level ? "node" : "leaf",
408				ret < 0 ? -1 : ref_level,
409				ret < 0 ? -1 : ref_root);
410		} while (ret != 1);
411	} else {
412		swarn.path = path;
413		swarn.dev = dev;
414		iterate_extent_inodes(fs_info, found_key.objectid,
415					extent_item_pos, 1,
416					scrub_print_warning_inode, &swarn);
417	}
418
419out:
420	btrfs_free_path(path);
421	kfree(swarn.scratch_buf);
422	kfree(swarn.msg_buf);
423}
424
425static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
426{
427	struct page *page = NULL;
428	unsigned long index;
429	struct scrub_fixup_nodatasum *fixup = ctx;
430	int ret;
431	int corrected = 0;
432	struct btrfs_key key;
433	struct inode *inode = NULL;
434	u64 end = offset + PAGE_SIZE - 1;
435	struct btrfs_root *local_root;
436
437	key.objectid = root;
438	key.type = BTRFS_ROOT_ITEM_KEY;
439	key.offset = (u64)-1;
440	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
441	if (IS_ERR(local_root))
442		return PTR_ERR(local_root);
443
444	key.type = BTRFS_INODE_ITEM_KEY;
445	key.objectid = inum;
446	key.offset = 0;
447	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
448	if (IS_ERR(inode))
449		return PTR_ERR(inode);
450
451	index = offset >> PAGE_CACHE_SHIFT;
452
453	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
454	if (!page) {
455		ret = -ENOMEM;
456		goto out;
457	}
458
459	if (PageUptodate(page)) {
460		struct btrfs_mapping_tree *map_tree;
461		if (PageDirty(page)) {
462			/*
463			 * we need to write the data to the defect sector. the
464			 * data that was in that sector is not in memory,
465			 * because the page was modified. we must not write the
466			 * modified page to that sector.
467			 *
468			 * TODO: what could be done here: wait for the delalloc
469			 *       runner to write out that page (might involve
470			 *       COW) and see whether the sector is still
471			 *       referenced afterwards.
472			 *
473			 * For the meantime, we'll treat this error
474			 * incorrectable, although there is a chance that a
475			 * later scrub will find the bad sector again and that
476			 * there's no dirty page in memory, then.
477			 */
478			ret = -EIO;
479			goto out;
480		}
481		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
482		ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
483					fixup->logical, page,
484					fixup->mirror_num);
485		unlock_page(page);
486		corrected = !ret;
487	} else {
488		/*
489		 * we need to get good data first. the general readpage path
490		 * will call repair_io_failure for us, we just have to make
491		 * sure we read the bad mirror.
492		 */
493		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
494					EXTENT_DAMAGED, GFP_NOFS);
495		if (ret) {
496			/* set_extent_bits should give proper error */
497			WARN_ON(ret > 0);
498			if (ret > 0)
499				ret = -EFAULT;
500			goto out;
501		}
502
503		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
504						btrfs_get_extent,
505						fixup->mirror_num);
506		wait_on_page_locked(page);
507
508		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
509						end, EXTENT_DAMAGED, 0, NULL);
510		if (!corrected)
511			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
512						EXTENT_DAMAGED, GFP_NOFS);
513	}
514
515out:
516	if (page)
517		put_page(page);
518	if (inode)
519		iput(inode);
520
521	if (ret < 0)
522		return ret;
523
524	if (ret == 0 && corrected) {
525		/*
526		 * we only need to call readpage for one of the inodes belonging
527		 * to this extent. so make iterate_extent_inodes stop
528		 */
529		return 1;
530	}
531
532	return -EIO;
533}
534
535static void scrub_fixup_nodatasum(struct btrfs_work *work)
536{
537	int ret;
538	struct scrub_fixup_nodatasum *fixup;
539	struct scrub_ctx *sctx;
540	struct btrfs_trans_handle *trans = NULL;
541	struct btrfs_fs_info *fs_info;
542	struct btrfs_path *path;
543	int uncorrectable = 0;
544
545	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
546	sctx = fixup->sctx;
547	fs_info = fixup->root->fs_info;
548
549	path = btrfs_alloc_path();
550	if (!path) {
551		spin_lock(&sctx->stat_lock);
552		++sctx->stat.malloc_errors;
553		spin_unlock(&sctx->stat_lock);
554		uncorrectable = 1;
555		goto out;
556	}
557
558	trans = btrfs_join_transaction(fixup->root);
559	if (IS_ERR(trans)) {
560		uncorrectable = 1;
561		goto out;
562	}
563
564	/*
565	 * the idea is to trigger a regular read through the standard path. we
566	 * read a page from the (failed) logical address by specifying the
567	 * corresponding copynum of the failed sector. thus, that readpage is
568	 * expected to fail.
569	 * that is the point where on-the-fly error correction will kick in
570	 * (once it's finished) and rewrite the failed sector if a good copy
571	 * can be found.
572	 */
573	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
574						path, scrub_fixup_readpage,
575						fixup);
576	if (ret < 0) {
577		uncorrectable = 1;
578		goto out;
579	}
580	WARN_ON(ret != 1);
581
582	spin_lock(&sctx->stat_lock);
583	++sctx->stat.corrected_errors;
584	spin_unlock(&sctx->stat_lock);
585
586out:
587	if (trans && !IS_ERR(trans))
588		btrfs_end_transaction(trans, fixup->root);
589	if (uncorrectable) {
590		spin_lock(&sctx->stat_lock);
591		++sctx->stat.uncorrectable_errors;
592		spin_unlock(&sctx->stat_lock);
593
594		printk_ratelimited_in_rcu(KERN_ERR
595			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
596			(unsigned long long)fixup->logical,
597			rcu_str_deref(fixup->dev->name));
598	}
599
600	btrfs_free_path(path);
601	kfree(fixup);
602
603	/* see caller why we're pretending to be paused in the scrub counters */
604	mutex_lock(&fs_info->scrub_lock);
605	atomic_dec(&fs_info->scrubs_running);
606	atomic_dec(&fs_info->scrubs_paused);
607	mutex_unlock(&fs_info->scrub_lock);
608	atomic_dec(&sctx->fixup_cnt);
609	wake_up(&fs_info->scrub_pause_wait);
610	wake_up(&sctx->list_wait);
611}
612
613/*
614 * scrub_handle_errored_block gets called when either verification of the
615 * pages failed or the bio failed to read, e.g. with EIO. In the latter
616 * case, this function handles all pages in the bio, even though only one
617 * may be bad.
618 * The goal of this function is to repair the errored block by using the
619 * contents of one of the mirrors.
620 */
621static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
622{
623	struct scrub_ctx *sctx = sblock_to_check->sctx;
624	struct btrfs_device *dev;
625	struct btrfs_fs_info *fs_info;
626	u64 length;
627	u64 logical;
628	u64 generation;
629	unsigned int failed_mirror_index;
630	unsigned int is_metadata;
631	unsigned int have_csum;
632	u8 *csum;
633	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
634	struct scrub_block *sblock_bad;
635	int ret;
636	int mirror_index;
637	int page_num;
638	int success;
639	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
640				      DEFAULT_RATELIMIT_BURST);
641
642	BUG_ON(sblock_to_check->page_count < 1);
643	fs_info = sctx->dev_root->fs_info;
644	length = sblock_to_check->page_count * PAGE_SIZE;
645	logical = sblock_to_check->pagev[0].logical;
646	generation = sblock_to_check->pagev[0].generation;
647	BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
648	failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
649	is_metadata = !(sblock_to_check->pagev[0].flags &
650			BTRFS_EXTENT_FLAG_DATA);
651	have_csum = sblock_to_check->pagev[0].have_csum;
652	csum = sblock_to_check->pagev[0].csum;
653	dev = sblock_to_check->pagev[0].dev;
654
655	/*
656	 * read all mirrors one after the other. This includes to
657	 * re-read the extent or metadata block that failed (that was
658	 * the cause that this fixup code is called) another time,
659	 * page by page this time in order to know which pages
660	 * caused I/O errors and which ones are good (for all mirrors).
661	 * It is the goal to handle the situation when more than one
662	 * mirror contains I/O errors, but the errors do not
663	 * overlap, i.e. the data can be repaired by selecting the
664	 * pages from those mirrors without I/O error on the
665	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
666	 * would be that mirror #1 has an I/O error on the first page,
667	 * the second page is good, and mirror #2 has an I/O error on
668	 * the second page, but the first page is good.
669	 * Then the first page of the first mirror can be repaired by
670	 * taking the first page of the second mirror, and the
671	 * second page of the second mirror can be repaired by
672	 * copying the contents of the 2nd page of the 1st mirror.
673	 * One more note: if the pages of one mirror contain I/O
674	 * errors, the checksum cannot be verified. In order to get
675	 * the best data for repairing, the first attempt is to find
676	 * a mirror without I/O errors and with a validated checksum.
677	 * Only if this is not possible, the pages are picked from
678	 * mirrors with I/O errors without considering the checksum.
679	 * If the latter is the case, at the end, the checksum of the
680	 * repaired area is verified in order to correctly maintain
681	 * the statistics.
682	 */
683
684	sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
685				     sizeof(*sblocks_for_recheck),
686				     GFP_NOFS);
687	if (!sblocks_for_recheck) {
688		spin_lock(&sctx->stat_lock);
689		sctx->stat.malloc_errors++;
690		sctx->stat.read_errors++;
691		sctx->stat.uncorrectable_errors++;
692		spin_unlock(&sctx->stat_lock);
693		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
694		goto out;
695	}
696
697	/* setup the context, map the logical blocks and alloc the pages */
698	ret = scrub_setup_recheck_block(sctx, &fs_info->mapping_tree, length,
699					logical, sblocks_for_recheck);
700	if (ret) {
701		spin_lock(&sctx->stat_lock);
702		sctx->stat.read_errors++;
703		sctx->stat.uncorrectable_errors++;
704		spin_unlock(&sctx->stat_lock);
705		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
706		goto out;
707	}
708	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
709	sblock_bad = sblocks_for_recheck + failed_mirror_index;
710
711	/* build and submit the bios for the failed mirror, check checksums */
712	ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
713				  csum, generation, sctx->csum_size);
714	if (ret) {
715		spin_lock(&sctx->stat_lock);
716		sctx->stat.read_errors++;
717		sctx->stat.uncorrectable_errors++;
718		spin_unlock(&sctx->stat_lock);
719		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
720		goto out;
721	}
722
723	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
724	    sblock_bad->no_io_error_seen) {
725		/*
726		 * the error disappeared after reading page by page, or
727		 * the area was part of a huge bio and other parts of the
728		 * bio caused I/O errors, or the block layer merged several
729		 * read requests into one and the error is caused by a
730		 * different bio (usually one of the two latter cases is
731		 * the cause)
732		 */
733		spin_lock(&sctx->stat_lock);
734		sctx->stat.unverified_errors++;
735		spin_unlock(&sctx->stat_lock);
736
737		goto out;
738	}
739
740	if (!sblock_bad->no_io_error_seen) {
741		spin_lock(&sctx->stat_lock);
742		sctx->stat.read_errors++;
743		spin_unlock(&sctx->stat_lock);
744		if (__ratelimit(&_rs))
745			scrub_print_warning("i/o error", sblock_to_check);
746		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
747	} else if (sblock_bad->checksum_error) {
748		spin_lock(&sctx->stat_lock);
749		sctx->stat.csum_errors++;
750		spin_unlock(&sctx->stat_lock);
751		if (__ratelimit(&_rs))
752			scrub_print_warning("checksum error", sblock_to_check);
753		btrfs_dev_stat_inc_and_print(dev,
754					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
755	} else if (sblock_bad->header_error) {
756		spin_lock(&sctx->stat_lock);
757		sctx->stat.verify_errors++;
758		spin_unlock(&sctx->stat_lock);
759		if (__ratelimit(&_rs))
760			scrub_print_warning("checksum/header error",
761					    sblock_to_check);
762		if (sblock_bad->generation_error)
763			btrfs_dev_stat_inc_and_print(dev,
764				BTRFS_DEV_STAT_GENERATION_ERRS);
765		else
766			btrfs_dev_stat_inc_and_print(dev,
767				BTRFS_DEV_STAT_CORRUPTION_ERRS);
768	}
769
770	if (sctx->readonly)
771		goto did_not_correct_error;
772
773	if (!is_metadata && !have_csum) {
774		struct scrub_fixup_nodatasum *fixup_nodatasum;
775
776		/*
777		 * !is_metadata and !have_csum, this means that the data
778		 * might not be COW'ed, that it might be modified
779		 * concurrently. The general strategy to work on the
780		 * commit root does not help in the case when COW is not
781		 * used.
782		 */
783		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
784		if (!fixup_nodatasum)
785			goto did_not_correct_error;
786		fixup_nodatasum->sctx = sctx;
787		fixup_nodatasum->dev = dev;
788		fixup_nodatasum->logical = logical;
789		fixup_nodatasum->root = fs_info->extent_root;
790		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
791		/*
792		 * increment scrubs_running to prevent cancel requests from
793		 * completing as long as a fixup worker is running. we must also
794		 * increment scrubs_paused to prevent deadlocking on pause
795		 * requests used for transactions commits (as the worker uses a
796		 * transaction context). it is safe to regard the fixup worker
797		 * as paused for all matters practical. effectively, we only
798		 * avoid cancellation requests from completing.
799		 */
800		mutex_lock(&fs_info->scrub_lock);
801		atomic_inc(&fs_info->scrubs_running);
802		atomic_inc(&fs_info->scrubs_paused);
803		mutex_unlock(&fs_info->scrub_lock);
804		atomic_inc(&sctx->fixup_cnt);
805		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
806		btrfs_queue_worker(&fs_info->scrub_workers,
807				   &fixup_nodatasum->work);
808		goto out;
809	}
810
811	/*
812	 * now build and submit the bios for the other mirrors, check
813	 * checksums
814	 */
815	for (mirror_index = 0;
816	     mirror_index < BTRFS_MAX_MIRRORS &&
817	     sblocks_for_recheck[mirror_index].page_count > 0;
818	     mirror_index++) {
819		if (mirror_index == failed_mirror_index)
820			continue;
821
822		/* build and submit the bios, check checksums */
823		ret = scrub_recheck_block(fs_info,
824					  sblocks_for_recheck + mirror_index,
825					  is_metadata, have_csum, csum,
826					  generation, sctx->csum_size);
827		if (ret)
828			goto did_not_correct_error;
829	}
830
831	/*
832	 * first try to pick the mirror which is completely without I/O
833	 * errors and also does not have a checksum error.
834	 * If one is found, and if a checksum is present, the full block
835	 * that is known to contain an error is rewritten. Afterwards
836	 * the block is known to be corrected.
837	 * If a mirror is found which is completely correct, and no
838	 * checksum is present, only those pages are rewritten that had
839	 * an I/O error in the block to be repaired, since it cannot be
840	 * determined, which copy of the other pages is better (and it
841	 * could happen otherwise that a correct page would be
842	 * overwritten by a bad one).
843	 */
844	for (mirror_index = 0;
845	     mirror_index < BTRFS_MAX_MIRRORS &&
846	     sblocks_for_recheck[mirror_index].page_count > 0;
847	     mirror_index++) {
848		struct scrub_block *sblock_other = sblocks_for_recheck +
849						   mirror_index;
850
851		if (!sblock_other->header_error &&
852		    !sblock_other->checksum_error &&
853		    sblock_other->no_io_error_seen) {
854			int force_write = is_metadata || have_csum;
855
856			ret = scrub_repair_block_from_good_copy(sblock_bad,
857								sblock_other,
858								force_write);
859			if (0 == ret)
860				goto corrected_error;
861		}
862	}
863
864	/*
865	 * in case of I/O errors in the area that is supposed to be
866	 * repaired, continue by picking good copies of those pages.
867	 * Select the good pages from mirrors to rewrite bad pages from
868	 * the area to fix. Afterwards verify the checksum of the block
869	 * that is supposed to be repaired. This verification step is
870	 * only done for the purpose of statistic counting and for the
871	 * final scrub report, whether errors remain.
872	 * A perfect algorithm could make use of the checksum and try
873	 * all possible combinations of pages from the different mirrors
874	 * until the checksum verification succeeds. For example, when
875	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
876	 * of mirror #2 is readable but the final checksum test fails,
877	 * then the 2nd page of mirror #3 could be tried, whether now
878	 * the final checksum succeedes. But this would be a rare
879	 * exception and is therefore not implemented. At least it is
880	 * avoided that the good copy is overwritten.
881	 * A more useful improvement would be to pick the sectors
882	 * without I/O error based on sector sizes (512 bytes on legacy
883	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
884	 * mirror could be repaired by taking 512 byte of a different
885	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
886	 * area are unreadable.
887	 */
888
889	/* can only fix I/O errors from here on */
890	if (sblock_bad->no_io_error_seen)
891		goto did_not_correct_error;
892
893	success = 1;
894	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
895		struct scrub_page *page_bad = sblock_bad->pagev + page_num;
896
897		if (!page_bad->io_error)
898			continue;
899
900		for (mirror_index = 0;
901		     mirror_index < BTRFS_MAX_MIRRORS &&
902		     sblocks_for_recheck[mirror_index].page_count > 0;
903		     mirror_index++) {
904			struct scrub_block *sblock_other = sblocks_for_recheck +
905							   mirror_index;
906			struct scrub_page *page_other = sblock_other->pagev +
907							page_num;
908
909			if (!page_other->io_error) {
910				ret = scrub_repair_page_from_good_copy(
911					sblock_bad, sblock_other, page_num, 0);
912				if (0 == ret) {
913					page_bad->io_error = 0;
914					break; /* succeeded for this page */
915				}
916			}
917		}
918
919		if (page_bad->io_error) {
920			/* did not find a mirror to copy the page from */
921			success = 0;
922		}
923	}
924
925	if (success) {
926		if (is_metadata || have_csum) {
927			/*
928			 * need to verify the checksum now that all
929			 * sectors on disk are repaired (the write
930			 * request for data to be repaired is on its way).
931			 * Just be lazy and use scrub_recheck_block()
932			 * which re-reads the data before the checksum
933			 * is verified, but most likely the data comes out
934			 * of the page cache.
935			 */
936			ret = scrub_recheck_block(fs_info, sblock_bad,
937						  is_metadata, have_csum, csum,
938						  generation, sctx->csum_size);
939			if (!ret && !sblock_bad->header_error &&
940			    !sblock_bad->checksum_error &&
941			    sblock_bad->no_io_error_seen)
942				goto corrected_error;
943			else
944				goto did_not_correct_error;
945		} else {
946corrected_error:
947			spin_lock(&sctx->stat_lock);
948			sctx->stat.corrected_errors++;
949			spin_unlock(&sctx->stat_lock);
950			printk_ratelimited_in_rcu(KERN_ERR
951				"btrfs: fixed up error at logical %llu on dev %s\n",
952				(unsigned long long)logical,
953				rcu_str_deref(dev->name));
954		}
955	} else {
956did_not_correct_error:
957		spin_lock(&sctx->stat_lock);
958		sctx->stat.uncorrectable_errors++;
959		spin_unlock(&sctx->stat_lock);
960		printk_ratelimited_in_rcu(KERN_ERR
961			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
962			(unsigned long long)logical,
963			rcu_str_deref(dev->name));
964	}
965
966out:
967	if (sblocks_for_recheck) {
968		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
969		     mirror_index++) {
970			struct scrub_block *sblock = sblocks_for_recheck +
971						     mirror_index;
972			int page_index;
973
974			for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
975			     page_index++)
976				if (sblock->pagev[page_index].page)
977					__free_page(
978						sblock->pagev[page_index].page);
979		}
980		kfree(sblocks_for_recheck);
981	}
982
983	return 0;
984}
985
986static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
987				     struct btrfs_mapping_tree *map_tree,
988				     u64 length, u64 logical,
989				     struct scrub_block *sblocks_for_recheck)
990{
991	int page_index;
992	int mirror_index;
993	int ret;
994
995	/*
996	 * note: the three members sctx, ref_count and outstanding_pages
997	 * are not used (and not set) in the blocks that are used for
998	 * the recheck procedure
999	 */
1000
1001	page_index = 0;
1002	while (length > 0) {
1003		u64 sublen = min_t(u64, length, PAGE_SIZE);
1004		u64 mapped_length = sublen;
1005		struct btrfs_bio *bbio = NULL;
1006
1007		/*
1008		 * with a length of PAGE_SIZE, each returned stripe
1009		 * represents one mirror
1010		 */
1011		ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
1012				      &bbio, 0);
1013		if (ret || !bbio || mapped_length < sublen) {
1014			kfree(bbio);
1015			return -EIO;
1016		}
1017
1018		BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
1019		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1020		     mirror_index++) {
1021			struct scrub_block *sblock;
1022			struct scrub_page *page;
1023
1024			if (mirror_index >= BTRFS_MAX_MIRRORS)
1025				continue;
1026
1027			sblock = sblocks_for_recheck + mirror_index;
1028			page = sblock->pagev + page_index;
1029			page->logical = logical;
1030			page->physical = bbio->stripes[mirror_index].physical;
1031			/* for missing devices, dev->bdev is NULL */
1032			page->dev = bbio->stripes[mirror_index].dev;
1033			page->mirror_num = mirror_index + 1;
1034			page->page = alloc_page(GFP_NOFS);
1035			if (!page->page) {
1036				spin_lock(&sctx->stat_lock);
1037				sctx->stat.malloc_errors++;
1038				spin_unlock(&sctx->stat_lock);
1039				kfree(bbio);
1040				return -ENOMEM;
1041			}
1042			sblock->page_count++;
1043		}
1044		kfree(bbio);
1045		length -= sublen;
1046		logical += sublen;
1047		page_index++;
1048	}
1049
1050	return 0;
1051}
1052
1053/*
1054 * this function will check the on disk data for checksum errors, header
1055 * errors and read I/O errors. If any I/O errors happen, the exact pages
1056 * which are errored are marked as being bad. The goal is to enable scrub
1057 * to take those pages that are not errored from all the mirrors so that
1058 * the pages that are errored in the just handled mirror can be repaired.
1059 */
1060static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1061			       struct scrub_block *sblock, int is_metadata,
1062			       int have_csum, u8 *csum, u64 generation,
1063			       u16 csum_size)
1064{
1065	int page_num;
1066
1067	sblock->no_io_error_seen = 1;
1068	sblock->header_error = 0;
1069	sblock->checksum_error = 0;
1070
1071	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1072		struct bio *bio;
1073		int ret;
1074		struct scrub_page *page = sblock->pagev + page_num;
1075		DECLARE_COMPLETION_ONSTACK(complete);
1076
1077		if (page->dev->bdev == NULL) {
1078			page->io_error = 1;
1079			sblock->no_io_error_seen = 0;
1080			continue;
1081		}
1082
1083		BUG_ON(!page->page);
1084		bio = bio_alloc(GFP_NOFS, 1);
1085		if (!bio)
1086			return -EIO;
1087		bio->bi_bdev = page->dev->bdev;
1088		bio->bi_sector = page->physical >> 9;
1089		bio->bi_end_io = scrub_complete_bio_end_io;
1090		bio->bi_private = &complete;
1091
1092		ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
1093		if (PAGE_SIZE != ret) {
1094			bio_put(bio);
1095			return -EIO;
1096		}
1097		btrfsic_submit_bio(READ, bio);
1098
1099		/* this will also unplug the queue */
1100		wait_for_completion(&complete);
1101
1102		page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
1103		if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1104			sblock->no_io_error_seen = 0;
1105		bio_put(bio);
1106	}
1107
1108	if (sblock->no_io_error_seen)
1109		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1110					     have_csum, csum, generation,
1111					     csum_size);
1112
1113	return 0;
1114}
1115
1116static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1117					 struct scrub_block *sblock,
1118					 int is_metadata, int have_csum,
1119					 const u8 *csum, u64 generation,
1120					 u16 csum_size)
1121{
1122	int page_num;
1123	u8 calculated_csum[BTRFS_CSUM_SIZE];
1124	u32 crc = ~(u32)0;
1125	struct btrfs_root *root = fs_info->extent_root;
1126	void *mapped_buffer;
1127
1128	BUG_ON(!sblock->pagev[0].page);
1129	if (is_metadata) {
1130		struct btrfs_header *h;
1131
1132		mapped_buffer = kmap_atomic(sblock->pagev[0].page);
1133		h = (struct btrfs_header *)mapped_buffer;
1134
1135		if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
1136		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1137		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1138			   BTRFS_UUID_SIZE)) {
1139			sblock->header_error = 1;
1140		} else if (generation != le64_to_cpu(h->generation)) {
1141			sblock->header_error = 1;
1142			sblock->generation_error = 1;
1143		}
1144		csum = h->csum;
1145	} else {
1146		if (!have_csum)
1147			return;
1148
1149		mapped_buffer = kmap_atomic(sblock->pagev[0].page);
1150	}
1151
1152	for (page_num = 0;;) {
1153		if (page_num == 0 && is_metadata)
1154			crc = btrfs_csum_data(root,
1155				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1156				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1157		else
1158			crc = btrfs_csum_data(root, mapped_buffer, crc,
1159					      PAGE_SIZE);
1160
1161		kunmap_atomic(mapped_buffer);
1162		page_num++;
1163		if (page_num >= sblock->page_count)
1164			break;
1165		BUG_ON(!sblock->pagev[page_num].page);
1166
1167		mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
1168	}
1169
1170	btrfs_csum_final(crc, calculated_csum);
1171	if (memcmp(calculated_csum, csum, csum_size))
1172		sblock->checksum_error = 1;
1173}
1174
1175static void scrub_complete_bio_end_io(struct bio *bio, int err)
1176{
1177	complete((struct completion *)bio->bi_private);
1178}
1179
1180static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1181					     struct scrub_block *sblock_good,
1182					     int force_write)
1183{
1184	int page_num;
1185	int ret = 0;
1186
1187	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1188		int ret_sub;
1189
1190		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1191							   sblock_good,
1192							   page_num,
1193							   force_write);
1194		if (ret_sub)
1195			ret = ret_sub;
1196	}
1197
1198	return ret;
1199}
1200
1201static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1202					    struct scrub_block *sblock_good,
1203					    int page_num, int force_write)
1204{
1205	struct scrub_page *page_bad = sblock_bad->pagev + page_num;
1206	struct scrub_page *page_good = sblock_good->pagev + page_num;
1207
1208	BUG_ON(sblock_bad->pagev[page_num].page == NULL);
1209	BUG_ON(sblock_good->pagev[page_num].page == NULL);
1210	if (force_write || sblock_bad->header_error ||
1211	    sblock_bad->checksum_error || page_bad->io_error) {
1212		struct bio *bio;
1213		int ret;
1214		DECLARE_COMPLETION_ONSTACK(complete);
1215
1216		bio = bio_alloc(GFP_NOFS, 1);
1217		if (!bio)
1218			return -EIO;
1219		bio->bi_bdev = page_bad->dev->bdev;
1220		bio->bi_sector = page_bad->physical >> 9;
1221		bio->bi_end_io = scrub_complete_bio_end_io;
1222		bio->bi_private = &complete;
1223
1224		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1225		if (PAGE_SIZE != ret) {
1226			bio_put(bio);
1227			return -EIO;
1228		}
1229		btrfsic_submit_bio(WRITE, bio);
1230
1231		/* this will also unplug the queue */
1232		wait_for_completion(&complete);
1233		if (!bio_flagged(bio, BIO_UPTODATE)) {
1234			btrfs_dev_stat_inc_and_print(page_bad->dev,
1235				BTRFS_DEV_STAT_WRITE_ERRS);
1236			bio_put(bio);
1237			return -EIO;
1238		}
1239		bio_put(bio);
1240	}
1241
1242	return 0;
1243}
1244
1245static void scrub_checksum(struct scrub_block *sblock)
1246{
1247	u64 flags;
1248	int ret;
1249
1250	BUG_ON(sblock->page_count < 1);
1251	flags = sblock->pagev[0].flags;
1252	ret = 0;
1253	if (flags & BTRFS_EXTENT_FLAG_DATA)
1254		ret = scrub_checksum_data(sblock);
1255	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1256		ret = scrub_checksum_tree_block(sblock);
1257	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1258		(void)scrub_checksum_super(sblock);
1259	else
1260		WARN_ON(1);
1261	if (ret)
1262		scrub_handle_errored_block(sblock);
1263}
1264
1265static int scrub_checksum_data(struct scrub_block *sblock)
1266{
1267	struct scrub_ctx *sctx = sblock->sctx;
1268	u8 csum[BTRFS_CSUM_SIZE];
1269	u8 *on_disk_csum;
1270	struct page *page;
1271	void *buffer;
1272	u32 crc = ~(u32)0;
1273	int fail = 0;
1274	struct btrfs_root *root = sctx->dev_root;
1275	u64 len;
1276	int index;
1277
1278	BUG_ON(sblock->page_count < 1);
1279	if (!sblock->pagev[0].have_csum)
1280		return 0;
1281
1282	on_disk_csum = sblock->pagev[0].csum;
1283	page = sblock->pagev[0].page;
1284	buffer = kmap_atomic(page);
1285
1286	len = sctx->sectorsize;
1287	index = 0;
1288	for (;;) {
1289		u64 l = min_t(u64, len, PAGE_SIZE);
1290
1291		crc = btrfs_csum_data(root, buffer, crc, l);
1292		kunmap_atomic(buffer);
1293		len -= l;
1294		if (len == 0)
1295			break;
1296		index++;
1297		BUG_ON(index >= sblock->page_count);
1298		BUG_ON(!sblock->pagev[index].page);
1299		page = sblock->pagev[index].page;
1300		buffer = kmap_atomic(page);
1301	}
1302
1303	btrfs_csum_final(crc, csum);
1304	if (memcmp(csum, on_disk_csum, sctx->csum_size))
1305		fail = 1;
1306
1307	return fail;
1308}
1309
1310static int scrub_checksum_tree_block(struct scrub_block *sblock)
1311{
1312	struct scrub_ctx *sctx = sblock->sctx;
1313	struct btrfs_header *h;
1314	struct btrfs_root *root = sctx->dev_root;
1315	struct btrfs_fs_info *fs_info = root->fs_info;
1316	u8 calculated_csum[BTRFS_CSUM_SIZE];
1317	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1318	struct page *page;
1319	void *mapped_buffer;
1320	u64 mapped_size;
1321	void *p;
1322	u32 crc = ~(u32)0;
1323	int fail = 0;
1324	int crc_fail = 0;
1325	u64 len;
1326	int index;
1327
1328	BUG_ON(sblock->page_count < 1);
1329	page = sblock->pagev[0].page;
1330	mapped_buffer = kmap_atomic(page);
1331	h = (struct btrfs_header *)mapped_buffer;
1332	memcpy(on_disk_csum, h->csum, sctx->csum_size);
1333
1334	/*
1335	 * we don't use the getter functions here, as we
1336	 * a) don't have an extent buffer and
1337	 * b) the page is already kmapped
1338	 */
1339
1340	if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
1341		++fail;
1342
1343	if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
1344		++fail;
1345
1346	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1347		++fail;
1348
1349	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1350		   BTRFS_UUID_SIZE))
1351		++fail;
1352
1353	BUG_ON(sctx->nodesize != sctx->leafsize);
1354	len = sctx->nodesize - BTRFS_CSUM_SIZE;
1355	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1356	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1357	index = 0;
1358	for (;;) {
1359		u64 l = min_t(u64, len, mapped_size);
1360
1361		crc = btrfs_csum_data(root, p, crc, l);
1362		kunmap_atomic(mapped_buffer);
1363		len -= l;
1364		if (len == 0)
1365			break;
1366		index++;
1367		BUG_ON(index >= sblock->page_count);
1368		BUG_ON(!sblock->pagev[index].page);
1369		page = sblock->pagev[index].page;
1370		mapped_buffer = kmap_atomic(page);
1371		mapped_size = PAGE_SIZE;
1372		p = mapped_buffer;
1373	}
1374
1375	btrfs_csum_final(crc, calculated_csum);
1376	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1377		++crc_fail;
1378
1379	return fail || crc_fail;
1380}
1381
1382static int scrub_checksum_super(struct scrub_block *sblock)
1383{
1384	struct btrfs_super_block *s;
1385	struct scrub_ctx *sctx = sblock->sctx;
1386	struct btrfs_root *root = sctx->dev_root;
1387	struct btrfs_fs_info *fs_info = root->fs_info;
1388	u8 calculated_csum[BTRFS_CSUM_SIZE];
1389	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1390	struct page *page;
1391	void *mapped_buffer;
1392	u64 mapped_size;
1393	void *p;
1394	u32 crc = ~(u32)0;
1395	int fail_gen = 0;
1396	int fail_cor = 0;
1397	u64 len;
1398	int index;
1399
1400	BUG_ON(sblock->page_count < 1);
1401	page = sblock->pagev[0].page;
1402	mapped_buffer = kmap_atomic(page);
1403	s = (struct btrfs_super_block *)mapped_buffer;
1404	memcpy(on_disk_csum, s->csum, sctx->csum_size);
1405
1406	if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
1407		++fail_cor;
1408
1409	if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
1410		++fail_gen;
1411
1412	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1413		++fail_cor;
1414
1415	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1416	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1417	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1418	index = 0;
1419	for (;;) {
1420		u64 l = min_t(u64, len, mapped_size);
1421
1422		crc = btrfs_csum_data(root, p, crc, l);
1423		kunmap_atomic(mapped_buffer);
1424		len -= l;
1425		if (len == 0)
1426			break;
1427		index++;
1428		BUG_ON(index >= sblock->page_count);
1429		BUG_ON(!sblock->pagev[index].page);
1430		page = sblock->pagev[index].page;
1431		mapped_buffer = kmap_atomic(page);
1432		mapped_size = PAGE_SIZE;
1433		p = mapped_buffer;
1434	}
1435
1436	btrfs_csum_final(crc, calculated_csum);
1437	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1438		++fail_cor;
1439
1440	if (fail_cor + fail_gen) {
1441		/*
1442		 * if we find an error in a super block, we just report it.
1443		 * They will get written with the next transaction commit
1444		 * anyway
1445		 */
1446		spin_lock(&sctx->stat_lock);
1447		++sctx->stat.super_errors;
1448		spin_unlock(&sctx->stat_lock);
1449		if (fail_cor)
1450			btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev,
1451				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1452		else
1453			btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev,
1454				BTRFS_DEV_STAT_GENERATION_ERRS);
1455	}
1456
1457	return fail_cor + fail_gen;
1458}
1459
1460static void scrub_block_get(struct scrub_block *sblock)
1461{
1462	atomic_inc(&sblock->ref_count);
1463}
1464
1465static void scrub_block_put(struct scrub_block *sblock)
1466{
1467	if (atomic_dec_and_test(&sblock->ref_count)) {
1468		int i;
1469
1470		for (i = 0; i < sblock->page_count; i++)
1471			if (sblock->pagev[i].page)
1472				__free_page(sblock->pagev[i].page);
1473		kfree(sblock);
1474	}
1475}
1476
1477static void scrub_submit(struct scrub_ctx *sctx)
1478{
1479	struct scrub_bio *sbio;
1480
1481	if (sctx->curr == -1)
1482		return;
1483
1484	sbio = sctx->bios[sctx->curr];
1485	sctx->curr = -1;
1486	atomic_inc(&sctx->in_flight);
1487
1488	btrfsic_submit_bio(READ, sbio->bio);
1489}
1490
1491static int scrub_add_page_to_bio(struct scrub_ctx *sctx,
1492				 struct scrub_page *spage)
1493{
1494	struct scrub_block *sblock = spage->sblock;
1495	struct scrub_bio *sbio;
1496	int ret;
1497
1498again:
1499	/*
1500	 * grab a fresh bio or wait for one to become available
1501	 */
1502	while (sctx->curr == -1) {
1503		spin_lock(&sctx->list_lock);
1504		sctx->curr = sctx->first_free;
1505		if (sctx->curr != -1) {
1506			sctx->first_free = sctx->bios[sctx->curr]->next_free;
1507			sctx->bios[sctx->curr]->next_free = -1;
1508			sctx->bios[sctx->curr]->page_count = 0;
1509			spin_unlock(&sctx->list_lock);
1510		} else {
1511			spin_unlock(&sctx->list_lock);
1512			wait_event(sctx->list_wait, sctx->first_free != -1);
1513		}
1514	}
1515	sbio = sctx->bios[sctx->curr];
1516	if (sbio->page_count == 0) {
1517		struct bio *bio;
1518
1519		sbio->physical = spage->physical;
1520		sbio->logical = spage->logical;
1521		sbio->dev = spage->dev;
1522		bio = sbio->bio;
1523		if (!bio) {
1524			bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio);
1525			if (!bio)
1526				return -ENOMEM;
1527			sbio->bio = bio;
1528		}
1529
1530		bio->bi_private = sbio;
1531		bio->bi_end_io = scrub_bio_end_io;
1532		bio->bi_bdev = sbio->dev->bdev;
1533		bio->bi_sector = sbio->physical >> 9;
1534		sbio->err = 0;
1535	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1536		   spage->physical ||
1537		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1538		   spage->logical ||
1539		   sbio->dev != spage->dev) {
1540		scrub_submit(sctx);
1541		goto again;
1542	}
1543
1544	sbio->pagev[sbio->page_count] = spage;
1545	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1546	if (ret != PAGE_SIZE) {
1547		if (sbio->page_count < 1) {
1548			bio_put(sbio->bio);
1549			sbio->bio = NULL;
1550			return -EIO;
1551		}
1552		scrub_submit(sctx);
1553		goto again;
1554	}
1555
1556	scrub_block_get(sblock); /* one for the added page */
1557	atomic_inc(&sblock->outstanding_pages);
1558	sbio->page_count++;
1559	if (sbio->page_count == sctx->pages_per_bio)
1560		scrub_submit(sctx);
1561
1562	return 0;
1563}
1564
1565static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1566		       u64 physical, struct btrfs_device *dev, u64 flags,
1567		       u64 gen, int mirror_num, u8 *csum, int force)
1568{
1569	struct scrub_block *sblock;
1570	int index;
1571
1572	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1573	if (!sblock) {
1574		spin_lock(&sctx->stat_lock);
1575		sctx->stat.malloc_errors++;
1576		spin_unlock(&sctx->stat_lock);
1577		return -ENOMEM;
1578	}
1579
1580	/* one ref inside this function, plus one for each page later on */
1581	atomic_set(&sblock->ref_count, 1);
1582	sblock->sctx = sctx;
1583	sblock->no_io_error_seen = 1;
1584
1585	for (index = 0; len > 0; index++) {
1586		struct scrub_page *spage = sblock->pagev + index;
1587		u64 l = min_t(u64, len, PAGE_SIZE);
1588
1589		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
1590		spage->page = alloc_page(GFP_NOFS);
1591		if (!spage->page) {
1592			spin_lock(&sctx->stat_lock);
1593			sctx->stat.malloc_errors++;
1594			spin_unlock(&sctx->stat_lock);
1595			while (index > 0) {
1596				index--;
1597				__free_page(sblock->pagev[index].page);
1598			}
1599			kfree(sblock);
1600			return -ENOMEM;
1601		}
1602		spage->sblock = sblock;
1603		spage->dev = dev;
1604		spage->flags = flags;
1605		spage->generation = gen;
1606		spage->logical = logical;
1607		spage->physical = physical;
1608		spage->mirror_num = mirror_num;
1609		if (csum) {
1610			spage->have_csum = 1;
1611			memcpy(spage->csum, csum, sctx->csum_size);
1612		} else {
1613			spage->have_csum = 0;
1614		}
1615		sblock->page_count++;
1616		len -= l;
1617		logical += l;
1618		physical += l;
1619	}
1620
1621	BUG_ON(sblock->page_count == 0);
1622	for (index = 0; index < sblock->page_count; index++) {
1623		struct scrub_page *spage = sblock->pagev + index;
1624		int ret;
1625
1626		ret = scrub_add_page_to_bio(sctx, spage);
1627		if (ret) {
1628			scrub_block_put(sblock);
1629			return ret;
1630		}
1631	}
1632
1633	if (force)
1634		scrub_submit(sctx);
1635
1636	/* last one frees, either here or in bio completion for last page */
1637	scrub_block_put(sblock);
1638	return 0;
1639}
1640
1641static void scrub_bio_end_io(struct bio *bio, int err)
1642{
1643	struct scrub_bio *sbio = bio->bi_private;
1644	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1645
1646	sbio->err = err;
1647	sbio->bio = bio;
1648
1649	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
1650}
1651
1652static void scrub_bio_end_io_worker(struct btrfs_work *work)
1653{
1654	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1655	struct scrub_ctx *sctx = sbio->sctx;
1656	int i;
1657
1658	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
1659	if (sbio->err) {
1660		for (i = 0; i < sbio->page_count; i++) {
1661			struct scrub_page *spage = sbio->pagev[i];
1662
1663			spage->io_error = 1;
1664			spage->sblock->no_io_error_seen = 0;
1665		}
1666	}
1667
1668	/* now complete the scrub_block items that have all pages completed */
1669	for (i = 0; i < sbio->page_count; i++) {
1670		struct scrub_page *spage = sbio->pagev[i];
1671		struct scrub_block *sblock = spage->sblock;
1672
1673		if (atomic_dec_and_test(&sblock->outstanding_pages))
1674			scrub_block_complete(sblock);
1675		scrub_block_put(sblock);
1676	}
1677
1678	bio_put(sbio->bio);
1679	sbio->bio = NULL;
1680	spin_lock(&sctx->list_lock);
1681	sbio->next_free = sctx->first_free;
1682	sctx->first_free = sbio->index;
1683	spin_unlock(&sctx->list_lock);
1684	atomic_dec(&sctx->in_flight);
1685	wake_up(&sctx->list_wait);
1686}
1687
1688static void scrub_block_complete(struct scrub_block *sblock)
1689{
1690	if (!sblock->no_io_error_seen)
1691		scrub_handle_errored_block(sblock);
1692	else
1693		scrub_checksum(sblock);
1694}
1695
1696static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1697			   u8 *csum)
1698{
1699	struct btrfs_ordered_sum *sum = NULL;
1700	int ret = 0;
1701	unsigned long i;
1702	unsigned long num_sectors;
1703
1704	while (!list_empty(&sctx->csum_list)) {
1705		sum = list_first_entry(&sctx->csum_list,
1706				       struct btrfs_ordered_sum, list);
1707		if (sum->bytenr > logical)
1708			return 0;
1709		if (sum->bytenr + sum->len > logical)
1710			break;
1711
1712		++sctx->stat.csum_discards;
1713		list_del(&sum->list);
1714		kfree(sum);
1715		sum = NULL;
1716	}
1717	if (!sum)
1718		return 0;
1719
1720	num_sectors = sum->len / sctx->sectorsize;
1721	for (i = 0; i < num_sectors; ++i) {
1722		if (sum->sums[i].bytenr == logical) {
1723			memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
1724			ret = 1;
1725			break;
1726		}
1727	}
1728	if (ret && i == num_sectors - 1) {
1729		list_del(&sum->list);
1730		kfree(sum);
1731	}
1732	return ret;
1733}
1734
1735/* scrub extent tries to collect up to 64 kB for each bio */
1736static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1737			u64 physical, struct btrfs_device *dev, u64 flags,
1738			u64 gen, int mirror_num)
1739{
1740	int ret;
1741	u8 csum[BTRFS_CSUM_SIZE];
1742	u32 blocksize;
1743
1744	if (flags & BTRFS_EXTENT_FLAG_DATA) {
1745		blocksize = sctx->sectorsize;
1746		spin_lock(&sctx->stat_lock);
1747		sctx->stat.data_extents_scrubbed++;
1748		sctx->stat.data_bytes_scrubbed += len;
1749		spin_unlock(&sctx->stat_lock);
1750	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1751		BUG_ON(sctx->nodesize != sctx->leafsize);
1752		blocksize = sctx->nodesize;
1753		spin_lock(&sctx->stat_lock);
1754		sctx->stat.tree_extents_scrubbed++;
1755		sctx->stat.tree_bytes_scrubbed += len;
1756		spin_unlock(&sctx->stat_lock);
1757	} else {
1758		blocksize = sctx->sectorsize;
1759		BUG_ON(1);
1760	}
1761
1762	while (len) {
1763		u64 l = min_t(u64, len, blocksize);
1764		int have_csum = 0;
1765
1766		if (flags & BTRFS_EXTENT_FLAG_DATA) {
1767			/* push csums to sbio */
1768			have_csum = scrub_find_csum(sctx, logical, l, csum);
1769			if (have_csum == 0)
1770				++sctx->stat.no_csum;
1771		}
1772		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1773				  mirror_num, have_csum ? csum : NULL, 0);
1774		if (ret)
1775			return ret;
1776		len -= l;
1777		logical += l;
1778		physical += l;
1779	}
1780	return 0;
1781}
1782
1783static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1784					   struct map_lookup *map,
1785					   struct btrfs_device *scrub_dev,
1786					   int num, u64 base, u64 length)
1787{
1788	struct btrfs_path *path;
1789	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1790	struct btrfs_root *root = fs_info->extent_root;
1791	struct btrfs_root *csum_root = fs_info->csum_root;
1792	struct btrfs_extent_item *extent;
1793	struct blk_plug plug;
1794	u64 flags;
1795	int ret;
1796	int slot;
1797	int i;
1798	u64 nstripes;
1799	struct extent_buffer *l;
1800	struct btrfs_key key;
1801	u64 physical;
1802	u64 logical;
1803	u64 generation;
1804	int mirror_num;
1805	struct reada_control *reada1;
1806	struct reada_control *reada2;
1807	struct btrfs_key key_start;
1808	struct btrfs_key key_end;
1809	u64 increment = map->stripe_len;
1810	u64 offset;
1811
1812	nstripes = length;
1813	offset = 0;
1814	do_div(nstripes, map->stripe_len);
1815	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1816		offset = map->stripe_len * num;
1817		increment = map->stripe_len * map->num_stripes;
1818		mirror_num = 1;
1819	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1820		int factor = map->num_stripes / map->sub_stripes;
1821		offset = map->stripe_len * (num / map->sub_stripes);
1822		increment = map->stripe_len * factor;
1823		mirror_num = num % map->sub_stripes + 1;
1824	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
1825		increment = map->stripe_len;
1826		mirror_num = num % map->num_stripes + 1;
1827	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
1828		increment = map->stripe_len;
1829		mirror_num = num % map->num_stripes + 1;
1830	} else {
1831		increment = map->stripe_len;
1832		mirror_num = 1;
1833	}
1834
1835	path = btrfs_alloc_path();
1836	if (!path)
1837		return -ENOMEM;
1838
1839	/*
1840	 * work on commit root. The related disk blocks are static as
1841	 * long as COW is applied. This means, it is save to rewrite
1842	 * them to repair disk errors without any race conditions
1843	 */
1844	path->search_commit_root = 1;
1845	path->skip_locking = 1;
1846
1847	/*
1848	 * trigger the readahead for extent tree csum tree and wait for
1849	 * completion. During readahead, the scrub is officially paused
1850	 * to not hold off transaction commits
1851	 */
1852	logical = base + offset;
1853
1854	wait_event(sctx->list_wait,
1855		   atomic_read(&sctx->in_flight) == 0);
1856	atomic_inc(&fs_info->scrubs_paused);
1857	wake_up(&fs_info->scrub_pause_wait);
1858
1859	/* FIXME it might be better to start readahead at commit root */
1860	key_start.objectid = logical;
1861	key_start.type = BTRFS_EXTENT_ITEM_KEY;
1862	key_start.offset = (u64)0;
1863	key_end.objectid = base + offset + nstripes * increment;
1864	key_end.type = BTRFS_EXTENT_ITEM_KEY;
1865	key_end.offset = (u64)0;
1866	reada1 = btrfs_reada_add(root, &key_start, &key_end);
1867
1868	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1869	key_start.type = BTRFS_EXTENT_CSUM_KEY;
1870	key_start.offset = logical;
1871	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1872	key_end.type = BTRFS_EXTENT_CSUM_KEY;
1873	key_end.offset = base + offset + nstripes * increment;
1874	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1875
1876	if (!IS_ERR(reada1))
1877		btrfs_reada_wait(reada1);
1878	if (!IS_ERR(reada2))
1879		btrfs_reada_wait(reada2);
1880
1881	mutex_lock(&fs_info->scrub_lock);
1882	while (atomic_read(&fs_info->scrub_pause_req)) {
1883		mutex_unlock(&fs_info->scrub_lock);
1884		wait_event(fs_info->scrub_pause_wait,
1885		   atomic_read(&fs_info->scrub_pause_req) == 0);
1886		mutex_lock(&fs_info->scrub_lock);
1887	}
1888	atomic_dec(&fs_info->scrubs_paused);
1889	mutex_unlock(&fs_info->scrub_lock);
1890	wake_up(&fs_info->scrub_pause_wait);
1891
1892	/*
1893	 * collect all data csums for the stripe to avoid seeking during
1894	 * the scrub. This might currently (crc32) end up to be about 1MB
1895	 */
1896	blk_start_plug(&plug);
1897
1898	/*
1899	 * now find all extents for each stripe and scrub them
1900	 */
1901	logical = base + offset;
1902	physical = map->stripes[num].physical;
1903	ret = 0;
1904	for (i = 0; i < nstripes; ++i) {
1905		/*
1906		 * canceled?
1907		 */
1908		if (atomic_read(&fs_info->scrub_cancel_req) ||
1909		    atomic_read(&sctx->cancel_req)) {
1910			ret = -ECANCELED;
1911			goto out;
1912		}
1913		/*
1914		 * check to see if we have to pause
1915		 */
1916		if (atomic_read(&fs_info->scrub_pause_req)) {
1917			/* push queued extents */
1918			scrub_submit(sctx);
1919			wait_event(sctx->list_wait,
1920				   atomic_read(&sctx->in_flight) == 0);
1921			atomic_inc(&fs_info->scrubs_paused);
1922			wake_up(&fs_info->scrub_pause_wait);
1923			mutex_lock(&fs_info->scrub_lock);
1924			while (atomic_read(&fs_info->scrub_pause_req)) {
1925				mutex_unlock(&fs_info->scrub_lock);
1926				wait_event(fs_info->scrub_pause_wait,
1927				   atomic_read(&fs_info->scrub_pause_req) == 0);
1928				mutex_lock(&fs_info->scrub_lock);
1929			}
1930			atomic_dec(&fs_info->scrubs_paused);
1931			mutex_unlock(&fs_info->scrub_lock);
1932			wake_up(&fs_info->scrub_pause_wait);
1933		}
1934
1935		ret = btrfs_lookup_csums_range(csum_root, logical,
1936					       logical + map->stripe_len - 1,
1937					       &sctx->csum_list, 1);
1938		if (ret)
1939			goto out;
1940
1941		key.objectid = logical;
1942		key.type = BTRFS_EXTENT_ITEM_KEY;
1943		key.offset = (u64)0;
1944
1945		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1946		if (ret < 0)
1947			goto out;
1948		if (ret > 0) {
1949			ret = btrfs_previous_item(root, path, 0,
1950						  BTRFS_EXTENT_ITEM_KEY);
1951			if (ret < 0)
1952				goto out;
1953			if (ret > 0) {
1954				/* there's no smaller item, so stick with the
1955				 * larger one */
1956				btrfs_release_path(path);
1957				ret = btrfs_search_slot(NULL, root, &key,
1958							path, 0, 0);
1959				if (ret < 0)
1960					goto out;
1961			}
1962		}
1963
1964		while (1) {
1965			l = path->nodes[0];
1966			slot = path->slots[0];
1967			if (slot >= btrfs_header_nritems(l)) {
1968				ret = btrfs_next_leaf(root, path);
1969				if (ret == 0)
1970					continue;
1971				if (ret < 0)
1972					goto out;
1973
1974				break;
1975			}
1976			btrfs_item_key_to_cpu(l, &key, slot);
1977
1978			if (key.objectid + key.offset <= logical)
1979				goto next;
1980
1981			if (key.objectid >= logical + map->stripe_len)
1982				break;
1983
1984			if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
1985				goto next;
1986
1987			extent = btrfs_item_ptr(l, slot,
1988						struct btrfs_extent_item);
1989			flags = btrfs_extent_flags(l, extent);
1990			generation = btrfs_extent_generation(l, extent);
1991
1992			if (key.objectid < logical &&
1993			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
1994				printk(KERN_ERR
1995				       "btrfs scrub: tree block %llu spanning "
1996				       "stripes, ignored. logical=%llu\n",
1997				       (unsigned long long)key.objectid,
1998				       (unsigned long long)logical);
1999				goto next;
2000			}
2001
2002			/*
2003			 * trim extent to this stripe
2004			 */
2005			if (key.objectid < logical) {
2006				key.offset -= logical - key.objectid;
2007				key.objectid = logical;
2008			}
2009			if (key.objectid + key.offset >
2010			    logical + map->stripe_len) {
2011				key.offset = logical + map->stripe_len -
2012					     key.objectid;
2013			}
2014
2015			ret = scrub_extent(sctx, key.objectid, key.offset,
2016					   key.objectid - logical + physical,
2017					   scrub_dev, flags, generation,
2018					   mirror_num);
2019			if (ret)
2020				goto out;
2021
2022next:
2023			path->slots[0]++;
2024		}
2025		btrfs_release_path(path);
2026		logical += increment;
2027		physical += map->stripe_len;
2028		spin_lock(&sctx->stat_lock);
2029		sctx->stat.last_physical = physical;
2030		spin_unlock(&sctx->stat_lock);
2031	}
2032	/* push queued extents */
2033	scrub_submit(sctx);
2034
2035out:
2036	blk_finish_plug(&plug);
2037	btrfs_free_path(path);
2038	return ret < 0 ? ret : 0;
2039}
2040
2041static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2042					  struct btrfs_device *scrub_dev,
2043					  u64 chunk_tree, u64 chunk_objectid,
2044					  u64 chunk_offset, u64 length,
2045					  u64 dev_offset)
2046{
2047	struct btrfs_mapping_tree *map_tree =
2048		&sctx->dev_root->fs_info->mapping_tree;
2049	struct map_lookup *map;
2050	struct extent_map *em;
2051	int i;
2052	int ret = -EINVAL;
2053
2054	read_lock(&map_tree->map_tree.lock);
2055	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2056	read_unlock(&map_tree->map_tree.lock);
2057
2058	if (!em)
2059		return -EINVAL;
2060
2061	map = (struct map_lookup *)em->bdev;
2062	if (em->start != chunk_offset)
2063		goto out;
2064
2065	if (em->len < length)
2066		goto out;
2067
2068	for (i = 0; i < map->num_stripes; ++i) {
2069		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2070		    map->stripes[i].physical == dev_offset) {
2071			ret = scrub_stripe(sctx, map, scrub_dev, i,
2072					   chunk_offset, length);
2073			if (ret)
2074				goto out;
2075		}
2076	}
2077out:
2078	free_extent_map(em);
2079
2080	return ret;
2081}
2082
2083static noinline_for_stack
2084int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2085			   struct btrfs_device *scrub_dev, u64 start, u64 end)
2086{
2087	struct btrfs_dev_extent *dev_extent = NULL;
2088	struct btrfs_path *path;
2089	struct btrfs_root *root = sctx->dev_root;
2090	struct btrfs_fs_info *fs_info = root->fs_info;
2091	u64 length;
2092	u64 chunk_tree;
2093	u64 chunk_objectid;
2094	u64 chunk_offset;
2095	int ret;
2096	int slot;
2097	struct extent_buffer *l;
2098	struct btrfs_key key;
2099	struct btrfs_key found_key;
2100	struct btrfs_block_group_cache *cache;
2101
2102	path = btrfs_alloc_path();
2103	if (!path)
2104		return -ENOMEM;
2105
2106	path->reada = 2;
2107	path->search_commit_root = 1;
2108	path->skip_locking = 1;
2109
2110	key.objectid = scrub_dev->devid;
2111	key.offset = 0ull;
2112	key.type = BTRFS_DEV_EXTENT_KEY;
2113
2114	while (1) {
2115		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2116		if (ret < 0)
2117			break;
2118		if (ret > 0) {
2119			if (path->slots[0] >=
2120			    btrfs_header_nritems(path->nodes[0])) {
2121				ret = btrfs_next_leaf(root, path);
2122				if (ret)
2123					break;
2124			}
2125		}
2126
2127		l = path->nodes[0];
2128		slot = path->slots[0];
2129
2130		btrfs_item_key_to_cpu(l, &found_key, slot);
2131
2132		if (found_key.objectid != scrub_dev->devid)
2133			break;
2134
2135		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2136			break;
2137
2138		if (found_key.offset >= end)
2139			break;
2140
2141		if (found_key.offset < key.offset)
2142			break;
2143
2144		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2145		length = btrfs_dev_extent_length(l, dev_extent);
2146
2147		if (found_key.offset + length <= start) {
2148			key.offset = found_key.offset + length;
2149			btrfs_release_path(path);
2150			continue;
2151		}
2152
2153		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2154		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2155		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2156
2157		/*
2158		 * get a reference on the corresponding block group to prevent
2159		 * the chunk from going away while we scrub it
2160		 */
2161		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2162		if (!cache) {
2163			ret = -ENOENT;
2164			break;
2165		}
2166		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2167				  chunk_offset, length, found_key.offset);
2168		btrfs_put_block_group(cache);
2169		if (ret)
2170			break;
2171
2172		key.offset = found_key.offset + length;
2173		btrfs_release_path(path);
2174	}
2175
2176	btrfs_free_path(path);
2177
2178	/*
2179	 * ret can still be 1 from search_slot or next_leaf,
2180	 * that's not an error
2181	 */
2182	return ret < 0 ? ret : 0;
2183}
2184
2185static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2186					   struct btrfs_device *scrub_dev)
2187{
2188	int	i;
2189	u64	bytenr;
2190	u64	gen;
2191	int	ret;
2192	struct btrfs_root *root = sctx->dev_root;
2193
2194	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2195		return -EIO;
2196
2197	gen = root->fs_info->last_trans_committed;
2198
2199	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2200		bytenr = btrfs_sb_offset(i);
2201		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2202			break;
2203
2204		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2205				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2206				  NULL, 1);
2207		if (ret)
2208			return ret;
2209	}
2210	wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0);
2211
2212	return 0;
2213}
2214
2215/*
2216 * get a reference count on fs_info->scrub_workers. start worker if necessary
2217 */
2218static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
2219{
2220	struct btrfs_fs_info *fs_info = root->fs_info;
2221	int ret = 0;
2222
2223	mutex_lock(&fs_info->scrub_lock);
2224	if (fs_info->scrub_workers_refcnt == 0) {
2225		btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2226			   fs_info->thread_pool_size, &fs_info->generic_worker);
2227		fs_info->scrub_workers.idle_thresh = 4;
2228		ret = btrfs_start_workers(&fs_info->scrub_workers);
2229		if (ret)
2230			goto out;
2231	}
2232	++fs_info->scrub_workers_refcnt;
2233out:
2234	mutex_unlock(&fs_info->scrub_lock);
2235
2236	return ret;
2237}
2238
2239static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
2240{
2241	struct btrfs_fs_info *fs_info = root->fs_info;
2242
2243	mutex_lock(&fs_info->scrub_lock);
2244	if (--fs_info->scrub_workers_refcnt == 0)
2245		btrfs_stop_workers(&fs_info->scrub_workers);
2246	WARN_ON(fs_info->scrub_workers_refcnt < 0);
2247	mutex_unlock(&fs_info->scrub_lock);
2248}
2249
2250
2251int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2252		    struct btrfs_scrub_progress *progress, int readonly)
2253{
2254	struct scrub_ctx *sctx;
2255	struct btrfs_fs_info *fs_info = root->fs_info;
2256	int ret;
2257	struct btrfs_device *dev;
2258
2259	if (btrfs_fs_closing(root->fs_info))
2260		return -EINVAL;
2261
2262	/*
2263	 * check some assumptions
2264	 */
2265	if (root->nodesize != root->leafsize) {
2266		printk(KERN_ERR
2267		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2268		       root->nodesize, root->leafsize);
2269		return -EINVAL;
2270	}
2271
2272	if (root->nodesize > BTRFS_STRIPE_LEN) {
2273		/*
2274		 * in this case scrub is unable to calculate the checksum
2275		 * the way scrub is implemented. Do not handle this
2276		 * situation at all because it won't ever happen.
2277		 */
2278		printk(KERN_ERR
2279		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2280		       root->nodesize, BTRFS_STRIPE_LEN);
2281		return -EINVAL;
2282	}
2283
2284	if (root->sectorsize != PAGE_SIZE) {
2285		/* not supported for data w/o checksums */
2286		printk(KERN_ERR
2287		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2288		       root->sectorsize, (unsigned long long)PAGE_SIZE);
2289		return -EINVAL;
2290	}
2291
2292	ret = scrub_workers_get(root);
2293	if (ret)
2294		return ret;
2295
2296	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2297	dev = btrfs_find_device(root, devid, NULL, NULL);
2298	if (!dev || dev->missing) {
2299		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2300		scrub_workers_put(root);
2301		return -ENODEV;
2302	}
2303	mutex_lock(&fs_info->scrub_lock);
2304
2305	if (!dev->in_fs_metadata) {
2306		mutex_unlock(&fs_info->scrub_lock);
2307		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2308		scrub_workers_put(root);
2309		return -ENODEV;
2310	}
2311
2312	if (dev->scrub_device) {
2313		mutex_unlock(&fs_info->scrub_lock);
2314		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2315		scrub_workers_put(root);
2316		return -EINPROGRESS;
2317	}
2318	sctx = scrub_setup_ctx(dev);
2319	if (IS_ERR(sctx)) {
2320		mutex_unlock(&fs_info->scrub_lock);
2321		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2322		scrub_workers_put(root);
2323		return PTR_ERR(sctx);
2324	}
2325	sctx->readonly = readonly;
2326	dev->scrub_device = sctx;
2327
2328	atomic_inc(&fs_info->scrubs_running);
2329	mutex_unlock(&fs_info->scrub_lock);
2330	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2331
2332	down_read(&fs_info->scrub_super_lock);
2333	ret = scrub_supers(sctx, dev);
2334	up_read(&fs_info->scrub_super_lock);
2335
2336	if (!ret)
2337		ret = scrub_enumerate_chunks(sctx, dev, start, end);
2338
2339	wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0);
2340	atomic_dec(&fs_info->scrubs_running);
2341	wake_up(&fs_info->scrub_pause_wait);
2342
2343	wait_event(sctx->list_wait, atomic_read(&sctx->fixup_cnt) == 0);
2344
2345	if (progress)
2346		memcpy(progress, &sctx->stat, sizeof(*progress));
2347
2348	mutex_lock(&fs_info->scrub_lock);
2349	dev->scrub_device = NULL;
2350	mutex_unlock(&fs_info->scrub_lock);
2351
2352	scrub_free_ctx(sctx);
2353	scrub_workers_put(root);
2354
2355	return ret;
2356}
2357
2358void btrfs_scrub_pause(struct btrfs_root *root)
2359{
2360	struct btrfs_fs_info *fs_info = root->fs_info;
2361
2362	mutex_lock(&fs_info->scrub_lock);
2363	atomic_inc(&fs_info->scrub_pause_req);
2364	while (atomic_read(&fs_info->scrubs_paused) !=
2365	       atomic_read(&fs_info->scrubs_running)) {
2366		mutex_unlock(&fs_info->scrub_lock);
2367		wait_event(fs_info->scrub_pause_wait,
2368			   atomic_read(&fs_info->scrubs_paused) ==
2369			   atomic_read(&fs_info->scrubs_running));
2370		mutex_lock(&fs_info->scrub_lock);
2371	}
2372	mutex_unlock(&fs_info->scrub_lock);
2373}
2374
2375void btrfs_scrub_continue(struct btrfs_root *root)
2376{
2377	struct btrfs_fs_info *fs_info = root->fs_info;
2378
2379	atomic_dec(&fs_info->scrub_pause_req);
2380	wake_up(&fs_info->scrub_pause_wait);
2381}
2382
2383void btrfs_scrub_pause_super(struct btrfs_root *root)
2384{
2385	down_write(&root->fs_info->scrub_super_lock);
2386}
2387
2388void btrfs_scrub_continue_super(struct btrfs_root *root)
2389{
2390	up_write(&root->fs_info->scrub_super_lock);
2391}
2392
2393int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2394{
2395
2396	mutex_lock(&fs_info->scrub_lock);
2397	if (!atomic_read(&fs_info->scrubs_running)) {
2398		mutex_unlock(&fs_info->scrub_lock);
2399		return -ENOTCONN;
2400	}
2401
2402	atomic_inc(&fs_info->scrub_cancel_req);
2403	while (atomic_read(&fs_info->scrubs_running)) {
2404		mutex_unlock(&fs_info->scrub_lock);
2405		wait_event(fs_info->scrub_pause_wait,
2406			   atomic_read(&fs_info->scrubs_running) == 0);
2407		mutex_lock(&fs_info->scrub_lock);
2408	}
2409	atomic_dec(&fs_info->scrub_cancel_req);
2410	mutex_unlock(&fs_info->scrub_lock);
2411
2412	return 0;
2413}
2414
2415int btrfs_scrub_cancel(struct btrfs_root *root)
2416{
2417	return __btrfs_scrub_cancel(root->fs_info);
2418}
2419
2420int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
2421{
2422	struct btrfs_fs_info *fs_info = root->fs_info;
2423	struct scrub_ctx *sctx;
2424
2425	mutex_lock(&fs_info->scrub_lock);
2426	sctx = dev->scrub_device;
2427	if (!sctx) {
2428		mutex_unlock(&fs_info->scrub_lock);
2429		return -ENOTCONN;
2430	}
2431	atomic_inc(&sctx->cancel_req);
2432	while (dev->scrub_device) {
2433		mutex_unlock(&fs_info->scrub_lock);
2434		wait_event(fs_info->scrub_pause_wait,
2435			   dev->scrub_device == NULL);
2436		mutex_lock(&fs_info->scrub_lock);
2437	}
2438	mutex_unlock(&fs_info->scrub_lock);
2439
2440	return 0;
2441}
2442
2443int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2444{
2445	struct btrfs_fs_info *fs_info = root->fs_info;
2446	struct btrfs_device *dev;
2447	int ret;
2448
2449	/*
2450	 * we have to hold the device_list_mutex here so the device
2451	 * does not go away in cancel_dev. FIXME: find a better solution
2452	 */
2453	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2454	dev = btrfs_find_device(root, devid, NULL, NULL);
2455	if (!dev) {
2456		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2457		return -ENODEV;
2458	}
2459	ret = btrfs_scrub_cancel_dev(root, dev);
2460	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2461
2462	return ret;
2463}
2464
2465int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2466			 struct btrfs_scrub_progress *progress)
2467{
2468	struct btrfs_device *dev;
2469	struct scrub_ctx *sctx = NULL;
2470
2471	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2472	dev = btrfs_find_device(root, devid, NULL, NULL);
2473	if (dev)
2474		sctx = dev->scrub_device;
2475	if (sctx)
2476		memcpy(progress, &sctx->stat, sizeof(*progress));
2477	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2478
2479	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
2480}
2481