scrub.c revision aa1b8cd409f05e1489ec77ff219eff6ed4b801b8
1/*
2 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
21#include "ctree.h"
22#include "volumes.h"
23#include "disk-io.h"
24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
28#include "check-integrity.h"
29#include "rcu-string.h"
30
31/*
32 * This is only the first step towards a full-features scrub. It reads all
33 * extent and super block and verifies the checksums. In case a bad checksum
34 * is found or the extent cannot be read, good data will be written back if
35 * any can be found.
36 *
37 * Future enhancements:
38 *  - In case an unrepairable extent is encountered, track which files are
39 *    affected and report them
40 *  - track and record media errors, throw out bad devices
41 *  - add a mode to also read unallocated space
42 */
43
44struct scrub_block;
45struct scrub_ctx;
46
47#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */
48#define SCRUB_BIOS_PER_CTX	16	/* 1 MB per device in flight */
49
50/*
51 * the following value times PAGE_SIZE needs to be large enough to match the
52 * largest node/leaf/sector size that shall be supported.
53 * Values larger than BTRFS_STRIPE_LEN are not supported.
54 */
55#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
56
57struct scrub_page {
58	struct scrub_block	*sblock;
59	struct page		*page;
60	struct btrfs_device	*dev;
61	u64			flags;  /* extent flags */
62	u64			generation;
63	u64			logical;
64	u64			physical;
65	atomic_t		ref_count;
66	struct {
67		unsigned int	mirror_num:8;
68		unsigned int	have_csum:1;
69		unsigned int	io_error:1;
70	};
71	u8			csum[BTRFS_CSUM_SIZE];
72};
73
74struct scrub_bio {
75	int			index;
76	struct scrub_ctx	*sctx;
77	struct btrfs_device	*dev;
78	struct bio		*bio;
79	int			err;
80	u64			logical;
81	u64			physical;
82	struct scrub_page	*pagev[SCRUB_PAGES_PER_BIO];
83	int			page_count;
84	int			next_free;
85	struct btrfs_work	work;
86};
87
88struct scrub_block {
89	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
90	int			page_count;
91	atomic_t		outstanding_pages;
92	atomic_t		ref_count; /* free mem on transition to zero */
93	struct scrub_ctx	*sctx;
94	struct {
95		unsigned int	header_error:1;
96		unsigned int	checksum_error:1;
97		unsigned int	no_io_error_seen:1;
98		unsigned int	generation_error:1; /* also sets header_error */
99	};
100};
101
102struct scrub_ctx {
103	struct scrub_bio	*bios[SCRUB_BIOS_PER_CTX];
104	struct btrfs_root	*dev_root;
105	int			first_free;
106	int			curr;
107	atomic_t		bios_in_flight;
108	atomic_t		workers_pending;
109	spinlock_t		list_lock;
110	wait_queue_head_t	list_wait;
111	u16			csum_size;
112	struct list_head	csum_list;
113	atomic_t		cancel_req;
114	int			readonly;
115	int			pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
116	u32			sectorsize;
117	u32			nodesize;
118	u32			leafsize;
119	/*
120	 * statistics
121	 */
122	struct btrfs_scrub_progress stat;
123	spinlock_t		stat_lock;
124};
125
126struct scrub_fixup_nodatasum {
127	struct scrub_ctx	*sctx;
128	struct btrfs_device	*dev;
129	u64			logical;
130	struct btrfs_root	*root;
131	struct btrfs_work	work;
132	int			mirror_num;
133};
134
135struct scrub_warning {
136	struct btrfs_path	*path;
137	u64			extent_item_size;
138	char			*scratch_buf;
139	char			*msg_buf;
140	const char		*errstr;
141	sector_t		sector;
142	u64			logical;
143	struct btrfs_device	*dev;
144	int			msg_bufsize;
145	int			scratch_bufsize;
146};
147
148
149static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
150static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
151static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
152static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
153static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
154static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
155				     struct btrfs_fs_info *fs_info,
156				     u64 length, u64 logical,
157				     struct scrub_block *sblock);
158static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
159				struct scrub_block *sblock, int is_metadata,
160				int have_csum, u8 *csum, u64 generation,
161				u16 csum_size);
162static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
163					 struct scrub_block *sblock,
164					 int is_metadata, int have_csum,
165					 const u8 *csum, u64 generation,
166					 u16 csum_size);
167static void scrub_complete_bio_end_io(struct bio *bio, int err);
168static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
169					     struct scrub_block *sblock_good,
170					     int force_write);
171static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
172					    struct scrub_block *sblock_good,
173					    int page_num, int force_write);
174static int scrub_checksum_data(struct scrub_block *sblock);
175static int scrub_checksum_tree_block(struct scrub_block *sblock);
176static int scrub_checksum_super(struct scrub_block *sblock);
177static void scrub_block_get(struct scrub_block *sblock);
178static void scrub_block_put(struct scrub_block *sblock);
179static void scrub_page_get(struct scrub_page *spage);
180static void scrub_page_put(struct scrub_page *spage);
181static int scrub_add_page_to_bio(struct scrub_ctx *sctx,
182				 struct scrub_page *spage);
183static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
184		       u64 physical, struct btrfs_device *dev, u64 flags,
185		       u64 gen, int mirror_num, u8 *csum, int force);
186static void scrub_bio_end_io(struct bio *bio, int err);
187static void scrub_bio_end_io_worker(struct btrfs_work *work);
188static void scrub_block_complete(struct scrub_block *sblock);
189
190
191static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
192{
193	atomic_inc(&sctx->bios_in_flight);
194}
195
196static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
197{
198	atomic_dec(&sctx->bios_in_flight);
199	wake_up(&sctx->list_wait);
200}
201
202/*
203 * used for workers that require transaction commits (i.e., for the
204 * NOCOW case)
205 */
206static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
207{
208	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
209
210	/*
211	 * increment scrubs_running to prevent cancel requests from
212	 * completing as long as a worker is running. we must also
213	 * increment scrubs_paused to prevent deadlocking on pause
214	 * requests used for transactions commits (as the worker uses a
215	 * transaction context). it is safe to regard the worker
216	 * as paused for all matters practical. effectively, we only
217	 * avoid cancellation requests from completing.
218	 */
219	mutex_lock(&fs_info->scrub_lock);
220	atomic_inc(&fs_info->scrubs_running);
221	atomic_inc(&fs_info->scrubs_paused);
222	mutex_unlock(&fs_info->scrub_lock);
223	atomic_inc(&sctx->workers_pending);
224}
225
226/* used for workers that require transaction commits */
227static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
228{
229	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
230
231	/*
232	 * see scrub_pending_trans_workers_inc() why we're pretending
233	 * to be paused in the scrub counters
234	 */
235	mutex_lock(&fs_info->scrub_lock);
236	atomic_dec(&fs_info->scrubs_running);
237	atomic_dec(&fs_info->scrubs_paused);
238	mutex_unlock(&fs_info->scrub_lock);
239	atomic_dec(&sctx->workers_pending);
240	wake_up(&fs_info->scrub_pause_wait);
241	wake_up(&sctx->list_wait);
242}
243
244static void scrub_free_csums(struct scrub_ctx *sctx)
245{
246	while (!list_empty(&sctx->csum_list)) {
247		struct btrfs_ordered_sum *sum;
248		sum = list_first_entry(&sctx->csum_list,
249				       struct btrfs_ordered_sum, list);
250		list_del(&sum->list);
251		kfree(sum);
252	}
253}
254
255static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
256{
257	int i;
258
259	if (!sctx)
260		return;
261
262	/* this can happen when scrub is cancelled */
263	if (sctx->curr != -1) {
264		struct scrub_bio *sbio = sctx->bios[sctx->curr];
265
266		for (i = 0; i < sbio->page_count; i++) {
267			BUG_ON(!sbio->pagev[i]);
268			BUG_ON(!sbio->pagev[i]->page);
269			scrub_block_put(sbio->pagev[i]->sblock);
270		}
271		bio_put(sbio->bio);
272	}
273
274	for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) {
275		struct scrub_bio *sbio = sctx->bios[i];
276
277		if (!sbio)
278			break;
279		kfree(sbio);
280	}
281
282	scrub_free_csums(sctx);
283	kfree(sctx);
284}
285
286static noinline_for_stack
287struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev)
288{
289	struct scrub_ctx *sctx;
290	int		i;
291	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
292	int pages_per_bio;
293
294	pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
295			      bio_get_nr_vecs(dev->bdev));
296	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
297	if (!sctx)
298		goto nomem;
299	sctx->pages_per_bio = pages_per_bio;
300	sctx->curr = -1;
301	sctx->dev_root = dev->dev_root;
302	for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) {
303		struct scrub_bio *sbio;
304
305		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
306		if (!sbio)
307			goto nomem;
308		sctx->bios[i] = sbio;
309
310		sbio->index = i;
311		sbio->sctx = sctx;
312		sbio->page_count = 0;
313		sbio->work.func = scrub_bio_end_io_worker;
314
315		if (i != SCRUB_BIOS_PER_CTX - 1)
316			sctx->bios[i]->next_free = i + 1;
317		else
318			sctx->bios[i]->next_free = -1;
319	}
320	sctx->first_free = 0;
321	sctx->nodesize = dev->dev_root->nodesize;
322	sctx->leafsize = dev->dev_root->leafsize;
323	sctx->sectorsize = dev->dev_root->sectorsize;
324	atomic_set(&sctx->bios_in_flight, 0);
325	atomic_set(&sctx->workers_pending, 0);
326	atomic_set(&sctx->cancel_req, 0);
327	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
328	INIT_LIST_HEAD(&sctx->csum_list);
329
330	spin_lock_init(&sctx->list_lock);
331	spin_lock_init(&sctx->stat_lock);
332	init_waitqueue_head(&sctx->list_wait);
333	return sctx;
334
335nomem:
336	scrub_free_ctx(sctx);
337	return ERR_PTR(-ENOMEM);
338}
339
340static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
341{
342	u64 isize;
343	u32 nlink;
344	int ret;
345	int i;
346	struct extent_buffer *eb;
347	struct btrfs_inode_item *inode_item;
348	struct scrub_warning *swarn = ctx;
349	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
350	struct inode_fs_paths *ipath = NULL;
351	struct btrfs_root *local_root;
352	struct btrfs_key root_key;
353
354	root_key.objectid = root;
355	root_key.type = BTRFS_ROOT_ITEM_KEY;
356	root_key.offset = (u64)-1;
357	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
358	if (IS_ERR(local_root)) {
359		ret = PTR_ERR(local_root);
360		goto err;
361	}
362
363	ret = inode_item_info(inum, 0, local_root, swarn->path);
364	if (ret) {
365		btrfs_release_path(swarn->path);
366		goto err;
367	}
368
369	eb = swarn->path->nodes[0];
370	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
371					struct btrfs_inode_item);
372	isize = btrfs_inode_size(eb, inode_item);
373	nlink = btrfs_inode_nlink(eb, inode_item);
374	btrfs_release_path(swarn->path);
375
376	ipath = init_ipath(4096, local_root, swarn->path);
377	if (IS_ERR(ipath)) {
378		ret = PTR_ERR(ipath);
379		ipath = NULL;
380		goto err;
381	}
382	ret = paths_from_inode(inum, ipath);
383
384	if (ret < 0)
385		goto err;
386
387	/*
388	 * we deliberately ignore the bit ipath might have been too small to
389	 * hold all of the paths here
390	 */
391	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
392		printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
393			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
394			"length %llu, links %u (path: %s)\n", swarn->errstr,
395			swarn->logical, rcu_str_deref(swarn->dev->name),
396			(unsigned long long)swarn->sector, root, inum, offset,
397			min(isize - offset, (u64)PAGE_SIZE), nlink,
398			(char *)(unsigned long)ipath->fspath->val[i]);
399
400	free_ipath(ipath);
401	return 0;
402
403err:
404	printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
405		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
406		"resolving failed with ret=%d\n", swarn->errstr,
407		swarn->logical, rcu_str_deref(swarn->dev->name),
408		(unsigned long long)swarn->sector, root, inum, offset, ret);
409
410	free_ipath(ipath);
411	return 0;
412}
413
414static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
415{
416	struct btrfs_device *dev;
417	struct btrfs_fs_info *fs_info;
418	struct btrfs_path *path;
419	struct btrfs_key found_key;
420	struct extent_buffer *eb;
421	struct btrfs_extent_item *ei;
422	struct scrub_warning swarn;
423	unsigned long ptr = 0;
424	u64 extent_item_pos;
425	u64 flags = 0;
426	u64 ref_root;
427	u32 item_size;
428	u8 ref_level;
429	const int bufsize = 4096;
430	int ret;
431
432	WARN_ON(sblock->page_count < 1);
433	dev = sblock->pagev[0]->dev;
434	fs_info = sblock->sctx->dev_root->fs_info;
435
436	path = btrfs_alloc_path();
437
438	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
439	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
440	swarn.sector = (sblock->pagev[0]->physical) >> 9;
441	swarn.logical = sblock->pagev[0]->logical;
442	swarn.errstr = errstr;
443	swarn.dev = NULL;
444	swarn.msg_bufsize = bufsize;
445	swarn.scratch_bufsize = bufsize;
446
447	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
448		goto out;
449
450	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
451				  &flags);
452	if (ret < 0)
453		goto out;
454
455	extent_item_pos = swarn.logical - found_key.objectid;
456	swarn.extent_item_size = found_key.offset;
457
458	eb = path->nodes[0];
459	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
460	item_size = btrfs_item_size_nr(eb, path->slots[0]);
461	btrfs_release_path(path);
462
463	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
464		do {
465			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
466							&ref_root, &ref_level);
467			printk_in_rcu(KERN_WARNING
468				"btrfs: %s at logical %llu on dev %s, "
469				"sector %llu: metadata %s (level %d) in tree "
470				"%llu\n", errstr, swarn.logical,
471				rcu_str_deref(dev->name),
472				(unsigned long long)swarn.sector,
473				ref_level ? "node" : "leaf",
474				ret < 0 ? -1 : ref_level,
475				ret < 0 ? -1 : ref_root);
476		} while (ret != 1);
477	} else {
478		swarn.path = path;
479		swarn.dev = dev;
480		iterate_extent_inodes(fs_info, found_key.objectid,
481					extent_item_pos, 1,
482					scrub_print_warning_inode, &swarn);
483	}
484
485out:
486	btrfs_free_path(path);
487	kfree(swarn.scratch_buf);
488	kfree(swarn.msg_buf);
489}
490
491static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
492{
493	struct page *page = NULL;
494	unsigned long index;
495	struct scrub_fixup_nodatasum *fixup = ctx;
496	int ret;
497	int corrected = 0;
498	struct btrfs_key key;
499	struct inode *inode = NULL;
500	u64 end = offset + PAGE_SIZE - 1;
501	struct btrfs_root *local_root;
502
503	key.objectid = root;
504	key.type = BTRFS_ROOT_ITEM_KEY;
505	key.offset = (u64)-1;
506	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
507	if (IS_ERR(local_root))
508		return PTR_ERR(local_root);
509
510	key.type = BTRFS_INODE_ITEM_KEY;
511	key.objectid = inum;
512	key.offset = 0;
513	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
514	if (IS_ERR(inode))
515		return PTR_ERR(inode);
516
517	index = offset >> PAGE_CACHE_SHIFT;
518
519	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
520	if (!page) {
521		ret = -ENOMEM;
522		goto out;
523	}
524
525	if (PageUptodate(page)) {
526		struct btrfs_fs_info *fs_info;
527		if (PageDirty(page)) {
528			/*
529			 * we need to write the data to the defect sector. the
530			 * data that was in that sector is not in memory,
531			 * because the page was modified. we must not write the
532			 * modified page to that sector.
533			 *
534			 * TODO: what could be done here: wait for the delalloc
535			 *       runner to write out that page (might involve
536			 *       COW) and see whether the sector is still
537			 *       referenced afterwards.
538			 *
539			 * For the meantime, we'll treat this error
540			 * incorrectable, although there is a chance that a
541			 * later scrub will find the bad sector again and that
542			 * there's no dirty page in memory, then.
543			 */
544			ret = -EIO;
545			goto out;
546		}
547		fs_info = BTRFS_I(inode)->root->fs_info;
548		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
549					fixup->logical, page,
550					fixup->mirror_num);
551		unlock_page(page);
552		corrected = !ret;
553	} else {
554		/*
555		 * we need to get good data first. the general readpage path
556		 * will call repair_io_failure for us, we just have to make
557		 * sure we read the bad mirror.
558		 */
559		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
560					EXTENT_DAMAGED, GFP_NOFS);
561		if (ret) {
562			/* set_extent_bits should give proper error */
563			WARN_ON(ret > 0);
564			if (ret > 0)
565				ret = -EFAULT;
566			goto out;
567		}
568
569		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
570						btrfs_get_extent,
571						fixup->mirror_num);
572		wait_on_page_locked(page);
573
574		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
575						end, EXTENT_DAMAGED, 0, NULL);
576		if (!corrected)
577			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
578						EXTENT_DAMAGED, GFP_NOFS);
579	}
580
581out:
582	if (page)
583		put_page(page);
584	if (inode)
585		iput(inode);
586
587	if (ret < 0)
588		return ret;
589
590	if (ret == 0 && corrected) {
591		/*
592		 * we only need to call readpage for one of the inodes belonging
593		 * to this extent. so make iterate_extent_inodes stop
594		 */
595		return 1;
596	}
597
598	return -EIO;
599}
600
601static void scrub_fixup_nodatasum(struct btrfs_work *work)
602{
603	int ret;
604	struct scrub_fixup_nodatasum *fixup;
605	struct scrub_ctx *sctx;
606	struct btrfs_trans_handle *trans = NULL;
607	struct btrfs_fs_info *fs_info;
608	struct btrfs_path *path;
609	int uncorrectable = 0;
610
611	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
612	sctx = fixup->sctx;
613	fs_info = fixup->root->fs_info;
614
615	path = btrfs_alloc_path();
616	if (!path) {
617		spin_lock(&sctx->stat_lock);
618		++sctx->stat.malloc_errors;
619		spin_unlock(&sctx->stat_lock);
620		uncorrectable = 1;
621		goto out;
622	}
623
624	trans = btrfs_join_transaction(fixup->root);
625	if (IS_ERR(trans)) {
626		uncorrectable = 1;
627		goto out;
628	}
629
630	/*
631	 * the idea is to trigger a regular read through the standard path. we
632	 * read a page from the (failed) logical address by specifying the
633	 * corresponding copynum of the failed sector. thus, that readpage is
634	 * expected to fail.
635	 * that is the point where on-the-fly error correction will kick in
636	 * (once it's finished) and rewrite the failed sector if a good copy
637	 * can be found.
638	 */
639	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
640						path, scrub_fixup_readpage,
641						fixup);
642	if (ret < 0) {
643		uncorrectable = 1;
644		goto out;
645	}
646	WARN_ON(ret != 1);
647
648	spin_lock(&sctx->stat_lock);
649	++sctx->stat.corrected_errors;
650	spin_unlock(&sctx->stat_lock);
651
652out:
653	if (trans && !IS_ERR(trans))
654		btrfs_end_transaction(trans, fixup->root);
655	if (uncorrectable) {
656		spin_lock(&sctx->stat_lock);
657		++sctx->stat.uncorrectable_errors;
658		spin_unlock(&sctx->stat_lock);
659
660		printk_ratelimited_in_rcu(KERN_ERR
661			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
662			(unsigned long long)fixup->logical,
663			rcu_str_deref(fixup->dev->name));
664	}
665
666	btrfs_free_path(path);
667	kfree(fixup);
668
669	scrub_pending_trans_workers_dec(sctx);
670}
671
672/*
673 * scrub_handle_errored_block gets called when either verification of the
674 * pages failed or the bio failed to read, e.g. with EIO. In the latter
675 * case, this function handles all pages in the bio, even though only one
676 * may be bad.
677 * The goal of this function is to repair the errored block by using the
678 * contents of one of the mirrors.
679 */
680static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
681{
682	struct scrub_ctx *sctx = sblock_to_check->sctx;
683	struct btrfs_device *dev;
684	struct btrfs_fs_info *fs_info;
685	u64 length;
686	u64 logical;
687	u64 generation;
688	unsigned int failed_mirror_index;
689	unsigned int is_metadata;
690	unsigned int have_csum;
691	u8 *csum;
692	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
693	struct scrub_block *sblock_bad;
694	int ret;
695	int mirror_index;
696	int page_num;
697	int success;
698	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
699				      DEFAULT_RATELIMIT_BURST);
700
701	BUG_ON(sblock_to_check->page_count < 1);
702	fs_info = sctx->dev_root->fs_info;
703	length = sblock_to_check->page_count * PAGE_SIZE;
704	logical = sblock_to_check->pagev[0]->logical;
705	generation = sblock_to_check->pagev[0]->generation;
706	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
707	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
708	is_metadata = !(sblock_to_check->pagev[0]->flags &
709			BTRFS_EXTENT_FLAG_DATA);
710	have_csum = sblock_to_check->pagev[0]->have_csum;
711	csum = sblock_to_check->pagev[0]->csum;
712	dev = sblock_to_check->pagev[0]->dev;
713
714	/*
715	 * read all mirrors one after the other. This includes to
716	 * re-read the extent or metadata block that failed (that was
717	 * the cause that this fixup code is called) another time,
718	 * page by page this time in order to know which pages
719	 * caused I/O errors and which ones are good (for all mirrors).
720	 * It is the goal to handle the situation when more than one
721	 * mirror contains I/O errors, but the errors do not
722	 * overlap, i.e. the data can be repaired by selecting the
723	 * pages from those mirrors without I/O error on the
724	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
725	 * would be that mirror #1 has an I/O error on the first page,
726	 * the second page is good, and mirror #2 has an I/O error on
727	 * the second page, but the first page is good.
728	 * Then the first page of the first mirror can be repaired by
729	 * taking the first page of the second mirror, and the
730	 * second page of the second mirror can be repaired by
731	 * copying the contents of the 2nd page of the 1st mirror.
732	 * One more note: if the pages of one mirror contain I/O
733	 * errors, the checksum cannot be verified. In order to get
734	 * the best data for repairing, the first attempt is to find
735	 * a mirror without I/O errors and with a validated checksum.
736	 * Only if this is not possible, the pages are picked from
737	 * mirrors with I/O errors without considering the checksum.
738	 * If the latter is the case, at the end, the checksum of the
739	 * repaired area is verified in order to correctly maintain
740	 * the statistics.
741	 */
742
743	sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
744				     sizeof(*sblocks_for_recheck),
745				     GFP_NOFS);
746	if (!sblocks_for_recheck) {
747		spin_lock(&sctx->stat_lock);
748		sctx->stat.malloc_errors++;
749		sctx->stat.read_errors++;
750		sctx->stat.uncorrectable_errors++;
751		spin_unlock(&sctx->stat_lock);
752		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
753		goto out;
754	}
755
756	/* setup the context, map the logical blocks and alloc the pages */
757	ret = scrub_setup_recheck_block(sctx, fs_info, length,
758					logical, sblocks_for_recheck);
759	if (ret) {
760		spin_lock(&sctx->stat_lock);
761		sctx->stat.read_errors++;
762		sctx->stat.uncorrectable_errors++;
763		spin_unlock(&sctx->stat_lock);
764		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
765		goto out;
766	}
767	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
768	sblock_bad = sblocks_for_recheck + failed_mirror_index;
769
770	/* build and submit the bios for the failed mirror, check checksums */
771	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
772			    csum, generation, sctx->csum_size);
773
774	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
775	    sblock_bad->no_io_error_seen) {
776		/*
777		 * the error disappeared after reading page by page, or
778		 * the area was part of a huge bio and other parts of the
779		 * bio caused I/O errors, or the block layer merged several
780		 * read requests into one and the error is caused by a
781		 * different bio (usually one of the two latter cases is
782		 * the cause)
783		 */
784		spin_lock(&sctx->stat_lock);
785		sctx->stat.unverified_errors++;
786		spin_unlock(&sctx->stat_lock);
787
788		goto out;
789	}
790
791	if (!sblock_bad->no_io_error_seen) {
792		spin_lock(&sctx->stat_lock);
793		sctx->stat.read_errors++;
794		spin_unlock(&sctx->stat_lock);
795		if (__ratelimit(&_rs))
796			scrub_print_warning("i/o error", sblock_to_check);
797		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
798	} else if (sblock_bad->checksum_error) {
799		spin_lock(&sctx->stat_lock);
800		sctx->stat.csum_errors++;
801		spin_unlock(&sctx->stat_lock);
802		if (__ratelimit(&_rs))
803			scrub_print_warning("checksum error", sblock_to_check);
804		btrfs_dev_stat_inc_and_print(dev,
805					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
806	} else if (sblock_bad->header_error) {
807		spin_lock(&sctx->stat_lock);
808		sctx->stat.verify_errors++;
809		spin_unlock(&sctx->stat_lock);
810		if (__ratelimit(&_rs))
811			scrub_print_warning("checksum/header error",
812					    sblock_to_check);
813		if (sblock_bad->generation_error)
814			btrfs_dev_stat_inc_and_print(dev,
815				BTRFS_DEV_STAT_GENERATION_ERRS);
816		else
817			btrfs_dev_stat_inc_and_print(dev,
818				BTRFS_DEV_STAT_CORRUPTION_ERRS);
819	}
820
821	if (sctx->readonly)
822		goto did_not_correct_error;
823
824	if (!is_metadata && !have_csum) {
825		struct scrub_fixup_nodatasum *fixup_nodatasum;
826
827		/*
828		 * !is_metadata and !have_csum, this means that the data
829		 * might not be COW'ed, that it might be modified
830		 * concurrently. The general strategy to work on the
831		 * commit root does not help in the case when COW is not
832		 * used.
833		 */
834		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
835		if (!fixup_nodatasum)
836			goto did_not_correct_error;
837		fixup_nodatasum->sctx = sctx;
838		fixup_nodatasum->dev = dev;
839		fixup_nodatasum->logical = logical;
840		fixup_nodatasum->root = fs_info->extent_root;
841		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
842		scrub_pending_trans_workers_inc(sctx);
843		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
844		btrfs_queue_worker(&fs_info->scrub_workers,
845				   &fixup_nodatasum->work);
846		goto out;
847	}
848
849	/*
850	 * now build and submit the bios for the other mirrors, check
851	 * checksums.
852	 * First try to pick the mirror which is completely without I/O
853	 * errors and also does not have a checksum error.
854	 * If one is found, and if a checksum is present, the full block
855	 * that is known to contain an error is rewritten. Afterwards
856	 * the block is known to be corrected.
857	 * If a mirror is found which is completely correct, and no
858	 * checksum is present, only those pages are rewritten that had
859	 * an I/O error in the block to be repaired, since it cannot be
860	 * determined, which copy of the other pages is better (and it
861	 * could happen otherwise that a correct page would be
862	 * overwritten by a bad one).
863	 */
864	for (mirror_index = 0;
865	     mirror_index < BTRFS_MAX_MIRRORS &&
866	     sblocks_for_recheck[mirror_index].page_count > 0;
867	     mirror_index++) {
868		struct scrub_block *sblock_other;
869
870		if (mirror_index == failed_mirror_index)
871			continue;
872		sblock_other = sblocks_for_recheck + mirror_index;
873
874		/* build and submit the bios, check checksums */
875		scrub_recheck_block(fs_info, sblock_other, is_metadata,
876				    have_csum, csum, generation,
877				    sctx->csum_size);
878
879		if (!sblock_other->header_error &&
880		    !sblock_other->checksum_error &&
881		    sblock_other->no_io_error_seen) {
882			int force_write = is_metadata || have_csum;
883
884			ret = scrub_repair_block_from_good_copy(sblock_bad,
885								sblock_other,
886								force_write);
887			if (0 == ret)
888				goto corrected_error;
889		}
890	}
891
892	/*
893	 * in case of I/O errors in the area that is supposed to be
894	 * repaired, continue by picking good copies of those pages.
895	 * Select the good pages from mirrors to rewrite bad pages from
896	 * the area to fix. Afterwards verify the checksum of the block
897	 * that is supposed to be repaired. This verification step is
898	 * only done for the purpose of statistic counting and for the
899	 * final scrub report, whether errors remain.
900	 * A perfect algorithm could make use of the checksum and try
901	 * all possible combinations of pages from the different mirrors
902	 * until the checksum verification succeeds. For example, when
903	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
904	 * of mirror #2 is readable but the final checksum test fails,
905	 * then the 2nd page of mirror #3 could be tried, whether now
906	 * the final checksum succeedes. But this would be a rare
907	 * exception and is therefore not implemented. At least it is
908	 * avoided that the good copy is overwritten.
909	 * A more useful improvement would be to pick the sectors
910	 * without I/O error based on sector sizes (512 bytes on legacy
911	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
912	 * mirror could be repaired by taking 512 byte of a different
913	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
914	 * area are unreadable.
915	 */
916
917	/* can only fix I/O errors from here on */
918	if (sblock_bad->no_io_error_seen)
919		goto did_not_correct_error;
920
921	success = 1;
922	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
923		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
924
925		if (!page_bad->io_error)
926			continue;
927
928		for (mirror_index = 0;
929		     mirror_index < BTRFS_MAX_MIRRORS &&
930		     sblocks_for_recheck[mirror_index].page_count > 0;
931		     mirror_index++) {
932			struct scrub_block *sblock_other = sblocks_for_recheck +
933							   mirror_index;
934			struct scrub_page *page_other = sblock_other->pagev[
935							page_num];
936
937			if (!page_other->io_error) {
938				ret = scrub_repair_page_from_good_copy(
939					sblock_bad, sblock_other, page_num, 0);
940				if (0 == ret) {
941					page_bad->io_error = 0;
942					break; /* succeeded for this page */
943				}
944			}
945		}
946
947		if (page_bad->io_error) {
948			/* did not find a mirror to copy the page from */
949			success = 0;
950		}
951	}
952
953	if (success) {
954		if (is_metadata || have_csum) {
955			/*
956			 * need to verify the checksum now that all
957			 * sectors on disk are repaired (the write
958			 * request for data to be repaired is on its way).
959			 * Just be lazy and use scrub_recheck_block()
960			 * which re-reads the data before the checksum
961			 * is verified, but most likely the data comes out
962			 * of the page cache.
963			 */
964			scrub_recheck_block(fs_info, sblock_bad,
965					    is_metadata, have_csum, csum,
966					    generation, sctx->csum_size);
967			if (!sblock_bad->header_error &&
968			    !sblock_bad->checksum_error &&
969			    sblock_bad->no_io_error_seen)
970				goto corrected_error;
971			else
972				goto did_not_correct_error;
973		} else {
974corrected_error:
975			spin_lock(&sctx->stat_lock);
976			sctx->stat.corrected_errors++;
977			spin_unlock(&sctx->stat_lock);
978			printk_ratelimited_in_rcu(KERN_ERR
979				"btrfs: fixed up error at logical %llu on dev %s\n",
980				(unsigned long long)logical,
981				rcu_str_deref(dev->name));
982		}
983	} else {
984did_not_correct_error:
985		spin_lock(&sctx->stat_lock);
986		sctx->stat.uncorrectable_errors++;
987		spin_unlock(&sctx->stat_lock);
988		printk_ratelimited_in_rcu(KERN_ERR
989			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
990			(unsigned long long)logical,
991			rcu_str_deref(dev->name));
992	}
993
994out:
995	if (sblocks_for_recheck) {
996		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
997		     mirror_index++) {
998			struct scrub_block *sblock = sblocks_for_recheck +
999						     mirror_index;
1000			int page_index;
1001
1002			for (page_index = 0; page_index < sblock->page_count;
1003			     page_index++) {
1004				sblock->pagev[page_index]->sblock = NULL;
1005				scrub_page_put(sblock->pagev[page_index]);
1006			}
1007		}
1008		kfree(sblocks_for_recheck);
1009	}
1010
1011	return 0;
1012}
1013
1014static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1015				     struct btrfs_fs_info *fs_info,
1016				     u64 length, u64 logical,
1017				     struct scrub_block *sblocks_for_recheck)
1018{
1019	int page_index;
1020	int mirror_index;
1021	int ret;
1022
1023	/*
1024	 * note: the two members ref_count and outstanding_pages
1025	 * are not used (and not set) in the blocks that are used for
1026	 * the recheck procedure
1027	 */
1028
1029	page_index = 0;
1030	while (length > 0) {
1031		u64 sublen = min_t(u64, length, PAGE_SIZE);
1032		u64 mapped_length = sublen;
1033		struct btrfs_bio *bbio = NULL;
1034
1035		/*
1036		 * with a length of PAGE_SIZE, each returned stripe
1037		 * represents one mirror
1038		 */
1039		ret = btrfs_map_block(fs_info, WRITE, logical, &mapped_length,
1040				      &bbio, 0);
1041		if (ret || !bbio || mapped_length < sublen) {
1042			kfree(bbio);
1043			return -EIO;
1044		}
1045
1046		BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
1047		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1048		     mirror_index++) {
1049			struct scrub_block *sblock;
1050			struct scrub_page *page;
1051
1052			if (mirror_index >= BTRFS_MAX_MIRRORS)
1053				continue;
1054
1055			sblock = sblocks_for_recheck + mirror_index;
1056			sblock->sctx = sctx;
1057			page = kzalloc(sizeof(*page), GFP_NOFS);
1058			if (!page) {
1059leave_nomem:
1060				spin_lock(&sctx->stat_lock);
1061				sctx->stat.malloc_errors++;
1062				spin_unlock(&sctx->stat_lock);
1063				kfree(bbio);
1064				return -ENOMEM;
1065			}
1066			scrub_page_get(page);
1067			sblock->pagev[page_index] = page;
1068			page->logical = logical;
1069			page->physical = bbio->stripes[mirror_index].physical;
1070			/* for missing devices, dev->bdev is NULL */
1071			page->dev = bbio->stripes[mirror_index].dev;
1072			page->mirror_num = mirror_index + 1;
1073			sblock->page_count++;
1074			page->page = alloc_page(GFP_NOFS);
1075			if (!page->page)
1076				goto leave_nomem;
1077		}
1078		kfree(bbio);
1079		length -= sublen;
1080		logical += sublen;
1081		page_index++;
1082	}
1083
1084	return 0;
1085}
1086
1087/*
1088 * this function will check the on disk data for checksum errors, header
1089 * errors and read I/O errors. If any I/O errors happen, the exact pages
1090 * which are errored are marked as being bad. The goal is to enable scrub
1091 * to take those pages that are not errored from all the mirrors so that
1092 * the pages that are errored in the just handled mirror can be repaired.
1093 */
1094static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1095				struct scrub_block *sblock, int is_metadata,
1096				int have_csum, u8 *csum, u64 generation,
1097				u16 csum_size)
1098{
1099	int page_num;
1100
1101	sblock->no_io_error_seen = 1;
1102	sblock->header_error = 0;
1103	sblock->checksum_error = 0;
1104
1105	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1106		struct bio *bio;
1107		struct scrub_page *page = sblock->pagev[page_num];
1108		DECLARE_COMPLETION_ONSTACK(complete);
1109
1110		if (page->dev->bdev == NULL) {
1111			page->io_error = 1;
1112			sblock->no_io_error_seen = 0;
1113			continue;
1114		}
1115
1116		WARN_ON(!page->page);
1117		bio = bio_alloc(GFP_NOFS, 1);
1118		if (!bio) {
1119			page->io_error = 1;
1120			sblock->no_io_error_seen = 0;
1121			continue;
1122		}
1123		bio->bi_bdev = page->dev->bdev;
1124		bio->bi_sector = page->physical >> 9;
1125		bio->bi_end_io = scrub_complete_bio_end_io;
1126		bio->bi_private = &complete;
1127
1128		bio_add_page(bio, page->page, PAGE_SIZE, 0);
1129		btrfsic_submit_bio(READ, bio);
1130
1131		/* this will also unplug the queue */
1132		wait_for_completion(&complete);
1133
1134		page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
1135		if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1136			sblock->no_io_error_seen = 0;
1137		bio_put(bio);
1138	}
1139
1140	if (sblock->no_io_error_seen)
1141		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1142					     have_csum, csum, generation,
1143					     csum_size);
1144
1145	return;
1146}
1147
1148static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1149					 struct scrub_block *sblock,
1150					 int is_metadata, int have_csum,
1151					 const u8 *csum, u64 generation,
1152					 u16 csum_size)
1153{
1154	int page_num;
1155	u8 calculated_csum[BTRFS_CSUM_SIZE];
1156	u32 crc = ~(u32)0;
1157	struct btrfs_root *root = fs_info->extent_root;
1158	void *mapped_buffer;
1159
1160	WARN_ON(!sblock->pagev[0]->page);
1161	if (is_metadata) {
1162		struct btrfs_header *h;
1163
1164		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1165		h = (struct btrfs_header *)mapped_buffer;
1166
1167		if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1168		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1169		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1170			   BTRFS_UUID_SIZE)) {
1171			sblock->header_error = 1;
1172		} else if (generation != le64_to_cpu(h->generation)) {
1173			sblock->header_error = 1;
1174			sblock->generation_error = 1;
1175		}
1176		csum = h->csum;
1177	} else {
1178		if (!have_csum)
1179			return;
1180
1181		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1182	}
1183
1184	for (page_num = 0;;) {
1185		if (page_num == 0 && is_metadata)
1186			crc = btrfs_csum_data(root,
1187				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1188				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1189		else
1190			crc = btrfs_csum_data(root, mapped_buffer, crc,
1191					      PAGE_SIZE);
1192
1193		kunmap_atomic(mapped_buffer);
1194		page_num++;
1195		if (page_num >= sblock->page_count)
1196			break;
1197		WARN_ON(!sblock->pagev[page_num]->page);
1198
1199		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1200	}
1201
1202	btrfs_csum_final(crc, calculated_csum);
1203	if (memcmp(calculated_csum, csum, csum_size))
1204		sblock->checksum_error = 1;
1205}
1206
1207static void scrub_complete_bio_end_io(struct bio *bio, int err)
1208{
1209	complete((struct completion *)bio->bi_private);
1210}
1211
1212static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1213					     struct scrub_block *sblock_good,
1214					     int force_write)
1215{
1216	int page_num;
1217	int ret = 0;
1218
1219	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1220		int ret_sub;
1221
1222		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1223							   sblock_good,
1224							   page_num,
1225							   force_write);
1226		if (ret_sub)
1227			ret = ret_sub;
1228	}
1229
1230	return ret;
1231}
1232
1233static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1234					    struct scrub_block *sblock_good,
1235					    int page_num, int force_write)
1236{
1237	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1238	struct scrub_page *page_good = sblock_good->pagev[page_num];
1239
1240	BUG_ON(page_bad->page == NULL);
1241	BUG_ON(page_good->page == NULL);
1242	if (force_write || sblock_bad->header_error ||
1243	    sblock_bad->checksum_error || page_bad->io_error) {
1244		struct bio *bio;
1245		int ret;
1246		DECLARE_COMPLETION_ONSTACK(complete);
1247
1248		bio = bio_alloc(GFP_NOFS, 1);
1249		if (!bio)
1250			return -EIO;
1251		bio->bi_bdev = page_bad->dev->bdev;
1252		bio->bi_sector = page_bad->physical >> 9;
1253		bio->bi_end_io = scrub_complete_bio_end_io;
1254		bio->bi_private = &complete;
1255
1256		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1257		if (PAGE_SIZE != ret) {
1258			bio_put(bio);
1259			return -EIO;
1260		}
1261		btrfsic_submit_bio(WRITE, bio);
1262
1263		/* this will also unplug the queue */
1264		wait_for_completion(&complete);
1265		if (!bio_flagged(bio, BIO_UPTODATE)) {
1266			btrfs_dev_stat_inc_and_print(page_bad->dev,
1267				BTRFS_DEV_STAT_WRITE_ERRS);
1268			bio_put(bio);
1269			return -EIO;
1270		}
1271		bio_put(bio);
1272	}
1273
1274	return 0;
1275}
1276
1277static void scrub_checksum(struct scrub_block *sblock)
1278{
1279	u64 flags;
1280	int ret;
1281
1282	WARN_ON(sblock->page_count < 1);
1283	flags = sblock->pagev[0]->flags;
1284	ret = 0;
1285	if (flags & BTRFS_EXTENT_FLAG_DATA)
1286		ret = scrub_checksum_data(sblock);
1287	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1288		ret = scrub_checksum_tree_block(sblock);
1289	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1290		(void)scrub_checksum_super(sblock);
1291	else
1292		WARN_ON(1);
1293	if (ret)
1294		scrub_handle_errored_block(sblock);
1295}
1296
1297static int scrub_checksum_data(struct scrub_block *sblock)
1298{
1299	struct scrub_ctx *sctx = sblock->sctx;
1300	u8 csum[BTRFS_CSUM_SIZE];
1301	u8 *on_disk_csum;
1302	struct page *page;
1303	void *buffer;
1304	u32 crc = ~(u32)0;
1305	int fail = 0;
1306	struct btrfs_root *root = sctx->dev_root;
1307	u64 len;
1308	int index;
1309
1310	BUG_ON(sblock->page_count < 1);
1311	if (!sblock->pagev[0]->have_csum)
1312		return 0;
1313
1314	on_disk_csum = sblock->pagev[0]->csum;
1315	page = sblock->pagev[0]->page;
1316	buffer = kmap_atomic(page);
1317
1318	len = sctx->sectorsize;
1319	index = 0;
1320	for (;;) {
1321		u64 l = min_t(u64, len, PAGE_SIZE);
1322
1323		crc = btrfs_csum_data(root, buffer, crc, l);
1324		kunmap_atomic(buffer);
1325		len -= l;
1326		if (len == 0)
1327			break;
1328		index++;
1329		BUG_ON(index >= sblock->page_count);
1330		BUG_ON(!sblock->pagev[index]->page);
1331		page = sblock->pagev[index]->page;
1332		buffer = kmap_atomic(page);
1333	}
1334
1335	btrfs_csum_final(crc, csum);
1336	if (memcmp(csum, on_disk_csum, sctx->csum_size))
1337		fail = 1;
1338
1339	return fail;
1340}
1341
1342static int scrub_checksum_tree_block(struct scrub_block *sblock)
1343{
1344	struct scrub_ctx *sctx = sblock->sctx;
1345	struct btrfs_header *h;
1346	struct btrfs_root *root = sctx->dev_root;
1347	struct btrfs_fs_info *fs_info = root->fs_info;
1348	u8 calculated_csum[BTRFS_CSUM_SIZE];
1349	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1350	struct page *page;
1351	void *mapped_buffer;
1352	u64 mapped_size;
1353	void *p;
1354	u32 crc = ~(u32)0;
1355	int fail = 0;
1356	int crc_fail = 0;
1357	u64 len;
1358	int index;
1359
1360	BUG_ON(sblock->page_count < 1);
1361	page = sblock->pagev[0]->page;
1362	mapped_buffer = kmap_atomic(page);
1363	h = (struct btrfs_header *)mapped_buffer;
1364	memcpy(on_disk_csum, h->csum, sctx->csum_size);
1365
1366	/*
1367	 * we don't use the getter functions here, as we
1368	 * a) don't have an extent buffer and
1369	 * b) the page is already kmapped
1370	 */
1371
1372	if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1373		++fail;
1374
1375	if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1376		++fail;
1377
1378	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1379		++fail;
1380
1381	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1382		   BTRFS_UUID_SIZE))
1383		++fail;
1384
1385	BUG_ON(sctx->nodesize != sctx->leafsize);
1386	len = sctx->nodesize - BTRFS_CSUM_SIZE;
1387	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1388	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1389	index = 0;
1390	for (;;) {
1391		u64 l = min_t(u64, len, mapped_size);
1392
1393		crc = btrfs_csum_data(root, p, crc, l);
1394		kunmap_atomic(mapped_buffer);
1395		len -= l;
1396		if (len == 0)
1397			break;
1398		index++;
1399		BUG_ON(index >= sblock->page_count);
1400		BUG_ON(!sblock->pagev[index]->page);
1401		page = sblock->pagev[index]->page;
1402		mapped_buffer = kmap_atomic(page);
1403		mapped_size = PAGE_SIZE;
1404		p = mapped_buffer;
1405	}
1406
1407	btrfs_csum_final(crc, calculated_csum);
1408	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1409		++crc_fail;
1410
1411	return fail || crc_fail;
1412}
1413
1414static int scrub_checksum_super(struct scrub_block *sblock)
1415{
1416	struct btrfs_super_block *s;
1417	struct scrub_ctx *sctx = sblock->sctx;
1418	struct btrfs_root *root = sctx->dev_root;
1419	struct btrfs_fs_info *fs_info = root->fs_info;
1420	u8 calculated_csum[BTRFS_CSUM_SIZE];
1421	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1422	struct page *page;
1423	void *mapped_buffer;
1424	u64 mapped_size;
1425	void *p;
1426	u32 crc = ~(u32)0;
1427	int fail_gen = 0;
1428	int fail_cor = 0;
1429	u64 len;
1430	int index;
1431
1432	BUG_ON(sblock->page_count < 1);
1433	page = sblock->pagev[0]->page;
1434	mapped_buffer = kmap_atomic(page);
1435	s = (struct btrfs_super_block *)mapped_buffer;
1436	memcpy(on_disk_csum, s->csum, sctx->csum_size);
1437
1438	if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1439		++fail_cor;
1440
1441	if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1442		++fail_gen;
1443
1444	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1445		++fail_cor;
1446
1447	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1448	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1449	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1450	index = 0;
1451	for (;;) {
1452		u64 l = min_t(u64, len, mapped_size);
1453
1454		crc = btrfs_csum_data(root, p, crc, l);
1455		kunmap_atomic(mapped_buffer);
1456		len -= l;
1457		if (len == 0)
1458			break;
1459		index++;
1460		BUG_ON(index >= sblock->page_count);
1461		BUG_ON(!sblock->pagev[index]->page);
1462		page = sblock->pagev[index]->page;
1463		mapped_buffer = kmap_atomic(page);
1464		mapped_size = PAGE_SIZE;
1465		p = mapped_buffer;
1466	}
1467
1468	btrfs_csum_final(crc, calculated_csum);
1469	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1470		++fail_cor;
1471
1472	if (fail_cor + fail_gen) {
1473		/*
1474		 * if we find an error in a super block, we just report it.
1475		 * They will get written with the next transaction commit
1476		 * anyway
1477		 */
1478		spin_lock(&sctx->stat_lock);
1479		++sctx->stat.super_errors;
1480		spin_unlock(&sctx->stat_lock);
1481		if (fail_cor)
1482			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1483				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1484		else
1485			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1486				BTRFS_DEV_STAT_GENERATION_ERRS);
1487	}
1488
1489	return fail_cor + fail_gen;
1490}
1491
1492static void scrub_block_get(struct scrub_block *sblock)
1493{
1494	atomic_inc(&sblock->ref_count);
1495}
1496
1497static void scrub_block_put(struct scrub_block *sblock)
1498{
1499	if (atomic_dec_and_test(&sblock->ref_count)) {
1500		int i;
1501
1502		for (i = 0; i < sblock->page_count; i++)
1503			scrub_page_put(sblock->pagev[i]);
1504		kfree(sblock);
1505	}
1506}
1507
1508static void scrub_page_get(struct scrub_page *spage)
1509{
1510	atomic_inc(&spage->ref_count);
1511}
1512
1513static void scrub_page_put(struct scrub_page *spage)
1514{
1515	if (atomic_dec_and_test(&spage->ref_count)) {
1516		if (spage->page)
1517			__free_page(spage->page);
1518		kfree(spage);
1519	}
1520}
1521
1522static void scrub_submit(struct scrub_ctx *sctx)
1523{
1524	struct scrub_bio *sbio;
1525
1526	if (sctx->curr == -1)
1527		return;
1528
1529	sbio = sctx->bios[sctx->curr];
1530	sctx->curr = -1;
1531	scrub_pending_bio_inc(sctx);
1532
1533	btrfsic_submit_bio(READ, sbio->bio);
1534}
1535
1536static int scrub_add_page_to_bio(struct scrub_ctx *sctx,
1537				 struct scrub_page *spage)
1538{
1539	struct scrub_block *sblock = spage->sblock;
1540	struct scrub_bio *sbio;
1541	int ret;
1542
1543again:
1544	/*
1545	 * grab a fresh bio or wait for one to become available
1546	 */
1547	while (sctx->curr == -1) {
1548		spin_lock(&sctx->list_lock);
1549		sctx->curr = sctx->first_free;
1550		if (sctx->curr != -1) {
1551			sctx->first_free = sctx->bios[sctx->curr]->next_free;
1552			sctx->bios[sctx->curr]->next_free = -1;
1553			sctx->bios[sctx->curr]->page_count = 0;
1554			spin_unlock(&sctx->list_lock);
1555		} else {
1556			spin_unlock(&sctx->list_lock);
1557			wait_event(sctx->list_wait, sctx->first_free != -1);
1558		}
1559	}
1560	sbio = sctx->bios[sctx->curr];
1561	if (sbio->page_count == 0) {
1562		struct bio *bio;
1563
1564		sbio->physical = spage->physical;
1565		sbio->logical = spage->logical;
1566		sbio->dev = spage->dev;
1567		bio = sbio->bio;
1568		if (!bio) {
1569			bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio);
1570			if (!bio)
1571				return -ENOMEM;
1572			sbio->bio = bio;
1573		}
1574
1575		bio->bi_private = sbio;
1576		bio->bi_end_io = scrub_bio_end_io;
1577		bio->bi_bdev = sbio->dev->bdev;
1578		bio->bi_sector = sbio->physical >> 9;
1579		sbio->err = 0;
1580	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1581		   spage->physical ||
1582		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1583		   spage->logical ||
1584		   sbio->dev != spage->dev) {
1585		scrub_submit(sctx);
1586		goto again;
1587	}
1588
1589	sbio->pagev[sbio->page_count] = spage;
1590	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1591	if (ret != PAGE_SIZE) {
1592		if (sbio->page_count < 1) {
1593			bio_put(sbio->bio);
1594			sbio->bio = NULL;
1595			return -EIO;
1596		}
1597		scrub_submit(sctx);
1598		goto again;
1599	}
1600
1601	scrub_block_get(sblock); /* one for the added page */
1602	atomic_inc(&sblock->outstanding_pages);
1603	sbio->page_count++;
1604	if (sbio->page_count == sctx->pages_per_bio)
1605		scrub_submit(sctx);
1606
1607	return 0;
1608}
1609
1610static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1611		       u64 physical, struct btrfs_device *dev, u64 flags,
1612		       u64 gen, int mirror_num, u8 *csum, int force)
1613{
1614	struct scrub_block *sblock;
1615	int index;
1616
1617	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1618	if (!sblock) {
1619		spin_lock(&sctx->stat_lock);
1620		sctx->stat.malloc_errors++;
1621		spin_unlock(&sctx->stat_lock);
1622		return -ENOMEM;
1623	}
1624
1625	/* one ref inside this function, plus one for each page added to
1626	 * a bio later on */
1627	atomic_set(&sblock->ref_count, 1);
1628	sblock->sctx = sctx;
1629	sblock->no_io_error_seen = 1;
1630
1631	for (index = 0; len > 0; index++) {
1632		struct scrub_page *spage;
1633		u64 l = min_t(u64, len, PAGE_SIZE);
1634
1635		spage = kzalloc(sizeof(*spage), GFP_NOFS);
1636		if (!spage) {
1637leave_nomem:
1638			spin_lock(&sctx->stat_lock);
1639			sctx->stat.malloc_errors++;
1640			spin_unlock(&sctx->stat_lock);
1641			scrub_block_put(sblock);
1642			return -ENOMEM;
1643		}
1644		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
1645		scrub_page_get(spage);
1646		sblock->pagev[index] = spage;
1647		spage->sblock = sblock;
1648		spage->dev = dev;
1649		spage->flags = flags;
1650		spage->generation = gen;
1651		spage->logical = logical;
1652		spage->physical = physical;
1653		spage->mirror_num = mirror_num;
1654		if (csum) {
1655			spage->have_csum = 1;
1656			memcpy(spage->csum, csum, sctx->csum_size);
1657		} else {
1658			spage->have_csum = 0;
1659		}
1660		sblock->page_count++;
1661		spage->page = alloc_page(GFP_NOFS);
1662		if (!spage->page)
1663			goto leave_nomem;
1664		len -= l;
1665		logical += l;
1666		physical += l;
1667	}
1668
1669	WARN_ON(sblock->page_count == 0);
1670	for (index = 0; index < sblock->page_count; index++) {
1671		struct scrub_page *spage = sblock->pagev[index];
1672		int ret;
1673
1674		ret = scrub_add_page_to_bio(sctx, spage);
1675		if (ret) {
1676			scrub_block_put(sblock);
1677			return ret;
1678		}
1679	}
1680
1681	if (force)
1682		scrub_submit(sctx);
1683
1684	/* last one frees, either here or in bio completion for last page */
1685	scrub_block_put(sblock);
1686	return 0;
1687}
1688
1689static void scrub_bio_end_io(struct bio *bio, int err)
1690{
1691	struct scrub_bio *sbio = bio->bi_private;
1692	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1693
1694	sbio->err = err;
1695	sbio->bio = bio;
1696
1697	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
1698}
1699
1700static void scrub_bio_end_io_worker(struct btrfs_work *work)
1701{
1702	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1703	struct scrub_ctx *sctx = sbio->sctx;
1704	int i;
1705
1706	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
1707	if (sbio->err) {
1708		for (i = 0; i < sbio->page_count; i++) {
1709			struct scrub_page *spage = sbio->pagev[i];
1710
1711			spage->io_error = 1;
1712			spage->sblock->no_io_error_seen = 0;
1713		}
1714	}
1715
1716	/* now complete the scrub_block items that have all pages completed */
1717	for (i = 0; i < sbio->page_count; i++) {
1718		struct scrub_page *spage = sbio->pagev[i];
1719		struct scrub_block *sblock = spage->sblock;
1720
1721		if (atomic_dec_and_test(&sblock->outstanding_pages))
1722			scrub_block_complete(sblock);
1723		scrub_block_put(sblock);
1724	}
1725
1726	bio_put(sbio->bio);
1727	sbio->bio = NULL;
1728	spin_lock(&sctx->list_lock);
1729	sbio->next_free = sctx->first_free;
1730	sctx->first_free = sbio->index;
1731	spin_unlock(&sctx->list_lock);
1732	scrub_pending_bio_dec(sctx);
1733}
1734
1735static void scrub_block_complete(struct scrub_block *sblock)
1736{
1737	if (!sblock->no_io_error_seen)
1738		scrub_handle_errored_block(sblock);
1739	else
1740		scrub_checksum(sblock);
1741}
1742
1743static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1744			   u8 *csum)
1745{
1746	struct btrfs_ordered_sum *sum = NULL;
1747	int ret = 0;
1748	unsigned long i;
1749	unsigned long num_sectors;
1750
1751	while (!list_empty(&sctx->csum_list)) {
1752		sum = list_first_entry(&sctx->csum_list,
1753				       struct btrfs_ordered_sum, list);
1754		if (sum->bytenr > logical)
1755			return 0;
1756		if (sum->bytenr + sum->len > logical)
1757			break;
1758
1759		++sctx->stat.csum_discards;
1760		list_del(&sum->list);
1761		kfree(sum);
1762		sum = NULL;
1763	}
1764	if (!sum)
1765		return 0;
1766
1767	num_sectors = sum->len / sctx->sectorsize;
1768	for (i = 0; i < num_sectors; ++i) {
1769		if (sum->sums[i].bytenr == logical) {
1770			memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
1771			ret = 1;
1772			break;
1773		}
1774	}
1775	if (ret && i == num_sectors - 1) {
1776		list_del(&sum->list);
1777		kfree(sum);
1778	}
1779	return ret;
1780}
1781
1782/* scrub extent tries to collect up to 64 kB for each bio */
1783static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1784			u64 physical, struct btrfs_device *dev, u64 flags,
1785			u64 gen, int mirror_num)
1786{
1787	int ret;
1788	u8 csum[BTRFS_CSUM_SIZE];
1789	u32 blocksize;
1790
1791	if (flags & BTRFS_EXTENT_FLAG_DATA) {
1792		blocksize = sctx->sectorsize;
1793		spin_lock(&sctx->stat_lock);
1794		sctx->stat.data_extents_scrubbed++;
1795		sctx->stat.data_bytes_scrubbed += len;
1796		spin_unlock(&sctx->stat_lock);
1797	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1798		BUG_ON(sctx->nodesize != sctx->leafsize);
1799		blocksize = sctx->nodesize;
1800		spin_lock(&sctx->stat_lock);
1801		sctx->stat.tree_extents_scrubbed++;
1802		sctx->stat.tree_bytes_scrubbed += len;
1803		spin_unlock(&sctx->stat_lock);
1804	} else {
1805		blocksize = sctx->sectorsize;
1806		BUG_ON(1);
1807	}
1808
1809	while (len) {
1810		u64 l = min_t(u64, len, blocksize);
1811		int have_csum = 0;
1812
1813		if (flags & BTRFS_EXTENT_FLAG_DATA) {
1814			/* push csums to sbio */
1815			have_csum = scrub_find_csum(sctx, logical, l, csum);
1816			if (have_csum == 0)
1817				++sctx->stat.no_csum;
1818		}
1819		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1820				  mirror_num, have_csum ? csum : NULL, 0);
1821		if (ret)
1822			return ret;
1823		len -= l;
1824		logical += l;
1825		physical += l;
1826	}
1827	return 0;
1828}
1829
1830static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1831					   struct map_lookup *map,
1832					   struct btrfs_device *scrub_dev,
1833					   int num, u64 base, u64 length)
1834{
1835	struct btrfs_path *path;
1836	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1837	struct btrfs_root *root = fs_info->extent_root;
1838	struct btrfs_root *csum_root = fs_info->csum_root;
1839	struct btrfs_extent_item *extent;
1840	struct blk_plug plug;
1841	u64 flags;
1842	int ret;
1843	int slot;
1844	int i;
1845	u64 nstripes;
1846	struct extent_buffer *l;
1847	struct btrfs_key key;
1848	u64 physical;
1849	u64 logical;
1850	u64 generation;
1851	int mirror_num;
1852	struct reada_control *reada1;
1853	struct reada_control *reada2;
1854	struct btrfs_key key_start;
1855	struct btrfs_key key_end;
1856	u64 increment = map->stripe_len;
1857	u64 offset;
1858
1859	nstripes = length;
1860	offset = 0;
1861	do_div(nstripes, map->stripe_len);
1862	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1863		offset = map->stripe_len * num;
1864		increment = map->stripe_len * map->num_stripes;
1865		mirror_num = 1;
1866	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1867		int factor = map->num_stripes / map->sub_stripes;
1868		offset = map->stripe_len * (num / map->sub_stripes);
1869		increment = map->stripe_len * factor;
1870		mirror_num = num % map->sub_stripes + 1;
1871	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
1872		increment = map->stripe_len;
1873		mirror_num = num % map->num_stripes + 1;
1874	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
1875		increment = map->stripe_len;
1876		mirror_num = num % map->num_stripes + 1;
1877	} else {
1878		increment = map->stripe_len;
1879		mirror_num = 1;
1880	}
1881
1882	path = btrfs_alloc_path();
1883	if (!path)
1884		return -ENOMEM;
1885
1886	/*
1887	 * work on commit root. The related disk blocks are static as
1888	 * long as COW is applied. This means, it is save to rewrite
1889	 * them to repair disk errors without any race conditions
1890	 */
1891	path->search_commit_root = 1;
1892	path->skip_locking = 1;
1893
1894	/*
1895	 * trigger the readahead for extent tree csum tree and wait for
1896	 * completion. During readahead, the scrub is officially paused
1897	 * to not hold off transaction commits
1898	 */
1899	logical = base + offset;
1900
1901	wait_event(sctx->list_wait,
1902		   atomic_read(&sctx->bios_in_flight) == 0);
1903	atomic_inc(&fs_info->scrubs_paused);
1904	wake_up(&fs_info->scrub_pause_wait);
1905
1906	/* FIXME it might be better to start readahead at commit root */
1907	key_start.objectid = logical;
1908	key_start.type = BTRFS_EXTENT_ITEM_KEY;
1909	key_start.offset = (u64)0;
1910	key_end.objectid = base + offset + nstripes * increment;
1911	key_end.type = BTRFS_EXTENT_ITEM_KEY;
1912	key_end.offset = (u64)0;
1913	reada1 = btrfs_reada_add(root, &key_start, &key_end);
1914
1915	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1916	key_start.type = BTRFS_EXTENT_CSUM_KEY;
1917	key_start.offset = logical;
1918	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1919	key_end.type = BTRFS_EXTENT_CSUM_KEY;
1920	key_end.offset = base + offset + nstripes * increment;
1921	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1922
1923	if (!IS_ERR(reada1))
1924		btrfs_reada_wait(reada1);
1925	if (!IS_ERR(reada2))
1926		btrfs_reada_wait(reada2);
1927
1928	mutex_lock(&fs_info->scrub_lock);
1929	while (atomic_read(&fs_info->scrub_pause_req)) {
1930		mutex_unlock(&fs_info->scrub_lock);
1931		wait_event(fs_info->scrub_pause_wait,
1932		   atomic_read(&fs_info->scrub_pause_req) == 0);
1933		mutex_lock(&fs_info->scrub_lock);
1934	}
1935	atomic_dec(&fs_info->scrubs_paused);
1936	mutex_unlock(&fs_info->scrub_lock);
1937	wake_up(&fs_info->scrub_pause_wait);
1938
1939	/*
1940	 * collect all data csums for the stripe to avoid seeking during
1941	 * the scrub. This might currently (crc32) end up to be about 1MB
1942	 */
1943	blk_start_plug(&plug);
1944
1945	/*
1946	 * now find all extents for each stripe and scrub them
1947	 */
1948	logical = base + offset;
1949	physical = map->stripes[num].physical;
1950	ret = 0;
1951	for (i = 0; i < nstripes; ++i) {
1952		/*
1953		 * canceled?
1954		 */
1955		if (atomic_read(&fs_info->scrub_cancel_req) ||
1956		    atomic_read(&sctx->cancel_req)) {
1957			ret = -ECANCELED;
1958			goto out;
1959		}
1960		/*
1961		 * check to see if we have to pause
1962		 */
1963		if (atomic_read(&fs_info->scrub_pause_req)) {
1964			/* push queued extents */
1965			scrub_submit(sctx);
1966			wait_event(sctx->list_wait,
1967				   atomic_read(&sctx->bios_in_flight) == 0);
1968			atomic_inc(&fs_info->scrubs_paused);
1969			wake_up(&fs_info->scrub_pause_wait);
1970			mutex_lock(&fs_info->scrub_lock);
1971			while (atomic_read(&fs_info->scrub_pause_req)) {
1972				mutex_unlock(&fs_info->scrub_lock);
1973				wait_event(fs_info->scrub_pause_wait,
1974				   atomic_read(&fs_info->scrub_pause_req) == 0);
1975				mutex_lock(&fs_info->scrub_lock);
1976			}
1977			atomic_dec(&fs_info->scrubs_paused);
1978			mutex_unlock(&fs_info->scrub_lock);
1979			wake_up(&fs_info->scrub_pause_wait);
1980		}
1981
1982		ret = btrfs_lookup_csums_range(csum_root, logical,
1983					       logical + map->stripe_len - 1,
1984					       &sctx->csum_list, 1);
1985		if (ret)
1986			goto out;
1987
1988		key.objectid = logical;
1989		key.type = BTRFS_EXTENT_ITEM_KEY;
1990		key.offset = (u64)0;
1991
1992		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1993		if (ret < 0)
1994			goto out;
1995		if (ret > 0) {
1996			ret = btrfs_previous_item(root, path, 0,
1997						  BTRFS_EXTENT_ITEM_KEY);
1998			if (ret < 0)
1999				goto out;
2000			if (ret > 0) {
2001				/* there's no smaller item, so stick with the
2002				 * larger one */
2003				btrfs_release_path(path);
2004				ret = btrfs_search_slot(NULL, root, &key,
2005							path, 0, 0);
2006				if (ret < 0)
2007					goto out;
2008			}
2009		}
2010
2011		while (1) {
2012			l = path->nodes[0];
2013			slot = path->slots[0];
2014			if (slot >= btrfs_header_nritems(l)) {
2015				ret = btrfs_next_leaf(root, path);
2016				if (ret == 0)
2017					continue;
2018				if (ret < 0)
2019					goto out;
2020
2021				break;
2022			}
2023			btrfs_item_key_to_cpu(l, &key, slot);
2024
2025			if (key.objectid + key.offset <= logical)
2026				goto next;
2027
2028			if (key.objectid >= logical + map->stripe_len)
2029				break;
2030
2031			if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
2032				goto next;
2033
2034			extent = btrfs_item_ptr(l, slot,
2035						struct btrfs_extent_item);
2036			flags = btrfs_extent_flags(l, extent);
2037			generation = btrfs_extent_generation(l, extent);
2038
2039			if (key.objectid < logical &&
2040			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2041				printk(KERN_ERR
2042				       "btrfs scrub: tree block %llu spanning "
2043				       "stripes, ignored. logical=%llu\n",
2044				       (unsigned long long)key.objectid,
2045				       (unsigned long long)logical);
2046				goto next;
2047			}
2048
2049			/*
2050			 * trim extent to this stripe
2051			 */
2052			if (key.objectid < logical) {
2053				key.offset -= logical - key.objectid;
2054				key.objectid = logical;
2055			}
2056			if (key.objectid + key.offset >
2057			    logical + map->stripe_len) {
2058				key.offset = logical + map->stripe_len -
2059					     key.objectid;
2060			}
2061
2062			ret = scrub_extent(sctx, key.objectid, key.offset,
2063					   key.objectid - logical + physical,
2064					   scrub_dev, flags, generation,
2065					   mirror_num);
2066			if (ret)
2067				goto out;
2068
2069next:
2070			path->slots[0]++;
2071		}
2072		btrfs_release_path(path);
2073		logical += increment;
2074		physical += map->stripe_len;
2075		spin_lock(&sctx->stat_lock);
2076		sctx->stat.last_physical = physical;
2077		spin_unlock(&sctx->stat_lock);
2078	}
2079	/* push queued extents */
2080	scrub_submit(sctx);
2081
2082out:
2083	blk_finish_plug(&plug);
2084	btrfs_free_path(path);
2085	return ret < 0 ? ret : 0;
2086}
2087
2088static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2089					  struct btrfs_device *scrub_dev,
2090					  u64 chunk_tree, u64 chunk_objectid,
2091					  u64 chunk_offset, u64 length,
2092					  u64 dev_offset)
2093{
2094	struct btrfs_mapping_tree *map_tree =
2095		&sctx->dev_root->fs_info->mapping_tree;
2096	struct map_lookup *map;
2097	struct extent_map *em;
2098	int i;
2099	int ret = -EINVAL;
2100
2101	read_lock(&map_tree->map_tree.lock);
2102	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2103	read_unlock(&map_tree->map_tree.lock);
2104
2105	if (!em)
2106		return -EINVAL;
2107
2108	map = (struct map_lookup *)em->bdev;
2109	if (em->start != chunk_offset)
2110		goto out;
2111
2112	if (em->len < length)
2113		goto out;
2114
2115	for (i = 0; i < map->num_stripes; ++i) {
2116		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2117		    map->stripes[i].physical == dev_offset) {
2118			ret = scrub_stripe(sctx, map, scrub_dev, i,
2119					   chunk_offset, length);
2120			if (ret)
2121				goto out;
2122		}
2123	}
2124out:
2125	free_extent_map(em);
2126
2127	return ret;
2128}
2129
2130static noinline_for_stack
2131int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2132			   struct btrfs_device *scrub_dev, u64 start, u64 end)
2133{
2134	struct btrfs_dev_extent *dev_extent = NULL;
2135	struct btrfs_path *path;
2136	struct btrfs_root *root = sctx->dev_root;
2137	struct btrfs_fs_info *fs_info = root->fs_info;
2138	u64 length;
2139	u64 chunk_tree;
2140	u64 chunk_objectid;
2141	u64 chunk_offset;
2142	int ret;
2143	int slot;
2144	struct extent_buffer *l;
2145	struct btrfs_key key;
2146	struct btrfs_key found_key;
2147	struct btrfs_block_group_cache *cache;
2148
2149	path = btrfs_alloc_path();
2150	if (!path)
2151		return -ENOMEM;
2152
2153	path->reada = 2;
2154	path->search_commit_root = 1;
2155	path->skip_locking = 1;
2156
2157	key.objectid = scrub_dev->devid;
2158	key.offset = 0ull;
2159	key.type = BTRFS_DEV_EXTENT_KEY;
2160
2161	while (1) {
2162		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2163		if (ret < 0)
2164			break;
2165		if (ret > 0) {
2166			if (path->slots[0] >=
2167			    btrfs_header_nritems(path->nodes[0])) {
2168				ret = btrfs_next_leaf(root, path);
2169				if (ret)
2170					break;
2171			}
2172		}
2173
2174		l = path->nodes[0];
2175		slot = path->slots[0];
2176
2177		btrfs_item_key_to_cpu(l, &found_key, slot);
2178
2179		if (found_key.objectid != scrub_dev->devid)
2180			break;
2181
2182		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2183			break;
2184
2185		if (found_key.offset >= end)
2186			break;
2187
2188		if (found_key.offset < key.offset)
2189			break;
2190
2191		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2192		length = btrfs_dev_extent_length(l, dev_extent);
2193
2194		if (found_key.offset + length <= start) {
2195			key.offset = found_key.offset + length;
2196			btrfs_release_path(path);
2197			continue;
2198		}
2199
2200		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2201		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2202		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2203
2204		/*
2205		 * get a reference on the corresponding block group to prevent
2206		 * the chunk from going away while we scrub it
2207		 */
2208		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2209		if (!cache) {
2210			ret = -ENOENT;
2211			break;
2212		}
2213		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2214				  chunk_offset, length, found_key.offset);
2215		btrfs_put_block_group(cache);
2216		if (ret)
2217			break;
2218
2219		key.offset = found_key.offset + length;
2220		btrfs_release_path(path);
2221	}
2222
2223	btrfs_free_path(path);
2224
2225	/*
2226	 * ret can still be 1 from search_slot or next_leaf,
2227	 * that's not an error
2228	 */
2229	return ret < 0 ? ret : 0;
2230}
2231
2232static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2233					   struct btrfs_device *scrub_dev)
2234{
2235	int	i;
2236	u64	bytenr;
2237	u64	gen;
2238	int	ret;
2239	struct btrfs_root *root = sctx->dev_root;
2240
2241	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2242		return -EIO;
2243
2244	gen = root->fs_info->last_trans_committed;
2245
2246	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2247		bytenr = btrfs_sb_offset(i);
2248		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2249			break;
2250
2251		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2252				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2253				  NULL, 1);
2254		if (ret)
2255			return ret;
2256	}
2257	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2258
2259	return 0;
2260}
2261
2262/*
2263 * get a reference count on fs_info->scrub_workers. start worker if necessary
2264 */
2265static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
2266{
2267	int ret = 0;
2268
2269	mutex_lock(&fs_info->scrub_lock);
2270	if (fs_info->scrub_workers_refcnt == 0) {
2271		btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2272			   fs_info->thread_pool_size, &fs_info->generic_worker);
2273		fs_info->scrub_workers.idle_thresh = 4;
2274		ret = btrfs_start_workers(&fs_info->scrub_workers);
2275		if (ret)
2276			goto out;
2277	}
2278	++fs_info->scrub_workers_refcnt;
2279out:
2280	mutex_unlock(&fs_info->scrub_lock);
2281
2282	return ret;
2283}
2284
2285static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2286{
2287	mutex_lock(&fs_info->scrub_lock);
2288	if (--fs_info->scrub_workers_refcnt == 0)
2289		btrfs_stop_workers(&fs_info->scrub_workers);
2290	WARN_ON(fs_info->scrub_workers_refcnt < 0);
2291	mutex_unlock(&fs_info->scrub_lock);
2292}
2293
2294int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2295		    u64 end, struct btrfs_scrub_progress *progress,
2296		    int readonly)
2297{
2298	struct scrub_ctx *sctx;
2299	int ret;
2300	struct btrfs_device *dev;
2301
2302	if (btrfs_fs_closing(fs_info))
2303		return -EINVAL;
2304
2305	/*
2306	 * check some assumptions
2307	 */
2308	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2309		printk(KERN_ERR
2310		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2311		       fs_info->chunk_root->nodesize,
2312		       fs_info->chunk_root->leafsize);
2313		return -EINVAL;
2314	}
2315
2316	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2317		/*
2318		 * in this case scrub is unable to calculate the checksum
2319		 * the way scrub is implemented. Do not handle this
2320		 * situation at all because it won't ever happen.
2321		 */
2322		printk(KERN_ERR
2323		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2324		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2325		return -EINVAL;
2326	}
2327
2328	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2329		/* not supported for data w/o checksums */
2330		printk(KERN_ERR
2331		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2332		       fs_info->chunk_root->sectorsize,
2333		       (unsigned long long)PAGE_SIZE);
2334		return -EINVAL;
2335	}
2336
2337	if (fs_info->chunk_root->nodesize >
2338	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2339	    fs_info->chunk_root->sectorsize >
2340	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2341		/*
2342		 * would exhaust the array bounds of pagev member in
2343		 * struct scrub_block
2344		 */
2345		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2346		       fs_info->chunk_root->nodesize,
2347		       SCRUB_MAX_PAGES_PER_BLOCK,
2348		       fs_info->chunk_root->sectorsize,
2349		       SCRUB_MAX_PAGES_PER_BLOCK);
2350		return -EINVAL;
2351	}
2352
2353	ret = scrub_workers_get(fs_info);
2354	if (ret)
2355		return ret;
2356
2357	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2358	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2359	if (!dev || dev->missing) {
2360		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2361		scrub_workers_put(fs_info);
2362		return -ENODEV;
2363	}
2364	mutex_lock(&fs_info->scrub_lock);
2365
2366	if (!dev->in_fs_metadata) {
2367		mutex_unlock(&fs_info->scrub_lock);
2368		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2369		scrub_workers_put(fs_info);
2370		return -EIO;
2371	}
2372
2373	if (dev->scrub_device) {
2374		mutex_unlock(&fs_info->scrub_lock);
2375		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2376		scrub_workers_put(fs_info);
2377		return -EINPROGRESS;
2378	}
2379	sctx = scrub_setup_ctx(dev);
2380	if (IS_ERR(sctx)) {
2381		mutex_unlock(&fs_info->scrub_lock);
2382		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2383		scrub_workers_put(fs_info);
2384		return PTR_ERR(sctx);
2385	}
2386	sctx->readonly = readonly;
2387	dev->scrub_device = sctx;
2388
2389	atomic_inc(&fs_info->scrubs_running);
2390	mutex_unlock(&fs_info->scrub_lock);
2391	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2392
2393	down_read(&fs_info->scrub_super_lock);
2394	ret = scrub_supers(sctx, dev);
2395	up_read(&fs_info->scrub_super_lock);
2396
2397	if (!ret)
2398		ret = scrub_enumerate_chunks(sctx, dev, start, end);
2399
2400	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2401	atomic_dec(&fs_info->scrubs_running);
2402	wake_up(&fs_info->scrub_pause_wait);
2403
2404	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2405
2406	if (progress)
2407		memcpy(progress, &sctx->stat, sizeof(*progress));
2408
2409	mutex_lock(&fs_info->scrub_lock);
2410	dev->scrub_device = NULL;
2411	mutex_unlock(&fs_info->scrub_lock);
2412
2413	scrub_free_ctx(sctx);
2414	scrub_workers_put(fs_info);
2415
2416	return ret;
2417}
2418
2419void btrfs_scrub_pause(struct btrfs_root *root)
2420{
2421	struct btrfs_fs_info *fs_info = root->fs_info;
2422
2423	mutex_lock(&fs_info->scrub_lock);
2424	atomic_inc(&fs_info->scrub_pause_req);
2425	while (atomic_read(&fs_info->scrubs_paused) !=
2426	       atomic_read(&fs_info->scrubs_running)) {
2427		mutex_unlock(&fs_info->scrub_lock);
2428		wait_event(fs_info->scrub_pause_wait,
2429			   atomic_read(&fs_info->scrubs_paused) ==
2430			   atomic_read(&fs_info->scrubs_running));
2431		mutex_lock(&fs_info->scrub_lock);
2432	}
2433	mutex_unlock(&fs_info->scrub_lock);
2434}
2435
2436void btrfs_scrub_continue(struct btrfs_root *root)
2437{
2438	struct btrfs_fs_info *fs_info = root->fs_info;
2439
2440	atomic_dec(&fs_info->scrub_pause_req);
2441	wake_up(&fs_info->scrub_pause_wait);
2442}
2443
2444void btrfs_scrub_pause_super(struct btrfs_root *root)
2445{
2446	down_write(&root->fs_info->scrub_super_lock);
2447}
2448
2449void btrfs_scrub_continue_super(struct btrfs_root *root)
2450{
2451	up_write(&root->fs_info->scrub_super_lock);
2452}
2453
2454int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2455{
2456	mutex_lock(&fs_info->scrub_lock);
2457	if (!atomic_read(&fs_info->scrubs_running)) {
2458		mutex_unlock(&fs_info->scrub_lock);
2459		return -ENOTCONN;
2460	}
2461
2462	atomic_inc(&fs_info->scrub_cancel_req);
2463	while (atomic_read(&fs_info->scrubs_running)) {
2464		mutex_unlock(&fs_info->scrub_lock);
2465		wait_event(fs_info->scrub_pause_wait,
2466			   atomic_read(&fs_info->scrubs_running) == 0);
2467		mutex_lock(&fs_info->scrub_lock);
2468	}
2469	atomic_dec(&fs_info->scrub_cancel_req);
2470	mutex_unlock(&fs_info->scrub_lock);
2471
2472	return 0;
2473}
2474
2475int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2476			   struct btrfs_device *dev)
2477{
2478	struct scrub_ctx *sctx;
2479
2480	mutex_lock(&fs_info->scrub_lock);
2481	sctx = dev->scrub_device;
2482	if (!sctx) {
2483		mutex_unlock(&fs_info->scrub_lock);
2484		return -ENOTCONN;
2485	}
2486	atomic_inc(&sctx->cancel_req);
2487	while (dev->scrub_device) {
2488		mutex_unlock(&fs_info->scrub_lock);
2489		wait_event(fs_info->scrub_pause_wait,
2490			   dev->scrub_device == NULL);
2491		mutex_lock(&fs_info->scrub_lock);
2492	}
2493	mutex_unlock(&fs_info->scrub_lock);
2494
2495	return 0;
2496}
2497
2498int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2499{
2500	struct btrfs_fs_info *fs_info = root->fs_info;
2501	struct btrfs_device *dev;
2502	int ret;
2503
2504	/*
2505	 * we have to hold the device_list_mutex here so the device
2506	 * does not go away in cancel_dev. FIXME: find a better solution
2507	 */
2508	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2509	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2510	if (!dev) {
2511		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2512		return -ENODEV;
2513	}
2514	ret = btrfs_scrub_cancel_dev(fs_info, dev);
2515	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2516
2517	return ret;
2518}
2519
2520int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2521			 struct btrfs_scrub_progress *progress)
2522{
2523	struct btrfs_device *dev;
2524	struct scrub_ctx *sctx = NULL;
2525
2526	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2527	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
2528	if (dev)
2529		sctx = dev->scrub_device;
2530	if (sctx)
2531		memcpy(progress, &sctx->stat, sizeof(*progress));
2532	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2533
2534	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
2535}
2536