scrub.c revision c170bbb45febc03ac4d34ba2b8bb55e06104b7e7
1/*
2 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
21#include "ctree.h"
22#include "volumes.h"
23#include "disk-io.h"
24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
28#include "dev-replace.h"
29#include "check-integrity.h"
30#include "rcu-string.h"
31#include "raid56.h"
32
33/*
34 * This is only the first step towards a full-features scrub. It reads all
35 * extent and super block and verifies the checksums. In case a bad checksum
36 * is found or the extent cannot be read, good data will be written back if
37 * any can be found.
38 *
39 * Future enhancements:
40 *  - In case an unrepairable extent is encountered, track which files are
41 *    affected and report them
42 *  - track and record media errors, throw out bad devices
43 *  - add a mode to also read unallocated space
44 */
45
46struct scrub_block;
47struct scrub_ctx;
48
49/*
50 * the following three values only influence the performance.
51 * The last one configures the number of parallel and outstanding I/O
52 * operations. The first two values configure an upper limit for the number
53 * of (dynamically allocated) pages that are added to a bio.
54 */
55#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
56#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
57#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
58
59/*
60 * the following value times PAGE_SIZE needs to be large enough to match the
61 * largest node/leaf/sector size that shall be supported.
62 * Values larger than BTRFS_STRIPE_LEN are not supported.
63 */
64#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
65
66struct scrub_page {
67	struct scrub_block	*sblock;
68	struct page		*page;
69	struct btrfs_device	*dev;
70	u64			flags;  /* extent flags */
71	u64			generation;
72	u64			logical;
73	u64			physical;
74	u64			physical_for_dev_replace;
75	atomic_t		ref_count;
76	struct {
77		unsigned int	mirror_num:8;
78		unsigned int	have_csum:1;
79		unsigned int	io_error:1;
80	};
81	u8			csum[BTRFS_CSUM_SIZE];
82};
83
84struct scrub_bio {
85	int			index;
86	struct scrub_ctx	*sctx;
87	struct btrfs_device	*dev;
88	struct bio		*bio;
89	int			err;
90	u64			logical;
91	u64			physical;
92#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
93	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
94#else
95	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
96#endif
97	int			page_count;
98	int			next_free;
99	struct btrfs_work	work;
100};
101
102struct scrub_block {
103	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
104	int			page_count;
105	atomic_t		outstanding_pages;
106	atomic_t		ref_count; /* free mem on transition to zero */
107	struct scrub_ctx	*sctx;
108	struct {
109		unsigned int	header_error:1;
110		unsigned int	checksum_error:1;
111		unsigned int	no_io_error_seen:1;
112		unsigned int	generation_error:1; /* also sets header_error */
113	};
114};
115
116struct scrub_wr_ctx {
117	struct scrub_bio *wr_curr_bio;
118	struct btrfs_device *tgtdev;
119	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
120	atomic_t flush_all_writes;
121	struct mutex wr_lock;
122};
123
124struct scrub_ctx {
125	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
126	struct btrfs_root	*dev_root;
127	int			first_free;
128	int			curr;
129	atomic_t		bios_in_flight;
130	atomic_t		workers_pending;
131	spinlock_t		list_lock;
132	wait_queue_head_t	list_wait;
133	u16			csum_size;
134	struct list_head	csum_list;
135	atomic_t		cancel_req;
136	int			readonly;
137	int			pages_per_rd_bio;
138	u32			sectorsize;
139	u32			nodesize;
140	u32			leafsize;
141
142	int			is_dev_replace;
143	struct scrub_wr_ctx	wr_ctx;
144
145	/*
146	 * statistics
147	 */
148	struct btrfs_scrub_progress stat;
149	spinlock_t		stat_lock;
150};
151
152struct scrub_fixup_nodatasum {
153	struct scrub_ctx	*sctx;
154	struct btrfs_device	*dev;
155	u64			logical;
156	struct btrfs_root	*root;
157	struct btrfs_work	work;
158	int			mirror_num;
159};
160
161struct scrub_nocow_inode {
162	u64			inum;
163	u64			offset;
164	u64			root;
165	struct list_head	list;
166};
167
168struct scrub_copy_nocow_ctx {
169	struct scrub_ctx	*sctx;
170	u64			logical;
171	u64			len;
172	int			mirror_num;
173	u64			physical_for_dev_replace;
174	struct list_head	inodes;
175	struct btrfs_work	work;
176};
177
178struct scrub_warning {
179	struct btrfs_path	*path;
180	u64			extent_item_size;
181	char			*scratch_buf;
182	char			*msg_buf;
183	const char		*errstr;
184	sector_t		sector;
185	u64			logical;
186	struct btrfs_device	*dev;
187	int			msg_bufsize;
188	int			scratch_bufsize;
189};
190
191
192static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
193static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
194static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
195static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
196static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
197static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
198				     struct btrfs_fs_info *fs_info,
199				     struct scrub_block *original_sblock,
200				     u64 length, u64 logical,
201				     struct scrub_block *sblocks_for_recheck);
202static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
203				struct scrub_block *sblock, int is_metadata,
204				int have_csum, u8 *csum, u64 generation,
205				u16 csum_size);
206static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
207					 struct scrub_block *sblock,
208					 int is_metadata, int have_csum,
209					 const u8 *csum, u64 generation,
210					 u16 csum_size);
211static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
212					     struct scrub_block *sblock_good,
213					     int force_write);
214static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
215					    struct scrub_block *sblock_good,
216					    int page_num, int force_write);
217static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
218static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
219					   int page_num);
220static int scrub_checksum_data(struct scrub_block *sblock);
221static int scrub_checksum_tree_block(struct scrub_block *sblock);
222static int scrub_checksum_super(struct scrub_block *sblock);
223static void scrub_block_get(struct scrub_block *sblock);
224static void scrub_block_put(struct scrub_block *sblock);
225static void scrub_page_get(struct scrub_page *spage);
226static void scrub_page_put(struct scrub_page *spage);
227static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
228				    struct scrub_page *spage);
229static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
230		       u64 physical, struct btrfs_device *dev, u64 flags,
231		       u64 gen, int mirror_num, u8 *csum, int force,
232		       u64 physical_for_dev_replace);
233static void scrub_bio_end_io(struct bio *bio, int err);
234static void scrub_bio_end_io_worker(struct btrfs_work *work);
235static void scrub_block_complete(struct scrub_block *sblock);
236static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
237			       u64 extent_logical, u64 extent_len,
238			       u64 *extent_physical,
239			       struct btrfs_device **extent_dev,
240			       int *extent_mirror_num);
241static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
242			      struct scrub_wr_ctx *wr_ctx,
243			      struct btrfs_fs_info *fs_info,
244			      struct btrfs_device *dev,
245			      int is_dev_replace);
246static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
247static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
248				    struct scrub_page *spage);
249static void scrub_wr_submit(struct scrub_ctx *sctx);
250static void scrub_wr_bio_end_io(struct bio *bio, int err);
251static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
252static int write_page_nocow(struct scrub_ctx *sctx,
253			    u64 physical_for_dev_replace, struct page *page);
254static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
255				      struct scrub_copy_nocow_ctx *ctx);
256static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
257			    int mirror_num, u64 physical_for_dev_replace);
258static void copy_nocow_pages_worker(struct btrfs_work *work);
259
260
261static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
262{
263	atomic_inc(&sctx->bios_in_flight);
264}
265
266static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
267{
268	atomic_dec(&sctx->bios_in_flight);
269	wake_up(&sctx->list_wait);
270}
271
272/*
273 * used for workers that require transaction commits (i.e., for the
274 * NOCOW case)
275 */
276static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
277{
278	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
279
280	/*
281	 * increment scrubs_running to prevent cancel requests from
282	 * completing as long as a worker is running. we must also
283	 * increment scrubs_paused to prevent deadlocking on pause
284	 * requests used for transactions commits (as the worker uses a
285	 * transaction context). it is safe to regard the worker
286	 * as paused for all matters practical. effectively, we only
287	 * avoid cancellation requests from completing.
288	 */
289	mutex_lock(&fs_info->scrub_lock);
290	atomic_inc(&fs_info->scrubs_running);
291	atomic_inc(&fs_info->scrubs_paused);
292	mutex_unlock(&fs_info->scrub_lock);
293	atomic_inc(&sctx->workers_pending);
294}
295
296/* used for workers that require transaction commits */
297static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
298{
299	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
300
301	/*
302	 * see scrub_pending_trans_workers_inc() why we're pretending
303	 * to be paused in the scrub counters
304	 */
305	mutex_lock(&fs_info->scrub_lock);
306	atomic_dec(&fs_info->scrubs_running);
307	atomic_dec(&fs_info->scrubs_paused);
308	mutex_unlock(&fs_info->scrub_lock);
309	atomic_dec(&sctx->workers_pending);
310	wake_up(&fs_info->scrub_pause_wait);
311	wake_up(&sctx->list_wait);
312}
313
314static void scrub_free_csums(struct scrub_ctx *sctx)
315{
316	while (!list_empty(&sctx->csum_list)) {
317		struct btrfs_ordered_sum *sum;
318		sum = list_first_entry(&sctx->csum_list,
319				       struct btrfs_ordered_sum, list);
320		list_del(&sum->list);
321		kfree(sum);
322	}
323}
324
325static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
326{
327	int i;
328
329	if (!sctx)
330		return;
331
332	scrub_free_wr_ctx(&sctx->wr_ctx);
333
334	/* this can happen when scrub is cancelled */
335	if (sctx->curr != -1) {
336		struct scrub_bio *sbio = sctx->bios[sctx->curr];
337
338		for (i = 0; i < sbio->page_count; i++) {
339			WARN_ON(!sbio->pagev[i]->page);
340			scrub_block_put(sbio->pagev[i]->sblock);
341		}
342		bio_put(sbio->bio);
343	}
344
345	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
346		struct scrub_bio *sbio = sctx->bios[i];
347
348		if (!sbio)
349			break;
350		kfree(sbio);
351	}
352
353	scrub_free_csums(sctx);
354	kfree(sctx);
355}
356
357static noinline_for_stack
358struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
359{
360	struct scrub_ctx *sctx;
361	int		i;
362	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
363	int pages_per_rd_bio;
364	int ret;
365
366	/*
367	 * the setting of pages_per_rd_bio is correct for scrub but might
368	 * be wrong for the dev_replace code where we might read from
369	 * different devices in the initial huge bios. However, that
370	 * code is able to correctly handle the case when adding a page
371	 * to a bio fails.
372	 */
373	if (dev->bdev)
374		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
375					 bio_get_nr_vecs(dev->bdev));
376	else
377		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
378	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
379	if (!sctx)
380		goto nomem;
381	sctx->is_dev_replace = is_dev_replace;
382	sctx->pages_per_rd_bio = pages_per_rd_bio;
383	sctx->curr = -1;
384	sctx->dev_root = dev->dev_root;
385	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
386		struct scrub_bio *sbio;
387
388		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
389		if (!sbio)
390			goto nomem;
391		sctx->bios[i] = sbio;
392
393		sbio->index = i;
394		sbio->sctx = sctx;
395		sbio->page_count = 0;
396		sbio->work.func = scrub_bio_end_io_worker;
397
398		if (i != SCRUB_BIOS_PER_SCTX - 1)
399			sctx->bios[i]->next_free = i + 1;
400		else
401			sctx->bios[i]->next_free = -1;
402	}
403	sctx->first_free = 0;
404	sctx->nodesize = dev->dev_root->nodesize;
405	sctx->leafsize = dev->dev_root->leafsize;
406	sctx->sectorsize = dev->dev_root->sectorsize;
407	atomic_set(&sctx->bios_in_flight, 0);
408	atomic_set(&sctx->workers_pending, 0);
409	atomic_set(&sctx->cancel_req, 0);
410	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
411	INIT_LIST_HEAD(&sctx->csum_list);
412
413	spin_lock_init(&sctx->list_lock);
414	spin_lock_init(&sctx->stat_lock);
415	init_waitqueue_head(&sctx->list_wait);
416
417	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
418				 fs_info->dev_replace.tgtdev, is_dev_replace);
419	if (ret) {
420		scrub_free_ctx(sctx);
421		return ERR_PTR(ret);
422	}
423	return sctx;
424
425nomem:
426	scrub_free_ctx(sctx);
427	return ERR_PTR(-ENOMEM);
428}
429
430static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
431				     void *warn_ctx)
432{
433	u64 isize;
434	u32 nlink;
435	int ret;
436	int i;
437	struct extent_buffer *eb;
438	struct btrfs_inode_item *inode_item;
439	struct scrub_warning *swarn = warn_ctx;
440	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
441	struct inode_fs_paths *ipath = NULL;
442	struct btrfs_root *local_root;
443	struct btrfs_key root_key;
444
445	root_key.objectid = root;
446	root_key.type = BTRFS_ROOT_ITEM_KEY;
447	root_key.offset = (u64)-1;
448	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
449	if (IS_ERR(local_root)) {
450		ret = PTR_ERR(local_root);
451		goto err;
452	}
453
454	ret = inode_item_info(inum, 0, local_root, swarn->path);
455	if (ret) {
456		btrfs_release_path(swarn->path);
457		goto err;
458	}
459
460	eb = swarn->path->nodes[0];
461	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
462					struct btrfs_inode_item);
463	isize = btrfs_inode_size(eb, inode_item);
464	nlink = btrfs_inode_nlink(eb, inode_item);
465	btrfs_release_path(swarn->path);
466
467	ipath = init_ipath(4096, local_root, swarn->path);
468	if (IS_ERR(ipath)) {
469		ret = PTR_ERR(ipath);
470		ipath = NULL;
471		goto err;
472	}
473	ret = paths_from_inode(inum, ipath);
474
475	if (ret < 0)
476		goto err;
477
478	/*
479	 * we deliberately ignore the bit ipath might have been too small to
480	 * hold all of the paths here
481	 */
482	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
483		printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
484			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
485			"length %llu, links %u (path: %s)\n", swarn->errstr,
486			swarn->logical, rcu_str_deref(swarn->dev->name),
487			(unsigned long long)swarn->sector, root, inum, offset,
488			min(isize - offset, (u64)PAGE_SIZE), nlink,
489			(char *)(unsigned long)ipath->fspath->val[i]);
490
491	free_ipath(ipath);
492	return 0;
493
494err:
495	printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
496		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
497		"resolving failed with ret=%d\n", swarn->errstr,
498		swarn->logical, rcu_str_deref(swarn->dev->name),
499		(unsigned long long)swarn->sector, root, inum, offset, ret);
500
501	free_ipath(ipath);
502	return 0;
503}
504
505static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
506{
507	struct btrfs_device *dev;
508	struct btrfs_fs_info *fs_info;
509	struct btrfs_path *path;
510	struct btrfs_key found_key;
511	struct extent_buffer *eb;
512	struct btrfs_extent_item *ei;
513	struct scrub_warning swarn;
514	unsigned long ptr = 0;
515	u64 extent_item_pos;
516	u64 flags = 0;
517	u64 ref_root;
518	u32 item_size;
519	u8 ref_level;
520	const int bufsize = 4096;
521	int ret;
522
523	WARN_ON(sblock->page_count < 1);
524	dev = sblock->pagev[0]->dev;
525	fs_info = sblock->sctx->dev_root->fs_info;
526
527	path = btrfs_alloc_path();
528
529	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
530	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
531	swarn.sector = (sblock->pagev[0]->physical) >> 9;
532	swarn.logical = sblock->pagev[0]->logical;
533	swarn.errstr = errstr;
534	swarn.dev = NULL;
535	swarn.msg_bufsize = bufsize;
536	swarn.scratch_bufsize = bufsize;
537
538	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
539		goto out;
540
541	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
542				  &flags);
543	if (ret < 0)
544		goto out;
545
546	extent_item_pos = swarn.logical - found_key.objectid;
547	swarn.extent_item_size = found_key.offset;
548
549	eb = path->nodes[0];
550	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
551	item_size = btrfs_item_size_nr(eb, path->slots[0]);
552
553	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
554		do {
555			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
556							&ref_root, &ref_level);
557			printk_in_rcu(KERN_WARNING
558				"btrfs: %s at logical %llu on dev %s, "
559				"sector %llu: metadata %s (level %d) in tree "
560				"%llu\n", errstr, swarn.logical,
561				rcu_str_deref(dev->name),
562				(unsigned long long)swarn.sector,
563				ref_level ? "node" : "leaf",
564				ret < 0 ? -1 : ref_level,
565				ret < 0 ? -1 : ref_root);
566		} while (ret != 1);
567		btrfs_release_path(path);
568	} else {
569		btrfs_release_path(path);
570		swarn.path = path;
571		swarn.dev = dev;
572		iterate_extent_inodes(fs_info, found_key.objectid,
573					extent_item_pos, 1,
574					scrub_print_warning_inode, &swarn);
575	}
576
577out:
578	btrfs_free_path(path);
579	kfree(swarn.scratch_buf);
580	kfree(swarn.msg_buf);
581}
582
583static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
584{
585	struct page *page = NULL;
586	unsigned long index;
587	struct scrub_fixup_nodatasum *fixup = fixup_ctx;
588	int ret;
589	int corrected = 0;
590	struct btrfs_key key;
591	struct inode *inode = NULL;
592	struct btrfs_fs_info *fs_info;
593	u64 end = offset + PAGE_SIZE - 1;
594	struct btrfs_root *local_root;
595	int srcu_index;
596
597	key.objectid = root;
598	key.type = BTRFS_ROOT_ITEM_KEY;
599	key.offset = (u64)-1;
600
601	fs_info = fixup->root->fs_info;
602	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
603
604	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
605	if (IS_ERR(local_root)) {
606		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
607		return PTR_ERR(local_root);
608	}
609
610	key.type = BTRFS_INODE_ITEM_KEY;
611	key.objectid = inum;
612	key.offset = 0;
613	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
614	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
615	if (IS_ERR(inode))
616		return PTR_ERR(inode);
617
618	index = offset >> PAGE_CACHE_SHIFT;
619
620	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
621	if (!page) {
622		ret = -ENOMEM;
623		goto out;
624	}
625
626	if (PageUptodate(page)) {
627		if (PageDirty(page)) {
628			/*
629			 * we need to write the data to the defect sector. the
630			 * data that was in that sector is not in memory,
631			 * because the page was modified. we must not write the
632			 * modified page to that sector.
633			 *
634			 * TODO: what could be done here: wait for the delalloc
635			 *       runner to write out that page (might involve
636			 *       COW) and see whether the sector is still
637			 *       referenced afterwards.
638			 *
639			 * For the meantime, we'll treat this error
640			 * incorrectable, although there is a chance that a
641			 * later scrub will find the bad sector again and that
642			 * there's no dirty page in memory, then.
643			 */
644			ret = -EIO;
645			goto out;
646		}
647		fs_info = BTRFS_I(inode)->root->fs_info;
648		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
649					fixup->logical, page,
650					fixup->mirror_num);
651		unlock_page(page);
652		corrected = !ret;
653	} else {
654		/*
655		 * we need to get good data first. the general readpage path
656		 * will call repair_io_failure for us, we just have to make
657		 * sure we read the bad mirror.
658		 */
659		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
660					EXTENT_DAMAGED, GFP_NOFS);
661		if (ret) {
662			/* set_extent_bits should give proper error */
663			WARN_ON(ret > 0);
664			if (ret > 0)
665				ret = -EFAULT;
666			goto out;
667		}
668
669		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
670						btrfs_get_extent,
671						fixup->mirror_num);
672		wait_on_page_locked(page);
673
674		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
675						end, EXTENT_DAMAGED, 0, NULL);
676		if (!corrected)
677			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
678						EXTENT_DAMAGED, GFP_NOFS);
679	}
680
681out:
682	if (page)
683		put_page(page);
684	if (inode)
685		iput(inode);
686
687	if (ret < 0)
688		return ret;
689
690	if (ret == 0 && corrected) {
691		/*
692		 * we only need to call readpage for one of the inodes belonging
693		 * to this extent. so make iterate_extent_inodes stop
694		 */
695		return 1;
696	}
697
698	return -EIO;
699}
700
701static void scrub_fixup_nodatasum(struct btrfs_work *work)
702{
703	int ret;
704	struct scrub_fixup_nodatasum *fixup;
705	struct scrub_ctx *sctx;
706	struct btrfs_trans_handle *trans = NULL;
707	struct btrfs_fs_info *fs_info;
708	struct btrfs_path *path;
709	int uncorrectable = 0;
710
711	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
712	sctx = fixup->sctx;
713	fs_info = fixup->root->fs_info;
714
715	path = btrfs_alloc_path();
716	if (!path) {
717		spin_lock(&sctx->stat_lock);
718		++sctx->stat.malloc_errors;
719		spin_unlock(&sctx->stat_lock);
720		uncorrectable = 1;
721		goto out;
722	}
723
724	trans = btrfs_join_transaction(fixup->root);
725	if (IS_ERR(trans)) {
726		uncorrectable = 1;
727		goto out;
728	}
729
730	/*
731	 * the idea is to trigger a regular read through the standard path. we
732	 * read a page from the (failed) logical address by specifying the
733	 * corresponding copynum of the failed sector. thus, that readpage is
734	 * expected to fail.
735	 * that is the point where on-the-fly error correction will kick in
736	 * (once it's finished) and rewrite the failed sector if a good copy
737	 * can be found.
738	 */
739	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
740						path, scrub_fixup_readpage,
741						fixup);
742	if (ret < 0) {
743		uncorrectable = 1;
744		goto out;
745	}
746	WARN_ON(ret != 1);
747
748	spin_lock(&sctx->stat_lock);
749	++sctx->stat.corrected_errors;
750	spin_unlock(&sctx->stat_lock);
751
752out:
753	if (trans && !IS_ERR(trans))
754		btrfs_end_transaction(trans, fixup->root);
755	if (uncorrectable) {
756		spin_lock(&sctx->stat_lock);
757		++sctx->stat.uncorrectable_errors;
758		spin_unlock(&sctx->stat_lock);
759		btrfs_dev_replace_stats_inc(
760			&sctx->dev_root->fs_info->dev_replace.
761			num_uncorrectable_read_errors);
762		printk_ratelimited_in_rcu(KERN_ERR
763			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
764			fixup->logical, rcu_str_deref(fixup->dev->name));
765	}
766
767	btrfs_free_path(path);
768	kfree(fixup);
769
770	scrub_pending_trans_workers_dec(sctx);
771}
772
773/*
774 * scrub_handle_errored_block gets called when either verification of the
775 * pages failed or the bio failed to read, e.g. with EIO. In the latter
776 * case, this function handles all pages in the bio, even though only one
777 * may be bad.
778 * The goal of this function is to repair the errored block by using the
779 * contents of one of the mirrors.
780 */
781static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
782{
783	struct scrub_ctx *sctx = sblock_to_check->sctx;
784	struct btrfs_device *dev;
785	struct btrfs_fs_info *fs_info;
786	u64 length;
787	u64 logical;
788	u64 generation;
789	unsigned int failed_mirror_index;
790	unsigned int is_metadata;
791	unsigned int have_csum;
792	u8 *csum;
793	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
794	struct scrub_block *sblock_bad;
795	int ret;
796	int mirror_index;
797	int page_num;
798	int success;
799	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
800				      DEFAULT_RATELIMIT_BURST);
801
802	BUG_ON(sblock_to_check->page_count < 1);
803	fs_info = sctx->dev_root->fs_info;
804	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
805		/*
806		 * if we find an error in a super block, we just report it.
807		 * They will get written with the next transaction commit
808		 * anyway
809		 */
810		spin_lock(&sctx->stat_lock);
811		++sctx->stat.super_errors;
812		spin_unlock(&sctx->stat_lock);
813		return 0;
814	}
815	length = sblock_to_check->page_count * PAGE_SIZE;
816	logical = sblock_to_check->pagev[0]->logical;
817	generation = sblock_to_check->pagev[0]->generation;
818	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
819	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
820	is_metadata = !(sblock_to_check->pagev[0]->flags &
821			BTRFS_EXTENT_FLAG_DATA);
822	have_csum = sblock_to_check->pagev[0]->have_csum;
823	csum = sblock_to_check->pagev[0]->csum;
824	dev = sblock_to_check->pagev[0]->dev;
825
826	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
827		sblocks_for_recheck = NULL;
828		goto nodatasum_case;
829	}
830
831	/*
832	 * read all mirrors one after the other. This includes to
833	 * re-read the extent or metadata block that failed (that was
834	 * the cause that this fixup code is called) another time,
835	 * page by page this time in order to know which pages
836	 * caused I/O errors and which ones are good (for all mirrors).
837	 * It is the goal to handle the situation when more than one
838	 * mirror contains I/O errors, but the errors do not
839	 * overlap, i.e. the data can be repaired by selecting the
840	 * pages from those mirrors without I/O error on the
841	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
842	 * would be that mirror #1 has an I/O error on the first page,
843	 * the second page is good, and mirror #2 has an I/O error on
844	 * the second page, but the first page is good.
845	 * Then the first page of the first mirror can be repaired by
846	 * taking the first page of the second mirror, and the
847	 * second page of the second mirror can be repaired by
848	 * copying the contents of the 2nd page of the 1st mirror.
849	 * One more note: if the pages of one mirror contain I/O
850	 * errors, the checksum cannot be verified. In order to get
851	 * the best data for repairing, the first attempt is to find
852	 * a mirror without I/O errors and with a validated checksum.
853	 * Only if this is not possible, the pages are picked from
854	 * mirrors with I/O errors without considering the checksum.
855	 * If the latter is the case, at the end, the checksum of the
856	 * repaired area is verified in order to correctly maintain
857	 * the statistics.
858	 */
859
860	sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
861				     sizeof(*sblocks_for_recheck),
862				     GFP_NOFS);
863	if (!sblocks_for_recheck) {
864		spin_lock(&sctx->stat_lock);
865		sctx->stat.malloc_errors++;
866		sctx->stat.read_errors++;
867		sctx->stat.uncorrectable_errors++;
868		spin_unlock(&sctx->stat_lock);
869		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
870		goto out;
871	}
872
873	/* setup the context, map the logical blocks and alloc the pages */
874	ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
875					logical, sblocks_for_recheck);
876	if (ret) {
877		spin_lock(&sctx->stat_lock);
878		sctx->stat.read_errors++;
879		sctx->stat.uncorrectable_errors++;
880		spin_unlock(&sctx->stat_lock);
881		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
882		goto out;
883	}
884	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
885	sblock_bad = sblocks_for_recheck + failed_mirror_index;
886
887	/* build and submit the bios for the failed mirror, check checksums */
888	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
889			    csum, generation, sctx->csum_size);
890
891	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
892	    sblock_bad->no_io_error_seen) {
893		/*
894		 * the error disappeared after reading page by page, or
895		 * the area was part of a huge bio and other parts of the
896		 * bio caused I/O errors, or the block layer merged several
897		 * read requests into one and the error is caused by a
898		 * different bio (usually one of the two latter cases is
899		 * the cause)
900		 */
901		spin_lock(&sctx->stat_lock);
902		sctx->stat.unverified_errors++;
903		spin_unlock(&sctx->stat_lock);
904
905		if (sctx->is_dev_replace)
906			scrub_write_block_to_dev_replace(sblock_bad);
907		goto out;
908	}
909
910	if (!sblock_bad->no_io_error_seen) {
911		spin_lock(&sctx->stat_lock);
912		sctx->stat.read_errors++;
913		spin_unlock(&sctx->stat_lock);
914		if (__ratelimit(&_rs))
915			scrub_print_warning("i/o error", sblock_to_check);
916		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
917	} else if (sblock_bad->checksum_error) {
918		spin_lock(&sctx->stat_lock);
919		sctx->stat.csum_errors++;
920		spin_unlock(&sctx->stat_lock);
921		if (__ratelimit(&_rs))
922			scrub_print_warning("checksum error", sblock_to_check);
923		btrfs_dev_stat_inc_and_print(dev,
924					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
925	} else if (sblock_bad->header_error) {
926		spin_lock(&sctx->stat_lock);
927		sctx->stat.verify_errors++;
928		spin_unlock(&sctx->stat_lock);
929		if (__ratelimit(&_rs))
930			scrub_print_warning("checksum/header error",
931					    sblock_to_check);
932		if (sblock_bad->generation_error)
933			btrfs_dev_stat_inc_and_print(dev,
934				BTRFS_DEV_STAT_GENERATION_ERRS);
935		else
936			btrfs_dev_stat_inc_and_print(dev,
937				BTRFS_DEV_STAT_CORRUPTION_ERRS);
938	}
939
940	if (sctx->readonly && !sctx->is_dev_replace)
941		goto did_not_correct_error;
942
943	if (!is_metadata && !have_csum) {
944		struct scrub_fixup_nodatasum *fixup_nodatasum;
945
946nodatasum_case:
947		WARN_ON(sctx->is_dev_replace);
948
949		/*
950		 * !is_metadata and !have_csum, this means that the data
951		 * might not be COW'ed, that it might be modified
952		 * concurrently. The general strategy to work on the
953		 * commit root does not help in the case when COW is not
954		 * used.
955		 */
956		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
957		if (!fixup_nodatasum)
958			goto did_not_correct_error;
959		fixup_nodatasum->sctx = sctx;
960		fixup_nodatasum->dev = dev;
961		fixup_nodatasum->logical = logical;
962		fixup_nodatasum->root = fs_info->extent_root;
963		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
964		scrub_pending_trans_workers_inc(sctx);
965		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
966		btrfs_queue_worker(&fs_info->scrub_workers,
967				   &fixup_nodatasum->work);
968		goto out;
969	}
970
971	/*
972	 * now build and submit the bios for the other mirrors, check
973	 * checksums.
974	 * First try to pick the mirror which is completely without I/O
975	 * errors and also does not have a checksum error.
976	 * If one is found, and if a checksum is present, the full block
977	 * that is known to contain an error is rewritten. Afterwards
978	 * the block is known to be corrected.
979	 * If a mirror is found which is completely correct, and no
980	 * checksum is present, only those pages are rewritten that had
981	 * an I/O error in the block to be repaired, since it cannot be
982	 * determined, which copy of the other pages is better (and it
983	 * could happen otherwise that a correct page would be
984	 * overwritten by a bad one).
985	 */
986	for (mirror_index = 0;
987	     mirror_index < BTRFS_MAX_MIRRORS &&
988	     sblocks_for_recheck[mirror_index].page_count > 0;
989	     mirror_index++) {
990		struct scrub_block *sblock_other;
991
992		if (mirror_index == failed_mirror_index)
993			continue;
994		sblock_other = sblocks_for_recheck + mirror_index;
995
996		/* build and submit the bios, check checksums */
997		scrub_recheck_block(fs_info, sblock_other, is_metadata,
998				    have_csum, csum, generation,
999				    sctx->csum_size);
1000
1001		if (!sblock_other->header_error &&
1002		    !sblock_other->checksum_error &&
1003		    sblock_other->no_io_error_seen) {
1004			if (sctx->is_dev_replace) {
1005				scrub_write_block_to_dev_replace(sblock_other);
1006			} else {
1007				int force_write = is_metadata || have_csum;
1008
1009				ret = scrub_repair_block_from_good_copy(
1010						sblock_bad, sblock_other,
1011						force_write);
1012			}
1013			if (0 == ret)
1014				goto corrected_error;
1015		}
1016	}
1017
1018	/*
1019	 * for dev_replace, pick good pages and write to the target device.
1020	 */
1021	if (sctx->is_dev_replace) {
1022		success = 1;
1023		for (page_num = 0; page_num < sblock_bad->page_count;
1024		     page_num++) {
1025			int sub_success;
1026
1027			sub_success = 0;
1028			for (mirror_index = 0;
1029			     mirror_index < BTRFS_MAX_MIRRORS &&
1030			     sblocks_for_recheck[mirror_index].page_count > 0;
1031			     mirror_index++) {
1032				struct scrub_block *sblock_other =
1033					sblocks_for_recheck + mirror_index;
1034				struct scrub_page *page_other =
1035					sblock_other->pagev[page_num];
1036
1037				if (!page_other->io_error) {
1038					ret = scrub_write_page_to_dev_replace(
1039							sblock_other, page_num);
1040					if (ret == 0) {
1041						/* succeeded for this page */
1042						sub_success = 1;
1043						break;
1044					} else {
1045						btrfs_dev_replace_stats_inc(
1046							&sctx->dev_root->
1047							fs_info->dev_replace.
1048							num_write_errors);
1049					}
1050				}
1051			}
1052
1053			if (!sub_success) {
1054				/*
1055				 * did not find a mirror to fetch the page
1056				 * from. scrub_write_page_to_dev_replace()
1057				 * handles this case (page->io_error), by
1058				 * filling the block with zeros before
1059				 * submitting the write request
1060				 */
1061				success = 0;
1062				ret = scrub_write_page_to_dev_replace(
1063						sblock_bad, page_num);
1064				if (ret)
1065					btrfs_dev_replace_stats_inc(
1066						&sctx->dev_root->fs_info->
1067						dev_replace.num_write_errors);
1068			}
1069		}
1070
1071		goto out;
1072	}
1073
1074	/*
1075	 * for regular scrub, repair those pages that are errored.
1076	 * In case of I/O errors in the area that is supposed to be
1077	 * repaired, continue by picking good copies of those pages.
1078	 * Select the good pages from mirrors to rewrite bad pages from
1079	 * the area to fix. Afterwards verify the checksum of the block
1080	 * that is supposed to be repaired. This verification step is
1081	 * only done for the purpose of statistic counting and for the
1082	 * final scrub report, whether errors remain.
1083	 * A perfect algorithm could make use of the checksum and try
1084	 * all possible combinations of pages from the different mirrors
1085	 * until the checksum verification succeeds. For example, when
1086	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1087	 * of mirror #2 is readable but the final checksum test fails,
1088	 * then the 2nd page of mirror #3 could be tried, whether now
1089	 * the final checksum succeedes. But this would be a rare
1090	 * exception and is therefore not implemented. At least it is
1091	 * avoided that the good copy is overwritten.
1092	 * A more useful improvement would be to pick the sectors
1093	 * without I/O error based on sector sizes (512 bytes on legacy
1094	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1095	 * mirror could be repaired by taking 512 byte of a different
1096	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1097	 * area are unreadable.
1098	 */
1099
1100	/* can only fix I/O errors from here on */
1101	if (sblock_bad->no_io_error_seen)
1102		goto did_not_correct_error;
1103
1104	success = 1;
1105	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1106		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1107
1108		if (!page_bad->io_error)
1109			continue;
1110
1111		for (mirror_index = 0;
1112		     mirror_index < BTRFS_MAX_MIRRORS &&
1113		     sblocks_for_recheck[mirror_index].page_count > 0;
1114		     mirror_index++) {
1115			struct scrub_block *sblock_other = sblocks_for_recheck +
1116							   mirror_index;
1117			struct scrub_page *page_other = sblock_other->pagev[
1118							page_num];
1119
1120			if (!page_other->io_error) {
1121				ret = scrub_repair_page_from_good_copy(
1122					sblock_bad, sblock_other, page_num, 0);
1123				if (0 == ret) {
1124					page_bad->io_error = 0;
1125					break; /* succeeded for this page */
1126				}
1127			}
1128		}
1129
1130		if (page_bad->io_error) {
1131			/* did not find a mirror to copy the page from */
1132			success = 0;
1133		}
1134	}
1135
1136	if (success) {
1137		if (is_metadata || have_csum) {
1138			/*
1139			 * need to verify the checksum now that all
1140			 * sectors on disk are repaired (the write
1141			 * request for data to be repaired is on its way).
1142			 * Just be lazy and use scrub_recheck_block()
1143			 * which re-reads the data before the checksum
1144			 * is verified, but most likely the data comes out
1145			 * of the page cache.
1146			 */
1147			scrub_recheck_block(fs_info, sblock_bad,
1148					    is_metadata, have_csum, csum,
1149					    generation, sctx->csum_size);
1150			if (!sblock_bad->header_error &&
1151			    !sblock_bad->checksum_error &&
1152			    sblock_bad->no_io_error_seen)
1153				goto corrected_error;
1154			else
1155				goto did_not_correct_error;
1156		} else {
1157corrected_error:
1158			spin_lock(&sctx->stat_lock);
1159			sctx->stat.corrected_errors++;
1160			spin_unlock(&sctx->stat_lock);
1161			printk_ratelimited_in_rcu(KERN_ERR
1162				"btrfs: fixed up error at logical %llu on dev %s\n",
1163				logical, rcu_str_deref(dev->name));
1164		}
1165	} else {
1166did_not_correct_error:
1167		spin_lock(&sctx->stat_lock);
1168		sctx->stat.uncorrectable_errors++;
1169		spin_unlock(&sctx->stat_lock);
1170		printk_ratelimited_in_rcu(KERN_ERR
1171			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
1172			logical, rcu_str_deref(dev->name));
1173	}
1174
1175out:
1176	if (sblocks_for_recheck) {
1177		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1178		     mirror_index++) {
1179			struct scrub_block *sblock = sblocks_for_recheck +
1180						     mirror_index;
1181			int page_index;
1182
1183			for (page_index = 0; page_index < sblock->page_count;
1184			     page_index++) {
1185				sblock->pagev[page_index]->sblock = NULL;
1186				scrub_page_put(sblock->pagev[page_index]);
1187			}
1188		}
1189		kfree(sblocks_for_recheck);
1190	}
1191
1192	return 0;
1193}
1194
1195static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1196				     struct btrfs_fs_info *fs_info,
1197				     struct scrub_block *original_sblock,
1198				     u64 length, u64 logical,
1199				     struct scrub_block *sblocks_for_recheck)
1200{
1201	int page_index;
1202	int mirror_index;
1203	int ret;
1204
1205	/*
1206	 * note: the two members ref_count and outstanding_pages
1207	 * are not used (and not set) in the blocks that are used for
1208	 * the recheck procedure
1209	 */
1210
1211	page_index = 0;
1212	while (length > 0) {
1213		u64 sublen = min_t(u64, length, PAGE_SIZE);
1214		u64 mapped_length = sublen;
1215		struct btrfs_bio *bbio = NULL;
1216
1217		/*
1218		 * with a length of PAGE_SIZE, each returned stripe
1219		 * represents one mirror
1220		 */
1221		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1222				      &mapped_length, &bbio, 0);
1223		if (ret || !bbio || mapped_length < sublen) {
1224			kfree(bbio);
1225			return -EIO;
1226		}
1227
1228		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1229		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1230		     mirror_index++) {
1231			struct scrub_block *sblock;
1232			struct scrub_page *page;
1233
1234			if (mirror_index >= BTRFS_MAX_MIRRORS)
1235				continue;
1236
1237			sblock = sblocks_for_recheck + mirror_index;
1238			sblock->sctx = sctx;
1239			page = kzalloc(sizeof(*page), GFP_NOFS);
1240			if (!page) {
1241leave_nomem:
1242				spin_lock(&sctx->stat_lock);
1243				sctx->stat.malloc_errors++;
1244				spin_unlock(&sctx->stat_lock);
1245				kfree(bbio);
1246				return -ENOMEM;
1247			}
1248			scrub_page_get(page);
1249			sblock->pagev[page_index] = page;
1250			page->logical = logical;
1251			page->physical = bbio->stripes[mirror_index].physical;
1252			BUG_ON(page_index >= original_sblock->page_count);
1253			page->physical_for_dev_replace =
1254				original_sblock->pagev[page_index]->
1255				physical_for_dev_replace;
1256			/* for missing devices, dev->bdev is NULL */
1257			page->dev = bbio->stripes[mirror_index].dev;
1258			page->mirror_num = mirror_index + 1;
1259			sblock->page_count++;
1260			page->page = alloc_page(GFP_NOFS);
1261			if (!page->page)
1262				goto leave_nomem;
1263		}
1264		kfree(bbio);
1265		length -= sublen;
1266		logical += sublen;
1267		page_index++;
1268	}
1269
1270	return 0;
1271}
1272
1273/*
1274 * this function will check the on disk data for checksum errors, header
1275 * errors and read I/O errors. If any I/O errors happen, the exact pages
1276 * which are errored are marked as being bad. The goal is to enable scrub
1277 * to take those pages that are not errored from all the mirrors so that
1278 * the pages that are errored in the just handled mirror can be repaired.
1279 */
1280static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1281				struct scrub_block *sblock, int is_metadata,
1282				int have_csum, u8 *csum, u64 generation,
1283				u16 csum_size)
1284{
1285	int page_num;
1286
1287	sblock->no_io_error_seen = 1;
1288	sblock->header_error = 0;
1289	sblock->checksum_error = 0;
1290
1291	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1292		struct bio *bio;
1293		struct scrub_page *page = sblock->pagev[page_num];
1294
1295		if (page->dev->bdev == NULL) {
1296			page->io_error = 1;
1297			sblock->no_io_error_seen = 0;
1298			continue;
1299		}
1300
1301		WARN_ON(!page->page);
1302		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1303		if (!bio) {
1304			page->io_error = 1;
1305			sblock->no_io_error_seen = 0;
1306			continue;
1307		}
1308		bio->bi_bdev = page->dev->bdev;
1309		bio->bi_sector = page->physical >> 9;
1310
1311		bio_add_page(bio, page->page, PAGE_SIZE, 0);
1312		if (btrfsic_submit_bio_wait(READ, bio))
1313			sblock->no_io_error_seen = 0;
1314
1315		bio_put(bio);
1316	}
1317
1318	if (sblock->no_io_error_seen)
1319		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1320					     have_csum, csum, generation,
1321					     csum_size);
1322
1323	return;
1324}
1325
1326static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1327					 struct scrub_block *sblock,
1328					 int is_metadata, int have_csum,
1329					 const u8 *csum, u64 generation,
1330					 u16 csum_size)
1331{
1332	int page_num;
1333	u8 calculated_csum[BTRFS_CSUM_SIZE];
1334	u32 crc = ~(u32)0;
1335	void *mapped_buffer;
1336
1337	WARN_ON(!sblock->pagev[0]->page);
1338	if (is_metadata) {
1339		struct btrfs_header *h;
1340
1341		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1342		h = (struct btrfs_header *)mapped_buffer;
1343
1344		if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1345		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1346		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1347			   BTRFS_UUID_SIZE)) {
1348			sblock->header_error = 1;
1349		} else if (generation != btrfs_stack_header_generation(h)) {
1350			sblock->header_error = 1;
1351			sblock->generation_error = 1;
1352		}
1353		csum = h->csum;
1354	} else {
1355		if (!have_csum)
1356			return;
1357
1358		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1359	}
1360
1361	for (page_num = 0;;) {
1362		if (page_num == 0 && is_metadata)
1363			crc = btrfs_csum_data(
1364				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1365				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1366		else
1367			crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1368
1369		kunmap_atomic(mapped_buffer);
1370		page_num++;
1371		if (page_num >= sblock->page_count)
1372			break;
1373		WARN_ON(!sblock->pagev[page_num]->page);
1374
1375		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1376	}
1377
1378	btrfs_csum_final(crc, calculated_csum);
1379	if (memcmp(calculated_csum, csum, csum_size))
1380		sblock->checksum_error = 1;
1381}
1382
1383static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1384					     struct scrub_block *sblock_good,
1385					     int force_write)
1386{
1387	int page_num;
1388	int ret = 0;
1389
1390	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1391		int ret_sub;
1392
1393		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1394							   sblock_good,
1395							   page_num,
1396							   force_write);
1397		if (ret_sub)
1398			ret = ret_sub;
1399	}
1400
1401	return ret;
1402}
1403
1404static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1405					    struct scrub_block *sblock_good,
1406					    int page_num, int force_write)
1407{
1408	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1409	struct scrub_page *page_good = sblock_good->pagev[page_num];
1410
1411	BUG_ON(page_bad->page == NULL);
1412	BUG_ON(page_good->page == NULL);
1413	if (force_write || sblock_bad->header_error ||
1414	    sblock_bad->checksum_error || page_bad->io_error) {
1415		struct bio *bio;
1416		int ret;
1417
1418		if (!page_bad->dev->bdev) {
1419			printk_ratelimited(KERN_WARNING
1420				"btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1421			return -EIO;
1422		}
1423
1424		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1425		if (!bio)
1426			return -EIO;
1427		bio->bi_bdev = page_bad->dev->bdev;
1428		bio->bi_sector = page_bad->physical >> 9;
1429
1430		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1431		if (PAGE_SIZE != ret) {
1432			bio_put(bio);
1433			return -EIO;
1434		}
1435
1436		if (btrfsic_submit_bio_wait(WRITE, bio)) {
1437			btrfs_dev_stat_inc_and_print(page_bad->dev,
1438				BTRFS_DEV_STAT_WRITE_ERRS);
1439			btrfs_dev_replace_stats_inc(
1440				&sblock_bad->sctx->dev_root->fs_info->
1441				dev_replace.num_write_errors);
1442			bio_put(bio);
1443			return -EIO;
1444		}
1445		bio_put(bio);
1446	}
1447
1448	return 0;
1449}
1450
1451static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1452{
1453	int page_num;
1454
1455	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1456		int ret;
1457
1458		ret = scrub_write_page_to_dev_replace(sblock, page_num);
1459		if (ret)
1460			btrfs_dev_replace_stats_inc(
1461				&sblock->sctx->dev_root->fs_info->dev_replace.
1462				num_write_errors);
1463	}
1464}
1465
1466static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1467					   int page_num)
1468{
1469	struct scrub_page *spage = sblock->pagev[page_num];
1470
1471	BUG_ON(spage->page == NULL);
1472	if (spage->io_error) {
1473		void *mapped_buffer = kmap_atomic(spage->page);
1474
1475		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1476		flush_dcache_page(spage->page);
1477		kunmap_atomic(mapped_buffer);
1478	}
1479	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1480}
1481
1482static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1483				    struct scrub_page *spage)
1484{
1485	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1486	struct scrub_bio *sbio;
1487	int ret;
1488
1489	mutex_lock(&wr_ctx->wr_lock);
1490again:
1491	if (!wr_ctx->wr_curr_bio) {
1492		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1493					      GFP_NOFS);
1494		if (!wr_ctx->wr_curr_bio) {
1495			mutex_unlock(&wr_ctx->wr_lock);
1496			return -ENOMEM;
1497		}
1498		wr_ctx->wr_curr_bio->sctx = sctx;
1499		wr_ctx->wr_curr_bio->page_count = 0;
1500	}
1501	sbio = wr_ctx->wr_curr_bio;
1502	if (sbio->page_count == 0) {
1503		struct bio *bio;
1504
1505		sbio->physical = spage->physical_for_dev_replace;
1506		sbio->logical = spage->logical;
1507		sbio->dev = wr_ctx->tgtdev;
1508		bio = sbio->bio;
1509		if (!bio) {
1510			bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1511			if (!bio) {
1512				mutex_unlock(&wr_ctx->wr_lock);
1513				return -ENOMEM;
1514			}
1515			sbio->bio = bio;
1516		}
1517
1518		bio->bi_private = sbio;
1519		bio->bi_end_io = scrub_wr_bio_end_io;
1520		bio->bi_bdev = sbio->dev->bdev;
1521		bio->bi_sector = sbio->physical >> 9;
1522		sbio->err = 0;
1523	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1524		   spage->physical_for_dev_replace ||
1525		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1526		   spage->logical) {
1527		scrub_wr_submit(sctx);
1528		goto again;
1529	}
1530
1531	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1532	if (ret != PAGE_SIZE) {
1533		if (sbio->page_count < 1) {
1534			bio_put(sbio->bio);
1535			sbio->bio = NULL;
1536			mutex_unlock(&wr_ctx->wr_lock);
1537			return -EIO;
1538		}
1539		scrub_wr_submit(sctx);
1540		goto again;
1541	}
1542
1543	sbio->pagev[sbio->page_count] = spage;
1544	scrub_page_get(spage);
1545	sbio->page_count++;
1546	if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1547		scrub_wr_submit(sctx);
1548	mutex_unlock(&wr_ctx->wr_lock);
1549
1550	return 0;
1551}
1552
1553static void scrub_wr_submit(struct scrub_ctx *sctx)
1554{
1555	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1556	struct scrub_bio *sbio;
1557
1558	if (!wr_ctx->wr_curr_bio)
1559		return;
1560
1561	sbio = wr_ctx->wr_curr_bio;
1562	wr_ctx->wr_curr_bio = NULL;
1563	WARN_ON(!sbio->bio->bi_bdev);
1564	scrub_pending_bio_inc(sctx);
1565	/* process all writes in a single worker thread. Then the block layer
1566	 * orders the requests before sending them to the driver which
1567	 * doubled the write performance on spinning disks when measured
1568	 * with Linux 3.5 */
1569	btrfsic_submit_bio(WRITE, sbio->bio);
1570}
1571
1572static void scrub_wr_bio_end_io(struct bio *bio, int err)
1573{
1574	struct scrub_bio *sbio = bio->bi_private;
1575	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1576
1577	sbio->err = err;
1578	sbio->bio = bio;
1579
1580	sbio->work.func = scrub_wr_bio_end_io_worker;
1581	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1582}
1583
1584static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1585{
1586	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1587	struct scrub_ctx *sctx = sbio->sctx;
1588	int i;
1589
1590	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1591	if (sbio->err) {
1592		struct btrfs_dev_replace *dev_replace =
1593			&sbio->sctx->dev_root->fs_info->dev_replace;
1594
1595		for (i = 0; i < sbio->page_count; i++) {
1596			struct scrub_page *spage = sbio->pagev[i];
1597
1598			spage->io_error = 1;
1599			btrfs_dev_replace_stats_inc(&dev_replace->
1600						    num_write_errors);
1601		}
1602	}
1603
1604	for (i = 0; i < sbio->page_count; i++)
1605		scrub_page_put(sbio->pagev[i]);
1606
1607	bio_put(sbio->bio);
1608	kfree(sbio);
1609	scrub_pending_bio_dec(sctx);
1610}
1611
1612static int scrub_checksum(struct scrub_block *sblock)
1613{
1614	u64 flags;
1615	int ret;
1616
1617	WARN_ON(sblock->page_count < 1);
1618	flags = sblock->pagev[0]->flags;
1619	ret = 0;
1620	if (flags & BTRFS_EXTENT_FLAG_DATA)
1621		ret = scrub_checksum_data(sblock);
1622	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1623		ret = scrub_checksum_tree_block(sblock);
1624	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1625		(void)scrub_checksum_super(sblock);
1626	else
1627		WARN_ON(1);
1628	if (ret)
1629		scrub_handle_errored_block(sblock);
1630
1631	return ret;
1632}
1633
1634static int scrub_checksum_data(struct scrub_block *sblock)
1635{
1636	struct scrub_ctx *sctx = sblock->sctx;
1637	u8 csum[BTRFS_CSUM_SIZE];
1638	u8 *on_disk_csum;
1639	struct page *page;
1640	void *buffer;
1641	u32 crc = ~(u32)0;
1642	int fail = 0;
1643	u64 len;
1644	int index;
1645
1646	BUG_ON(sblock->page_count < 1);
1647	if (!sblock->pagev[0]->have_csum)
1648		return 0;
1649
1650	on_disk_csum = sblock->pagev[0]->csum;
1651	page = sblock->pagev[0]->page;
1652	buffer = kmap_atomic(page);
1653
1654	len = sctx->sectorsize;
1655	index = 0;
1656	for (;;) {
1657		u64 l = min_t(u64, len, PAGE_SIZE);
1658
1659		crc = btrfs_csum_data(buffer, crc, l);
1660		kunmap_atomic(buffer);
1661		len -= l;
1662		if (len == 0)
1663			break;
1664		index++;
1665		BUG_ON(index >= sblock->page_count);
1666		BUG_ON(!sblock->pagev[index]->page);
1667		page = sblock->pagev[index]->page;
1668		buffer = kmap_atomic(page);
1669	}
1670
1671	btrfs_csum_final(crc, csum);
1672	if (memcmp(csum, on_disk_csum, sctx->csum_size))
1673		fail = 1;
1674
1675	return fail;
1676}
1677
1678static int scrub_checksum_tree_block(struct scrub_block *sblock)
1679{
1680	struct scrub_ctx *sctx = sblock->sctx;
1681	struct btrfs_header *h;
1682	struct btrfs_root *root = sctx->dev_root;
1683	struct btrfs_fs_info *fs_info = root->fs_info;
1684	u8 calculated_csum[BTRFS_CSUM_SIZE];
1685	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1686	struct page *page;
1687	void *mapped_buffer;
1688	u64 mapped_size;
1689	void *p;
1690	u32 crc = ~(u32)0;
1691	int fail = 0;
1692	int crc_fail = 0;
1693	u64 len;
1694	int index;
1695
1696	BUG_ON(sblock->page_count < 1);
1697	page = sblock->pagev[0]->page;
1698	mapped_buffer = kmap_atomic(page);
1699	h = (struct btrfs_header *)mapped_buffer;
1700	memcpy(on_disk_csum, h->csum, sctx->csum_size);
1701
1702	/*
1703	 * we don't use the getter functions here, as we
1704	 * a) don't have an extent buffer and
1705	 * b) the page is already kmapped
1706	 */
1707
1708	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1709		++fail;
1710
1711	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1712		++fail;
1713
1714	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1715		++fail;
1716
1717	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1718		   BTRFS_UUID_SIZE))
1719		++fail;
1720
1721	WARN_ON(sctx->nodesize != sctx->leafsize);
1722	len = sctx->nodesize - BTRFS_CSUM_SIZE;
1723	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1724	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1725	index = 0;
1726	for (;;) {
1727		u64 l = min_t(u64, len, mapped_size);
1728
1729		crc = btrfs_csum_data(p, crc, l);
1730		kunmap_atomic(mapped_buffer);
1731		len -= l;
1732		if (len == 0)
1733			break;
1734		index++;
1735		BUG_ON(index >= sblock->page_count);
1736		BUG_ON(!sblock->pagev[index]->page);
1737		page = sblock->pagev[index]->page;
1738		mapped_buffer = kmap_atomic(page);
1739		mapped_size = PAGE_SIZE;
1740		p = mapped_buffer;
1741	}
1742
1743	btrfs_csum_final(crc, calculated_csum);
1744	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1745		++crc_fail;
1746
1747	return fail || crc_fail;
1748}
1749
1750static int scrub_checksum_super(struct scrub_block *sblock)
1751{
1752	struct btrfs_super_block *s;
1753	struct scrub_ctx *sctx = sblock->sctx;
1754	struct btrfs_root *root = sctx->dev_root;
1755	struct btrfs_fs_info *fs_info = root->fs_info;
1756	u8 calculated_csum[BTRFS_CSUM_SIZE];
1757	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1758	struct page *page;
1759	void *mapped_buffer;
1760	u64 mapped_size;
1761	void *p;
1762	u32 crc = ~(u32)0;
1763	int fail_gen = 0;
1764	int fail_cor = 0;
1765	u64 len;
1766	int index;
1767
1768	BUG_ON(sblock->page_count < 1);
1769	page = sblock->pagev[0]->page;
1770	mapped_buffer = kmap_atomic(page);
1771	s = (struct btrfs_super_block *)mapped_buffer;
1772	memcpy(on_disk_csum, s->csum, sctx->csum_size);
1773
1774	if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1775		++fail_cor;
1776
1777	if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1778		++fail_gen;
1779
1780	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1781		++fail_cor;
1782
1783	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1784	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1785	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1786	index = 0;
1787	for (;;) {
1788		u64 l = min_t(u64, len, mapped_size);
1789
1790		crc = btrfs_csum_data(p, crc, l);
1791		kunmap_atomic(mapped_buffer);
1792		len -= l;
1793		if (len == 0)
1794			break;
1795		index++;
1796		BUG_ON(index >= sblock->page_count);
1797		BUG_ON(!sblock->pagev[index]->page);
1798		page = sblock->pagev[index]->page;
1799		mapped_buffer = kmap_atomic(page);
1800		mapped_size = PAGE_SIZE;
1801		p = mapped_buffer;
1802	}
1803
1804	btrfs_csum_final(crc, calculated_csum);
1805	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1806		++fail_cor;
1807
1808	if (fail_cor + fail_gen) {
1809		/*
1810		 * if we find an error in a super block, we just report it.
1811		 * They will get written with the next transaction commit
1812		 * anyway
1813		 */
1814		spin_lock(&sctx->stat_lock);
1815		++sctx->stat.super_errors;
1816		spin_unlock(&sctx->stat_lock);
1817		if (fail_cor)
1818			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1819				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1820		else
1821			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1822				BTRFS_DEV_STAT_GENERATION_ERRS);
1823	}
1824
1825	return fail_cor + fail_gen;
1826}
1827
1828static void scrub_block_get(struct scrub_block *sblock)
1829{
1830	atomic_inc(&sblock->ref_count);
1831}
1832
1833static void scrub_block_put(struct scrub_block *sblock)
1834{
1835	if (atomic_dec_and_test(&sblock->ref_count)) {
1836		int i;
1837
1838		for (i = 0; i < sblock->page_count; i++)
1839			scrub_page_put(sblock->pagev[i]);
1840		kfree(sblock);
1841	}
1842}
1843
1844static void scrub_page_get(struct scrub_page *spage)
1845{
1846	atomic_inc(&spage->ref_count);
1847}
1848
1849static void scrub_page_put(struct scrub_page *spage)
1850{
1851	if (atomic_dec_and_test(&spage->ref_count)) {
1852		if (spage->page)
1853			__free_page(spage->page);
1854		kfree(spage);
1855	}
1856}
1857
1858static void scrub_submit(struct scrub_ctx *sctx)
1859{
1860	struct scrub_bio *sbio;
1861
1862	if (sctx->curr == -1)
1863		return;
1864
1865	sbio = sctx->bios[sctx->curr];
1866	sctx->curr = -1;
1867	scrub_pending_bio_inc(sctx);
1868
1869	if (!sbio->bio->bi_bdev) {
1870		/*
1871		 * this case should not happen. If btrfs_map_block() is
1872		 * wrong, it could happen for dev-replace operations on
1873		 * missing devices when no mirrors are available, but in
1874		 * this case it should already fail the mount.
1875		 * This case is handled correctly (but _very_ slowly).
1876		 */
1877		printk_ratelimited(KERN_WARNING
1878			"btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1879		bio_endio(sbio->bio, -EIO);
1880	} else {
1881		btrfsic_submit_bio(READ, sbio->bio);
1882	}
1883}
1884
1885static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1886				    struct scrub_page *spage)
1887{
1888	struct scrub_block *sblock = spage->sblock;
1889	struct scrub_bio *sbio;
1890	int ret;
1891
1892again:
1893	/*
1894	 * grab a fresh bio or wait for one to become available
1895	 */
1896	while (sctx->curr == -1) {
1897		spin_lock(&sctx->list_lock);
1898		sctx->curr = sctx->first_free;
1899		if (sctx->curr != -1) {
1900			sctx->first_free = sctx->bios[sctx->curr]->next_free;
1901			sctx->bios[sctx->curr]->next_free = -1;
1902			sctx->bios[sctx->curr]->page_count = 0;
1903			spin_unlock(&sctx->list_lock);
1904		} else {
1905			spin_unlock(&sctx->list_lock);
1906			wait_event(sctx->list_wait, sctx->first_free != -1);
1907		}
1908	}
1909	sbio = sctx->bios[sctx->curr];
1910	if (sbio->page_count == 0) {
1911		struct bio *bio;
1912
1913		sbio->physical = spage->physical;
1914		sbio->logical = spage->logical;
1915		sbio->dev = spage->dev;
1916		bio = sbio->bio;
1917		if (!bio) {
1918			bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1919			if (!bio)
1920				return -ENOMEM;
1921			sbio->bio = bio;
1922		}
1923
1924		bio->bi_private = sbio;
1925		bio->bi_end_io = scrub_bio_end_io;
1926		bio->bi_bdev = sbio->dev->bdev;
1927		bio->bi_sector = sbio->physical >> 9;
1928		sbio->err = 0;
1929	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1930		   spage->physical ||
1931		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1932		   spage->logical ||
1933		   sbio->dev != spage->dev) {
1934		scrub_submit(sctx);
1935		goto again;
1936	}
1937
1938	sbio->pagev[sbio->page_count] = spage;
1939	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1940	if (ret != PAGE_SIZE) {
1941		if (sbio->page_count < 1) {
1942			bio_put(sbio->bio);
1943			sbio->bio = NULL;
1944			return -EIO;
1945		}
1946		scrub_submit(sctx);
1947		goto again;
1948	}
1949
1950	scrub_block_get(sblock); /* one for the page added to the bio */
1951	atomic_inc(&sblock->outstanding_pages);
1952	sbio->page_count++;
1953	if (sbio->page_count == sctx->pages_per_rd_bio)
1954		scrub_submit(sctx);
1955
1956	return 0;
1957}
1958
1959static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1960		       u64 physical, struct btrfs_device *dev, u64 flags,
1961		       u64 gen, int mirror_num, u8 *csum, int force,
1962		       u64 physical_for_dev_replace)
1963{
1964	struct scrub_block *sblock;
1965	int index;
1966
1967	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1968	if (!sblock) {
1969		spin_lock(&sctx->stat_lock);
1970		sctx->stat.malloc_errors++;
1971		spin_unlock(&sctx->stat_lock);
1972		return -ENOMEM;
1973	}
1974
1975	/* one ref inside this function, plus one for each page added to
1976	 * a bio later on */
1977	atomic_set(&sblock->ref_count, 1);
1978	sblock->sctx = sctx;
1979	sblock->no_io_error_seen = 1;
1980
1981	for (index = 0; len > 0; index++) {
1982		struct scrub_page *spage;
1983		u64 l = min_t(u64, len, PAGE_SIZE);
1984
1985		spage = kzalloc(sizeof(*spage), GFP_NOFS);
1986		if (!spage) {
1987leave_nomem:
1988			spin_lock(&sctx->stat_lock);
1989			sctx->stat.malloc_errors++;
1990			spin_unlock(&sctx->stat_lock);
1991			scrub_block_put(sblock);
1992			return -ENOMEM;
1993		}
1994		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
1995		scrub_page_get(spage);
1996		sblock->pagev[index] = spage;
1997		spage->sblock = sblock;
1998		spage->dev = dev;
1999		spage->flags = flags;
2000		spage->generation = gen;
2001		spage->logical = logical;
2002		spage->physical = physical;
2003		spage->physical_for_dev_replace = physical_for_dev_replace;
2004		spage->mirror_num = mirror_num;
2005		if (csum) {
2006			spage->have_csum = 1;
2007			memcpy(spage->csum, csum, sctx->csum_size);
2008		} else {
2009			spage->have_csum = 0;
2010		}
2011		sblock->page_count++;
2012		spage->page = alloc_page(GFP_NOFS);
2013		if (!spage->page)
2014			goto leave_nomem;
2015		len -= l;
2016		logical += l;
2017		physical += l;
2018		physical_for_dev_replace += l;
2019	}
2020
2021	WARN_ON(sblock->page_count == 0);
2022	for (index = 0; index < sblock->page_count; index++) {
2023		struct scrub_page *spage = sblock->pagev[index];
2024		int ret;
2025
2026		ret = scrub_add_page_to_rd_bio(sctx, spage);
2027		if (ret) {
2028			scrub_block_put(sblock);
2029			return ret;
2030		}
2031	}
2032
2033	if (force)
2034		scrub_submit(sctx);
2035
2036	/* last one frees, either here or in bio completion for last page */
2037	scrub_block_put(sblock);
2038	return 0;
2039}
2040
2041static void scrub_bio_end_io(struct bio *bio, int err)
2042{
2043	struct scrub_bio *sbio = bio->bi_private;
2044	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2045
2046	sbio->err = err;
2047	sbio->bio = bio;
2048
2049	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
2050}
2051
2052static void scrub_bio_end_io_worker(struct btrfs_work *work)
2053{
2054	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2055	struct scrub_ctx *sctx = sbio->sctx;
2056	int i;
2057
2058	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2059	if (sbio->err) {
2060		for (i = 0; i < sbio->page_count; i++) {
2061			struct scrub_page *spage = sbio->pagev[i];
2062
2063			spage->io_error = 1;
2064			spage->sblock->no_io_error_seen = 0;
2065		}
2066	}
2067
2068	/* now complete the scrub_block items that have all pages completed */
2069	for (i = 0; i < sbio->page_count; i++) {
2070		struct scrub_page *spage = sbio->pagev[i];
2071		struct scrub_block *sblock = spage->sblock;
2072
2073		if (atomic_dec_and_test(&sblock->outstanding_pages))
2074			scrub_block_complete(sblock);
2075		scrub_block_put(sblock);
2076	}
2077
2078	bio_put(sbio->bio);
2079	sbio->bio = NULL;
2080	spin_lock(&sctx->list_lock);
2081	sbio->next_free = sctx->first_free;
2082	sctx->first_free = sbio->index;
2083	spin_unlock(&sctx->list_lock);
2084
2085	if (sctx->is_dev_replace &&
2086	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2087		mutex_lock(&sctx->wr_ctx.wr_lock);
2088		scrub_wr_submit(sctx);
2089		mutex_unlock(&sctx->wr_ctx.wr_lock);
2090	}
2091
2092	scrub_pending_bio_dec(sctx);
2093}
2094
2095static void scrub_block_complete(struct scrub_block *sblock)
2096{
2097	if (!sblock->no_io_error_seen) {
2098		scrub_handle_errored_block(sblock);
2099	} else {
2100		/*
2101		 * if has checksum error, write via repair mechanism in
2102		 * dev replace case, otherwise write here in dev replace
2103		 * case.
2104		 */
2105		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2106			scrub_write_block_to_dev_replace(sblock);
2107	}
2108}
2109
2110static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2111			   u8 *csum)
2112{
2113	struct btrfs_ordered_sum *sum = NULL;
2114	unsigned long index;
2115	unsigned long num_sectors;
2116
2117	while (!list_empty(&sctx->csum_list)) {
2118		sum = list_first_entry(&sctx->csum_list,
2119				       struct btrfs_ordered_sum, list);
2120		if (sum->bytenr > logical)
2121			return 0;
2122		if (sum->bytenr + sum->len > logical)
2123			break;
2124
2125		++sctx->stat.csum_discards;
2126		list_del(&sum->list);
2127		kfree(sum);
2128		sum = NULL;
2129	}
2130	if (!sum)
2131		return 0;
2132
2133	index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2134	num_sectors = sum->len / sctx->sectorsize;
2135	memcpy(csum, sum->sums + index, sctx->csum_size);
2136	if (index == num_sectors - 1) {
2137		list_del(&sum->list);
2138		kfree(sum);
2139	}
2140	return 1;
2141}
2142
2143/* scrub extent tries to collect up to 64 kB for each bio */
2144static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2145			u64 physical, struct btrfs_device *dev, u64 flags,
2146			u64 gen, int mirror_num, u64 physical_for_dev_replace)
2147{
2148	int ret;
2149	u8 csum[BTRFS_CSUM_SIZE];
2150	u32 blocksize;
2151
2152	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2153		blocksize = sctx->sectorsize;
2154		spin_lock(&sctx->stat_lock);
2155		sctx->stat.data_extents_scrubbed++;
2156		sctx->stat.data_bytes_scrubbed += len;
2157		spin_unlock(&sctx->stat_lock);
2158	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2159		WARN_ON(sctx->nodesize != sctx->leafsize);
2160		blocksize = sctx->nodesize;
2161		spin_lock(&sctx->stat_lock);
2162		sctx->stat.tree_extents_scrubbed++;
2163		sctx->stat.tree_bytes_scrubbed += len;
2164		spin_unlock(&sctx->stat_lock);
2165	} else {
2166		blocksize = sctx->sectorsize;
2167		WARN_ON(1);
2168	}
2169
2170	while (len) {
2171		u64 l = min_t(u64, len, blocksize);
2172		int have_csum = 0;
2173
2174		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2175			/* push csums to sbio */
2176			have_csum = scrub_find_csum(sctx, logical, l, csum);
2177			if (have_csum == 0)
2178				++sctx->stat.no_csum;
2179			if (sctx->is_dev_replace && !have_csum) {
2180				ret = copy_nocow_pages(sctx, logical, l,
2181						       mirror_num,
2182						      physical_for_dev_replace);
2183				goto behind_scrub_pages;
2184			}
2185		}
2186		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2187				  mirror_num, have_csum ? csum : NULL, 0,
2188				  physical_for_dev_replace);
2189behind_scrub_pages:
2190		if (ret)
2191			return ret;
2192		len -= l;
2193		logical += l;
2194		physical += l;
2195		physical_for_dev_replace += l;
2196	}
2197	return 0;
2198}
2199
2200static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2201					   struct map_lookup *map,
2202					   struct btrfs_device *scrub_dev,
2203					   int num, u64 base, u64 length,
2204					   int is_dev_replace)
2205{
2206	struct btrfs_path *path;
2207	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2208	struct btrfs_root *root = fs_info->extent_root;
2209	struct btrfs_root *csum_root = fs_info->csum_root;
2210	struct btrfs_extent_item *extent;
2211	struct blk_plug plug;
2212	u64 flags;
2213	int ret;
2214	int slot;
2215	u64 nstripes;
2216	struct extent_buffer *l;
2217	struct btrfs_key key;
2218	u64 physical;
2219	u64 logical;
2220	u64 logic_end;
2221	u64 generation;
2222	int mirror_num;
2223	struct reada_control *reada1;
2224	struct reada_control *reada2;
2225	struct btrfs_key key_start;
2226	struct btrfs_key key_end;
2227	u64 increment = map->stripe_len;
2228	u64 offset;
2229	u64 extent_logical;
2230	u64 extent_physical;
2231	u64 extent_len;
2232	struct btrfs_device *extent_dev;
2233	int extent_mirror_num;
2234	int stop_loop;
2235
2236	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2237			 BTRFS_BLOCK_GROUP_RAID6)) {
2238		if (num >= nr_data_stripes(map)) {
2239			return 0;
2240		}
2241	}
2242
2243	nstripes = length;
2244	offset = 0;
2245	do_div(nstripes, map->stripe_len);
2246	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2247		offset = map->stripe_len * num;
2248		increment = map->stripe_len * map->num_stripes;
2249		mirror_num = 1;
2250	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2251		int factor = map->num_stripes / map->sub_stripes;
2252		offset = map->stripe_len * (num / map->sub_stripes);
2253		increment = map->stripe_len * factor;
2254		mirror_num = num % map->sub_stripes + 1;
2255	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2256		increment = map->stripe_len;
2257		mirror_num = num % map->num_stripes + 1;
2258	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2259		increment = map->stripe_len;
2260		mirror_num = num % map->num_stripes + 1;
2261	} else {
2262		increment = map->stripe_len;
2263		mirror_num = 1;
2264	}
2265
2266	path = btrfs_alloc_path();
2267	if (!path)
2268		return -ENOMEM;
2269
2270	/*
2271	 * work on commit root. The related disk blocks are static as
2272	 * long as COW is applied. This means, it is save to rewrite
2273	 * them to repair disk errors without any race conditions
2274	 */
2275	path->search_commit_root = 1;
2276	path->skip_locking = 1;
2277
2278	/*
2279	 * trigger the readahead for extent tree csum tree and wait for
2280	 * completion. During readahead, the scrub is officially paused
2281	 * to not hold off transaction commits
2282	 */
2283	logical = base + offset;
2284
2285	wait_event(sctx->list_wait,
2286		   atomic_read(&sctx->bios_in_flight) == 0);
2287	atomic_inc(&fs_info->scrubs_paused);
2288	wake_up(&fs_info->scrub_pause_wait);
2289
2290	/* FIXME it might be better to start readahead at commit root */
2291	key_start.objectid = logical;
2292	key_start.type = BTRFS_EXTENT_ITEM_KEY;
2293	key_start.offset = (u64)0;
2294	key_end.objectid = base + offset + nstripes * increment;
2295	key_end.type = BTRFS_METADATA_ITEM_KEY;
2296	key_end.offset = (u64)-1;
2297	reada1 = btrfs_reada_add(root, &key_start, &key_end);
2298
2299	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2300	key_start.type = BTRFS_EXTENT_CSUM_KEY;
2301	key_start.offset = logical;
2302	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2303	key_end.type = BTRFS_EXTENT_CSUM_KEY;
2304	key_end.offset = base + offset + nstripes * increment;
2305	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2306
2307	if (!IS_ERR(reada1))
2308		btrfs_reada_wait(reada1);
2309	if (!IS_ERR(reada2))
2310		btrfs_reada_wait(reada2);
2311
2312	mutex_lock(&fs_info->scrub_lock);
2313	while (atomic_read(&fs_info->scrub_pause_req)) {
2314		mutex_unlock(&fs_info->scrub_lock);
2315		wait_event(fs_info->scrub_pause_wait,
2316		   atomic_read(&fs_info->scrub_pause_req) == 0);
2317		mutex_lock(&fs_info->scrub_lock);
2318	}
2319	atomic_dec(&fs_info->scrubs_paused);
2320	mutex_unlock(&fs_info->scrub_lock);
2321	wake_up(&fs_info->scrub_pause_wait);
2322
2323	/*
2324	 * collect all data csums for the stripe to avoid seeking during
2325	 * the scrub. This might currently (crc32) end up to be about 1MB
2326	 */
2327	blk_start_plug(&plug);
2328
2329	/*
2330	 * now find all extents for each stripe and scrub them
2331	 */
2332	logical = base + offset;
2333	physical = map->stripes[num].physical;
2334	logic_end = logical + increment * nstripes;
2335	ret = 0;
2336	while (logical < logic_end) {
2337		/*
2338		 * canceled?
2339		 */
2340		if (atomic_read(&fs_info->scrub_cancel_req) ||
2341		    atomic_read(&sctx->cancel_req)) {
2342			ret = -ECANCELED;
2343			goto out;
2344		}
2345		/*
2346		 * check to see if we have to pause
2347		 */
2348		if (atomic_read(&fs_info->scrub_pause_req)) {
2349			/* push queued extents */
2350			atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2351			scrub_submit(sctx);
2352			mutex_lock(&sctx->wr_ctx.wr_lock);
2353			scrub_wr_submit(sctx);
2354			mutex_unlock(&sctx->wr_ctx.wr_lock);
2355			wait_event(sctx->list_wait,
2356				   atomic_read(&sctx->bios_in_flight) == 0);
2357			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2358			atomic_inc(&fs_info->scrubs_paused);
2359			wake_up(&fs_info->scrub_pause_wait);
2360			mutex_lock(&fs_info->scrub_lock);
2361			while (atomic_read(&fs_info->scrub_pause_req)) {
2362				mutex_unlock(&fs_info->scrub_lock);
2363				wait_event(fs_info->scrub_pause_wait,
2364				   atomic_read(&fs_info->scrub_pause_req) == 0);
2365				mutex_lock(&fs_info->scrub_lock);
2366			}
2367			atomic_dec(&fs_info->scrubs_paused);
2368			mutex_unlock(&fs_info->scrub_lock);
2369			wake_up(&fs_info->scrub_pause_wait);
2370		}
2371
2372		key.objectid = logical;
2373		key.type = BTRFS_EXTENT_ITEM_KEY;
2374		key.offset = (u64)-1;
2375
2376		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2377		if (ret < 0)
2378			goto out;
2379
2380		if (ret > 0) {
2381			ret = btrfs_previous_item(root, path, 0,
2382						  BTRFS_EXTENT_ITEM_KEY);
2383			if (ret < 0)
2384				goto out;
2385			if (ret > 0) {
2386				/* there's no smaller item, so stick with the
2387				 * larger one */
2388				btrfs_release_path(path);
2389				ret = btrfs_search_slot(NULL, root, &key,
2390							path, 0, 0);
2391				if (ret < 0)
2392					goto out;
2393			}
2394		}
2395
2396		stop_loop = 0;
2397		while (1) {
2398			u64 bytes;
2399
2400			l = path->nodes[0];
2401			slot = path->slots[0];
2402			if (slot >= btrfs_header_nritems(l)) {
2403				ret = btrfs_next_leaf(root, path);
2404				if (ret == 0)
2405					continue;
2406				if (ret < 0)
2407					goto out;
2408
2409				stop_loop = 1;
2410				break;
2411			}
2412			btrfs_item_key_to_cpu(l, &key, slot);
2413
2414			if (key.type == BTRFS_METADATA_ITEM_KEY)
2415				bytes = root->leafsize;
2416			else
2417				bytes = key.offset;
2418
2419			if (key.objectid + bytes <= logical)
2420				goto next;
2421
2422			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2423			    key.type != BTRFS_METADATA_ITEM_KEY)
2424				goto next;
2425
2426			if (key.objectid >= logical + map->stripe_len) {
2427				/* out of this device extent */
2428				if (key.objectid >= logic_end)
2429					stop_loop = 1;
2430				break;
2431			}
2432
2433			extent = btrfs_item_ptr(l, slot,
2434						struct btrfs_extent_item);
2435			flags = btrfs_extent_flags(l, extent);
2436			generation = btrfs_extent_generation(l, extent);
2437
2438			if (key.objectid < logical &&
2439			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2440				printk(KERN_ERR
2441				       "btrfs scrub: tree block %llu spanning "
2442				       "stripes, ignored. logical=%llu\n",
2443				       key.objectid, logical);
2444				goto next;
2445			}
2446
2447again:
2448			extent_logical = key.objectid;
2449			extent_len = bytes;
2450
2451			/*
2452			 * trim extent to this stripe
2453			 */
2454			if (extent_logical < logical) {
2455				extent_len -= logical - extent_logical;
2456				extent_logical = logical;
2457			}
2458			if (extent_logical + extent_len >
2459			    logical + map->stripe_len) {
2460				extent_len = logical + map->stripe_len -
2461					     extent_logical;
2462			}
2463
2464			extent_physical = extent_logical - logical + physical;
2465			extent_dev = scrub_dev;
2466			extent_mirror_num = mirror_num;
2467			if (is_dev_replace)
2468				scrub_remap_extent(fs_info, extent_logical,
2469						   extent_len, &extent_physical,
2470						   &extent_dev,
2471						   &extent_mirror_num);
2472
2473			ret = btrfs_lookup_csums_range(csum_root, logical,
2474						logical + map->stripe_len - 1,
2475						&sctx->csum_list, 1);
2476			if (ret)
2477				goto out;
2478
2479			ret = scrub_extent(sctx, extent_logical, extent_len,
2480					   extent_physical, extent_dev, flags,
2481					   generation, extent_mirror_num,
2482					   extent_logical - logical + physical);
2483			if (ret)
2484				goto out;
2485
2486			scrub_free_csums(sctx);
2487			if (extent_logical + extent_len <
2488			    key.objectid + bytes) {
2489				logical += increment;
2490				physical += map->stripe_len;
2491
2492				if (logical < key.objectid + bytes) {
2493					cond_resched();
2494					goto again;
2495				}
2496
2497				if (logical >= logic_end) {
2498					stop_loop = 1;
2499					break;
2500				}
2501			}
2502next:
2503			path->slots[0]++;
2504		}
2505		btrfs_release_path(path);
2506		logical += increment;
2507		physical += map->stripe_len;
2508		spin_lock(&sctx->stat_lock);
2509		if (stop_loop)
2510			sctx->stat.last_physical = map->stripes[num].physical +
2511						   length;
2512		else
2513			sctx->stat.last_physical = physical;
2514		spin_unlock(&sctx->stat_lock);
2515		if (stop_loop)
2516			break;
2517	}
2518out:
2519	/* push queued extents */
2520	scrub_submit(sctx);
2521	mutex_lock(&sctx->wr_ctx.wr_lock);
2522	scrub_wr_submit(sctx);
2523	mutex_unlock(&sctx->wr_ctx.wr_lock);
2524
2525	blk_finish_plug(&plug);
2526	btrfs_free_path(path);
2527	return ret < 0 ? ret : 0;
2528}
2529
2530static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2531					  struct btrfs_device *scrub_dev,
2532					  u64 chunk_tree, u64 chunk_objectid,
2533					  u64 chunk_offset, u64 length,
2534					  u64 dev_offset, int is_dev_replace)
2535{
2536	struct btrfs_mapping_tree *map_tree =
2537		&sctx->dev_root->fs_info->mapping_tree;
2538	struct map_lookup *map;
2539	struct extent_map *em;
2540	int i;
2541	int ret = 0;
2542
2543	read_lock(&map_tree->map_tree.lock);
2544	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2545	read_unlock(&map_tree->map_tree.lock);
2546
2547	if (!em)
2548		return -EINVAL;
2549
2550	map = (struct map_lookup *)em->bdev;
2551	if (em->start != chunk_offset)
2552		goto out;
2553
2554	if (em->len < length)
2555		goto out;
2556
2557	for (i = 0; i < map->num_stripes; ++i) {
2558		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2559		    map->stripes[i].physical == dev_offset) {
2560			ret = scrub_stripe(sctx, map, scrub_dev, i,
2561					   chunk_offset, length,
2562					   is_dev_replace);
2563			if (ret)
2564				goto out;
2565		}
2566	}
2567out:
2568	free_extent_map(em);
2569
2570	return ret;
2571}
2572
2573static noinline_for_stack
2574int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2575			   struct btrfs_device *scrub_dev, u64 start, u64 end,
2576			   int is_dev_replace)
2577{
2578	struct btrfs_dev_extent *dev_extent = NULL;
2579	struct btrfs_path *path;
2580	struct btrfs_root *root = sctx->dev_root;
2581	struct btrfs_fs_info *fs_info = root->fs_info;
2582	u64 length;
2583	u64 chunk_tree;
2584	u64 chunk_objectid;
2585	u64 chunk_offset;
2586	int ret;
2587	int slot;
2588	struct extent_buffer *l;
2589	struct btrfs_key key;
2590	struct btrfs_key found_key;
2591	struct btrfs_block_group_cache *cache;
2592	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2593
2594	path = btrfs_alloc_path();
2595	if (!path)
2596		return -ENOMEM;
2597
2598	path->reada = 2;
2599	path->search_commit_root = 1;
2600	path->skip_locking = 1;
2601
2602	key.objectid = scrub_dev->devid;
2603	key.offset = 0ull;
2604	key.type = BTRFS_DEV_EXTENT_KEY;
2605
2606	while (1) {
2607		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2608		if (ret < 0)
2609			break;
2610		if (ret > 0) {
2611			if (path->slots[0] >=
2612			    btrfs_header_nritems(path->nodes[0])) {
2613				ret = btrfs_next_leaf(root, path);
2614				if (ret)
2615					break;
2616			}
2617		}
2618
2619		l = path->nodes[0];
2620		slot = path->slots[0];
2621
2622		btrfs_item_key_to_cpu(l, &found_key, slot);
2623
2624		if (found_key.objectid != scrub_dev->devid)
2625			break;
2626
2627		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2628			break;
2629
2630		if (found_key.offset >= end)
2631			break;
2632
2633		if (found_key.offset < key.offset)
2634			break;
2635
2636		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2637		length = btrfs_dev_extent_length(l, dev_extent);
2638
2639		if (found_key.offset + length <= start) {
2640			key.offset = found_key.offset + length;
2641			btrfs_release_path(path);
2642			continue;
2643		}
2644
2645		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2646		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2647		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2648
2649		/*
2650		 * get a reference on the corresponding block group to prevent
2651		 * the chunk from going away while we scrub it
2652		 */
2653		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2654		if (!cache) {
2655			ret = -ENOENT;
2656			break;
2657		}
2658		dev_replace->cursor_right = found_key.offset + length;
2659		dev_replace->cursor_left = found_key.offset;
2660		dev_replace->item_needs_writeback = 1;
2661		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2662				  chunk_offset, length, found_key.offset,
2663				  is_dev_replace);
2664
2665		/*
2666		 * flush, submit all pending read and write bios, afterwards
2667		 * wait for them.
2668		 * Note that in the dev replace case, a read request causes
2669		 * write requests that are submitted in the read completion
2670		 * worker. Therefore in the current situation, it is required
2671		 * that all write requests are flushed, so that all read and
2672		 * write requests are really completed when bios_in_flight
2673		 * changes to 0.
2674		 */
2675		atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2676		scrub_submit(sctx);
2677		mutex_lock(&sctx->wr_ctx.wr_lock);
2678		scrub_wr_submit(sctx);
2679		mutex_unlock(&sctx->wr_ctx.wr_lock);
2680
2681		wait_event(sctx->list_wait,
2682			   atomic_read(&sctx->bios_in_flight) == 0);
2683		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2684		atomic_inc(&fs_info->scrubs_paused);
2685		wake_up(&fs_info->scrub_pause_wait);
2686		wait_event(sctx->list_wait,
2687			   atomic_read(&sctx->workers_pending) == 0);
2688
2689		mutex_lock(&fs_info->scrub_lock);
2690		while (atomic_read(&fs_info->scrub_pause_req)) {
2691			mutex_unlock(&fs_info->scrub_lock);
2692			wait_event(fs_info->scrub_pause_wait,
2693			   atomic_read(&fs_info->scrub_pause_req) == 0);
2694			mutex_lock(&fs_info->scrub_lock);
2695		}
2696		atomic_dec(&fs_info->scrubs_paused);
2697		mutex_unlock(&fs_info->scrub_lock);
2698		wake_up(&fs_info->scrub_pause_wait);
2699
2700		btrfs_put_block_group(cache);
2701		if (ret)
2702			break;
2703		if (is_dev_replace &&
2704		    atomic64_read(&dev_replace->num_write_errors) > 0) {
2705			ret = -EIO;
2706			break;
2707		}
2708		if (sctx->stat.malloc_errors > 0) {
2709			ret = -ENOMEM;
2710			break;
2711		}
2712
2713		dev_replace->cursor_left = dev_replace->cursor_right;
2714		dev_replace->item_needs_writeback = 1;
2715
2716		key.offset = found_key.offset + length;
2717		btrfs_release_path(path);
2718	}
2719
2720	btrfs_free_path(path);
2721
2722	/*
2723	 * ret can still be 1 from search_slot or next_leaf,
2724	 * that's not an error
2725	 */
2726	return ret < 0 ? ret : 0;
2727}
2728
2729static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2730					   struct btrfs_device *scrub_dev)
2731{
2732	int	i;
2733	u64	bytenr;
2734	u64	gen;
2735	int	ret;
2736	struct btrfs_root *root = sctx->dev_root;
2737
2738	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2739		return -EIO;
2740
2741	gen = root->fs_info->last_trans_committed;
2742
2743	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2744		bytenr = btrfs_sb_offset(i);
2745		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2746			break;
2747
2748		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2749				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2750				  NULL, 1, bytenr);
2751		if (ret)
2752			return ret;
2753	}
2754	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2755
2756	return 0;
2757}
2758
2759/*
2760 * get a reference count on fs_info->scrub_workers. start worker if necessary
2761 */
2762static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2763						int is_dev_replace)
2764{
2765	int ret = 0;
2766
2767	if (fs_info->scrub_workers_refcnt == 0) {
2768		if (is_dev_replace)
2769			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2770					&fs_info->generic_worker);
2771		else
2772			btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2773					fs_info->thread_pool_size,
2774					&fs_info->generic_worker);
2775		fs_info->scrub_workers.idle_thresh = 4;
2776		ret = btrfs_start_workers(&fs_info->scrub_workers);
2777		if (ret)
2778			goto out;
2779		btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2780				   "scrubwrc",
2781				   fs_info->thread_pool_size,
2782				   &fs_info->generic_worker);
2783		fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2784		ret = btrfs_start_workers(
2785				&fs_info->scrub_wr_completion_workers);
2786		if (ret)
2787			goto out;
2788		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2789				   &fs_info->generic_worker);
2790		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2791		if (ret)
2792			goto out;
2793	}
2794	++fs_info->scrub_workers_refcnt;
2795out:
2796	return ret;
2797}
2798
2799static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2800{
2801	if (--fs_info->scrub_workers_refcnt == 0) {
2802		btrfs_stop_workers(&fs_info->scrub_workers);
2803		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2804		btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2805	}
2806	WARN_ON(fs_info->scrub_workers_refcnt < 0);
2807}
2808
2809int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2810		    u64 end, struct btrfs_scrub_progress *progress,
2811		    int readonly, int is_dev_replace)
2812{
2813	struct scrub_ctx *sctx;
2814	int ret;
2815	struct btrfs_device *dev;
2816
2817	if (btrfs_fs_closing(fs_info))
2818		return -EINVAL;
2819
2820	/*
2821	 * check some assumptions
2822	 */
2823	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2824		printk(KERN_ERR
2825		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2826		       fs_info->chunk_root->nodesize,
2827		       fs_info->chunk_root->leafsize);
2828		return -EINVAL;
2829	}
2830
2831	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2832		/*
2833		 * in this case scrub is unable to calculate the checksum
2834		 * the way scrub is implemented. Do not handle this
2835		 * situation at all because it won't ever happen.
2836		 */
2837		printk(KERN_ERR
2838		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2839		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2840		return -EINVAL;
2841	}
2842
2843	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2844		/* not supported for data w/o checksums */
2845		printk(KERN_ERR
2846		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n",
2847		       fs_info->chunk_root->sectorsize, PAGE_SIZE);
2848		return -EINVAL;
2849	}
2850
2851	if (fs_info->chunk_root->nodesize >
2852	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2853	    fs_info->chunk_root->sectorsize >
2854	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2855		/*
2856		 * would exhaust the array bounds of pagev member in
2857		 * struct scrub_block
2858		 */
2859		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2860		       fs_info->chunk_root->nodesize,
2861		       SCRUB_MAX_PAGES_PER_BLOCK,
2862		       fs_info->chunk_root->sectorsize,
2863		       SCRUB_MAX_PAGES_PER_BLOCK);
2864		return -EINVAL;
2865	}
2866
2867
2868	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2869	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2870	if (!dev || (dev->missing && !is_dev_replace)) {
2871		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2872		return -ENODEV;
2873	}
2874
2875	mutex_lock(&fs_info->scrub_lock);
2876	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2877		mutex_unlock(&fs_info->scrub_lock);
2878		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2879		return -EIO;
2880	}
2881
2882	btrfs_dev_replace_lock(&fs_info->dev_replace);
2883	if (dev->scrub_device ||
2884	    (!is_dev_replace &&
2885	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2886		btrfs_dev_replace_unlock(&fs_info->dev_replace);
2887		mutex_unlock(&fs_info->scrub_lock);
2888		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2889		return -EINPROGRESS;
2890	}
2891	btrfs_dev_replace_unlock(&fs_info->dev_replace);
2892
2893	ret = scrub_workers_get(fs_info, is_dev_replace);
2894	if (ret) {
2895		mutex_unlock(&fs_info->scrub_lock);
2896		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2897		return ret;
2898	}
2899
2900	sctx = scrub_setup_ctx(dev, is_dev_replace);
2901	if (IS_ERR(sctx)) {
2902		mutex_unlock(&fs_info->scrub_lock);
2903		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2904		scrub_workers_put(fs_info);
2905		return PTR_ERR(sctx);
2906	}
2907	sctx->readonly = readonly;
2908	dev->scrub_device = sctx;
2909
2910	atomic_inc(&fs_info->scrubs_running);
2911	mutex_unlock(&fs_info->scrub_lock);
2912
2913	if (!is_dev_replace) {
2914		/*
2915		 * by holding device list mutex, we can
2916		 * kick off writing super in log tree sync.
2917		 */
2918		ret = scrub_supers(sctx, dev);
2919	}
2920	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2921
2922	if (!ret)
2923		ret = scrub_enumerate_chunks(sctx, dev, start, end,
2924					     is_dev_replace);
2925
2926	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2927	atomic_dec(&fs_info->scrubs_running);
2928	wake_up(&fs_info->scrub_pause_wait);
2929
2930	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2931
2932	if (progress)
2933		memcpy(progress, &sctx->stat, sizeof(*progress));
2934
2935	mutex_lock(&fs_info->scrub_lock);
2936	dev->scrub_device = NULL;
2937	scrub_workers_put(fs_info);
2938	mutex_unlock(&fs_info->scrub_lock);
2939
2940	scrub_free_ctx(sctx);
2941
2942	return ret;
2943}
2944
2945void btrfs_scrub_pause(struct btrfs_root *root)
2946{
2947	struct btrfs_fs_info *fs_info = root->fs_info;
2948
2949	mutex_lock(&fs_info->scrub_lock);
2950	atomic_inc(&fs_info->scrub_pause_req);
2951	while (atomic_read(&fs_info->scrubs_paused) !=
2952	       atomic_read(&fs_info->scrubs_running)) {
2953		mutex_unlock(&fs_info->scrub_lock);
2954		wait_event(fs_info->scrub_pause_wait,
2955			   atomic_read(&fs_info->scrubs_paused) ==
2956			   atomic_read(&fs_info->scrubs_running));
2957		mutex_lock(&fs_info->scrub_lock);
2958	}
2959	mutex_unlock(&fs_info->scrub_lock);
2960}
2961
2962void btrfs_scrub_continue(struct btrfs_root *root)
2963{
2964	struct btrfs_fs_info *fs_info = root->fs_info;
2965
2966	atomic_dec(&fs_info->scrub_pause_req);
2967	wake_up(&fs_info->scrub_pause_wait);
2968}
2969
2970int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2971{
2972	mutex_lock(&fs_info->scrub_lock);
2973	if (!atomic_read(&fs_info->scrubs_running)) {
2974		mutex_unlock(&fs_info->scrub_lock);
2975		return -ENOTCONN;
2976	}
2977
2978	atomic_inc(&fs_info->scrub_cancel_req);
2979	while (atomic_read(&fs_info->scrubs_running)) {
2980		mutex_unlock(&fs_info->scrub_lock);
2981		wait_event(fs_info->scrub_pause_wait,
2982			   atomic_read(&fs_info->scrubs_running) == 0);
2983		mutex_lock(&fs_info->scrub_lock);
2984	}
2985	atomic_dec(&fs_info->scrub_cancel_req);
2986	mutex_unlock(&fs_info->scrub_lock);
2987
2988	return 0;
2989}
2990
2991int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2992			   struct btrfs_device *dev)
2993{
2994	struct scrub_ctx *sctx;
2995
2996	mutex_lock(&fs_info->scrub_lock);
2997	sctx = dev->scrub_device;
2998	if (!sctx) {
2999		mutex_unlock(&fs_info->scrub_lock);
3000		return -ENOTCONN;
3001	}
3002	atomic_inc(&sctx->cancel_req);
3003	while (dev->scrub_device) {
3004		mutex_unlock(&fs_info->scrub_lock);
3005		wait_event(fs_info->scrub_pause_wait,
3006			   dev->scrub_device == NULL);
3007		mutex_lock(&fs_info->scrub_lock);
3008	}
3009	mutex_unlock(&fs_info->scrub_lock);
3010
3011	return 0;
3012}
3013
3014int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3015			 struct btrfs_scrub_progress *progress)
3016{
3017	struct btrfs_device *dev;
3018	struct scrub_ctx *sctx = NULL;
3019
3020	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3021	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3022	if (dev)
3023		sctx = dev->scrub_device;
3024	if (sctx)
3025		memcpy(progress, &sctx->stat, sizeof(*progress));
3026	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3027
3028	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3029}
3030
3031static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3032			       u64 extent_logical, u64 extent_len,
3033			       u64 *extent_physical,
3034			       struct btrfs_device **extent_dev,
3035			       int *extent_mirror_num)
3036{
3037	u64 mapped_length;
3038	struct btrfs_bio *bbio = NULL;
3039	int ret;
3040
3041	mapped_length = extent_len;
3042	ret = btrfs_map_block(fs_info, READ, extent_logical,
3043			      &mapped_length, &bbio, 0);
3044	if (ret || !bbio || mapped_length < extent_len ||
3045	    !bbio->stripes[0].dev->bdev) {
3046		kfree(bbio);
3047		return;
3048	}
3049
3050	*extent_physical = bbio->stripes[0].physical;
3051	*extent_mirror_num = bbio->mirror_num;
3052	*extent_dev = bbio->stripes[0].dev;
3053	kfree(bbio);
3054}
3055
3056static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3057			      struct scrub_wr_ctx *wr_ctx,
3058			      struct btrfs_fs_info *fs_info,
3059			      struct btrfs_device *dev,
3060			      int is_dev_replace)
3061{
3062	WARN_ON(wr_ctx->wr_curr_bio != NULL);
3063
3064	mutex_init(&wr_ctx->wr_lock);
3065	wr_ctx->wr_curr_bio = NULL;
3066	if (!is_dev_replace)
3067		return 0;
3068
3069	WARN_ON(!dev->bdev);
3070	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3071					 bio_get_nr_vecs(dev->bdev));
3072	wr_ctx->tgtdev = dev;
3073	atomic_set(&wr_ctx->flush_all_writes, 0);
3074	return 0;
3075}
3076
3077static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3078{
3079	mutex_lock(&wr_ctx->wr_lock);
3080	kfree(wr_ctx->wr_curr_bio);
3081	wr_ctx->wr_curr_bio = NULL;
3082	mutex_unlock(&wr_ctx->wr_lock);
3083}
3084
3085static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3086			    int mirror_num, u64 physical_for_dev_replace)
3087{
3088	struct scrub_copy_nocow_ctx *nocow_ctx;
3089	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3090
3091	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3092	if (!nocow_ctx) {
3093		spin_lock(&sctx->stat_lock);
3094		sctx->stat.malloc_errors++;
3095		spin_unlock(&sctx->stat_lock);
3096		return -ENOMEM;
3097	}
3098
3099	scrub_pending_trans_workers_inc(sctx);
3100
3101	nocow_ctx->sctx = sctx;
3102	nocow_ctx->logical = logical;
3103	nocow_ctx->len = len;
3104	nocow_ctx->mirror_num = mirror_num;
3105	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3106	nocow_ctx->work.func = copy_nocow_pages_worker;
3107	INIT_LIST_HEAD(&nocow_ctx->inodes);
3108	btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3109			   &nocow_ctx->work);
3110
3111	return 0;
3112}
3113
3114static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3115{
3116	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3117	struct scrub_nocow_inode *nocow_inode;
3118
3119	nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3120	if (!nocow_inode)
3121		return -ENOMEM;
3122	nocow_inode->inum = inum;
3123	nocow_inode->offset = offset;
3124	nocow_inode->root = root;
3125	list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3126	return 0;
3127}
3128
3129#define COPY_COMPLETE 1
3130
3131static void copy_nocow_pages_worker(struct btrfs_work *work)
3132{
3133	struct scrub_copy_nocow_ctx *nocow_ctx =
3134		container_of(work, struct scrub_copy_nocow_ctx, work);
3135	struct scrub_ctx *sctx = nocow_ctx->sctx;
3136	u64 logical = nocow_ctx->logical;
3137	u64 len = nocow_ctx->len;
3138	int mirror_num = nocow_ctx->mirror_num;
3139	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3140	int ret;
3141	struct btrfs_trans_handle *trans = NULL;
3142	struct btrfs_fs_info *fs_info;
3143	struct btrfs_path *path;
3144	struct btrfs_root *root;
3145	int not_written = 0;
3146
3147	fs_info = sctx->dev_root->fs_info;
3148	root = fs_info->extent_root;
3149
3150	path = btrfs_alloc_path();
3151	if (!path) {
3152		spin_lock(&sctx->stat_lock);
3153		sctx->stat.malloc_errors++;
3154		spin_unlock(&sctx->stat_lock);
3155		not_written = 1;
3156		goto out;
3157	}
3158
3159	trans = btrfs_join_transaction(root);
3160	if (IS_ERR(trans)) {
3161		not_written = 1;
3162		goto out;
3163	}
3164
3165	ret = iterate_inodes_from_logical(logical, fs_info, path,
3166					  record_inode_for_nocow, nocow_ctx);
3167	if (ret != 0 && ret != -ENOENT) {
3168		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
3169			logical, physical_for_dev_replace, len, mirror_num,
3170			ret);
3171		not_written = 1;
3172		goto out;
3173	}
3174
3175	btrfs_end_transaction(trans, root);
3176	trans = NULL;
3177	while (!list_empty(&nocow_ctx->inodes)) {
3178		struct scrub_nocow_inode *entry;
3179		entry = list_first_entry(&nocow_ctx->inodes,
3180					 struct scrub_nocow_inode,
3181					 list);
3182		list_del_init(&entry->list);
3183		ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
3184						 entry->root, nocow_ctx);
3185		kfree(entry);
3186		if (ret == COPY_COMPLETE) {
3187			ret = 0;
3188			break;
3189		} else if (ret) {
3190			break;
3191		}
3192	}
3193out:
3194	while (!list_empty(&nocow_ctx->inodes)) {
3195		struct scrub_nocow_inode *entry;
3196		entry = list_first_entry(&nocow_ctx->inodes,
3197					 struct scrub_nocow_inode,
3198					 list);
3199		list_del_init(&entry->list);
3200		kfree(entry);
3201	}
3202	if (trans && !IS_ERR(trans))
3203		btrfs_end_transaction(trans, root);
3204	if (not_written)
3205		btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3206					    num_uncorrectable_read_errors);
3207
3208	btrfs_free_path(path);
3209	kfree(nocow_ctx);
3210
3211	scrub_pending_trans_workers_dec(sctx);
3212}
3213
3214static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3215				      struct scrub_copy_nocow_ctx *nocow_ctx)
3216{
3217	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3218	struct btrfs_key key;
3219	struct inode *inode;
3220	struct page *page;
3221	struct btrfs_root *local_root;
3222	struct btrfs_ordered_extent *ordered;
3223	struct extent_map *em;
3224	struct extent_state *cached_state = NULL;
3225	struct extent_io_tree *io_tree;
3226	u64 physical_for_dev_replace;
3227	u64 len = nocow_ctx->len;
3228	u64 lockstart = offset, lockend = offset + len - 1;
3229	unsigned long index;
3230	int srcu_index;
3231	int ret = 0;
3232	int err = 0;
3233
3234	key.objectid = root;
3235	key.type = BTRFS_ROOT_ITEM_KEY;
3236	key.offset = (u64)-1;
3237
3238	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
3239
3240	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3241	if (IS_ERR(local_root)) {
3242		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3243		return PTR_ERR(local_root);
3244	}
3245
3246	key.type = BTRFS_INODE_ITEM_KEY;
3247	key.objectid = inum;
3248	key.offset = 0;
3249	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3250	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3251	if (IS_ERR(inode))
3252		return PTR_ERR(inode);
3253
3254	/* Avoid truncate/dio/punch hole.. */
3255	mutex_lock(&inode->i_mutex);
3256	inode_dio_wait(inode);
3257
3258	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3259	io_tree = &BTRFS_I(inode)->io_tree;
3260
3261	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
3262	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
3263	if (ordered) {
3264		btrfs_put_ordered_extent(ordered);
3265		goto out_unlock;
3266	}
3267
3268	em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
3269	if (IS_ERR(em)) {
3270		ret = PTR_ERR(em);
3271		goto out_unlock;
3272	}
3273
3274	/*
3275	 * This extent does not actually cover the logical extent anymore,
3276	 * move on to the next inode.
3277	 */
3278	if (em->block_start > nocow_ctx->logical ||
3279	    em->block_start + em->block_len < nocow_ctx->logical + len) {
3280		free_extent_map(em);
3281		goto out_unlock;
3282	}
3283	free_extent_map(em);
3284
3285	while (len >= PAGE_CACHE_SIZE) {
3286		index = offset >> PAGE_CACHE_SHIFT;
3287again:
3288		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3289		if (!page) {
3290			pr_err("find_or_create_page() failed\n");
3291			ret = -ENOMEM;
3292			goto out;
3293		}
3294
3295		if (PageUptodate(page)) {
3296			if (PageDirty(page))
3297				goto next_page;
3298		} else {
3299			ClearPageError(page);
3300			err = extent_read_full_page_nolock(io_tree, page,
3301							   btrfs_get_extent,
3302							   nocow_ctx->mirror_num);
3303			if (err) {
3304				ret = err;
3305				goto next_page;
3306			}
3307
3308			lock_page(page);
3309			/*
3310			 * If the page has been remove from the page cache,
3311			 * the data on it is meaningless, because it may be
3312			 * old one, the new data may be written into the new
3313			 * page in the page cache.
3314			 */
3315			if (page->mapping != inode->i_mapping) {
3316				unlock_page(page);
3317				page_cache_release(page);
3318				goto again;
3319			}
3320			if (!PageUptodate(page)) {
3321				ret = -EIO;
3322				goto next_page;
3323			}
3324		}
3325		err = write_page_nocow(nocow_ctx->sctx,
3326				       physical_for_dev_replace, page);
3327		if (err)
3328			ret = err;
3329next_page:
3330		unlock_page(page);
3331		page_cache_release(page);
3332
3333		if (ret)
3334			break;
3335
3336		offset += PAGE_CACHE_SIZE;
3337		physical_for_dev_replace += PAGE_CACHE_SIZE;
3338		len -= PAGE_CACHE_SIZE;
3339	}
3340	ret = COPY_COMPLETE;
3341out_unlock:
3342	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
3343			     GFP_NOFS);
3344out:
3345	mutex_unlock(&inode->i_mutex);
3346	iput(inode);
3347	return ret;
3348}
3349
3350static int write_page_nocow(struct scrub_ctx *sctx,
3351			    u64 physical_for_dev_replace, struct page *page)
3352{
3353	struct bio *bio;
3354	struct btrfs_device *dev;
3355	int ret;
3356
3357	dev = sctx->wr_ctx.tgtdev;
3358	if (!dev)
3359		return -EIO;
3360	if (!dev->bdev) {
3361		printk_ratelimited(KERN_WARNING
3362			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3363		return -EIO;
3364	}
3365	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
3366	if (!bio) {
3367		spin_lock(&sctx->stat_lock);
3368		sctx->stat.malloc_errors++;
3369		spin_unlock(&sctx->stat_lock);
3370		return -ENOMEM;
3371	}
3372	bio->bi_size = 0;
3373	bio->bi_sector = physical_for_dev_replace >> 9;
3374	bio->bi_bdev = dev->bdev;
3375	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3376	if (ret != PAGE_CACHE_SIZE) {
3377leave_with_eio:
3378		bio_put(bio);
3379		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3380		return -EIO;
3381	}
3382
3383	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
3384		goto leave_with_eio;
3385
3386	bio_put(bio);
3387	return 0;
3388}
3389