scrub.c revision 3cae210fa529d69cb25c2a3c491f29dab687b245
1/*
2 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
21#include "ctree.h"
22#include "volumes.h"
23#include "disk-io.h"
24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
28#include "dev-replace.h"
29#include "check-integrity.h"
30#include "rcu-string.h"
31#include "raid56.h"
32
33/*
34 * This is only the first step towards a full-features scrub. It reads all
35 * extent and super block and verifies the checksums. In case a bad checksum
36 * is found or the extent cannot be read, good data will be written back if
37 * any can be found.
38 *
39 * Future enhancements:
40 *  - In case an unrepairable extent is encountered, track which files are
41 *    affected and report them
42 *  - track and record media errors, throw out bad devices
43 *  - add a mode to also read unallocated space
44 */
45
46struct scrub_block;
47struct scrub_ctx;
48
49/*
50 * the following three values only influence the performance.
51 * The last one configures the number of parallel and outstanding I/O
52 * operations. The first two values configure an upper limit for the number
53 * of (dynamically allocated) pages that are added to a bio.
54 */
55#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
56#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
57#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
58
59/*
60 * the following value times PAGE_SIZE needs to be large enough to match the
61 * largest node/leaf/sector size that shall be supported.
62 * Values larger than BTRFS_STRIPE_LEN are not supported.
63 */
64#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
65
66struct scrub_page {
67	struct scrub_block	*sblock;
68	struct page		*page;
69	struct btrfs_device	*dev;
70	u64			flags;  /* extent flags */
71	u64			generation;
72	u64			logical;
73	u64			physical;
74	u64			physical_for_dev_replace;
75	atomic_t		ref_count;
76	struct {
77		unsigned int	mirror_num:8;
78		unsigned int	have_csum:1;
79		unsigned int	io_error:1;
80	};
81	u8			csum[BTRFS_CSUM_SIZE];
82};
83
84struct scrub_bio {
85	int			index;
86	struct scrub_ctx	*sctx;
87	struct btrfs_device	*dev;
88	struct bio		*bio;
89	int			err;
90	u64			logical;
91	u64			physical;
92#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
93	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
94#else
95	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
96#endif
97	int			page_count;
98	int			next_free;
99	struct btrfs_work	work;
100};
101
102struct scrub_block {
103	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
104	int			page_count;
105	atomic_t		outstanding_pages;
106	atomic_t		ref_count; /* free mem on transition to zero */
107	struct scrub_ctx	*sctx;
108	struct {
109		unsigned int	header_error:1;
110		unsigned int	checksum_error:1;
111		unsigned int	no_io_error_seen:1;
112		unsigned int	generation_error:1; /* also sets header_error */
113	};
114};
115
116struct scrub_wr_ctx {
117	struct scrub_bio *wr_curr_bio;
118	struct btrfs_device *tgtdev;
119	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
120	atomic_t flush_all_writes;
121	struct mutex wr_lock;
122};
123
124struct scrub_ctx {
125	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
126	struct btrfs_root	*dev_root;
127	int			first_free;
128	int			curr;
129	atomic_t		bios_in_flight;
130	atomic_t		workers_pending;
131	spinlock_t		list_lock;
132	wait_queue_head_t	list_wait;
133	u16			csum_size;
134	struct list_head	csum_list;
135	atomic_t		cancel_req;
136	int			readonly;
137	int			pages_per_rd_bio;
138	u32			sectorsize;
139	u32			nodesize;
140	u32			leafsize;
141
142	int			is_dev_replace;
143	struct scrub_wr_ctx	wr_ctx;
144
145	/*
146	 * statistics
147	 */
148	struct btrfs_scrub_progress stat;
149	spinlock_t		stat_lock;
150};
151
152struct scrub_fixup_nodatasum {
153	struct scrub_ctx	*sctx;
154	struct btrfs_device	*dev;
155	u64			logical;
156	struct btrfs_root	*root;
157	struct btrfs_work	work;
158	int			mirror_num;
159};
160
161struct scrub_copy_nocow_ctx {
162	struct scrub_ctx	*sctx;
163	u64			logical;
164	u64			len;
165	int			mirror_num;
166	u64			physical_for_dev_replace;
167	struct btrfs_work	work;
168};
169
170struct scrub_warning {
171	struct btrfs_path	*path;
172	u64			extent_item_size;
173	char			*scratch_buf;
174	char			*msg_buf;
175	const char		*errstr;
176	sector_t		sector;
177	u64			logical;
178	struct btrfs_device	*dev;
179	int			msg_bufsize;
180	int			scratch_bufsize;
181};
182
183
184static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
185static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
186static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
187static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
188static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
189static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
190				     struct btrfs_fs_info *fs_info,
191				     struct scrub_block *original_sblock,
192				     u64 length, u64 logical,
193				     struct scrub_block *sblocks_for_recheck);
194static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
195				struct scrub_block *sblock, int is_metadata,
196				int have_csum, u8 *csum, u64 generation,
197				u16 csum_size);
198static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
199					 struct scrub_block *sblock,
200					 int is_metadata, int have_csum,
201					 const u8 *csum, u64 generation,
202					 u16 csum_size);
203static void scrub_complete_bio_end_io(struct bio *bio, int err);
204static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
205					     struct scrub_block *sblock_good,
206					     int force_write);
207static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
208					    struct scrub_block *sblock_good,
209					    int page_num, int force_write);
210static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
211static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
212					   int page_num);
213static int scrub_checksum_data(struct scrub_block *sblock);
214static int scrub_checksum_tree_block(struct scrub_block *sblock);
215static int scrub_checksum_super(struct scrub_block *sblock);
216static void scrub_block_get(struct scrub_block *sblock);
217static void scrub_block_put(struct scrub_block *sblock);
218static void scrub_page_get(struct scrub_page *spage);
219static void scrub_page_put(struct scrub_page *spage);
220static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
221				    struct scrub_page *spage);
222static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
223		       u64 physical, struct btrfs_device *dev, u64 flags,
224		       u64 gen, int mirror_num, u8 *csum, int force,
225		       u64 physical_for_dev_replace);
226static void scrub_bio_end_io(struct bio *bio, int err);
227static void scrub_bio_end_io_worker(struct btrfs_work *work);
228static void scrub_block_complete(struct scrub_block *sblock);
229static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
230			       u64 extent_logical, u64 extent_len,
231			       u64 *extent_physical,
232			       struct btrfs_device **extent_dev,
233			       int *extent_mirror_num);
234static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
235			      struct scrub_wr_ctx *wr_ctx,
236			      struct btrfs_fs_info *fs_info,
237			      struct btrfs_device *dev,
238			      int is_dev_replace);
239static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
240static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
241				    struct scrub_page *spage);
242static void scrub_wr_submit(struct scrub_ctx *sctx);
243static void scrub_wr_bio_end_io(struct bio *bio, int err);
244static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
245static int write_page_nocow(struct scrub_ctx *sctx,
246			    u64 physical_for_dev_replace, struct page *page);
247static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
248				      void *ctx);
249static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
250			    int mirror_num, u64 physical_for_dev_replace);
251static void copy_nocow_pages_worker(struct btrfs_work *work);
252
253
254static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
255{
256	atomic_inc(&sctx->bios_in_flight);
257}
258
259static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
260{
261	atomic_dec(&sctx->bios_in_flight);
262	wake_up(&sctx->list_wait);
263}
264
265/*
266 * used for workers that require transaction commits (i.e., for the
267 * NOCOW case)
268 */
269static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
270{
271	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
272
273	/*
274	 * increment scrubs_running to prevent cancel requests from
275	 * completing as long as a worker is running. we must also
276	 * increment scrubs_paused to prevent deadlocking on pause
277	 * requests used for transactions commits (as the worker uses a
278	 * transaction context). it is safe to regard the worker
279	 * as paused for all matters practical. effectively, we only
280	 * avoid cancellation requests from completing.
281	 */
282	mutex_lock(&fs_info->scrub_lock);
283	atomic_inc(&fs_info->scrubs_running);
284	atomic_inc(&fs_info->scrubs_paused);
285	mutex_unlock(&fs_info->scrub_lock);
286	atomic_inc(&sctx->workers_pending);
287}
288
289/* used for workers that require transaction commits */
290static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
291{
292	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
293
294	/*
295	 * see scrub_pending_trans_workers_inc() why we're pretending
296	 * to be paused in the scrub counters
297	 */
298	mutex_lock(&fs_info->scrub_lock);
299	atomic_dec(&fs_info->scrubs_running);
300	atomic_dec(&fs_info->scrubs_paused);
301	mutex_unlock(&fs_info->scrub_lock);
302	atomic_dec(&sctx->workers_pending);
303	wake_up(&fs_info->scrub_pause_wait);
304	wake_up(&sctx->list_wait);
305}
306
307static void scrub_free_csums(struct scrub_ctx *sctx)
308{
309	while (!list_empty(&sctx->csum_list)) {
310		struct btrfs_ordered_sum *sum;
311		sum = list_first_entry(&sctx->csum_list,
312				       struct btrfs_ordered_sum, list);
313		list_del(&sum->list);
314		kfree(sum);
315	}
316}
317
318static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
319{
320	int i;
321
322	if (!sctx)
323		return;
324
325	scrub_free_wr_ctx(&sctx->wr_ctx);
326
327	/* this can happen when scrub is cancelled */
328	if (sctx->curr != -1) {
329		struct scrub_bio *sbio = sctx->bios[sctx->curr];
330
331		for (i = 0; i < sbio->page_count; i++) {
332			WARN_ON(!sbio->pagev[i]->page);
333			scrub_block_put(sbio->pagev[i]->sblock);
334		}
335		bio_put(sbio->bio);
336	}
337
338	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
339		struct scrub_bio *sbio = sctx->bios[i];
340
341		if (!sbio)
342			break;
343		kfree(sbio);
344	}
345
346	scrub_free_csums(sctx);
347	kfree(sctx);
348}
349
350static noinline_for_stack
351struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
352{
353	struct scrub_ctx *sctx;
354	int		i;
355	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
356	int pages_per_rd_bio;
357	int ret;
358
359	/*
360	 * the setting of pages_per_rd_bio is correct for scrub but might
361	 * be wrong for the dev_replace code where we might read from
362	 * different devices in the initial huge bios. However, that
363	 * code is able to correctly handle the case when adding a page
364	 * to a bio fails.
365	 */
366	if (dev->bdev)
367		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
368					 bio_get_nr_vecs(dev->bdev));
369	else
370		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
371	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
372	if (!sctx)
373		goto nomem;
374	sctx->is_dev_replace = is_dev_replace;
375	sctx->pages_per_rd_bio = pages_per_rd_bio;
376	sctx->curr = -1;
377	sctx->dev_root = dev->dev_root;
378	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
379		struct scrub_bio *sbio;
380
381		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
382		if (!sbio)
383			goto nomem;
384		sctx->bios[i] = sbio;
385
386		sbio->index = i;
387		sbio->sctx = sctx;
388		sbio->page_count = 0;
389		sbio->work.func = scrub_bio_end_io_worker;
390
391		if (i != SCRUB_BIOS_PER_SCTX - 1)
392			sctx->bios[i]->next_free = i + 1;
393		else
394			sctx->bios[i]->next_free = -1;
395	}
396	sctx->first_free = 0;
397	sctx->nodesize = dev->dev_root->nodesize;
398	sctx->leafsize = dev->dev_root->leafsize;
399	sctx->sectorsize = dev->dev_root->sectorsize;
400	atomic_set(&sctx->bios_in_flight, 0);
401	atomic_set(&sctx->workers_pending, 0);
402	atomic_set(&sctx->cancel_req, 0);
403	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
404	INIT_LIST_HEAD(&sctx->csum_list);
405
406	spin_lock_init(&sctx->list_lock);
407	spin_lock_init(&sctx->stat_lock);
408	init_waitqueue_head(&sctx->list_wait);
409
410	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
411				 fs_info->dev_replace.tgtdev, is_dev_replace);
412	if (ret) {
413		scrub_free_ctx(sctx);
414		return ERR_PTR(ret);
415	}
416	return sctx;
417
418nomem:
419	scrub_free_ctx(sctx);
420	return ERR_PTR(-ENOMEM);
421}
422
423static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
424				     void *warn_ctx)
425{
426	u64 isize;
427	u32 nlink;
428	int ret;
429	int i;
430	struct extent_buffer *eb;
431	struct btrfs_inode_item *inode_item;
432	struct scrub_warning *swarn = warn_ctx;
433	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
434	struct inode_fs_paths *ipath = NULL;
435	struct btrfs_root *local_root;
436	struct btrfs_key root_key;
437
438	root_key.objectid = root;
439	root_key.type = BTRFS_ROOT_ITEM_KEY;
440	root_key.offset = (u64)-1;
441	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
442	if (IS_ERR(local_root)) {
443		ret = PTR_ERR(local_root);
444		goto err;
445	}
446
447	ret = inode_item_info(inum, 0, local_root, swarn->path);
448	if (ret) {
449		btrfs_release_path(swarn->path);
450		goto err;
451	}
452
453	eb = swarn->path->nodes[0];
454	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
455					struct btrfs_inode_item);
456	isize = btrfs_inode_size(eb, inode_item);
457	nlink = btrfs_inode_nlink(eb, inode_item);
458	btrfs_release_path(swarn->path);
459
460	ipath = init_ipath(4096, local_root, swarn->path);
461	if (IS_ERR(ipath)) {
462		ret = PTR_ERR(ipath);
463		ipath = NULL;
464		goto err;
465	}
466	ret = paths_from_inode(inum, ipath);
467
468	if (ret < 0)
469		goto err;
470
471	/*
472	 * we deliberately ignore the bit ipath might have been too small to
473	 * hold all of the paths here
474	 */
475	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
476		printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
477			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
478			"length %llu, links %u (path: %s)\n", swarn->errstr,
479			swarn->logical, rcu_str_deref(swarn->dev->name),
480			(unsigned long long)swarn->sector, root, inum, offset,
481			min(isize - offset, (u64)PAGE_SIZE), nlink,
482			(char *)(unsigned long)ipath->fspath->val[i]);
483
484	free_ipath(ipath);
485	return 0;
486
487err:
488	printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
489		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
490		"resolving failed with ret=%d\n", swarn->errstr,
491		swarn->logical, rcu_str_deref(swarn->dev->name),
492		(unsigned long long)swarn->sector, root, inum, offset, ret);
493
494	free_ipath(ipath);
495	return 0;
496}
497
498static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
499{
500	struct btrfs_device *dev;
501	struct btrfs_fs_info *fs_info;
502	struct btrfs_path *path;
503	struct btrfs_key found_key;
504	struct extent_buffer *eb;
505	struct btrfs_extent_item *ei;
506	struct scrub_warning swarn;
507	unsigned long ptr = 0;
508	u64 extent_item_pos;
509	u64 flags = 0;
510	u64 ref_root;
511	u32 item_size;
512	u8 ref_level;
513	const int bufsize = 4096;
514	int ret;
515
516	WARN_ON(sblock->page_count < 1);
517	dev = sblock->pagev[0]->dev;
518	fs_info = sblock->sctx->dev_root->fs_info;
519
520	path = btrfs_alloc_path();
521
522	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
523	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
524	swarn.sector = (sblock->pagev[0]->physical) >> 9;
525	swarn.logical = sblock->pagev[0]->logical;
526	swarn.errstr = errstr;
527	swarn.dev = NULL;
528	swarn.msg_bufsize = bufsize;
529	swarn.scratch_bufsize = bufsize;
530
531	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
532		goto out;
533
534	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
535				  &flags);
536	if (ret < 0)
537		goto out;
538
539	extent_item_pos = swarn.logical - found_key.objectid;
540	swarn.extent_item_size = found_key.offset;
541
542	eb = path->nodes[0];
543	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
544	item_size = btrfs_item_size_nr(eb, path->slots[0]);
545
546	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
547		do {
548			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
549							&ref_root, &ref_level);
550			printk_in_rcu(KERN_WARNING
551				"btrfs: %s at logical %llu on dev %s, "
552				"sector %llu: metadata %s (level %d) in tree "
553				"%llu\n", errstr, swarn.logical,
554				rcu_str_deref(dev->name),
555				(unsigned long long)swarn.sector,
556				ref_level ? "node" : "leaf",
557				ret < 0 ? -1 : ref_level,
558				ret < 0 ? -1 : ref_root);
559		} while (ret != 1);
560		btrfs_release_path(path);
561	} else {
562		btrfs_release_path(path);
563		swarn.path = path;
564		swarn.dev = dev;
565		iterate_extent_inodes(fs_info, found_key.objectid,
566					extent_item_pos, 1,
567					scrub_print_warning_inode, &swarn);
568	}
569
570out:
571	btrfs_free_path(path);
572	kfree(swarn.scratch_buf);
573	kfree(swarn.msg_buf);
574}
575
576static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
577{
578	struct page *page = NULL;
579	unsigned long index;
580	struct scrub_fixup_nodatasum *fixup = fixup_ctx;
581	int ret;
582	int corrected = 0;
583	struct btrfs_key key;
584	struct inode *inode = NULL;
585	struct btrfs_fs_info *fs_info;
586	u64 end = offset + PAGE_SIZE - 1;
587	struct btrfs_root *local_root;
588	int srcu_index;
589
590	key.objectid = root;
591	key.type = BTRFS_ROOT_ITEM_KEY;
592	key.offset = (u64)-1;
593
594	fs_info = fixup->root->fs_info;
595	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
596
597	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
598	if (IS_ERR(local_root)) {
599		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
600		return PTR_ERR(local_root);
601	}
602
603	key.type = BTRFS_INODE_ITEM_KEY;
604	key.objectid = inum;
605	key.offset = 0;
606	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
607	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
608	if (IS_ERR(inode))
609		return PTR_ERR(inode);
610
611	index = offset >> PAGE_CACHE_SHIFT;
612
613	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
614	if (!page) {
615		ret = -ENOMEM;
616		goto out;
617	}
618
619	if (PageUptodate(page)) {
620		if (PageDirty(page)) {
621			/*
622			 * we need to write the data to the defect sector. the
623			 * data that was in that sector is not in memory,
624			 * because the page was modified. we must not write the
625			 * modified page to that sector.
626			 *
627			 * TODO: what could be done here: wait for the delalloc
628			 *       runner to write out that page (might involve
629			 *       COW) and see whether the sector is still
630			 *       referenced afterwards.
631			 *
632			 * For the meantime, we'll treat this error
633			 * incorrectable, although there is a chance that a
634			 * later scrub will find the bad sector again and that
635			 * there's no dirty page in memory, then.
636			 */
637			ret = -EIO;
638			goto out;
639		}
640		fs_info = BTRFS_I(inode)->root->fs_info;
641		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
642					fixup->logical, page,
643					fixup->mirror_num);
644		unlock_page(page);
645		corrected = !ret;
646	} else {
647		/*
648		 * we need to get good data first. the general readpage path
649		 * will call repair_io_failure for us, we just have to make
650		 * sure we read the bad mirror.
651		 */
652		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
653					EXTENT_DAMAGED, GFP_NOFS);
654		if (ret) {
655			/* set_extent_bits should give proper error */
656			WARN_ON(ret > 0);
657			if (ret > 0)
658				ret = -EFAULT;
659			goto out;
660		}
661
662		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
663						btrfs_get_extent,
664						fixup->mirror_num);
665		wait_on_page_locked(page);
666
667		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
668						end, EXTENT_DAMAGED, 0, NULL);
669		if (!corrected)
670			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
671						EXTENT_DAMAGED, GFP_NOFS);
672	}
673
674out:
675	if (page)
676		put_page(page);
677	if (inode)
678		iput(inode);
679
680	if (ret < 0)
681		return ret;
682
683	if (ret == 0 && corrected) {
684		/*
685		 * we only need to call readpage for one of the inodes belonging
686		 * to this extent. so make iterate_extent_inodes stop
687		 */
688		return 1;
689	}
690
691	return -EIO;
692}
693
694static void scrub_fixup_nodatasum(struct btrfs_work *work)
695{
696	int ret;
697	struct scrub_fixup_nodatasum *fixup;
698	struct scrub_ctx *sctx;
699	struct btrfs_trans_handle *trans = NULL;
700	struct btrfs_fs_info *fs_info;
701	struct btrfs_path *path;
702	int uncorrectable = 0;
703
704	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
705	sctx = fixup->sctx;
706	fs_info = fixup->root->fs_info;
707
708	path = btrfs_alloc_path();
709	if (!path) {
710		spin_lock(&sctx->stat_lock);
711		++sctx->stat.malloc_errors;
712		spin_unlock(&sctx->stat_lock);
713		uncorrectable = 1;
714		goto out;
715	}
716
717	trans = btrfs_join_transaction(fixup->root);
718	if (IS_ERR(trans)) {
719		uncorrectable = 1;
720		goto out;
721	}
722
723	/*
724	 * the idea is to trigger a regular read through the standard path. we
725	 * read a page from the (failed) logical address by specifying the
726	 * corresponding copynum of the failed sector. thus, that readpage is
727	 * expected to fail.
728	 * that is the point where on-the-fly error correction will kick in
729	 * (once it's finished) and rewrite the failed sector if a good copy
730	 * can be found.
731	 */
732	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
733						path, scrub_fixup_readpage,
734						fixup);
735	if (ret < 0) {
736		uncorrectable = 1;
737		goto out;
738	}
739	WARN_ON(ret != 1);
740
741	spin_lock(&sctx->stat_lock);
742	++sctx->stat.corrected_errors;
743	spin_unlock(&sctx->stat_lock);
744
745out:
746	if (trans && !IS_ERR(trans))
747		btrfs_end_transaction(trans, fixup->root);
748	if (uncorrectable) {
749		spin_lock(&sctx->stat_lock);
750		++sctx->stat.uncorrectable_errors;
751		spin_unlock(&sctx->stat_lock);
752		btrfs_dev_replace_stats_inc(
753			&sctx->dev_root->fs_info->dev_replace.
754			num_uncorrectable_read_errors);
755		printk_ratelimited_in_rcu(KERN_ERR
756			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
757			(unsigned long long)fixup->logical,
758			rcu_str_deref(fixup->dev->name));
759	}
760
761	btrfs_free_path(path);
762	kfree(fixup);
763
764	scrub_pending_trans_workers_dec(sctx);
765}
766
767/*
768 * scrub_handle_errored_block gets called when either verification of the
769 * pages failed or the bio failed to read, e.g. with EIO. In the latter
770 * case, this function handles all pages in the bio, even though only one
771 * may be bad.
772 * The goal of this function is to repair the errored block by using the
773 * contents of one of the mirrors.
774 */
775static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
776{
777	struct scrub_ctx *sctx = sblock_to_check->sctx;
778	struct btrfs_device *dev;
779	struct btrfs_fs_info *fs_info;
780	u64 length;
781	u64 logical;
782	u64 generation;
783	unsigned int failed_mirror_index;
784	unsigned int is_metadata;
785	unsigned int have_csum;
786	u8 *csum;
787	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
788	struct scrub_block *sblock_bad;
789	int ret;
790	int mirror_index;
791	int page_num;
792	int success;
793	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
794				      DEFAULT_RATELIMIT_BURST);
795
796	BUG_ON(sblock_to_check->page_count < 1);
797	fs_info = sctx->dev_root->fs_info;
798	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
799		/*
800		 * if we find an error in a super block, we just report it.
801		 * They will get written with the next transaction commit
802		 * anyway
803		 */
804		spin_lock(&sctx->stat_lock);
805		++sctx->stat.super_errors;
806		spin_unlock(&sctx->stat_lock);
807		return 0;
808	}
809	length = sblock_to_check->page_count * PAGE_SIZE;
810	logical = sblock_to_check->pagev[0]->logical;
811	generation = sblock_to_check->pagev[0]->generation;
812	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
813	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
814	is_metadata = !(sblock_to_check->pagev[0]->flags &
815			BTRFS_EXTENT_FLAG_DATA);
816	have_csum = sblock_to_check->pagev[0]->have_csum;
817	csum = sblock_to_check->pagev[0]->csum;
818	dev = sblock_to_check->pagev[0]->dev;
819
820	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
821		sblocks_for_recheck = NULL;
822		goto nodatasum_case;
823	}
824
825	/*
826	 * read all mirrors one after the other. This includes to
827	 * re-read the extent or metadata block that failed (that was
828	 * the cause that this fixup code is called) another time,
829	 * page by page this time in order to know which pages
830	 * caused I/O errors and which ones are good (for all mirrors).
831	 * It is the goal to handle the situation when more than one
832	 * mirror contains I/O errors, but the errors do not
833	 * overlap, i.e. the data can be repaired by selecting the
834	 * pages from those mirrors without I/O error on the
835	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
836	 * would be that mirror #1 has an I/O error on the first page,
837	 * the second page is good, and mirror #2 has an I/O error on
838	 * the second page, but the first page is good.
839	 * Then the first page of the first mirror can be repaired by
840	 * taking the first page of the second mirror, and the
841	 * second page of the second mirror can be repaired by
842	 * copying the contents of the 2nd page of the 1st mirror.
843	 * One more note: if the pages of one mirror contain I/O
844	 * errors, the checksum cannot be verified. In order to get
845	 * the best data for repairing, the first attempt is to find
846	 * a mirror without I/O errors and with a validated checksum.
847	 * Only if this is not possible, the pages are picked from
848	 * mirrors with I/O errors without considering the checksum.
849	 * If the latter is the case, at the end, the checksum of the
850	 * repaired area is verified in order to correctly maintain
851	 * the statistics.
852	 */
853
854	sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
855				     sizeof(*sblocks_for_recheck),
856				     GFP_NOFS);
857	if (!sblocks_for_recheck) {
858		spin_lock(&sctx->stat_lock);
859		sctx->stat.malloc_errors++;
860		sctx->stat.read_errors++;
861		sctx->stat.uncorrectable_errors++;
862		spin_unlock(&sctx->stat_lock);
863		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
864		goto out;
865	}
866
867	/* setup the context, map the logical blocks and alloc the pages */
868	ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
869					logical, sblocks_for_recheck);
870	if (ret) {
871		spin_lock(&sctx->stat_lock);
872		sctx->stat.read_errors++;
873		sctx->stat.uncorrectable_errors++;
874		spin_unlock(&sctx->stat_lock);
875		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
876		goto out;
877	}
878	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
879	sblock_bad = sblocks_for_recheck + failed_mirror_index;
880
881	/* build and submit the bios for the failed mirror, check checksums */
882	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
883			    csum, generation, sctx->csum_size);
884
885	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
886	    sblock_bad->no_io_error_seen) {
887		/*
888		 * the error disappeared after reading page by page, or
889		 * the area was part of a huge bio and other parts of the
890		 * bio caused I/O errors, or the block layer merged several
891		 * read requests into one and the error is caused by a
892		 * different bio (usually one of the two latter cases is
893		 * the cause)
894		 */
895		spin_lock(&sctx->stat_lock);
896		sctx->stat.unverified_errors++;
897		spin_unlock(&sctx->stat_lock);
898
899		if (sctx->is_dev_replace)
900			scrub_write_block_to_dev_replace(sblock_bad);
901		goto out;
902	}
903
904	if (!sblock_bad->no_io_error_seen) {
905		spin_lock(&sctx->stat_lock);
906		sctx->stat.read_errors++;
907		spin_unlock(&sctx->stat_lock);
908		if (__ratelimit(&_rs))
909			scrub_print_warning("i/o error", sblock_to_check);
910		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
911	} else if (sblock_bad->checksum_error) {
912		spin_lock(&sctx->stat_lock);
913		sctx->stat.csum_errors++;
914		spin_unlock(&sctx->stat_lock);
915		if (__ratelimit(&_rs))
916			scrub_print_warning("checksum error", sblock_to_check);
917		btrfs_dev_stat_inc_and_print(dev,
918					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
919	} else if (sblock_bad->header_error) {
920		spin_lock(&sctx->stat_lock);
921		sctx->stat.verify_errors++;
922		spin_unlock(&sctx->stat_lock);
923		if (__ratelimit(&_rs))
924			scrub_print_warning("checksum/header error",
925					    sblock_to_check);
926		if (sblock_bad->generation_error)
927			btrfs_dev_stat_inc_and_print(dev,
928				BTRFS_DEV_STAT_GENERATION_ERRS);
929		else
930			btrfs_dev_stat_inc_and_print(dev,
931				BTRFS_DEV_STAT_CORRUPTION_ERRS);
932	}
933
934	if (sctx->readonly && !sctx->is_dev_replace)
935		goto did_not_correct_error;
936
937	if (!is_metadata && !have_csum) {
938		struct scrub_fixup_nodatasum *fixup_nodatasum;
939
940nodatasum_case:
941		WARN_ON(sctx->is_dev_replace);
942
943		/*
944		 * !is_metadata and !have_csum, this means that the data
945		 * might not be COW'ed, that it might be modified
946		 * concurrently. The general strategy to work on the
947		 * commit root does not help in the case when COW is not
948		 * used.
949		 */
950		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
951		if (!fixup_nodatasum)
952			goto did_not_correct_error;
953		fixup_nodatasum->sctx = sctx;
954		fixup_nodatasum->dev = dev;
955		fixup_nodatasum->logical = logical;
956		fixup_nodatasum->root = fs_info->extent_root;
957		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
958		scrub_pending_trans_workers_inc(sctx);
959		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
960		btrfs_queue_worker(&fs_info->scrub_workers,
961				   &fixup_nodatasum->work);
962		goto out;
963	}
964
965	/*
966	 * now build and submit the bios for the other mirrors, check
967	 * checksums.
968	 * First try to pick the mirror which is completely without I/O
969	 * errors and also does not have a checksum error.
970	 * If one is found, and if a checksum is present, the full block
971	 * that is known to contain an error is rewritten. Afterwards
972	 * the block is known to be corrected.
973	 * If a mirror is found which is completely correct, and no
974	 * checksum is present, only those pages are rewritten that had
975	 * an I/O error in the block to be repaired, since it cannot be
976	 * determined, which copy of the other pages is better (and it
977	 * could happen otherwise that a correct page would be
978	 * overwritten by a bad one).
979	 */
980	for (mirror_index = 0;
981	     mirror_index < BTRFS_MAX_MIRRORS &&
982	     sblocks_for_recheck[mirror_index].page_count > 0;
983	     mirror_index++) {
984		struct scrub_block *sblock_other;
985
986		if (mirror_index == failed_mirror_index)
987			continue;
988		sblock_other = sblocks_for_recheck + mirror_index;
989
990		/* build and submit the bios, check checksums */
991		scrub_recheck_block(fs_info, sblock_other, is_metadata,
992				    have_csum, csum, generation,
993				    sctx->csum_size);
994
995		if (!sblock_other->header_error &&
996		    !sblock_other->checksum_error &&
997		    sblock_other->no_io_error_seen) {
998			if (sctx->is_dev_replace) {
999				scrub_write_block_to_dev_replace(sblock_other);
1000			} else {
1001				int force_write = is_metadata || have_csum;
1002
1003				ret = scrub_repair_block_from_good_copy(
1004						sblock_bad, sblock_other,
1005						force_write);
1006			}
1007			if (0 == ret)
1008				goto corrected_error;
1009		}
1010	}
1011
1012	/*
1013	 * for dev_replace, pick good pages and write to the target device.
1014	 */
1015	if (sctx->is_dev_replace) {
1016		success = 1;
1017		for (page_num = 0; page_num < sblock_bad->page_count;
1018		     page_num++) {
1019			int sub_success;
1020
1021			sub_success = 0;
1022			for (mirror_index = 0;
1023			     mirror_index < BTRFS_MAX_MIRRORS &&
1024			     sblocks_for_recheck[mirror_index].page_count > 0;
1025			     mirror_index++) {
1026				struct scrub_block *sblock_other =
1027					sblocks_for_recheck + mirror_index;
1028				struct scrub_page *page_other =
1029					sblock_other->pagev[page_num];
1030
1031				if (!page_other->io_error) {
1032					ret = scrub_write_page_to_dev_replace(
1033							sblock_other, page_num);
1034					if (ret == 0) {
1035						/* succeeded for this page */
1036						sub_success = 1;
1037						break;
1038					} else {
1039						btrfs_dev_replace_stats_inc(
1040							&sctx->dev_root->
1041							fs_info->dev_replace.
1042							num_write_errors);
1043					}
1044				}
1045			}
1046
1047			if (!sub_success) {
1048				/*
1049				 * did not find a mirror to fetch the page
1050				 * from. scrub_write_page_to_dev_replace()
1051				 * handles this case (page->io_error), by
1052				 * filling the block with zeros before
1053				 * submitting the write request
1054				 */
1055				success = 0;
1056				ret = scrub_write_page_to_dev_replace(
1057						sblock_bad, page_num);
1058				if (ret)
1059					btrfs_dev_replace_stats_inc(
1060						&sctx->dev_root->fs_info->
1061						dev_replace.num_write_errors);
1062			}
1063		}
1064
1065		goto out;
1066	}
1067
1068	/*
1069	 * for regular scrub, repair those pages that are errored.
1070	 * In case of I/O errors in the area that is supposed to be
1071	 * repaired, continue by picking good copies of those pages.
1072	 * Select the good pages from mirrors to rewrite bad pages from
1073	 * the area to fix. Afterwards verify the checksum of the block
1074	 * that is supposed to be repaired. This verification step is
1075	 * only done for the purpose of statistic counting and for the
1076	 * final scrub report, whether errors remain.
1077	 * A perfect algorithm could make use of the checksum and try
1078	 * all possible combinations of pages from the different mirrors
1079	 * until the checksum verification succeeds. For example, when
1080	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1081	 * of mirror #2 is readable but the final checksum test fails,
1082	 * then the 2nd page of mirror #3 could be tried, whether now
1083	 * the final checksum succeedes. But this would be a rare
1084	 * exception and is therefore not implemented. At least it is
1085	 * avoided that the good copy is overwritten.
1086	 * A more useful improvement would be to pick the sectors
1087	 * without I/O error based on sector sizes (512 bytes on legacy
1088	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1089	 * mirror could be repaired by taking 512 byte of a different
1090	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1091	 * area are unreadable.
1092	 */
1093
1094	/* can only fix I/O errors from here on */
1095	if (sblock_bad->no_io_error_seen)
1096		goto did_not_correct_error;
1097
1098	success = 1;
1099	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1100		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1101
1102		if (!page_bad->io_error)
1103			continue;
1104
1105		for (mirror_index = 0;
1106		     mirror_index < BTRFS_MAX_MIRRORS &&
1107		     sblocks_for_recheck[mirror_index].page_count > 0;
1108		     mirror_index++) {
1109			struct scrub_block *sblock_other = sblocks_for_recheck +
1110							   mirror_index;
1111			struct scrub_page *page_other = sblock_other->pagev[
1112							page_num];
1113
1114			if (!page_other->io_error) {
1115				ret = scrub_repair_page_from_good_copy(
1116					sblock_bad, sblock_other, page_num, 0);
1117				if (0 == ret) {
1118					page_bad->io_error = 0;
1119					break; /* succeeded for this page */
1120				}
1121			}
1122		}
1123
1124		if (page_bad->io_error) {
1125			/* did not find a mirror to copy the page from */
1126			success = 0;
1127		}
1128	}
1129
1130	if (success) {
1131		if (is_metadata || have_csum) {
1132			/*
1133			 * need to verify the checksum now that all
1134			 * sectors on disk are repaired (the write
1135			 * request for data to be repaired is on its way).
1136			 * Just be lazy and use scrub_recheck_block()
1137			 * which re-reads the data before the checksum
1138			 * is verified, but most likely the data comes out
1139			 * of the page cache.
1140			 */
1141			scrub_recheck_block(fs_info, sblock_bad,
1142					    is_metadata, have_csum, csum,
1143					    generation, sctx->csum_size);
1144			if (!sblock_bad->header_error &&
1145			    !sblock_bad->checksum_error &&
1146			    sblock_bad->no_io_error_seen)
1147				goto corrected_error;
1148			else
1149				goto did_not_correct_error;
1150		} else {
1151corrected_error:
1152			spin_lock(&sctx->stat_lock);
1153			sctx->stat.corrected_errors++;
1154			spin_unlock(&sctx->stat_lock);
1155			printk_ratelimited_in_rcu(KERN_ERR
1156				"btrfs: fixed up error at logical %llu on dev %s\n",
1157				(unsigned long long)logical,
1158				rcu_str_deref(dev->name));
1159		}
1160	} else {
1161did_not_correct_error:
1162		spin_lock(&sctx->stat_lock);
1163		sctx->stat.uncorrectable_errors++;
1164		spin_unlock(&sctx->stat_lock);
1165		printk_ratelimited_in_rcu(KERN_ERR
1166			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
1167			(unsigned long long)logical,
1168			rcu_str_deref(dev->name));
1169	}
1170
1171out:
1172	if (sblocks_for_recheck) {
1173		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1174		     mirror_index++) {
1175			struct scrub_block *sblock = sblocks_for_recheck +
1176						     mirror_index;
1177			int page_index;
1178
1179			for (page_index = 0; page_index < sblock->page_count;
1180			     page_index++) {
1181				sblock->pagev[page_index]->sblock = NULL;
1182				scrub_page_put(sblock->pagev[page_index]);
1183			}
1184		}
1185		kfree(sblocks_for_recheck);
1186	}
1187
1188	return 0;
1189}
1190
1191static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1192				     struct btrfs_fs_info *fs_info,
1193				     struct scrub_block *original_sblock,
1194				     u64 length, u64 logical,
1195				     struct scrub_block *sblocks_for_recheck)
1196{
1197	int page_index;
1198	int mirror_index;
1199	int ret;
1200
1201	/*
1202	 * note: the two members ref_count and outstanding_pages
1203	 * are not used (and not set) in the blocks that are used for
1204	 * the recheck procedure
1205	 */
1206
1207	page_index = 0;
1208	while (length > 0) {
1209		u64 sublen = min_t(u64, length, PAGE_SIZE);
1210		u64 mapped_length = sublen;
1211		struct btrfs_bio *bbio = NULL;
1212
1213		/*
1214		 * with a length of PAGE_SIZE, each returned stripe
1215		 * represents one mirror
1216		 */
1217		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1218				      &mapped_length, &bbio, 0);
1219		if (ret || !bbio || mapped_length < sublen) {
1220			kfree(bbio);
1221			return -EIO;
1222		}
1223
1224		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1225		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1226		     mirror_index++) {
1227			struct scrub_block *sblock;
1228			struct scrub_page *page;
1229
1230			if (mirror_index >= BTRFS_MAX_MIRRORS)
1231				continue;
1232
1233			sblock = sblocks_for_recheck + mirror_index;
1234			sblock->sctx = sctx;
1235			page = kzalloc(sizeof(*page), GFP_NOFS);
1236			if (!page) {
1237leave_nomem:
1238				spin_lock(&sctx->stat_lock);
1239				sctx->stat.malloc_errors++;
1240				spin_unlock(&sctx->stat_lock);
1241				kfree(bbio);
1242				return -ENOMEM;
1243			}
1244			scrub_page_get(page);
1245			sblock->pagev[page_index] = page;
1246			page->logical = logical;
1247			page->physical = bbio->stripes[mirror_index].physical;
1248			BUG_ON(page_index >= original_sblock->page_count);
1249			page->physical_for_dev_replace =
1250				original_sblock->pagev[page_index]->
1251				physical_for_dev_replace;
1252			/* for missing devices, dev->bdev is NULL */
1253			page->dev = bbio->stripes[mirror_index].dev;
1254			page->mirror_num = mirror_index + 1;
1255			sblock->page_count++;
1256			page->page = alloc_page(GFP_NOFS);
1257			if (!page->page)
1258				goto leave_nomem;
1259		}
1260		kfree(bbio);
1261		length -= sublen;
1262		logical += sublen;
1263		page_index++;
1264	}
1265
1266	return 0;
1267}
1268
1269/*
1270 * this function will check the on disk data for checksum errors, header
1271 * errors and read I/O errors. If any I/O errors happen, the exact pages
1272 * which are errored are marked as being bad. The goal is to enable scrub
1273 * to take those pages that are not errored from all the mirrors so that
1274 * the pages that are errored in the just handled mirror can be repaired.
1275 */
1276static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1277				struct scrub_block *sblock, int is_metadata,
1278				int have_csum, u8 *csum, u64 generation,
1279				u16 csum_size)
1280{
1281	int page_num;
1282
1283	sblock->no_io_error_seen = 1;
1284	sblock->header_error = 0;
1285	sblock->checksum_error = 0;
1286
1287	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1288		struct bio *bio;
1289		struct scrub_page *page = sblock->pagev[page_num];
1290		DECLARE_COMPLETION_ONSTACK(complete);
1291
1292		if (page->dev->bdev == NULL) {
1293			page->io_error = 1;
1294			sblock->no_io_error_seen = 0;
1295			continue;
1296		}
1297
1298		WARN_ON(!page->page);
1299		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1300		if (!bio) {
1301			page->io_error = 1;
1302			sblock->no_io_error_seen = 0;
1303			continue;
1304		}
1305		bio->bi_bdev = page->dev->bdev;
1306		bio->bi_sector = page->physical >> 9;
1307		bio->bi_end_io = scrub_complete_bio_end_io;
1308		bio->bi_private = &complete;
1309
1310		bio_add_page(bio, page->page, PAGE_SIZE, 0);
1311		btrfsic_submit_bio(READ, bio);
1312
1313		/* this will also unplug the queue */
1314		wait_for_completion(&complete);
1315
1316		page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
1317		if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1318			sblock->no_io_error_seen = 0;
1319		bio_put(bio);
1320	}
1321
1322	if (sblock->no_io_error_seen)
1323		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1324					     have_csum, csum, generation,
1325					     csum_size);
1326
1327	return;
1328}
1329
1330static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1331					 struct scrub_block *sblock,
1332					 int is_metadata, int have_csum,
1333					 const u8 *csum, u64 generation,
1334					 u16 csum_size)
1335{
1336	int page_num;
1337	u8 calculated_csum[BTRFS_CSUM_SIZE];
1338	u32 crc = ~(u32)0;
1339	void *mapped_buffer;
1340
1341	WARN_ON(!sblock->pagev[0]->page);
1342	if (is_metadata) {
1343		struct btrfs_header *h;
1344
1345		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1346		h = (struct btrfs_header *)mapped_buffer;
1347
1348		if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1349		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1350		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1351			   BTRFS_UUID_SIZE)) {
1352			sblock->header_error = 1;
1353		} else if (generation != btrfs_stack_header_generation(h)) {
1354			sblock->header_error = 1;
1355			sblock->generation_error = 1;
1356		}
1357		csum = h->csum;
1358	} else {
1359		if (!have_csum)
1360			return;
1361
1362		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1363	}
1364
1365	for (page_num = 0;;) {
1366		if (page_num == 0 && is_metadata)
1367			crc = btrfs_csum_data(
1368				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1369				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1370		else
1371			crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1372
1373		kunmap_atomic(mapped_buffer);
1374		page_num++;
1375		if (page_num >= sblock->page_count)
1376			break;
1377		WARN_ON(!sblock->pagev[page_num]->page);
1378
1379		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1380	}
1381
1382	btrfs_csum_final(crc, calculated_csum);
1383	if (memcmp(calculated_csum, csum, csum_size))
1384		sblock->checksum_error = 1;
1385}
1386
1387static void scrub_complete_bio_end_io(struct bio *bio, int err)
1388{
1389	complete((struct completion *)bio->bi_private);
1390}
1391
1392static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1393					     struct scrub_block *sblock_good,
1394					     int force_write)
1395{
1396	int page_num;
1397	int ret = 0;
1398
1399	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1400		int ret_sub;
1401
1402		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1403							   sblock_good,
1404							   page_num,
1405							   force_write);
1406		if (ret_sub)
1407			ret = ret_sub;
1408	}
1409
1410	return ret;
1411}
1412
1413static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1414					    struct scrub_block *sblock_good,
1415					    int page_num, int force_write)
1416{
1417	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1418	struct scrub_page *page_good = sblock_good->pagev[page_num];
1419
1420	BUG_ON(page_bad->page == NULL);
1421	BUG_ON(page_good->page == NULL);
1422	if (force_write || sblock_bad->header_error ||
1423	    sblock_bad->checksum_error || page_bad->io_error) {
1424		struct bio *bio;
1425		int ret;
1426		DECLARE_COMPLETION_ONSTACK(complete);
1427
1428		if (!page_bad->dev->bdev) {
1429			printk_ratelimited(KERN_WARNING
1430				"btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1431			return -EIO;
1432		}
1433
1434		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1435		if (!bio)
1436			return -EIO;
1437		bio->bi_bdev = page_bad->dev->bdev;
1438		bio->bi_sector = page_bad->physical >> 9;
1439		bio->bi_end_io = scrub_complete_bio_end_io;
1440		bio->bi_private = &complete;
1441
1442		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1443		if (PAGE_SIZE != ret) {
1444			bio_put(bio);
1445			return -EIO;
1446		}
1447		btrfsic_submit_bio(WRITE, bio);
1448
1449		/* this will also unplug the queue */
1450		wait_for_completion(&complete);
1451		if (!bio_flagged(bio, BIO_UPTODATE)) {
1452			btrfs_dev_stat_inc_and_print(page_bad->dev,
1453				BTRFS_DEV_STAT_WRITE_ERRS);
1454			btrfs_dev_replace_stats_inc(
1455				&sblock_bad->sctx->dev_root->fs_info->
1456				dev_replace.num_write_errors);
1457			bio_put(bio);
1458			return -EIO;
1459		}
1460		bio_put(bio);
1461	}
1462
1463	return 0;
1464}
1465
1466static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1467{
1468	int page_num;
1469
1470	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1471		int ret;
1472
1473		ret = scrub_write_page_to_dev_replace(sblock, page_num);
1474		if (ret)
1475			btrfs_dev_replace_stats_inc(
1476				&sblock->sctx->dev_root->fs_info->dev_replace.
1477				num_write_errors);
1478	}
1479}
1480
1481static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1482					   int page_num)
1483{
1484	struct scrub_page *spage = sblock->pagev[page_num];
1485
1486	BUG_ON(spage->page == NULL);
1487	if (spage->io_error) {
1488		void *mapped_buffer = kmap_atomic(spage->page);
1489
1490		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1491		flush_dcache_page(spage->page);
1492		kunmap_atomic(mapped_buffer);
1493	}
1494	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1495}
1496
1497static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1498				    struct scrub_page *spage)
1499{
1500	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1501	struct scrub_bio *sbio;
1502	int ret;
1503
1504	mutex_lock(&wr_ctx->wr_lock);
1505again:
1506	if (!wr_ctx->wr_curr_bio) {
1507		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1508					      GFP_NOFS);
1509		if (!wr_ctx->wr_curr_bio) {
1510			mutex_unlock(&wr_ctx->wr_lock);
1511			return -ENOMEM;
1512		}
1513		wr_ctx->wr_curr_bio->sctx = sctx;
1514		wr_ctx->wr_curr_bio->page_count = 0;
1515	}
1516	sbio = wr_ctx->wr_curr_bio;
1517	if (sbio->page_count == 0) {
1518		struct bio *bio;
1519
1520		sbio->physical = spage->physical_for_dev_replace;
1521		sbio->logical = spage->logical;
1522		sbio->dev = wr_ctx->tgtdev;
1523		bio = sbio->bio;
1524		if (!bio) {
1525			bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1526			if (!bio) {
1527				mutex_unlock(&wr_ctx->wr_lock);
1528				return -ENOMEM;
1529			}
1530			sbio->bio = bio;
1531		}
1532
1533		bio->bi_private = sbio;
1534		bio->bi_end_io = scrub_wr_bio_end_io;
1535		bio->bi_bdev = sbio->dev->bdev;
1536		bio->bi_sector = sbio->physical >> 9;
1537		sbio->err = 0;
1538	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1539		   spage->physical_for_dev_replace ||
1540		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1541		   spage->logical) {
1542		scrub_wr_submit(sctx);
1543		goto again;
1544	}
1545
1546	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1547	if (ret != PAGE_SIZE) {
1548		if (sbio->page_count < 1) {
1549			bio_put(sbio->bio);
1550			sbio->bio = NULL;
1551			mutex_unlock(&wr_ctx->wr_lock);
1552			return -EIO;
1553		}
1554		scrub_wr_submit(sctx);
1555		goto again;
1556	}
1557
1558	sbio->pagev[sbio->page_count] = spage;
1559	scrub_page_get(spage);
1560	sbio->page_count++;
1561	if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1562		scrub_wr_submit(sctx);
1563	mutex_unlock(&wr_ctx->wr_lock);
1564
1565	return 0;
1566}
1567
1568static void scrub_wr_submit(struct scrub_ctx *sctx)
1569{
1570	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1571	struct scrub_bio *sbio;
1572
1573	if (!wr_ctx->wr_curr_bio)
1574		return;
1575
1576	sbio = wr_ctx->wr_curr_bio;
1577	wr_ctx->wr_curr_bio = NULL;
1578	WARN_ON(!sbio->bio->bi_bdev);
1579	scrub_pending_bio_inc(sctx);
1580	/* process all writes in a single worker thread. Then the block layer
1581	 * orders the requests before sending them to the driver which
1582	 * doubled the write performance on spinning disks when measured
1583	 * with Linux 3.5 */
1584	btrfsic_submit_bio(WRITE, sbio->bio);
1585}
1586
1587static void scrub_wr_bio_end_io(struct bio *bio, int err)
1588{
1589	struct scrub_bio *sbio = bio->bi_private;
1590	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1591
1592	sbio->err = err;
1593	sbio->bio = bio;
1594
1595	sbio->work.func = scrub_wr_bio_end_io_worker;
1596	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1597}
1598
1599static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1600{
1601	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1602	struct scrub_ctx *sctx = sbio->sctx;
1603	int i;
1604
1605	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1606	if (sbio->err) {
1607		struct btrfs_dev_replace *dev_replace =
1608			&sbio->sctx->dev_root->fs_info->dev_replace;
1609
1610		for (i = 0; i < sbio->page_count; i++) {
1611			struct scrub_page *spage = sbio->pagev[i];
1612
1613			spage->io_error = 1;
1614			btrfs_dev_replace_stats_inc(&dev_replace->
1615						    num_write_errors);
1616		}
1617	}
1618
1619	for (i = 0; i < sbio->page_count; i++)
1620		scrub_page_put(sbio->pagev[i]);
1621
1622	bio_put(sbio->bio);
1623	kfree(sbio);
1624	scrub_pending_bio_dec(sctx);
1625}
1626
1627static int scrub_checksum(struct scrub_block *sblock)
1628{
1629	u64 flags;
1630	int ret;
1631
1632	WARN_ON(sblock->page_count < 1);
1633	flags = sblock->pagev[0]->flags;
1634	ret = 0;
1635	if (flags & BTRFS_EXTENT_FLAG_DATA)
1636		ret = scrub_checksum_data(sblock);
1637	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1638		ret = scrub_checksum_tree_block(sblock);
1639	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1640		(void)scrub_checksum_super(sblock);
1641	else
1642		WARN_ON(1);
1643	if (ret)
1644		scrub_handle_errored_block(sblock);
1645
1646	return ret;
1647}
1648
1649static int scrub_checksum_data(struct scrub_block *sblock)
1650{
1651	struct scrub_ctx *sctx = sblock->sctx;
1652	u8 csum[BTRFS_CSUM_SIZE];
1653	u8 *on_disk_csum;
1654	struct page *page;
1655	void *buffer;
1656	u32 crc = ~(u32)0;
1657	int fail = 0;
1658	u64 len;
1659	int index;
1660
1661	BUG_ON(sblock->page_count < 1);
1662	if (!sblock->pagev[0]->have_csum)
1663		return 0;
1664
1665	on_disk_csum = sblock->pagev[0]->csum;
1666	page = sblock->pagev[0]->page;
1667	buffer = kmap_atomic(page);
1668
1669	len = sctx->sectorsize;
1670	index = 0;
1671	for (;;) {
1672		u64 l = min_t(u64, len, PAGE_SIZE);
1673
1674		crc = btrfs_csum_data(buffer, crc, l);
1675		kunmap_atomic(buffer);
1676		len -= l;
1677		if (len == 0)
1678			break;
1679		index++;
1680		BUG_ON(index >= sblock->page_count);
1681		BUG_ON(!sblock->pagev[index]->page);
1682		page = sblock->pagev[index]->page;
1683		buffer = kmap_atomic(page);
1684	}
1685
1686	btrfs_csum_final(crc, csum);
1687	if (memcmp(csum, on_disk_csum, sctx->csum_size))
1688		fail = 1;
1689
1690	return fail;
1691}
1692
1693static int scrub_checksum_tree_block(struct scrub_block *sblock)
1694{
1695	struct scrub_ctx *sctx = sblock->sctx;
1696	struct btrfs_header *h;
1697	struct btrfs_root *root = sctx->dev_root;
1698	struct btrfs_fs_info *fs_info = root->fs_info;
1699	u8 calculated_csum[BTRFS_CSUM_SIZE];
1700	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1701	struct page *page;
1702	void *mapped_buffer;
1703	u64 mapped_size;
1704	void *p;
1705	u32 crc = ~(u32)0;
1706	int fail = 0;
1707	int crc_fail = 0;
1708	u64 len;
1709	int index;
1710
1711	BUG_ON(sblock->page_count < 1);
1712	page = sblock->pagev[0]->page;
1713	mapped_buffer = kmap_atomic(page);
1714	h = (struct btrfs_header *)mapped_buffer;
1715	memcpy(on_disk_csum, h->csum, sctx->csum_size);
1716
1717	/*
1718	 * we don't use the getter functions here, as we
1719	 * a) don't have an extent buffer and
1720	 * b) the page is already kmapped
1721	 */
1722
1723	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1724		++fail;
1725
1726	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1727		++fail;
1728
1729	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1730		++fail;
1731
1732	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1733		   BTRFS_UUID_SIZE))
1734		++fail;
1735
1736	WARN_ON(sctx->nodesize != sctx->leafsize);
1737	len = sctx->nodesize - BTRFS_CSUM_SIZE;
1738	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1739	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1740	index = 0;
1741	for (;;) {
1742		u64 l = min_t(u64, len, mapped_size);
1743
1744		crc = btrfs_csum_data(p, crc, l);
1745		kunmap_atomic(mapped_buffer);
1746		len -= l;
1747		if (len == 0)
1748			break;
1749		index++;
1750		BUG_ON(index >= sblock->page_count);
1751		BUG_ON(!sblock->pagev[index]->page);
1752		page = sblock->pagev[index]->page;
1753		mapped_buffer = kmap_atomic(page);
1754		mapped_size = PAGE_SIZE;
1755		p = mapped_buffer;
1756	}
1757
1758	btrfs_csum_final(crc, calculated_csum);
1759	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1760		++crc_fail;
1761
1762	return fail || crc_fail;
1763}
1764
1765static int scrub_checksum_super(struct scrub_block *sblock)
1766{
1767	struct btrfs_super_block *s;
1768	struct scrub_ctx *sctx = sblock->sctx;
1769	struct btrfs_root *root = sctx->dev_root;
1770	struct btrfs_fs_info *fs_info = root->fs_info;
1771	u8 calculated_csum[BTRFS_CSUM_SIZE];
1772	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1773	struct page *page;
1774	void *mapped_buffer;
1775	u64 mapped_size;
1776	void *p;
1777	u32 crc = ~(u32)0;
1778	int fail_gen = 0;
1779	int fail_cor = 0;
1780	u64 len;
1781	int index;
1782
1783	BUG_ON(sblock->page_count < 1);
1784	page = sblock->pagev[0]->page;
1785	mapped_buffer = kmap_atomic(page);
1786	s = (struct btrfs_super_block *)mapped_buffer;
1787	memcpy(on_disk_csum, s->csum, sctx->csum_size);
1788
1789	if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1790		++fail_cor;
1791
1792	if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1793		++fail_gen;
1794
1795	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1796		++fail_cor;
1797
1798	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1799	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1800	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1801	index = 0;
1802	for (;;) {
1803		u64 l = min_t(u64, len, mapped_size);
1804
1805		crc = btrfs_csum_data(p, crc, l);
1806		kunmap_atomic(mapped_buffer);
1807		len -= l;
1808		if (len == 0)
1809			break;
1810		index++;
1811		BUG_ON(index >= sblock->page_count);
1812		BUG_ON(!sblock->pagev[index]->page);
1813		page = sblock->pagev[index]->page;
1814		mapped_buffer = kmap_atomic(page);
1815		mapped_size = PAGE_SIZE;
1816		p = mapped_buffer;
1817	}
1818
1819	btrfs_csum_final(crc, calculated_csum);
1820	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1821		++fail_cor;
1822
1823	if (fail_cor + fail_gen) {
1824		/*
1825		 * if we find an error in a super block, we just report it.
1826		 * They will get written with the next transaction commit
1827		 * anyway
1828		 */
1829		spin_lock(&sctx->stat_lock);
1830		++sctx->stat.super_errors;
1831		spin_unlock(&sctx->stat_lock);
1832		if (fail_cor)
1833			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1834				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1835		else
1836			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1837				BTRFS_DEV_STAT_GENERATION_ERRS);
1838	}
1839
1840	return fail_cor + fail_gen;
1841}
1842
1843static void scrub_block_get(struct scrub_block *sblock)
1844{
1845	atomic_inc(&sblock->ref_count);
1846}
1847
1848static void scrub_block_put(struct scrub_block *sblock)
1849{
1850	if (atomic_dec_and_test(&sblock->ref_count)) {
1851		int i;
1852
1853		for (i = 0; i < sblock->page_count; i++)
1854			scrub_page_put(sblock->pagev[i]);
1855		kfree(sblock);
1856	}
1857}
1858
1859static void scrub_page_get(struct scrub_page *spage)
1860{
1861	atomic_inc(&spage->ref_count);
1862}
1863
1864static void scrub_page_put(struct scrub_page *spage)
1865{
1866	if (atomic_dec_and_test(&spage->ref_count)) {
1867		if (spage->page)
1868			__free_page(spage->page);
1869		kfree(spage);
1870	}
1871}
1872
1873static void scrub_submit(struct scrub_ctx *sctx)
1874{
1875	struct scrub_bio *sbio;
1876
1877	if (sctx->curr == -1)
1878		return;
1879
1880	sbio = sctx->bios[sctx->curr];
1881	sctx->curr = -1;
1882	scrub_pending_bio_inc(sctx);
1883
1884	if (!sbio->bio->bi_bdev) {
1885		/*
1886		 * this case should not happen. If btrfs_map_block() is
1887		 * wrong, it could happen for dev-replace operations on
1888		 * missing devices when no mirrors are available, but in
1889		 * this case it should already fail the mount.
1890		 * This case is handled correctly (but _very_ slowly).
1891		 */
1892		printk_ratelimited(KERN_WARNING
1893			"btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1894		bio_endio(sbio->bio, -EIO);
1895	} else {
1896		btrfsic_submit_bio(READ, sbio->bio);
1897	}
1898}
1899
1900static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1901				    struct scrub_page *spage)
1902{
1903	struct scrub_block *sblock = spage->sblock;
1904	struct scrub_bio *sbio;
1905	int ret;
1906
1907again:
1908	/*
1909	 * grab a fresh bio or wait for one to become available
1910	 */
1911	while (sctx->curr == -1) {
1912		spin_lock(&sctx->list_lock);
1913		sctx->curr = sctx->first_free;
1914		if (sctx->curr != -1) {
1915			sctx->first_free = sctx->bios[sctx->curr]->next_free;
1916			sctx->bios[sctx->curr]->next_free = -1;
1917			sctx->bios[sctx->curr]->page_count = 0;
1918			spin_unlock(&sctx->list_lock);
1919		} else {
1920			spin_unlock(&sctx->list_lock);
1921			wait_event(sctx->list_wait, sctx->first_free != -1);
1922		}
1923	}
1924	sbio = sctx->bios[sctx->curr];
1925	if (sbio->page_count == 0) {
1926		struct bio *bio;
1927
1928		sbio->physical = spage->physical;
1929		sbio->logical = spage->logical;
1930		sbio->dev = spage->dev;
1931		bio = sbio->bio;
1932		if (!bio) {
1933			bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1934			if (!bio)
1935				return -ENOMEM;
1936			sbio->bio = bio;
1937		}
1938
1939		bio->bi_private = sbio;
1940		bio->bi_end_io = scrub_bio_end_io;
1941		bio->bi_bdev = sbio->dev->bdev;
1942		bio->bi_sector = sbio->physical >> 9;
1943		sbio->err = 0;
1944	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1945		   spage->physical ||
1946		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1947		   spage->logical ||
1948		   sbio->dev != spage->dev) {
1949		scrub_submit(sctx);
1950		goto again;
1951	}
1952
1953	sbio->pagev[sbio->page_count] = spage;
1954	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1955	if (ret != PAGE_SIZE) {
1956		if (sbio->page_count < 1) {
1957			bio_put(sbio->bio);
1958			sbio->bio = NULL;
1959			return -EIO;
1960		}
1961		scrub_submit(sctx);
1962		goto again;
1963	}
1964
1965	scrub_block_get(sblock); /* one for the page added to the bio */
1966	atomic_inc(&sblock->outstanding_pages);
1967	sbio->page_count++;
1968	if (sbio->page_count == sctx->pages_per_rd_bio)
1969		scrub_submit(sctx);
1970
1971	return 0;
1972}
1973
1974static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1975		       u64 physical, struct btrfs_device *dev, u64 flags,
1976		       u64 gen, int mirror_num, u8 *csum, int force,
1977		       u64 physical_for_dev_replace)
1978{
1979	struct scrub_block *sblock;
1980	int index;
1981
1982	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1983	if (!sblock) {
1984		spin_lock(&sctx->stat_lock);
1985		sctx->stat.malloc_errors++;
1986		spin_unlock(&sctx->stat_lock);
1987		return -ENOMEM;
1988	}
1989
1990	/* one ref inside this function, plus one for each page added to
1991	 * a bio later on */
1992	atomic_set(&sblock->ref_count, 1);
1993	sblock->sctx = sctx;
1994	sblock->no_io_error_seen = 1;
1995
1996	for (index = 0; len > 0; index++) {
1997		struct scrub_page *spage;
1998		u64 l = min_t(u64, len, PAGE_SIZE);
1999
2000		spage = kzalloc(sizeof(*spage), GFP_NOFS);
2001		if (!spage) {
2002leave_nomem:
2003			spin_lock(&sctx->stat_lock);
2004			sctx->stat.malloc_errors++;
2005			spin_unlock(&sctx->stat_lock);
2006			scrub_block_put(sblock);
2007			return -ENOMEM;
2008		}
2009		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2010		scrub_page_get(spage);
2011		sblock->pagev[index] = spage;
2012		spage->sblock = sblock;
2013		spage->dev = dev;
2014		spage->flags = flags;
2015		spage->generation = gen;
2016		spage->logical = logical;
2017		spage->physical = physical;
2018		spage->physical_for_dev_replace = physical_for_dev_replace;
2019		spage->mirror_num = mirror_num;
2020		if (csum) {
2021			spage->have_csum = 1;
2022			memcpy(spage->csum, csum, sctx->csum_size);
2023		} else {
2024			spage->have_csum = 0;
2025		}
2026		sblock->page_count++;
2027		spage->page = alloc_page(GFP_NOFS);
2028		if (!spage->page)
2029			goto leave_nomem;
2030		len -= l;
2031		logical += l;
2032		physical += l;
2033		physical_for_dev_replace += l;
2034	}
2035
2036	WARN_ON(sblock->page_count == 0);
2037	for (index = 0; index < sblock->page_count; index++) {
2038		struct scrub_page *spage = sblock->pagev[index];
2039		int ret;
2040
2041		ret = scrub_add_page_to_rd_bio(sctx, spage);
2042		if (ret) {
2043			scrub_block_put(sblock);
2044			return ret;
2045		}
2046	}
2047
2048	if (force)
2049		scrub_submit(sctx);
2050
2051	/* last one frees, either here or in bio completion for last page */
2052	scrub_block_put(sblock);
2053	return 0;
2054}
2055
2056static void scrub_bio_end_io(struct bio *bio, int err)
2057{
2058	struct scrub_bio *sbio = bio->bi_private;
2059	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2060
2061	sbio->err = err;
2062	sbio->bio = bio;
2063
2064	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
2065}
2066
2067static void scrub_bio_end_io_worker(struct btrfs_work *work)
2068{
2069	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2070	struct scrub_ctx *sctx = sbio->sctx;
2071	int i;
2072
2073	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2074	if (sbio->err) {
2075		for (i = 0; i < sbio->page_count; i++) {
2076			struct scrub_page *spage = sbio->pagev[i];
2077
2078			spage->io_error = 1;
2079			spage->sblock->no_io_error_seen = 0;
2080		}
2081	}
2082
2083	/* now complete the scrub_block items that have all pages completed */
2084	for (i = 0; i < sbio->page_count; i++) {
2085		struct scrub_page *spage = sbio->pagev[i];
2086		struct scrub_block *sblock = spage->sblock;
2087
2088		if (atomic_dec_and_test(&sblock->outstanding_pages))
2089			scrub_block_complete(sblock);
2090		scrub_block_put(sblock);
2091	}
2092
2093	bio_put(sbio->bio);
2094	sbio->bio = NULL;
2095	spin_lock(&sctx->list_lock);
2096	sbio->next_free = sctx->first_free;
2097	sctx->first_free = sbio->index;
2098	spin_unlock(&sctx->list_lock);
2099
2100	if (sctx->is_dev_replace &&
2101	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2102		mutex_lock(&sctx->wr_ctx.wr_lock);
2103		scrub_wr_submit(sctx);
2104		mutex_unlock(&sctx->wr_ctx.wr_lock);
2105	}
2106
2107	scrub_pending_bio_dec(sctx);
2108}
2109
2110static void scrub_block_complete(struct scrub_block *sblock)
2111{
2112	if (!sblock->no_io_error_seen) {
2113		scrub_handle_errored_block(sblock);
2114	} else {
2115		/*
2116		 * if has checksum error, write via repair mechanism in
2117		 * dev replace case, otherwise write here in dev replace
2118		 * case.
2119		 */
2120		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2121			scrub_write_block_to_dev_replace(sblock);
2122	}
2123}
2124
2125static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2126			   u8 *csum)
2127{
2128	struct btrfs_ordered_sum *sum = NULL;
2129	unsigned long index;
2130	unsigned long num_sectors;
2131
2132	while (!list_empty(&sctx->csum_list)) {
2133		sum = list_first_entry(&sctx->csum_list,
2134				       struct btrfs_ordered_sum, list);
2135		if (sum->bytenr > logical)
2136			return 0;
2137		if (sum->bytenr + sum->len > logical)
2138			break;
2139
2140		++sctx->stat.csum_discards;
2141		list_del(&sum->list);
2142		kfree(sum);
2143		sum = NULL;
2144	}
2145	if (!sum)
2146		return 0;
2147
2148	index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2149	num_sectors = sum->len / sctx->sectorsize;
2150	memcpy(csum, sum->sums + index, sctx->csum_size);
2151	if (index == num_sectors - 1) {
2152		list_del(&sum->list);
2153		kfree(sum);
2154	}
2155	return 1;
2156}
2157
2158/* scrub extent tries to collect up to 64 kB for each bio */
2159static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2160			u64 physical, struct btrfs_device *dev, u64 flags,
2161			u64 gen, int mirror_num, u64 physical_for_dev_replace)
2162{
2163	int ret;
2164	u8 csum[BTRFS_CSUM_SIZE];
2165	u32 blocksize;
2166
2167	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2168		blocksize = sctx->sectorsize;
2169		spin_lock(&sctx->stat_lock);
2170		sctx->stat.data_extents_scrubbed++;
2171		sctx->stat.data_bytes_scrubbed += len;
2172		spin_unlock(&sctx->stat_lock);
2173	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2174		WARN_ON(sctx->nodesize != sctx->leafsize);
2175		blocksize = sctx->nodesize;
2176		spin_lock(&sctx->stat_lock);
2177		sctx->stat.tree_extents_scrubbed++;
2178		sctx->stat.tree_bytes_scrubbed += len;
2179		spin_unlock(&sctx->stat_lock);
2180	} else {
2181		blocksize = sctx->sectorsize;
2182		WARN_ON(1);
2183	}
2184
2185	while (len) {
2186		u64 l = min_t(u64, len, blocksize);
2187		int have_csum = 0;
2188
2189		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2190			/* push csums to sbio */
2191			have_csum = scrub_find_csum(sctx, logical, l, csum);
2192			if (have_csum == 0)
2193				++sctx->stat.no_csum;
2194			if (sctx->is_dev_replace && !have_csum) {
2195				ret = copy_nocow_pages(sctx, logical, l,
2196						       mirror_num,
2197						      physical_for_dev_replace);
2198				goto behind_scrub_pages;
2199			}
2200		}
2201		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2202				  mirror_num, have_csum ? csum : NULL, 0,
2203				  physical_for_dev_replace);
2204behind_scrub_pages:
2205		if (ret)
2206			return ret;
2207		len -= l;
2208		logical += l;
2209		physical += l;
2210		physical_for_dev_replace += l;
2211	}
2212	return 0;
2213}
2214
2215static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2216					   struct map_lookup *map,
2217					   struct btrfs_device *scrub_dev,
2218					   int num, u64 base, u64 length,
2219					   int is_dev_replace)
2220{
2221	struct btrfs_path *path;
2222	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2223	struct btrfs_root *root = fs_info->extent_root;
2224	struct btrfs_root *csum_root = fs_info->csum_root;
2225	struct btrfs_extent_item *extent;
2226	struct blk_plug plug;
2227	u64 flags;
2228	int ret;
2229	int slot;
2230	u64 nstripes;
2231	struct extent_buffer *l;
2232	struct btrfs_key key;
2233	u64 physical;
2234	u64 logical;
2235	u64 logic_end;
2236	u64 generation;
2237	int mirror_num;
2238	struct reada_control *reada1;
2239	struct reada_control *reada2;
2240	struct btrfs_key key_start;
2241	struct btrfs_key key_end;
2242	u64 increment = map->stripe_len;
2243	u64 offset;
2244	u64 extent_logical;
2245	u64 extent_physical;
2246	u64 extent_len;
2247	struct btrfs_device *extent_dev;
2248	int extent_mirror_num;
2249	int stop_loop;
2250
2251	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2252			 BTRFS_BLOCK_GROUP_RAID6)) {
2253		if (num >= nr_data_stripes(map)) {
2254			return 0;
2255		}
2256	}
2257
2258	nstripes = length;
2259	offset = 0;
2260	do_div(nstripes, map->stripe_len);
2261	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2262		offset = map->stripe_len * num;
2263		increment = map->stripe_len * map->num_stripes;
2264		mirror_num = 1;
2265	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2266		int factor = map->num_stripes / map->sub_stripes;
2267		offset = map->stripe_len * (num / map->sub_stripes);
2268		increment = map->stripe_len * factor;
2269		mirror_num = num % map->sub_stripes + 1;
2270	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2271		increment = map->stripe_len;
2272		mirror_num = num % map->num_stripes + 1;
2273	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2274		increment = map->stripe_len;
2275		mirror_num = num % map->num_stripes + 1;
2276	} else {
2277		increment = map->stripe_len;
2278		mirror_num = 1;
2279	}
2280
2281	path = btrfs_alloc_path();
2282	if (!path)
2283		return -ENOMEM;
2284
2285	/*
2286	 * work on commit root. The related disk blocks are static as
2287	 * long as COW is applied. This means, it is save to rewrite
2288	 * them to repair disk errors without any race conditions
2289	 */
2290	path->search_commit_root = 1;
2291	path->skip_locking = 1;
2292
2293	/*
2294	 * trigger the readahead for extent tree csum tree and wait for
2295	 * completion. During readahead, the scrub is officially paused
2296	 * to not hold off transaction commits
2297	 */
2298	logical = base + offset;
2299
2300	wait_event(sctx->list_wait,
2301		   atomic_read(&sctx->bios_in_flight) == 0);
2302	atomic_inc(&fs_info->scrubs_paused);
2303	wake_up(&fs_info->scrub_pause_wait);
2304
2305	/* FIXME it might be better to start readahead at commit root */
2306	key_start.objectid = logical;
2307	key_start.type = BTRFS_EXTENT_ITEM_KEY;
2308	key_start.offset = (u64)0;
2309	key_end.objectid = base + offset + nstripes * increment;
2310	key_end.type = BTRFS_METADATA_ITEM_KEY;
2311	key_end.offset = (u64)-1;
2312	reada1 = btrfs_reada_add(root, &key_start, &key_end);
2313
2314	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2315	key_start.type = BTRFS_EXTENT_CSUM_KEY;
2316	key_start.offset = logical;
2317	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2318	key_end.type = BTRFS_EXTENT_CSUM_KEY;
2319	key_end.offset = base + offset + nstripes * increment;
2320	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2321
2322	if (!IS_ERR(reada1))
2323		btrfs_reada_wait(reada1);
2324	if (!IS_ERR(reada2))
2325		btrfs_reada_wait(reada2);
2326
2327	mutex_lock(&fs_info->scrub_lock);
2328	while (atomic_read(&fs_info->scrub_pause_req)) {
2329		mutex_unlock(&fs_info->scrub_lock);
2330		wait_event(fs_info->scrub_pause_wait,
2331		   atomic_read(&fs_info->scrub_pause_req) == 0);
2332		mutex_lock(&fs_info->scrub_lock);
2333	}
2334	atomic_dec(&fs_info->scrubs_paused);
2335	mutex_unlock(&fs_info->scrub_lock);
2336	wake_up(&fs_info->scrub_pause_wait);
2337
2338	/*
2339	 * collect all data csums for the stripe to avoid seeking during
2340	 * the scrub. This might currently (crc32) end up to be about 1MB
2341	 */
2342	blk_start_plug(&plug);
2343
2344	/*
2345	 * now find all extents for each stripe and scrub them
2346	 */
2347	logical = base + offset;
2348	physical = map->stripes[num].physical;
2349	logic_end = logical + increment * nstripes;
2350	ret = 0;
2351	while (logical < logic_end) {
2352		/*
2353		 * canceled?
2354		 */
2355		if (atomic_read(&fs_info->scrub_cancel_req) ||
2356		    atomic_read(&sctx->cancel_req)) {
2357			ret = -ECANCELED;
2358			goto out;
2359		}
2360		/*
2361		 * check to see if we have to pause
2362		 */
2363		if (atomic_read(&fs_info->scrub_pause_req)) {
2364			/* push queued extents */
2365			atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2366			scrub_submit(sctx);
2367			mutex_lock(&sctx->wr_ctx.wr_lock);
2368			scrub_wr_submit(sctx);
2369			mutex_unlock(&sctx->wr_ctx.wr_lock);
2370			wait_event(sctx->list_wait,
2371				   atomic_read(&sctx->bios_in_flight) == 0);
2372			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2373			atomic_inc(&fs_info->scrubs_paused);
2374			wake_up(&fs_info->scrub_pause_wait);
2375			mutex_lock(&fs_info->scrub_lock);
2376			while (atomic_read(&fs_info->scrub_pause_req)) {
2377				mutex_unlock(&fs_info->scrub_lock);
2378				wait_event(fs_info->scrub_pause_wait,
2379				   atomic_read(&fs_info->scrub_pause_req) == 0);
2380				mutex_lock(&fs_info->scrub_lock);
2381			}
2382			atomic_dec(&fs_info->scrubs_paused);
2383			mutex_unlock(&fs_info->scrub_lock);
2384			wake_up(&fs_info->scrub_pause_wait);
2385		}
2386
2387		key.objectid = logical;
2388		key.type = BTRFS_EXTENT_ITEM_KEY;
2389		key.offset = (u64)-1;
2390
2391		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2392		if (ret < 0)
2393			goto out;
2394
2395		if (ret > 0) {
2396			ret = btrfs_previous_item(root, path, 0,
2397						  BTRFS_EXTENT_ITEM_KEY);
2398			if (ret < 0)
2399				goto out;
2400			if (ret > 0) {
2401				/* there's no smaller item, so stick with the
2402				 * larger one */
2403				btrfs_release_path(path);
2404				ret = btrfs_search_slot(NULL, root, &key,
2405							path, 0, 0);
2406				if (ret < 0)
2407					goto out;
2408			}
2409		}
2410
2411		stop_loop = 0;
2412		while (1) {
2413			u64 bytes;
2414
2415			l = path->nodes[0];
2416			slot = path->slots[0];
2417			if (slot >= btrfs_header_nritems(l)) {
2418				ret = btrfs_next_leaf(root, path);
2419				if (ret == 0)
2420					continue;
2421				if (ret < 0)
2422					goto out;
2423
2424				stop_loop = 1;
2425				break;
2426			}
2427			btrfs_item_key_to_cpu(l, &key, slot);
2428
2429			if (key.type == BTRFS_METADATA_ITEM_KEY)
2430				bytes = root->leafsize;
2431			else
2432				bytes = key.offset;
2433
2434			if (key.objectid + bytes <= logical)
2435				goto next;
2436
2437			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2438			    key.type != BTRFS_METADATA_ITEM_KEY)
2439				goto next;
2440
2441			if (key.objectid >= logical + map->stripe_len) {
2442				/* out of this device extent */
2443				if (key.objectid >= logic_end)
2444					stop_loop = 1;
2445				break;
2446			}
2447
2448			extent = btrfs_item_ptr(l, slot,
2449						struct btrfs_extent_item);
2450			flags = btrfs_extent_flags(l, extent);
2451			generation = btrfs_extent_generation(l, extent);
2452
2453			if (key.objectid < logical &&
2454			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2455				printk(KERN_ERR
2456				       "btrfs scrub: tree block %llu spanning "
2457				       "stripes, ignored. logical=%llu\n",
2458				       (unsigned long long)key.objectid,
2459				       (unsigned long long)logical);
2460				goto next;
2461			}
2462
2463again:
2464			extent_logical = key.objectid;
2465			extent_len = bytes;
2466
2467			/*
2468			 * trim extent to this stripe
2469			 */
2470			if (extent_logical < logical) {
2471				extent_len -= logical - extent_logical;
2472				extent_logical = logical;
2473			}
2474			if (extent_logical + extent_len >
2475			    logical + map->stripe_len) {
2476				extent_len = logical + map->stripe_len -
2477					     extent_logical;
2478			}
2479
2480			extent_physical = extent_logical - logical + physical;
2481			extent_dev = scrub_dev;
2482			extent_mirror_num = mirror_num;
2483			if (is_dev_replace)
2484				scrub_remap_extent(fs_info, extent_logical,
2485						   extent_len, &extent_physical,
2486						   &extent_dev,
2487						   &extent_mirror_num);
2488
2489			ret = btrfs_lookup_csums_range(csum_root, logical,
2490						logical + map->stripe_len - 1,
2491						&sctx->csum_list, 1);
2492			if (ret)
2493				goto out;
2494
2495			ret = scrub_extent(sctx, extent_logical, extent_len,
2496					   extent_physical, extent_dev, flags,
2497					   generation, extent_mirror_num,
2498					   extent_logical - logical + physical);
2499			if (ret)
2500				goto out;
2501
2502			scrub_free_csums(sctx);
2503			if (extent_logical + extent_len <
2504			    key.objectid + bytes) {
2505				logical += increment;
2506				physical += map->stripe_len;
2507
2508				if (logical < key.objectid + bytes) {
2509					cond_resched();
2510					goto again;
2511				}
2512
2513				if (logical >= logic_end) {
2514					stop_loop = 1;
2515					break;
2516				}
2517			}
2518next:
2519			path->slots[0]++;
2520		}
2521		btrfs_release_path(path);
2522		logical += increment;
2523		physical += map->stripe_len;
2524		spin_lock(&sctx->stat_lock);
2525		if (stop_loop)
2526			sctx->stat.last_physical = map->stripes[num].physical +
2527						   length;
2528		else
2529			sctx->stat.last_physical = physical;
2530		spin_unlock(&sctx->stat_lock);
2531		if (stop_loop)
2532			break;
2533	}
2534out:
2535	/* push queued extents */
2536	scrub_submit(sctx);
2537	mutex_lock(&sctx->wr_ctx.wr_lock);
2538	scrub_wr_submit(sctx);
2539	mutex_unlock(&sctx->wr_ctx.wr_lock);
2540
2541	blk_finish_plug(&plug);
2542	btrfs_free_path(path);
2543	return ret < 0 ? ret : 0;
2544}
2545
2546static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2547					  struct btrfs_device *scrub_dev,
2548					  u64 chunk_tree, u64 chunk_objectid,
2549					  u64 chunk_offset, u64 length,
2550					  u64 dev_offset, int is_dev_replace)
2551{
2552	struct btrfs_mapping_tree *map_tree =
2553		&sctx->dev_root->fs_info->mapping_tree;
2554	struct map_lookup *map;
2555	struct extent_map *em;
2556	int i;
2557	int ret = 0;
2558
2559	read_lock(&map_tree->map_tree.lock);
2560	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2561	read_unlock(&map_tree->map_tree.lock);
2562
2563	if (!em)
2564		return -EINVAL;
2565
2566	map = (struct map_lookup *)em->bdev;
2567	if (em->start != chunk_offset)
2568		goto out;
2569
2570	if (em->len < length)
2571		goto out;
2572
2573	for (i = 0; i < map->num_stripes; ++i) {
2574		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2575		    map->stripes[i].physical == dev_offset) {
2576			ret = scrub_stripe(sctx, map, scrub_dev, i,
2577					   chunk_offset, length,
2578					   is_dev_replace);
2579			if (ret)
2580				goto out;
2581		}
2582	}
2583out:
2584	free_extent_map(em);
2585
2586	return ret;
2587}
2588
2589static noinline_for_stack
2590int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2591			   struct btrfs_device *scrub_dev, u64 start, u64 end,
2592			   int is_dev_replace)
2593{
2594	struct btrfs_dev_extent *dev_extent = NULL;
2595	struct btrfs_path *path;
2596	struct btrfs_root *root = sctx->dev_root;
2597	struct btrfs_fs_info *fs_info = root->fs_info;
2598	u64 length;
2599	u64 chunk_tree;
2600	u64 chunk_objectid;
2601	u64 chunk_offset;
2602	int ret;
2603	int slot;
2604	struct extent_buffer *l;
2605	struct btrfs_key key;
2606	struct btrfs_key found_key;
2607	struct btrfs_block_group_cache *cache;
2608	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2609
2610	path = btrfs_alloc_path();
2611	if (!path)
2612		return -ENOMEM;
2613
2614	path->reada = 2;
2615	path->search_commit_root = 1;
2616	path->skip_locking = 1;
2617
2618	key.objectid = scrub_dev->devid;
2619	key.offset = 0ull;
2620	key.type = BTRFS_DEV_EXTENT_KEY;
2621
2622	while (1) {
2623		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2624		if (ret < 0)
2625			break;
2626		if (ret > 0) {
2627			if (path->slots[0] >=
2628			    btrfs_header_nritems(path->nodes[0])) {
2629				ret = btrfs_next_leaf(root, path);
2630				if (ret)
2631					break;
2632			}
2633		}
2634
2635		l = path->nodes[0];
2636		slot = path->slots[0];
2637
2638		btrfs_item_key_to_cpu(l, &found_key, slot);
2639
2640		if (found_key.objectid != scrub_dev->devid)
2641			break;
2642
2643		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2644			break;
2645
2646		if (found_key.offset >= end)
2647			break;
2648
2649		if (found_key.offset < key.offset)
2650			break;
2651
2652		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2653		length = btrfs_dev_extent_length(l, dev_extent);
2654
2655		if (found_key.offset + length <= start) {
2656			key.offset = found_key.offset + length;
2657			btrfs_release_path(path);
2658			continue;
2659		}
2660
2661		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2662		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2663		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2664
2665		/*
2666		 * get a reference on the corresponding block group to prevent
2667		 * the chunk from going away while we scrub it
2668		 */
2669		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2670		if (!cache) {
2671			ret = -ENOENT;
2672			break;
2673		}
2674		dev_replace->cursor_right = found_key.offset + length;
2675		dev_replace->cursor_left = found_key.offset;
2676		dev_replace->item_needs_writeback = 1;
2677		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2678				  chunk_offset, length, found_key.offset,
2679				  is_dev_replace);
2680
2681		/*
2682		 * flush, submit all pending read and write bios, afterwards
2683		 * wait for them.
2684		 * Note that in the dev replace case, a read request causes
2685		 * write requests that are submitted in the read completion
2686		 * worker. Therefore in the current situation, it is required
2687		 * that all write requests are flushed, so that all read and
2688		 * write requests are really completed when bios_in_flight
2689		 * changes to 0.
2690		 */
2691		atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2692		scrub_submit(sctx);
2693		mutex_lock(&sctx->wr_ctx.wr_lock);
2694		scrub_wr_submit(sctx);
2695		mutex_unlock(&sctx->wr_ctx.wr_lock);
2696
2697		wait_event(sctx->list_wait,
2698			   atomic_read(&sctx->bios_in_flight) == 0);
2699		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2700		atomic_inc(&fs_info->scrubs_paused);
2701		wake_up(&fs_info->scrub_pause_wait);
2702		wait_event(sctx->list_wait,
2703			   atomic_read(&sctx->workers_pending) == 0);
2704
2705		mutex_lock(&fs_info->scrub_lock);
2706		while (atomic_read(&fs_info->scrub_pause_req)) {
2707			mutex_unlock(&fs_info->scrub_lock);
2708			wait_event(fs_info->scrub_pause_wait,
2709			   atomic_read(&fs_info->scrub_pause_req) == 0);
2710			mutex_lock(&fs_info->scrub_lock);
2711		}
2712		atomic_dec(&fs_info->scrubs_paused);
2713		mutex_unlock(&fs_info->scrub_lock);
2714		wake_up(&fs_info->scrub_pause_wait);
2715
2716		dev_replace->cursor_left = dev_replace->cursor_right;
2717		dev_replace->item_needs_writeback = 1;
2718		btrfs_put_block_group(cache);
2719		if (ret)
2720			break;
2721		if (is_dev_replace &&
2722		    atomic64_read(&dev_replace->num_write_errors) > 0) {
2723			ret = -EIO;
2724			break;
2725		}
2726		if (sctx->stat.malloc_errors > 0) {
2727			ret = -ENOMEM;
2728			break;
2729		}
2730
2731		key.offset = found_key.offset + length;
2732		btrfs_release_path(path);
2733	}
2734
2735	btrfs_free_path(path);
2736
2737	/*
2738	 * ret can still be 1 from search_slot or next_leaf,
2739	 * that's not an error
2740	 */
2741	return ret < 0 ? ret : 0;
2742}
2743
2744static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2745					   struct btrfs_device *scrub_dev)
2746{
2747	int	i;
2748	u64	bytenr;
2749	u64	gen;
2750	int	ret;
2751	struct btrfs_root *root = sctx->dev_root;
2752
2753	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2754		return -EIO;
2755
2756	gen = root->fs_info->last_trans_committed;
2757
2758	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2759		bytenr = btrfs_sb_offset(i);
2760		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2761			break;
2762
2763		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2764				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2765				  NULL, 1, bytenr);
2766		if (ret)
2767			return ret;
2768	}
2769	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2770
2771	return 0;
2772}
2773
2774/*
2775 * get a reference count on fs_info->scrub_workers. start worker if necessary
2776 */
2777static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2778						int is_dev_replace)
2779{
2780	int ret = 0;
2781
2782	mutex_lock(&fs_info->scrub_lock);
2783	if (fs_info->scrub_workers_refcnt == 0) {
2784		if (is_dev_replace)
2785			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2786					&fs_info->generic_worker);
2787		else
2788			btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2789					fs_info->thread_pool_size,
2790					&fs_info->generic_worker);
2791		fs_info->scrub_workers.idle_thresh = 4;
2792		ret = btrfs_start_workers(&fs_info->scrub_workers);
2793		if (ret)
2794			goto out;
2795		btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2796				   "scrubwrc",
2797				   fs_info->thread_pool_size,
2798				   &fs_info->generic_worker);
2799		fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2800		ret = btrfs_start_workers(
2801				&fs_info->scrub_wr_completion_workers);
2802		if (ret)
2803			goto out;
2804		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2805				   &fs_info->generic_worker);
2806		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2807		if (ret)
2808			goto out;
2809	}
2810	++fs_info->scrub_workers_refcnt;
2811out:
2812	mutex_unlock(&fs_info->scrub_lock);
2813
2814	return ret;
2815}
2816
2817static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2818{
2819	mutex_lock(&fs_info->scrub_lock);
2820	if (--fs_info->scrub_workers_refcnt == 0) {
2821		btrfs_stop_workers(&fs_info->scrub_workers);
2822		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2823		btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2824	}
2825	WARN_ON(fs_info->scrub_workers_refcnt < 0);
2826	mutex_unlock(&fs_info->scrub_lock);
2827}
2828
2829int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2830		    u64 end, struct btrfs_scrub_progress *progress,
2831		    int readonly, int is_dev_replace)
2832{
2833	struct scrub_ctx *sctx;
2834	int ret;
2835	struct btrfs_device *dev;
2836
2837	if (btrfs_fs_closing(fs_info))
2838		return -EINVAL;
2839
2840	/*
2841	 * check some assumptions
2842	 */
2843	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2844		printk(KERN_ERR
2845		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2846		       fs_info->chunk_root->nodesize,
2847		       fs_info->chunk_root->leafsize);
2848		return -EINVAL;
2849	}
2850
2851	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2852		/*
2853		 * in this case scrub is unable to calculate the checksum
2854		 * the way scrub is implemented. Do not handle this
2855		 * situation at all because it won't ever happen.
2856		 */
2857		printk(KERN_ERR
2858		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2859		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2860		return -EINVAL;
2861	}
2862
2863	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2864		/* not supported for data w/o checksums */
2865		printk(KERN_ERR
2866		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2867		       fs_info->chunk_root->sectorsize,
2868		       (unsigned long long)PAGE_SIZE);
2869		return -EINVAL;
2870	}
2871
2872	if (fs_info->chunk_root->nodesize >
2873	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2874	    fs_info->chunk_root->sectorsize >
2875	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2876		/*
2877		 * would exhaust the array bounds of pagev member in
2878		 * struct scrub_block
2879		 */
2880		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2881		       fs_info->chunk_root->nodesize,
2882		       SCRUB_MAX_PAGES_PER_BLOCK,
2883		       fs_info->chunk_root->sectorsize,
2884		       SCRUB_MAX_PAGES_PER_BLOCK);
2885		return -EINVAL;
2886	}
2887
2888	ret = scrub_workers_get(fs_info, is_dev_replace);
2889	if (ret)
2890		return ret;
2891
2892	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2893	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2894	if (!dev || (dev->missing && !is_dev_replace)) {
2895		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2896		scrub_workers_put(fs_info);
2897		return -ENODEV;
2898	}
2899	mutex_lock(&fs_info->scrub_lock);
2900
2901	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2902		mutex_unlock(&fs_info->scrub_lock);
2903		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2904		scrub_workers_put(fs_info);
2905		return -EIO;
2906	}
2907
2908	btrfs_dev_replace_lock(&fs_info->dev_replace);
2909	if (dev->scrub_device ||
2910	    (!is_dev_replace &&
2911	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2912		btrfs_dev_replace_unlock(&fs_info->dev_replace);
2913		mutex_unlock(&fs_info->scrub_lock);
2914		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2915		scrub_workers_put(fs_info);
2916		return -EINPROGRESS;
2917	}
2918	btrfs_dev_replace_unlock(&fs_info->dev_replace);
2919	sctx = scrub_setup_ctx(dev, is_dev_replace);
2920	if (IS_ERR(sctx)) {
2921		mutex_unlock(&fs_info->scrub_lock);
2922		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2923		scrub_workers_put(fs_info);
2924		return PTR_ERR(sctx);
2925	}
2926	sctx->readonly = readonly;
2927	dev->scrub_device = sctx;
2928
2929	atomic_inc(&fs_info->scrubs_running);
2930	mutex_unlock(&fs_info->scrub_lock);
2931	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2932
2933	if (!is_dev_replace) {
2934		down_read(&fs_info->scrub_super_lock);
2935		ret = scrub_supers(sctx, dev);
2936		up_read(&fs_info->scrub_super_lock);
2937	}
2938
2939	if (!ret)
2940		ret = scrub_enumerate_chunks(sctx, dev, start, end,
2941					     is_dev_replace);
2942
2943	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2944	atomic_dec(&fs_info->scrubs_running);
2945	wake_up(&fs_info->scrub_pause_wait);
2946
2947	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2948
2949	if (progress)
2950		memcpy(progress, &sctx->stat, sizeof(*progress));
2951
2952	mutex_lock(&fs_info->scrub_lock);
2953	dev->scrub_device = NULL;
2954	mutex_unlock(&fs_info->scrub_lock);
2955
2956	scrub_free_ctx(sctx);
2957	scrub_workers_put(fs_info);
2958
2959	return ret;
2960}
2961
2962void btrfs_scrub_pause(struct btrfs_root *root)
2963{
2964	struct btrfs_fs_info *fs_info = root->fs_info;
2965
2966	mutex_lock(&fs_info->scrub_lock);
2967	atomic_inc(&fs_info->scrub_pause_req);
2968	while (atomic_read(&fs_info->scrubs_paused) !=
2969	       atomic_read(&fs_info->scrubs_running)) {
2970		mutex_unlock(&fs_info->scrub_lock);
2971		wait_event(fs_info->scrub_pause_wait,
2972			   atomic_read(&fs_info->scrubs_paused) ==
2973			   atomic_read(&fs_info->scrubs_running));
2974		mutex_lock(&fs_info->scrub_lock);
2975	}
2976	mutex_unlock(&fs_info->scrub_lock);
2977}
2978
2979void btrfs_scrub_continue(struct btrfs_root *root)
2980{
2981	struct btrfs_fs_info *fs_info = root->fs_info;
2982
2983	atomic_dec(&fs_info->scrub_pause_req);
2984	wake_up(&fs_info->scrub_pause_wait);
2985}
2986
2987void btrfs_scrub_pause_super(struct btrfs_root *root)
2988{
2989	down_write(&root->fs_info->scrub_super_lock);
2990}
2991
2992void btrfs_scrub_continue_super(struct btrfs_root *root)
2993{
2994	up_write(&root->fs_info->scrub_super_lock);
2995}
2996
2997int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2998{
2999	mutex_lock(&fs_info->scrub_lock);
3000	if (!atomic_read(&fs_info->scrubs_running)) {
3001		mutex_unlock(&fs_info->scrub_lock);
3002		return -ENOTCONN;
3003	}
3004
3005	atomic_inc(&fs_info->scrub_cancel_req);
3006	while (atomic_read(&fs_info->scrubs_running)) {
3007		mutex_unlock(&fs_info->scrub_lock);
3008		wait_event(fs_info->scrub_pause_wait,
3009			   atomic_read(&fs_info->scrubs_running) == 0);
3010		mutex_lock(&fs_info->scrub_lock);
3011	}
3012	atomic_dec(&fs_info->scrub_cancel_req);
3013	mutex_unlock(&fs_info->scrub_lock);
3014
3015	return 0;
3016}
3017
3018int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3019			   struct btrfs_device *dev)
3020{
3021	struct scrub_ctx *sctx;
3022
3023	mutex_lock(&fs_info->scrub_lock);
3024	sctx = dev->scrub_device;
3025	if (!sctx) {
3026		mutex_unlock(&fs_info->scrub_lock);
3027		return -ENOTCONN;
3028	}
3029	atomic_inc(&sctx->cancel_req);
3030	while (dev->scrub_device) {
3031		mutex_unlock(&fs_info->scrub_lock);
3032		wait_event(fs_info->scrub_pause_wait,
3033			   dev->scrub_device == NULL);
3034		mutex_lock(&fs_info->scrub_lock);
3035	}
3036	mutex_unlock(&fs_info->scrub_lock);
3037
3038	return 0;
3039}
3040
3041int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3042			 struct btrfs_scrub_progress *progress)
3043{
3044	struct btrfs_device *dev;
3045	struct scrub_ctx *sctx = NULL;
3046
3047	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3048	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3049	if (dev)
3050		sctx = dev->scrub_device;
3051	if (sctx)
3052		memcpy(progress, &sctx->stat, sizeof(*progress));
3053	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3054
3055	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3056}
3057
3058static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3059			       u64 extent_logical, u64 extent_len,
3060			       u64 *extent_physical,
3061			       struct btrfs_device **extent_dev,
3062			       int *extent_mirror_num)
3063{
3064	u64 mapped_length;
3065	struct btrfs_bio *bbio = NULL;
3066	int ret;
3067
3068	mapped_length = extent_len;
3069	ret = btrfs_map_block(fs_info, READ, extent_logical,
3070			      &mapped_length, &bbio, 0);
3071	if (ret || !bbio || mapped_length < extent_len ||
3072	    !bbio->stripes[0].dev->bdev) {
3073		kfree(bbio);
3074		return;
3075	}
3076
3077	*extent_physical = bbio->stripes[0].physical;
3078	*extent_mirror_num = bbio->mirror_num;
3079	*extent_dev = bbio->stripes[0].dev;
3080	kfree(bbio);
3081}
3082
3083static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3084			      struct scrub_wr_ctx *wr_ctx,
3085			      struct btrfs_fs_info *fs_info,
3086			      struct btrfs_device *dev,
3087			      int is_dev_replace)
3088{
3089	WARN_ON(wr_ctx->wr_curr_bio != NULL);
3090
3091	mutex_init(&wr_ctx->wr_lock);
3092	wr_ctx->wr_curr_bio = NULL;
3093	if (!is_dev_replace)
3094		return 0;
3095
3096	WARN_ON(!dev->bdev);
3097	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3098					 bio_get_nr_vecs(dev->bdev));
3099	wr_ctx->tgtdev = dev;
3100	atomic_set(&wr_ctx->flush_all_writes, 0);
3101	return 0;
3102}
3103
3104static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3105{
3106	mutex_lock(&wr_ctx->wr_lock);
3107	kfree(wr_ctx->wr_curr_bio);
3108	wr_ctx->wr_curr_bio = NULL;
3109	mutex_unlock(&wr_ctx->wr_lock);
3110}
3111
3112static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3113			    int mirror_num, u64 physical_for_dev_replace)
3114{
3115	struct scrub_copy_nocow_ctx *nocow_ctx;
3116	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3117
3118	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3119	if (!nocow_ctx) {
3120		spin_lock(&sctx->stat_lock);
3121		sctx->stat.malloc_errors++;
3122		spin_unlock(&sctx->stat_lock);
3123		return -ENOMEM;
3124	}
3125
3126	scrub_pending_trans_workers_inc(sctx);
3127
3128	nocow_ctx->sctx = sctx;
3129	nocow_ctx->logical = logical;
3130	nocow_ctx->len = len;
3131	nocow_ctx->mirror_num = mirror_num;
3132	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3133	nocow_ctx->work.func = copy_nocow_pages_worker;
3134	btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3135			   &nocow_ctx->work);
3136
3137	return 0;
3138}
3139
3140static void copy_nocow_pages_worker(struct btrfs_work *work)
3141{
3142	struct scrub_copy_nocow_ctx *nocow_ctx =
3143		container_of(work, struct scrub_copy_nocow_ctx, work);
3144	struct scrub_ctx *sctx = nocow_ctx->sctx;
3145	u64 logical = nocow_ctx->logical;
3146	u64 len = nocow_ctx->len;
3147	int mirror_num = nocow_ctx->mirror_num;
3148	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3149	int ret;
3150	struct btrfs_trans_handle *trans = NULL;
3151	struct btrfs_fs_info *fs_info;
3152	struct btrfs_path *path;
3153	struct btrfs_root *root;
3154	int not_written = 0;
3155
3156	fs_info = sctx->dev_root->fs_info;
3157	root = fs_info->extent_root;
3158
3159	path = btrfs_alloc_path();
3160	if (!path) {
3161		spin_lock(&sctx->stat_lock);
3162		sctx->stat.malloc_errors++;
3163		spin_unlock(&sctx->stat_lock);
3164		not_written = 1;
3165		goto out;
3166	}
3167
3168	trans = btrfs_join_transaction(root);
3169	if (IS_ERR(trans)) {
3170		not_written = 1;
3171		goto out;
3172	}
3173
3174	ret = iterate_inodes_from_logical(logical, fs_info, path,
3175					  copy_nocow_pages_for_inode,
3176					  nocow_ctx);
3177	if (ret != 0 && ret != -ENOENT) {
3178		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3179			(unsigned long long)logical,
3180			(unsigned long long)physical_for_dev_replace,
3181			(unsigned long long)len,
3182			(unsigned long long)mirror_num, ret);
3183		not_written = 1;
3184		goto out;
3185	}
3186
3187out:
3188	if (trans && !IS_ERR(trans))
3189		btrfs_end_transaction(trans, root);
3190	if (not_written)
3191		btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3192					    num_uncorrectable_read_errors);
3193
3194	btrfs_free_path(path);
3195	kfree(nocow_ctx);
3196
3197	scrub_pending_trans_workers_dec(sctx);
3198}
3199
3200static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3201{
3202	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3203	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3204	struct btrfs_key key;
3205	struct inode *inode;
3206	struct page *page;
3207	struct btrfs_root *local_root;
3208	u64 physical_for_dev_replace;
3209	u64 len;
3210	unsigned long index;
3211	int srcu_index;
3212	int ret;
3213	int err;
3214
3215	key.objectid = root;
3216	key.type = BTRFS_ROOT_ITEM_KEY;
3217	key.offset = (u64)-1;
3218
3219	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
3220
3221	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3222	if (IS_ERR(local_root)) {
3223		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3224		return PTR_ERR(local_root);
3225	}
3226
3227	if (btrfs_root_refs(&local_root->root_item) == 0) {
3228		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3229		return -ENOENT;
3230	}
3231
3232	key.type = BTRFS_INODE_ITEM_KEY;
3233	key.objectid = inum;
3234	key.offset = 0;
3235	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3236	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3237	if (IS_ERR(inode))
3238		return PTR_ERR(inode);
3239
3240	/* Avoid truncate/dio/punch hole.. */
3241	mutex_lock(&inode->i_mutex);
3242	inode_dio_wait(inode);
3243
3244	ret = 0;
3245	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3246	len = nocow_ctx->len;
3247	while (len >= PAGE_CACHE_SIZE) {
3248		index = offset >> PAGE_CACHE_SHIFT;
3249again:
3250		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3251		if (!page) {
3252			pr_err("find_or_create_page() failed\n");
3253			ret = -ENOMEM;
3254			goto out;
3255		}
3256
3257		if (PageUptodate(page)) {
3258			if (PageDirty(page))
3259				goto next_page;
3260		} else {
3261			ClearPageError(page);
3262			err = extent_read_full_page(&BTRFS_I(inode)->
3263							 io_tree,
3264							page, btrfs_get_extent,
3265							nocow_ctx->mirror_num);
3266			if (err) {
3267				ret = err;
3268				goto next_page;
3269			}
3270
3271			lock_page(page);
3272			/*
3273			 * If the page has been remove from the page cache,
3274			 * the data on it is meaningless, because it may be
3275			 * old one, the new data may be written into the new
3276			 * page in the page cache.
3277			 */
3278			if (page->mapping != inode->i_mapping) {
3279				page_cache_release(page);
3280				goto again;
3281			}
3282			if (!PageUptodate(page)) {
3283				ret = -EIO;
3284				goto next_page;
3285			}
3286		}
3287		err = write_page_nocow(nocow_ctx->sctx,
3288				       physical_for_dev_replace, page);
3289		if (err)
3290			ret = err;
3291next_page:
3292		unlock_page(page);
3293		page_cache_release(page);
3294
3295		if (ret)
3296			break;
3297
3298		offset += PAGE_CACHE_SIZE;
3299		physical_for_dev_replace += PAGE_CACHE_SIZE;
3300		len -= PAGE_CACHE_SIZE;
3301	}
3302out:
3303	mutex_unlock(&inode->i_mutex);
3304	iput(inode);
3305	return ret;
3306}
3307
3308static int write_page_nocow(struct scrub_ctx *sctx,
3309			    u64 physical_for_dev_replace, struct page *page)
3310{
3311	struct bio *bio;
3312	struct btrfs_device *dev;
3313	int ret;
3314	DECLARE_COMPLETION_ONSTACK(compl);
3315
3316	dev = sctx->wr_ctx.tgtdev;
3317	if (!dev)
3318		return -EIO;
3319	if (!dev->bdev) {
3320		printk_ratelimited(KERN_WARNING
3321			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3322		return -EIO;
3323	}
3324	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
3325	if (!bio) {
3326		spin_lock(&sctx->stat_lock);
3327		sctx->stat.malloc_errors++;
3328		spin_unlock(&sctx->stat_lock);
3329		return -ENOMEM;
3330	}
3331	bio->bi_private = &compl;
3332	bio->bi_end_io = scrub_complete_bio_end_io;
3333	bio->bi_size = 0;
3334	bio->bi_sector = physical_for_dev_replace >> 9;
3335	bio->bi_bdev = dev->bdev;
3336	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3337	if (ret != PAGE_CACHE_SIZE) {
3338leave_with_eio:
3339		bio_put(bio);
3340		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3341		return -EIO;
3342	}
3343	btrfsic_submit_bio(WRITE_SYNC, bio);
3344	wait_for_completion(&compl);
3345
3346	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3347		goto leave_with_eio;
3348
3349	bio_put(bio);
3350	return 0;
3351}
3352