scrub.c revision ff023aac31198e88507d626825379b28ea481d4d
1/* 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19#include <linux/blkdev.h> 20#include <linux/ratelimit.h> 21#include "ctree.h" 22#include "volumes.h" 23#include "disk-io.h" 24#include "ordered-data.h" 25#include "transaction.h" 26#include "backref.h" 27#include "extent_io.h" 28#include "dev-replace.h" 29#include "check-integrity.h" 30#include "rcu-string.h" 31 32/* 33 * This is only the first step towards a full-features scrub. It reads all 34 * extent and super block and verifies the checksums. In case a bad checksum 35 * is found or the extent cannot be read, good data will be written back if 36 * any can be found. 37 * 38 * Future enhancements: 39 * - In case an unrepairable extent is encountered, track which files are 40 * affected and report them 41 * - track and record media errors, throw out bad devices 42 * - add a mode to also read unallocated space 43 */ 44 45struct scrub_block; 46struct scrub_ctx; 47 48/* 49 * the following three values only influence the performance. 50 * The last one configures the number of parallel and outstanding I/O 51 * operations. The first two values configure an upper limit for the number 52 * of (dynamically allocated) pages that are added to a bio. 53 */ 54#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ 55#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ 56#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ 57 58/* 59 * the following value times PAGE_SIZE needs to be large enough to match the 60 * largest node/leaf/sector size that shall be supported. 61 * Values larger than BTRFS_STRIPE_LEN are not supported. 62 */ 63#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 64 65struct scrub_page { 66 struct scrub_block *sblock; 67 struct page *page; 68 struct btrfs_device *dev; 69 u64 flags; /* extent flags */ 70 u64 generation; 71 u64 logical; 72 u64 physical; 73 u64 physical_for_dev_replace; 74 atomic_t ref_count; 75 struct { 76 unsigned int mirror_num:8; 77 unsigned int have_csum:1; 78 unsigned int io_error:1; 79 }; 80 u8 csum[BTRFS_CSUM_SIZE]; 81}; 82 83struct scrub_bio { 84 int index; 85 struct scrub_ctx *sctx; 86 struct btrfs_device *dev; 87 struct bio *bio; 88 int err; 89 u64 logical; 90 u64 physical; 91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO 92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; 93#else 94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; 95#endif 96 int page_count; 97 int next_free; 98 struct btrfs_work work; 99}; 100 101struct scrub_block { 102 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 103 int page_count; 104 atomic_t outstanding_pages; 105 atomic_t ref_count; /* free mem on transition to zero */ 106 struct scrub_ctx *sctx; 107 struct { 108 unsigned int header_error:1; 109 unsigned int checksum_error:1; 110 unsigned int no_io_error_seen:1; 111 unsigned int generation_error:1; /* also sets header_error */ 112 }; 113}; 114 115struct scrub_wr_ctx { 116 struct scrub_bio *wr_curr_bio; 117 struct btrfs_device *tgtdev; 118 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ 119 atomic_t flush_all_writes; 120 struct mutex wr_lock; 121}; 122 123struct scrub_ctx { 124 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; 125 struct btrfs_root *dev_root; 126 int first_free; 127 int curr; 128 atomic_t bios_in_flight; 129 atomic_t workers_pending; 130 spinlock_t list_lock; 131 wait_queue_head_t list_wait; 132 u16 csum_size; 133 struct list_head csum_list; 134 atomic_t cancel_req; 135 int readonly; 136 int pages_per_rd_bio; 137 u32 sectorsize; 138 u32 nodesize; 139 u32 leafsize; 140 141 int is_dev_replace; 142 struct scrub_wr_ctx wr_ctx; 143 144 /* 145 * statistics 146 */ 147 struct btrfs_scrub_progress stat; 148 spinlock_t stat_lock; 149}; 150 151struct scrub_fixup_nodatasum { 152 struct scrub_ctx *sctx; 153 struct btrfs_device *dev; 154 u64 logical; 155 struct btrfs_root *root; 156 struct btrfs_work work; 157 int mirror_num; 158}; 159 160struct scrub_copy_nocow_ctx { 161 struct scrub_ctx *sctx; 162 u64 logical; 163 u64 len; 164 int mirror_num; 165 u64 physical_for_dev_replace; 166 struct btrfs_work work; 167}; 168 169struct scrub_warning { 170 struct btrfs_path *path; 171 u64 extent_item_size; 172 char *scratch_buf; 173 char *msg_buf; 174 const char *errstr; 175 sector_t sector; 176 u64 logical; 177 struct btrfs_device *dev; 178 int msg_bufsize; 179 int scratch_bufsize; 180}; 181 182 183static void scrub_pending_bio_inc(struct scrub_ctx *sctx); 184static void scrub_pending_bio_dec(struct scrub_ctx *sctx); 185static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 186static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 189 struct btrfs_fs_info *fs_info, 190 struct scrub_block *original_sblock, 191 u64 length, u64 logical, 192 struct scrub_block *sblocks_for_recheck); 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 194 struct scrub_block *sblock, int is_metadata, 195 int have_csum, u8 *csum, u64 generation, 196 u16 csum_size); 197static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 198 struct scrub_block *sblock, 199 int is_metadata, int have_csum, 200 const u8 *csum, u64 generation, 201 u16 csum_size); 202static void scrub_complete_bio_end_io(struct bio *bio, int err); 203static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 204 struct scrub_block *sblock_good, 205 int force_write); 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 207 struct scrub_block *sblock_good, 208 int page_num, int force_write); 209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); 210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 211 int page_num); 212static int scrub_checksum_data(struct scrub_block *sblock); 213static int scrub_checksum_tree_block(struct scrub_block *sblock); 214static int scrub_checksum_super(struct scrub_block *sblock); 215static void scrub_block_get(struct scrub_block *sblock); 216static void scrub_block_put(struct scrub_block *sblock); 217static void scrub_page_get(struct scrub_page *spage); 218static void scrub_page_put(struct scrub_page *spage); 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 220 struct scrub_page *spage); 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 222 u64 physical, struct btrfs_device *dev, u64 flags, 223 u64 gen, int mirror_num, u8 *csum, int force, 224 u64 physical_for_dev_replace); 225static void scrub_bio_end_io(struct bio *bio, int err); 226static void scrub_bio_end_io_worker(struct btrfs_work *work); 227static void scrub_block_complete(struct scrub_block *sblock); 228static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 229 u64 extent_logical, u64 extent_len, 230 u64 *extent_physical, 231 struct btrfs_device **extent_dev, 232 int *extent_mirror_num); 233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, 234 struct scrub_wr_ctx *wr_ctx, 235 struct btrfs_fs_info *fs_info, 236 struct btrfs_device *dev, 237 int is_dev_replace); 238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); 239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 240 struct scrub_page *spage); 241static void scrub_wr_submit(struct scrub_ctx *sctx); 242static void scrub_wr_bio_end_io(struct bio *bio, int err); 243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); 244static int write_page_nocow(struct scrub_ctx *sctx, 245 u64 physical_for_dev_replace, struct page *page); 246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 247 void *ctx); 248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 249 int mirror_num, u64 physical_for_dev_replace); 250static void copy_nocow_pages_worker(struct btrfs_work *work); 251 252 253static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 254{ 255 atomic_inc(&sctx->bios_in_flight); 256} 257 258static void scrub_pending_bio_dec(struct scrub_ctx *sctx) 259{ 260 atomic_dec(&sctx->bios_in_flight); 261 wake_up(&sctx->list_wait); 262} 263 264/* 265 * used for workers that require transaction commits (i.e., for the 266 * NOCOW case) 267 */ 268static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) 269{ 270 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 271 272 /* 273 * increment scrubs_running to prevent cancel requests from 274 * completing as long as a worker is running. we must also 275 * increment scrubs_paused to prevent deadlocking on pause 276 * requests used for transactions commits (as the worker uses a 277 * transaction context). it is safe to regard the worker 278 * as paused for all matters practical. effectively, we only 279 * avoid cancellation requests from completing. 280 */ 281 mutex_lock(&fs_info->scrub_lock); 282 atomic_inc(&fs_info->scrubs_running); 283 atomic_inc(&fs_info->scrubs_paused); 284 mutex_unlock(&fs_info->scrub_lock); 285 atomic_inc(&sctx->workers_pending); 286} 287 288/* used for workers that require transaction commits */ 289static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) 290{ 291 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 292 293 /* 294 * see scrub_pending_trans_workers_inc() why we're pretending 295 * to be paused in the scrub counters 296 */ 297 mutex_lock(&fs_info->scrub_lock); 298 atomic_dec(&fs_info->scrubs_running); 299 atomic_dec(&fs_info->scrubs_paused); 300 mutex_unlock(&fs_info->scrub_lock); 301 atomic_dec(&sctx->workers_pending); 302 wake_up(&fs_info->scrub_pause_wait); 303 wake_up(&sctx->list_wait); 304} 305 306static void scrub_free_csums(struct scrub_ctx *sctx) 307{ 308 while (!list_empty(&sctx->csum_list)) { 309 struct btrfs_ordered_sum *sum; 310 sum = list_first_entry(&sctx->csum_list, 311 struct btrfs_ordered_sum, list); 312 list_del(&sum->list); 313 kfree(sum); 314 } 315} 316 317static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 318{ 319 int i; 320 321 if (!sctx) 322 return; 323 324 scrub_free_wr_ctx(&sctx->wr_ctx); 325 326 /* this can happen when scrub is cancelled */ 327 if (sctx->curr != -1) { 328 struct scrub_bio *sbio = sctx->bios[sctx->curr]; 329 330 for (i = 0; i < sbio->page_count; i++) { 331 WARN_ON(!sbio->pagev[i]->page); 332 scrub_block_put(sbio->pagev[i]->sblock); 333 } 334 bio_put(sbio->bio); 335 } 336 337 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 338 struct scrub_bio *sbio = sctx->bios[i]; 339 340 if (!sbio) 341 break; 342 kfree(sbio); 343 } 344 345 scrub_free_csums(sctx); 346 kfree(sctx); 347} 348 349static noinline_for_stack 350struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) 351{ 352 struct scrub_ctx *sctx; 353 int i; 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 355 int pages_per_rd_bio; 356 int ret; 357 358 /* 359 * the setting of pages_per_rd_bio is correct for scrub but might 360 * be wrong for the dev_replace code where we might read from 361 * different devices in the initial huge bios. However, that 362 * code is able to correctly handle the case when adding a page 363 * to a bio fails. 364 */ 365 if (dev->bdev) 366 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, 367 bio_get_nr_vecs(dev->bdev)); 368 else 369 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; 370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 371 if (!sctx) 372 goto nomem; 373 sctx->is_dev_replace = is_dev_replace; 374 sctx->pages_per_rd_bio = pages_per_rd_bio; 375 sctx->curr = -1; 376 sctx->dev_root = dev->dev_root; 377 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 378 struct scrub_bio *sbio; 379 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 381 if (!sbio) 382 goto nomem; 383 sctx->bios[i] = sbio; 384 385 sbio->index = i; 386 sbio->sctx = sctx; 387 sbio->page_count = 0; 388 sbio->work.func = scrub_bio_end_io_worker; 389 390 if (i != SCRUB_BIOS_PER_SCTX - 1) 391 sctx->bios[i]->next_free = i + 1; 392 else 393 sctx->bios[i]->next_free = -1; 394 } 395 sctx->first_free = 0; 396 sctx->nodesize = dev->dev_root->nodesize; 397 sctx->leafsize = dev->dev_root->leafsize; 398 sctx->sectorsize = dev->dev_root->sectorsize; 399 atomic_set(&sctx->bios_in_flight, 0); 400 atomic_set(&sctx->workers_pending, 0); 401 atomic_set(&sctx->cancel_req, 0); 402 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); 403 INIT_LIST_HEAD(&sctx->csum_list); 404 405 spin_lock_init(&sctx->list_lock); 406 spin_lock_init(&sctx->stat_lock); 407 init_waitqueue_head(&sctx->list_wait); 408 409 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, 410 fs_info->dev_replace.tgtdev, is_dev_replace); 411 if (ret) { 412 scrub_free_ctx(sctx); 413 return ERR_PTR(ret); 414 } 415 return sctx; 416 417nomem: 418 scrub_free_ctx(sctx); 419 return ERR_PTR(-ENOMEM); 420} 421 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, 423 void *warn_ctx) 424{ 425 u64 isize; 426 u32 nlink; 427 int ret; 428 int i; 429 struct extent_buffer *eb; 430 struct btrfs_inode_item *inode_item; 431 struct scrub_warning *swarn = warn_ctx; 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 433 struct inode_fs_paths *ipath = NULL; 434 struct btrfs_root *local_root; 435 struct btrfs_key root_key; 436 437 root_key.objectid = root; 438 root_key.type = BTRFS_ROOT_ITEM_KEY; 439 root_key.offset = (u64)-1; 440 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); 441 if (IS_ERR(local_root)) { 442 ret = PTR_ERR(local_root); 443 goto err; 444 } 445 446 ret = inode_item_info(inum, 0, local_root, swarn->path); 447 if (ret) { 448 btrfs_release_path(swarn->path); 449 goto err; 450 } 451 452 eb = swarn->path->nodes[0]; 453 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 454 struct btrfs_inode_item); 455 isize = btrfs_inode_size(eb, inode_item); 456 nlink = btrfs_inode_nlink(eb, inode_item); 457 btrfs_release_path(swarn->path); 458 459 ipath = init_ipath(4096, local_root, swarn->path); 460 if (IS_ERR(ipath)) { 461 ret = PTR_ERR(ipath); 462 ipath = NULL; 463 goto err; 464 } 465 ret = paths_from_inode(inum, ipath); 466 467 if (ret < 0) 468 goto err; 469 470 /* 471 * we deliberately ignore the bit ipath might have been too small to 472 * hold all of the paths here 473 */ 474 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 475 printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " 476 "%s, sector %llu, root %llu, inode %llu, offset %llu, " 477 "length %llu, links %u (path: %s)\n", swarn->errstr, 478 swarn->logical, rcu_str_deref(swarn->dev->name), 479 (unsigned long long)swarn->sector, root, inum, offset, 480 min(isize - offset, (u64)PAGE_SIZE), nlink, 481 (char *)(unsigned long)ipath->fspath->val[i]); 482 483 free_ipath(ipath); 484 return 0; 485 486err: 487 printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " 488 "%s, sector %llu, root %llu, inode %llu, offset %llu: path " 489 "resolving failed with ret=%d\n", swarn->errstr, 490 swarn->logical, rcu_str_deref(swarn->dev->name), 491 (unsigned long long)swarn->sector, root, inum, offset, ret); 492 493 free_ipath(ipath); 494 return 0; 495} 496 497static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 498{ 499 struct btrfs_device *dev; 500 struct btrfs_fs_info *fs_info; 501 struct btrfs_path *path; 502 struct btrfs_key found_key; 503 struct extent_buffer *eb; 504 struct btrfs_extent_item *ei; 505 struct scrub_warning swarn; 506 unsigned long ptr = 0; 507 u64 extent_item_pos; 508 u64 flags = 0; 509 u64 ref_root; 510 u32 item_size; 511 u8 ref_level; 512 const int bufsize = 4096; 513 int ret; 514 515 WARN_ON(sblock->page_count < 1); 516 dev = sblock->pagev[0]->dev; 517 fs_info = sblock->sctx->dev_root->fs_info; 518 519 path = btrfs_alloc_path(); 520 521 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 522 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 523 swarn.sector = (sblock->pagev[0]->physical) >> 9; 524 swarn.logical = sblock->pagev[0]->logical; 525 swarn.errstr = errstr; 526 swarn.dev = NULL; 527 swarn.msg_bufsize = bufsize; 528 swarn.scratch_bufsize = bufsize; 529 530 if (!path || !swarn.scratch_buf || !swarn.msg_buf) 531 goto out; 532 533 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 534 &flags); 535 if (ret < 0) 536 goto out; 537 538 extent_item_pos = swarn.logical - found_key.objectid; 539 swarn.extent_item_size = found_key.offset; 540 541 eb = path->nodes[0]; 542 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 543 item_size = btrfs_item_size_nr(eb, path->slots[0]); 544 btrfs_release_path(path); 545 546 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 547 do { 548 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 549 &ref_root, &ref_level); 550 printk_in_rcu(KERN_WARNING 551 "btrfs: %s at logical %llu on dev %s, " 552 "sector %llu: metadata %s (level %d) in tree " 553 "%llu\n", errstr, swarn.logical, 554 rcu_str_deref(dev->name), 555 (unsigned long long)swarn.sector, 556 ref_level ? "node" : "leaf", 557 ret < 0 ? -1 : ref_level, 558 ret < 0 ? -1 : ref_root); 559 } while (ret != 1); 560 } else { 561 swarn.path = path; 562 swarn.dev = dev; 563 iterate_extent_inodes(fs_info, found_key.objectid, 564 extent_item_pos, 1, 565 scrub_print_warning_inode, &swarn); 566 } 567 568out: 569 btrfs_free_path(path); 570 kfree(swarn.scratch_buf); 571 kfree(swarn.msg_buf); 572} 573 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) 575{ 576 struct page *page = NULL; 577 unsigned long index; 578 struct scrub_fixup_nodatasum *fixup = fixup_ctx; 579 int ret; 580 int corrected = 0; 581 struct btrfs_key key; 582 struct inode *inode = NULL; 583 u64 end = offset + PAGE_SIZE - 1; 584 struct btrfs_root *local_root; 585 586 key.objectid = root; 587 key.type = BTRFS_ROOT_ITEM_KEY; 588 key.offset = (u64)-1; 589 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); 590 if (IS_ERR(local_root)) 591 return PTR_ERR(local_root); 592 593 key.type = BTRFS_INODE_ITEM_KEY; 594 key.objectid = inum; 595 key.offset = 0; 596 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); 597 if (IS_ERR(inode)) 598 return PTR_ERR(inode); 599 600 index = offset >> PAGE_CACHE_SHIFT; 601 602 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 603 if (!page) { 604 ret = -ENOMEM; 605 goto out; 606 } 607 608 if (PageUptodate(page)) { 609 struct btrfs_fs_info *fs_info; 610 if (PageDirty(page)) { 611 /* 612 * we need to write the data to the defect sector. the 613 * data that was in that sector is not in memory, 614 * because the page was modified. we must not write the 615 * modified page to that sector. 616 * 617 * TODO: what could be done here: wait for the delalloc 618 * runner to write out that page (might involve 619 * COW) and see whether the sector is still 620 * referenced afterwards. 621 * 622 * For the meantime, we'll treat this error 623 * incorrectable, although there is a chance that a 624 * later scrub will find the bad sector again and that 625 * there's no dirty page in memory, then. 626 */ 627 ret = -EIO; 628 goto out; 629 } 630 fs_info = BTRFS_I(inode)->root->fs_info; 631 ret = repair_io_failure(fs_info, offset, PAGE_SIZE, 632 fixup->logical, page, 633 fixup->mirror_num); 634 unlock_page(page); 635 corrected = !ret; 636 } else { 637 /* 638 * we need to get good data first. the general readpage path 639 * will call repair_io_failure for us, we just have to make 640 * sure we read the bad mirror. 641 */ 642 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 643 EXTENT_DAMAGED, GFP_NOFS); 644 if (ret) { 645 /* set_extent_bits should give proper error */ 646 WARN_ON(ret > 0); 647 if (ret > 0) 648 ret = -EFAULT; 649 goto out; 650 } 651 652 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, 653 btrfs_get_extent, 654 fixup->mirror_num); 655 wait_on_page_locked(page); 656 657 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, 658 end, EXTENT_DAMAGED, 0, NULL); 659 if (!corrected) 660 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 661 EXTENT_DAMAGED, GFP_NOFS); 662 } 663 664out: 665 if (page) 666 put_page(page); 667 if (inode) 668 iput(inode); 669 670 if (ret < 0) 671 return ret; 672 673 if (ret == 0 && corrected) { 674 /* 675 * we only need to call readpage for one of the inodes belonging 676 * to this extent. so make iterate_extent_inodes stop 677 */ 678 return 1; 679 } 680 681 return -EIO; 682} 683 684static void scrub_fixup_nodatasum(struct btrfs_work *work) 685{ 686 int ret; 687 struct scrub_fixup_nodatasum *fixup; 688 struct scrub_ctx *sctx; 689 struct btrfs_trans_handle *trans = NULL; 690 struct btrfs_fs_info *fs_info; 691 struct btrfs_path *path; 692 int uncorrectable = 0; 693 694 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 695 sctx = fixup->sctx; 696 fs_info = fixup->root->fs_info; 697 698 path = btrfs_alloc_path(); 699 if (!path) { 700 spin_lock(&sctx->stat_lock); 701 ++sctx->stat.malloc_errors; 702 spin_unlock(&sctx->stat_lock); 703 uncorrectable = 1; 704 goto out; 705 } 706 707 trans = btrfs_join_transaction(fixup->root); 708 if (IS_ERR(trans)) { 709 uncorrectable = 1; 710 goto out; 711 } 712 713 /* 714 * the idea is to trigger a regular read through the standard path. we 715 * read a page from the (failed) logical address by specifying the 716 * corresponding copynum of the failed sector. thus, that readpage is 717 * expected to fail. 718 * that is the point where on-the-fly error correction will kick in 719 * (once it's finished) and rewrite the failed sector if a good copy 720 * can be found. 721 */ 722 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, 723 path, scrub_fixup_readpage, 724 fixup); 725 if (ret < 0) { 726 uncorrectable = 1; 727 goto out; 728 } 729 WARN_ON(ret != 1); 730 731 spin_lock(&sctx->stat_lock); 732 ++sctx->stat.corrected_errors; 733 spin_unlock(&sctx->stat_lock); 734 735out: 736 if (trans && !IS_ERR(trans)) 737 btrfs_end_transaction(trans, fixup->root); 738 if (uncorrectable) { 739 spin_lock(&sctx->stat_lock); 740 ++sctx->stat.uncorrectable_errors; 741 spin_unlock(&sctx->stat_lock); 742 btrfs_dev_replace_stats_inc( 743 &sctx->dev_root->fs_info->dev_replace. 744 num_uncorrectable_read_errors); 745 printk_ratelimited_in_rcu(KERN_ERR 746 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 747 (unsigned long long)fixup->logical, 748 rcu_str_deref(fixup->dev->name)); 749 } 750 751 btrfs_free_path(path); 752 kfree(fixup); 753 754 scrub_pending_trans_workers_dec(sctx); 755} 756 757/* 758 * scrub_handle_errored_block gets called when either verification of the 759 * pages failed or the bio failed to read, e.g. with EIO. In the latter 760 * case, this function handles all pages in the bio, even though only one 761 * may be bad. 762 * The goal of this function is to repair the errored block by using the 763 * contents of one of the mirrors. 764 */ 765static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 766{ 767 struct scrub_ctx *sctx = sblock_to_check->sctx; 768 struct btrfs_device *dev; 769 struct btrfs_fs_info *fs_info; 770 u64 length; 771 u64 logical; 772 u64 generation; 773 unsigned int failed_mirror_index; 774 unsigned int is_metadata; 775 unsigned int have_csum; 776 u8 *csum; 777 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ 778 struct scrub_block *sblock_bad; 779 int ret; 780 int mirror_index; 781 int page_num; 782 int success; 783 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 784 DEFAULT_RATELIMIT_BURST); 785 786 BUG_ON(sblock_to_check->page_count < 1); 787 fs_info = sctx->dev_root->fs_info; 788 length = sblock_to_check->page_count * PAGE_SIZE; 789 logical = sblock_to_check->pagev[0]->logical; 790 generation = sblock_to_check->pagev[0]->generation; 791 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); 792 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; 793 is_metadata = !(sblock_to_check->pagev[0]->flags & 794 BTRFS_EXTENT_FLAG_DATA); 795 have_csum = sblock_to_check->pagev[0]->have_csum; 796 csum = sblock_to_check->pagev[0]->csum; 797 dev = sblock_to_check->pagev[0]->dev; 798 799 if (sctx->is_dev_replace && !is_metadata && !have_csum) { 800 sblocks_for_recheck = NULL; 801 goto nodatasum_case; 802 } 803 804 /* 805 * read all mirrors one after the other. This includes to 806 * re-read the extent or metadata block that failed (that was 807 * the cause that this fixup code is called) another time, 808 * page by page this time in order to know which pages 809 * caused I/O errors and which ones are good (for all mirrors). 810 * It is the goal to handle the situation when more than one 811 * mirror contains I/O errors, but the errors do not 812 * overlap, i.e. the data can be repaired by selecting the 813 * pages from those mirrors without I/O error on the 814 * particular pages. One example (with blocks >= 2 * PAGE_SIZE) 815 * would be that mirror #1 has an I/O error on the first page, 816 * the second page is good, and mirror #2 has an I/O error on 817 * the second page, but the first page is good. 818 * Then the first page of the first mirror can be repaired by 819 * taking the first page of the second mirror, and the 820 * second page of the second mirror can be repaired by 821 * copying the contents of the 2nd page of the 1st mirror. 822 * One more note: if the pages of one mirror contain I/O 823 * errors, the checksum cannot be verified. In order to get 824 * the best data for repairing, the first attempt is to find 825 * a mirror without I/O errors and with a validated checksum. 826 * Only if this is not possible, the pages are picked from 827 * mirrors with I/O errors without considering the checksum. 828 * If the latter is the case, at the end, the checksum of the 829 * repaired area is verified in order to correctly maintain 830 * the statistics. 831 */ 832 833 sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * 834 sizeof(*sblocks_for_recheck), 835 GFP_NOFS); 836 if (!sblocks_for_recheck) { 837 spin_lock(&sctx->stat_lock); 838 sctx->stat.malloc_errors++; 839 sctx->stat.read_errors++; 840 sctx->stat.uncorrectable_errors++; 841 spin_unlock(&sctx->stat_lock); 842 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 843 goto out; 844 } 845 846 /* setup the context, map the logical blocks and alloc the pages */ 847 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, 848 logical, sblocks_for_recheck); 849 if (ret) { 850 spin_lock(&sctx->stat_lock); 851 sctx->stat.read_errors++; 852 sctx->stat.uncorrectable_errors++; 853 spin_unlock(&sctx->stat_lock); 854 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 855 goto out; 856 } 857 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 858 sblock_bad = sblocks_for_recheck + failed_mirror_index; 859 860 /* build and submit the bios for the failed mirror, check checksums */ 861 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 862 csum, generation, sctx->csum_size); 863 864 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 865 sblock_bad->no_io_error_seen) { 866 /* 867 * the error disappeared after reading page by page, or 868 * the area was part of a huge bio and other parts of the 869 * bio caused I/O errors, or the block layer merged several 870 * read requests into one and the error is caused by a 871 * different bio (usually one of the two latter cases is 872 * the cause) 873 */ 874 spin_lock(&sctx->stat_lock); 875 sctx->stat.unverified_errors++; 876 spin_unlock(&sctx->stat_lock); 877 878 if (sctx->is_dev_replace) 879 scrub_write_block_to_dev_replace(sblock_bad); 880 goto out; 881 } 882 883 if (!sblock_bad->no_io_error_seen) { 884 spin_lock(&sctx->stat_lock); 885 sctx->stat.read_errors++; 886 spin_unlock(&sctx->stat_lock); 887 if (__ratelimit(&_rs)) 888 scrub_print_warning("i/o error", sblock_to_check); 889 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 890 } else if (sblock_bad->checksum_error) { 891 spin_lock(&sctx->stat_lock); 892 sctx->stat.csum_errors++; 893 spin_unlock(&sctx->stat_lock); 894 if (__ratelimit(&_rs)) 895 scrub_print_warning("checksum error", sblock_to_check); 896 btrfs_dev_stat_inc_and_print(dev, 897 BTRFS_DEV_STAT_CORRUPTION_ERRS); 898 } else if (sblock_bad->header_error) { 899 spin_lock(&sctx->stat_lock); 900 sctx->stat.verify_errors++; 901 spin_unlock(&sctx->stat_lock); 902 if (__ratelimit(&_rs)) 903 scrub_print_warning("checksum/header error", 904 sblock_to_check); 905 if (sblock_bad->generation_error) 906 btrfs_dev_stat_inc_and_print(dev, 907 BTRFS_DEV_STAT_GENERATION_ERRS); 908 else 909 btrfs_dev_stat_inc_and_print(dev, 910 BTRFS_DEV_STAT_CORRUPTION_ERRS); 911 } 912 913 if (sctx->readonly && !sctx->is_dev_replace) 914 goto did_not_correct_error; 915 916 if (!is_metadata && !have_csum) { 917 struct scrub_fixup_nodatasum *fixup_nodatasum; 918 919nodatasum_case: 920 WARN_ON(sctx->is_dev_replace); 921 922 /* 923 * !is_metadata and !have_csum, this means that the data 924 * might not be COW'ed, that it might be modified 925 * concurrently. The general strategy to work on the 926 * commit root does not help in the case when COW is not 927 * used. 928 */ 929 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 930 if (!fixup_nodatasum) 931 goto did_not_correct_error; 932 fixup_nodatasum->sctx = sctx; 933 fixup_nodatasum->dev = dev; 934 fixup_nodatasum->logical = logical; 935 fixup_nodatasum->root = fs_info->extent_root; 936 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 937 scrub_pending_trans_workers_inc(sctx); 938 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 939 btrfs_queue_worker(&fs_info->scrub_workers, 940 &fixup_nodatasum->work); 941 goto out; 942 } 943 944 /* 945 * now build and submit the bios for the other mirrors, check 946 * checksums. 947 * First try to pick the mirror which is completely without I/O 948 * errors and also does not have a checksum error. 949 * If one is found, and if a checksum is present, the full block 950 * that is known to contain an error is rewritten. Afterwards 951 * the block is known to be corrected. 952 * If a mirror is found which is completely correct, and no 953 * checksum is present, only those pages are rewritten that had 954 * an I/O error in the block to be repaired, since it cannot be 955 * determined, which copy of the other pages is better (and it 956 * could happen otherwise that a correct page would be 957 * overwritten by a bad one). 958 */ 959 for (mirror_index = 0; 960 mirror_index < BTRFS_MAX_MIRRORS && 961 sblocks_for_recheck[mirror_index].page_count > 0; 962 mirror_index++) { 963 struct scrub_block *sblock_other; 964 965 if (mirror_index == failed_mirror_index) 966 continue; 967 sblock_other = sblocks_for_recheck + mirror_index; 968 969 /* build and submit the bios, check checksums */ 970 scrub_recheck_block(fs_info, sblock_other, is_metadata, 971 have_csum, csum, generation, 972 sctx->csum_size); 973 974 if (!sblock_other->header_error && 975 !sblock_other->checksum_error && 976 sblock_other->no_io_error_seen) { 977 if (sctx->is_dev_replace) { 978 scrub_write_block_to_dev_replace(sblock_other); 979 } else { 980 int force_write = is_metadata || have_csum; 981 982 ret = scrub_repair_block_from_good_copy( 983 sblock_bad, sblock_other, 984 force_write); 985 } 986 if (0 == ret) 987 goto corrected_error; 988 } 989 } 990 991 /* 992 * for dev_replace, pick good pages and write to the target device. 993 */ 994 if (sctx->is_dev_replace) { 995 success = 1; 996 for (page_num = 0; page_num < sblock_bad->page_count; 997 page_num++) { 998 int sub_success; 999 1000 sub_success = 0; 1001 for (mirror_index = 0; 1002 mirror_index < BTRFS_MAX_MIRRORS && 1003 sblocks_for_recheck[mirror_index].page_count > 0; 1004 mirror_index++) { 1005 struct scrub_block *sblock_other = 1006 sblocks_for_recheck + mirror_index; 1007 struct scrub_page *page_other = 1008 sblock_other->pagev[page_num]; 1009 1010 if (!page_other->io_error) { 1011 ret = scrub_write_page_to_dev_replace( 1012 sblock_other, page_num); 1013 if (ret == 0) { 1014 /* succeeded for this page */ 1015 sub_success = 1; 1016 break; 1017 } else { 1018 btrfs_dev_replace_stats_inc( 1019 &sctx->dev_root-> 1020 fs_info->dev_replace. 1021 num_write_errors); 1022 } 1023 } 1024 } 1025 1026 if (!sub_success) { 1027 /* 1028 * did not find a mirror to fetch the page 1029 * from. scrub_write_page_to_dev_replace() 1030 * handles this case (page->io_error), by 1031 * filling the block with zeros before 1032 * submitting the write request 1033 */ 1034 success = 0; 1035 ret = scrub_write_page_to_dev_replace( 1036 sblock_bad, page_num); 1037 if (ret) 1038 btrfs_dev_replace_stats_inc( 1039 &sctx->dev_root->fs_info-> 1040 dev_replace.num_write_errors); 1041 } 1042 } 1043 1044 goto out; 1045 } 1046 1047 /* 1048 * for regular scrub, repair those pages that are errored. 1049 * In case of I/O errors in the area that is supposed to be 1050 * repaired, continue by picking good copies of those pages. 1051 * Select the good pages from mirrors to rewrite bad pages from 1052 * the area to fix. Afterwards verify the checksum of the block 1053 * that is supposed to be repaired. This verification step is 1054 * only done for the purpose of statistic counting and for the 1055 * final scrub report, whether errors remain. 1056 * A perfect algorithm could make use of the checksum and try 1057 * all possible combinations of pages from the different mirrors 1058 * until the checksum verification succeeds. For example, when 1059 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page 1060 * of mirror #2 is readable but the final checksum test fails, 1061 * then the 2nd page of mirror #3 could be tried, whether now 1062 * the final checksum succeedes. But this would be a rare 1063 * exception and is therefore not implemented. At least it is 1064 * avoided that the good copy is overwritten. 1065 * A more useful improvement would be to pick the sectors 1066 * without I/O error based on sector sizes (512 bytes on legacy 1067 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one 1068 * mirror could be repaired by taking 512 byte of a different 1069 * mirror, even if other 512 byte sectors in the same PAGE_SIZE 1070 * area are unreadable. 1071 */ 1072 1073 /* can only fix I/O errors from here on */ 1074 if (sblock_bad->no_io_error_seen) 1075 goto did_not_correct_error; 1076 1077 success = 1; 1078 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1079 struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1080 1081 if (!page_bad->io_error) 1082 continue; 1083 1084 for (mirror_index = 0; 1085 mirror_index < BTRFS_MAX_MIRRORS && 1086 sblocks_for_recheck[mirror_index].page_count > 0; 1087 mirror_index++) { 1088 struct scrub_block *sblock_other = sblocks_for_recheck + 1089 mirror_index; 1090 struct scrub_page *page_other = sblock_other->pagev[ 1091 page_num]; 1092 1093 if (!page_other->io_error) { 1094 ret = scrub_repair_page_from_good_copy( 1095 sblock_bad, sblock_other, page_num, 0); 1096 if (0 == ret) { 1097 page_bad->io_error = 0; 1098 break; /* succeeded for this page */ 1099 } 1100 } 1101 } 1102 1103 if (page_bad->io_error) { 1104 /* did not find a mirror to copy the page from */ 1105 success = 0; 1106 } 1107 } 1108 1109 if (success) { 1110 if (is_metadata || have_csum) { 1111 /* 1112 * need to verify the checksum now that all 1113 * sectors on disk are repaired (the write 1114 * request for data to be repaired is on its way). 1115 * Just be lazy and use scrub_recheck_block() 1116 * which re-reads the data before the checksum 1117 * is verified, but most likely the data comes out 1118 * of the page cache. 1119 */ 1120 scrub_recheck_block(fs_info, sblock_bad, 1121 is_metadata, have_csum, csum, 1122 generation, sctx->csum_size); 1123 if (!sblock_bad->header_error && 1124 !sblock_bad->checksum_error && 1125 sblock_bad->no_io_error_seen) 1126 goto corrected_error; 1127 else 1128 goto did_not_correct_error; 1129 } else { 1130corrected_error: 1131 spin_lock(&sctx->stat_lock); 1132 sctx->stat.corrected_errors++; 1133 spin_unlock(&sctx->stat_lock); 1134 printk_ratelimited_in_rcu(KERN_ERR 1135 "btrfs: fixed up error at logical %llu on dev %s\n", 1136 (unsigned long long)logical, 1137 rcu_str_deref(dev->name)); 1138 } 1139 } else { 1140did_not_correct_error: 1141 spin_lock(&sctx->stat_lock); 1142 sctx->stat.uncorrectable_errors++; 1143 spin_unlock(&sctx->stat_lock); 1144 printk_ratelimited_in_rcu(KERN_ERR 1145 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1146 (unsigned long long)logical, 1147 rcu_str_deref(dev->name)); 1148 } 1149 1150out: 1151 if (sblocks_for_recheck) { 1152 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; 1153 mirror_index++) { 1154 struct scrub_block *sblock = sblocks_for_recheck + 1155 mirror_index; 1156 int page_index; 1157 1158 for (page_index = 0; page_index < sblock->page_count; 1159 page_index++) { 1160 sblock->pagev[page_index]->sblock = NULL; 1161 scrub_page_put(sblock->pagev[page_index]); 1162 } 1163 } 1164 kfree(sblocks_for_recheck); 1165 } 1166 1167 return 0; 1168} 1169 1170static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 1171 struct btrfs_fs_info *fs_info, 1172 struct scrub_block *original_sblock, 1173 u64 length, u64 logical, 1174 struct scrub_block *sblocks_for_recheck) 1175{ 1176 int page_index; 1177 int mirror_index; 1178 int ret; 1179 1180 /* 1181 * note: the two members ref_count and outstanding_pages 1182 * are not used (and not set) in the blocks that are used for 1183 * the recheck procedure 1184 */ 1185 1186 page_index = 0; 1187 while (length > 0) { 1188 u64 sublen = min_t(u64, length, PAGE_SIZE); 1189 u64 mapped_length = sublen; 1190 struct btrfs_bio *bbio = NULL; 1191 1192 /* 1193 * with a length of PAGE_SIZE, each returned stripe 1194 * represents one mirror 1195 */ 1196 ret = btrfs_map_block(fs_info, WRITE, logical, &mapped_length, 1197 &bbio, 0); 1198 if (ret || !bbio || mapped_length < sublen) { 1199 kfree(bbio); 1200 return -EIO; 1201 } 1202 1203 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); 1204 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1205 mirror_index++) { 1206 struct scrub_block *sblock; 1207 struct scrub_page *page; 1208 1209 if (mirror_index >= BTRFS_MAX_MIRRORS) 1210 continue; 1211 1212 sblock = sblocks_for_recheck + mirror_index; 1213 sblock->sctx = sctx; 1214 page = kzalloc(sizeof(*page), GFP_NOFS); 1215 if (!page) { 1216leave_nomem: 1217 spin_lock(&sctx->stat_lock); 1218 sctx->stat.malloc_errors++; 1219 spin_unlock(&sctx->stat_lock); 1220 kfree(bbio); 1221 return -ENOMEM; 1222 } 1223 scrub_page_get(page); 1224 sblock->pagev[page_index] = page; 1225 page->logical = logical; 1226 page->physical = bbio->stripes[mirror_index].physical; 1227 BUG_ON(page_index >= original_sblock->page_count); 1228 page->physical_for_dev_replace = 1229 original_sblock->pagev[page_index]-> 1230 physical_for_dev_replace; 1231 /* for missing devices, dev->bdev is NULL */ 1232 page->dev = bbio->stripes[mirror_index].dev; 1233 page->mirror_num = mirror_index + 1; 1234 sblock->page_count++; 1235 page->page = alloc_page(GFP_NOFS); 1236 if (!page->page) 1237 goto leave_nomem; 1238 } 1239 kfree(bbio); 1240 length -= sublen; 1241 logical += sublen; 1242 page_index++; 1243 } 1244 1245 return 0; 1246} 1247 1248/* 1249 * this function will check the on disk data for checksum errors, header 1250 * errors and read I/O errors. If any I/O errors happen, the exact pages 1251 * which are errored are marked as being bad. The goal is to enable scrub 1252 * to take those pages that are not errored from all the mirrors so that 1253 * the pages that are errored in the just handled mirror can be repaired. 1254 */ 1255static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1256 struct scrub_block *sblock, int is_metadata, 1257 int have_csum, u8 *csum, u64 generation, 1258 u16 csum_size) 1259{ 1260 int page_num; 1261 1262 sblock->no_io_error_seen = 1; 1263 sblock->header_error = 0; 1264 sblock->checksum_error = 0; 1265 1266 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1267 struct bio *bio; 1268 struct scrub_page *page = sblock->pagev[page_num]; 1269 DECLARE_COMPLETION_ONSTACK(complete); 1270 1271 if (page->dev->bdev == NULL) { 1272 page->io_error = 1; 1273 sblock->no_io_error_seen = 0; 1274 continue; 1275 } 1276 1277 WARN_ON(!page->page); 1278 bio = bio_alloc(GFP_NOFS, 1); 1279 if (!bio) { 1280 page->io_error = 1; 1281 sblock->no_io_error_seen = 0; 1282 continue; 1283 } 1284 bio->bi_bdev = page->dev->bdev; 1285 bio->bi_sector = page->physical >> 9; 1286 bio->bi_end_io = scrub_complete_bio_end_io; 1287 bio->bi_private = &complete; 1288 1289 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1290 btrfsic_submit_bio(READ, bio); 1291 1292 /* this will also unplug the queue */ 1293 wait_for_completion(&complete); 1294 1295 page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); 1296 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1297 sblock->no_io_error_seen = 0; 1298 bio_put(bio); 1299 } 1300 1301 if (sblock->no_io_error_seen) 1302 scrub_recheck_block_checksum(fs_info, sblock, is_metadata, 1303 have_csum, csum, generation, 1304 csum_size); 1305 1306 return; 1307} 1308 1309static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1310 struct scrub_block *sblock, 1311 int is_metadata, int have_csum, 1312 const u8 *csum, u64 generation, 1313 u16 csum_size) 1314{ 1315 int page_num; 1316 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1317 u32 crc = ~(u32)0; 1318 struct btrfs_root *root = fs_info->extent_root; 1319 void *mapped_buffer; 1320 1321 WARN_ON(!sblock->pagev[0]->page); 1322 if (is_metadata) { 1323 struct btrfs_header *h; 1324 1325 mapped_buffer = kmap_atomic(sblock->pagev[0]->page); 1326 h = (struct btrfs_header *)mapped_buffer; 1327 1328 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) || 1329 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1330 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1331 BTRFS_UUID_SIZE)) { 1332 sblock->header_error = 1; 1333 } else if (generation != le64_to_cpu(h->generation)) { 1334 sblock->header_error = 1; 1335 sblock->generation_error = 1; 1336 } 1337 csum = h->csum; 1338 } else { 1339 if (!have_csum) 1340 return; 1341 1342 mapped_buffer = kmap_atomic(sblock->pagev[0]->page); 1343 } 1344 1345 for (page_num = 0;;) { 1346 if (page_num == 0 && is_metadata) 1347 crc = btrfs_csum_data(root, 1348 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, 1349 crc, PAGE_SIZE - BTRFS_CSUM_SIZE); 1350 else 1351 crc = btrfs_csum_data(root, mapped_buffer, crc, 1352 PAGE_SIZE); 1353 1354 kunmap_atomic(mapped_buffer); 1355 page_num++; 1356 if (page_num >= sblock->page_count) 1357 break; 1358 WARN_ON(!sblock->pagev[page_num]->page); 1359 1360 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); 1361 } 1362 1363 btrfs_csum_final(crc, calculated_csum); 1364 if (memcmp(calculated_csum, csum, csum_size)) 1365 sblock->checksum_error = 1; 1366} 1367 1368static void scrub_complete_bio_end_io(struct bio *bio, int err) 1369{ 1370 complete((struct completion *)bio->bi_private); 1371} 1372 1373static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 1374 struct scrub_block *sblock_good, 1375 int force_write) 1376{ 1377 int page_num; 1378 int ret = 0; 1379 1380 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1381 int ret_sub; 1382 1383 ret_sub = scrub_repair_page_from_good_copy(sblock_bad, 1384 sblock_good, 1385 page_num, 1386 force_write); 1387 if (ret_sub) 1388 ret = ret_sub; 1389 } 1390 1391 return ret; 1392} 1393 1394static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 1395 struct scrub_block *sblock_good, 1396 int page_num, int force_write) 1397{ 1398 struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1399 struct scrub_page *page_good = sblock_good->pagev[page_num]; 1400 1401 BUG_ON(page_bad->page == NULL); 1402 BUG_ON(page_good->page == NULL); 1403 if (force_write || sblock_bad->header_error || 1404 sblock_bad->checksum_error || page_bad->io_error) { 1405 struct bio *bio; 1406 int ret; 1407 DECLARE_COMPLETION_ONSTACK(complete); 1408 1409 if (!page_bad->dev->bdev) { 1410 printk_ratelimited(KERN_WARNING 1411 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); 1412 return -EIO; 1413 } 1414 1415 bio = bio_alloc(GFP_NOFS, 1); 1416 if (!bio) 1417 return -EIO; 1418 bio->bi_bdev = page_bad->dev->bdev; 1419 bio->bi_sector = page_bad->physical >> 9; 1420 bio->bi_end_io = scrub_complete_bio_end_io; 1421 bio->bi_private = &complete; 1422 1423 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); 1424 if (PAGE_SIZE != ret) { 1425 bio_put(bio); 1426 return -EIO; 1427 } 1428 btrfsic_submit_bio(WRITE, bio); 1429 1430 /* this will also unplug the queue */ 1431 wait_for_completion(&complete); 1432 if (!bio_flagged(bio, BIO_UPTODATE)) { 1433 btrfs_dev_stat_inc_and_print(page_bad->dev, 1434 BTRFS_DEV_STAT_WRITE_ERRS); 1435 btrfs_dev_replace_stats_inc( 1436 &sblock_bad->sctx->dev_root->fs_info-> 1437 dev_replace.num_write_errors); 1438 bio_put(bio); 1439 return -EIO; 1440 } 1441 bio_put(bio); 1442 } 1443 1444 return 0; 1445} 1446 1447static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) 1448{ 1449 int page_num; 1450 1451 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1452 int ret; 1453 1454 ret = scrub_write_page_to_dev_replace(sblock, page_num); 1455 if (ret) 1456 btrfs_dev_replace_stats_inc( 1457 &sblock->sctx->dev_root->fs_info->dev_replace. 1458 num_write_errors); 1459 } 1460} 1461 1462static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 1463 int page_num) 1464{ 1465 struct scrub_page *spage = sblock->pagev[page_num]; 1466 1467 BUG_ON(spage->page == NULL); 1468 if (spage->io_error) { 1469 void *mapped_buffer = kmap_atomic(spage->page); 1470 1471 memset(mapped_buffer, 0, PAGE_CACHE_SIZE); 1472 flush_dcache_page(spage->page); 1473 kunmap_atomic(mapped_buffer); 1474 } 1475 return scrub_add_page_to_wr_bio(sblock->sctx, spage); 1476} 1477 1478static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 1479 struct scrub_page *spage) 1480{ 1481 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; 1482 struct scrub_bio *sbio; 1483 int ret; 1484 1485 mutex_lock(&wr_ctx->wr_lock); 1486again: 1487 if (!wr_ctx->wr_curr_bio) { 1488 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), 1489 GFP_NOFS); 1490 if (!wr_ctx->wr_curr_bio) { 1491 mutex_unlock(&wr_ctx->wr_lock); 1492 return -ENOMEM; 1493 } 1494 wr_ctx->wr_curr_bio->sctx = sctx; 1495 wr_ctx->wr_curr_bio->page_count = 0; 1496 } 1497 sbio = wr_ctx->wr_curr_bio; 1498 if (sbio->page_count == 0) { 1499 struct bio *bio; 1500 1501 sbio->physical = spage->physical_for_dev_replace; 1502 sbio->logical = spage->logical; 1503 sbio->dev = wr_ctx->tgtdev; 1504 bio = sbio->bio; 1505 if (!bio) { 1506 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); 1507 if (!bio) { 1508 mutex_unlock(&wr_ctx->wr_lock); 1509 return -ENOMEM; 1510 } 1511 sbio->bio = bio; 1512 } 1513 1514 bio->bi_private = sbio; 1515 bio->bi_end_io = scrub_wr_bio_end_io; 1516 bio->bi_bdev = sbio->dev->bdev; 1517 bio->bi_sector = sbio->physical >> 9; 1518 sbio->err = 0; 1519 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1520 spage->physical_for_dev_replace || 1521 sbio->logical + sbio->page_count * PAGE_SIZE != 1522 spage->logical) { 1523 scrub_wr_submit(sctx); 1524 goto again; 1525 } 1526 1527 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 1528 if (ret != PAGE_SIZE) { 1529 if (sbio->page_count < 1) { 1530 bio_put(sbio->bio); 1531 sbio->bio = NULL; 1532 mutex_unlock(&wr_ctx->wr_lock); 1533 return -EIO; 1534 } 1535 scrub_wr_submit(sctx); 1536 goto again; 1537 } 1538 1539 sbio->pagev[sbio->page_count] = spage; 1540 scrub_page_get(spage); 1541 sbio->page_count++; 1542 if (sbio->page_count == wr_ctx->pages_per_wr_bio) 1543 scrub_wr_submit(sctx); 1544 mutex_unlock(&wr_ctx->wr_lock); 1545 1546 return 0; 1547} 1548 1549static void scrub_wr_submit(struct scrub_ctx *sctx) 1550{ 1551 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; 1552 struct scrub_bio *sbio; 1553 1554 if (!wr_ctx->wr_curr_bio) 1555 return; 1556 1557 sbio = wr_ctx->wr_curr_bio; 1558 wr_ctx->wr_curr_bio = NULL; 1559 WARN_ON(!sbio->bio->bi_bdev); 1560 scrub_pending_bio_inc(sctx); 1561 /* process all writes in a single worker thread. Then the block layer 1562 * orders the requests before sending them to the driver which 1563 * doubled the write performance on spinning disks when measured 1564 * with Linux 3.5 */ 1565 btrfsic_submit_bio(WRITE, sbio->bio); 1566} 1567 1568static void scrub_wr_bio_end_io(struct bio *bio, int err) 1569{ 1570 struct scrub_bio *sbio = bio->bi_private; 1571 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; 1572 1573 sbio->err = err; 1574 sbio->bio = bio; 1575 1576 sbio->work.func = scrub_wr_bio_end_io_worker; 1577 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); 1578} 1579 1580static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1581{ 1582 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 1583 struct scrub_ctx *sctx = sbio->sctx; 1584 int i; 1585 1586 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); 1587 if (sbio->err) { 1588 struct btrfs_dev_replace *dev_replace = 1589 &sbio->sctx->dev_root->fs_info->dev_replace; 1590 1591 for (i = 0; i < sbio->page_count; i++) { 1592 struct scrub_page *spage = sbio->pagev[i]; 1593 1594 spage->io_error = 1; 1595 btrfs_dev_replace_stats_inc(&dev_replace-> 1596 num_write_errors); 1597 } 1598 } 1599 1600 for (i = 0; i < sbio->page_count; i++) 1601 scrub_page_put(sbio->pagev[i]); 1602 1603 bio_put(sbio->bio); 1604 kfree(sbio); 1605 scrub_pending_bio_dec(sctx); 1606} 1607 1608static int scrub_checksum(struct scrub_block *sblock) 1609{ 1610 u64 flags; 1611 int ret; 1612 1613 WARN_ON(sblock->page_count < 1); 1614 flags = sblock->pagev[0]->flags; 1615 ret = 0; 1616 if (flags & BTRFS_EXTENT_FLAG_DATA) 1617 ret = scrub_checksum_data(sblock); 1618 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1619 ret = scrub_checksum_tree_block(sblock); 1620 else if (flags & BTRFS_EXTENT_FLAG_SUPER) 1621 (void)scrub_checksum_super(sblock); 1622 else 1623 WARN_ON(1); 1624 if (ret) 1625 scrub_handle_errored_block(sblock); 1626 1627 return ret; 1628} 1629 1630static int scrub_checksum_data(struct scrub_block *sblock) 1631{ 1632 struct scrub_ctx *sctx = sblock->sctx; 1633 u8 csum[BTRFS_CSUM_SIZE]; 1634 u8 *on_disk_csum; 1635 struct page *page; 1636 void *buffer; 1637 u32 crc = ~(u32)0; 1638 int fail = 0; 1639 struct btrfs_root *root = sctx->dev_root; 1640 u64 len; 1641 int index; 1642 1643 BUG_ON(sblock->page_count < 1); 1644 if (!sblock->pagev[0]->have_csum) 1645 return 0; 1646 1647 on_disk_csum = sblock->pagev[0]->csum; 1648 page = sblock->pagev[0]->page; 1649 buffer = kmap_atomic(page); 1650 1651 len = sctx->sectorsize; 1652 index = 0; 1653 for (;;) { 1654 u64 l = min_t(u64, len, PAGE_SIZE); 1655 1656 crc = btrfs_csum_data(root, buffer, crc, l); 1657 kunmap_atomic(buffer); 1658 len -= l; 1659 if (len == 0) 1660 break; 1661 index++; 1662 BUG_ON(index >= sblock->page_count); 1663 BUG_ON(!sblock->pagev[index]->page); 1664 page = sblock->pagev[index]->page; 1665 buffer = kmap_atomic(page); 1666 } 1667 1668 btrfs_csum_final(crc, csum); 1669 if (memcmp(csum, on_disk_csum, sctx->csum_size)) 1670 fail = 1; 1671 1672 return fail; 1673} 1674 1675static int scrub_checksum_tree_block(struct scrub_block *sblock) 1676{ 1677 struct scrub_ctx *sctx = sblock->sctx; 1678 struct btrfs_header *h; 1679 struct btrfs_root *root = sctx->dev_root; 1680 struct btrfs_fs_info *fs_info = root->fs_info; 1681 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1682 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1683 struct page *page; 1684 void *mapped_buffer; 1685 u64 mapped_size; 1686 void *p; 1687 u32 crc = ~(u32)0; 1688 int fail = 0; 1689 int crc_fail = 0; 1690 u64 len; 1691 int index; 1692 1693 BUG_ON(sblock->page_count < 1); 1694 page = sblock->pagev[0]->page; 1695 mapped_buffer = kmap_atomic(page); 1696 h = (struct btrfs_header *)mapped_buffer; 1697 memcpy(on_disk_csum, h->csum, sctx->csum_size); 1698 1699 /* 1700 * we don't use the getter functions here, as we 1701 * a) don't have an extent buffer and 1702 * b) the page is already kmapped 1703 */ 1704 1705 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr)) 1706 ++fail; 1707 1708 if (sblock->pagev[0]->generation != le64_to_cpu(h->generation)) 1709 ++fail; 1710 1711 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1712 ++fail; 1713 1714 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1715 BTRFS_UUID_SIZE)) 1716 ++fail; 1717 1718 WARN_ON(sctx->nodesize != sctx->leafsize); 1719 len = sctx->nodesize - BTRFS_CSUM_SIZE; 1720 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1721 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1722 index = 0; 1723 for (;;) { 1724 u64 l = min_t(u64, len, mapped_size); 1725 1726 crc = btrfs_csum_data(root, p, crc, l); 1727 kunmap_atomic(mapped_buffer); 1728 len -= l; 1729 if (len == 0) 1730 break; 1731 index++; 1732 BUG_ON(index >= sblock->page_count); 1733 BUG_ON(!sblock->pagev[index]->page); 1734 page = sblock->pagev[index]->page; 1735 mapped_buffer = kmap_atomic(page); 1736 mapped_size = PAGE_SIZE; 1737 p = mapped_buffer; 1738 } 1739 1740 btrfs_csum_final(crc, calculated_csum); 1741 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1742 ++crc_fail; 1743 1744 return fail || crc_fail; 1745} 1746 1747static int scrub_checksum_super(struct scrub_block *sblock) 1748{ 1749 struct btrfs_super_block *s; 1750 struct scrub_ctx *sctx = sblock->sctx; 1751 struct btrfs_root *root = sctx->dev_root; 1752 struct btrfs_fs_info *fs_info = root->fs_info; 1753 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1754 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1755 struct page *page; 1756 void *mapped_buffer; 1757 u64 mapped_size; 1758 void *p; 1759 u32 crc = ~(u32)0; 1760 int fail_gen = 0; 1761 int fail_cor = 0; 1762 u64 len; 1763 int index; 1764 1765 BUG_ON(sblock->page_count < 1); 1766 page = sblock->pagev[0]->page; 1767 mapped_buffer = kmap_atomic(page); 1768 s = (struct btrfs_super_block *)mapped_buffer; 1769 memcpy(on_disk_csum, s->csum, sctx->csum_size); 1770 1771 if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr)) 1772 ++fail_cor; 1773 1774 if (sblock->pagev[0]->generation != le64_to_cpu(s->generation)) 1775 ++fail_gen; 1776 1777 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1778 ++fail_cor; 1779 1780 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 1781 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1782 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1783 index = 0; 1784 for (;;) { 1785 u64 l = min_t(u64, len, mapped_size); 1786 1787 crc = btrfs_csum_data(root, p, crc, l); 1788 kunmap_atomic(mapped_buffer); 1789 len -= l; 1790 if (len == 0) 1791 break; 1792 index++; 1793 BUG_ON(index >= sblock->page_count); 1794 BUG_ON(!sblock->pagev[index]->page); 1795 page = sblock->pagev[index]->page; 1796 mapped_buffer = kmap_atomic(page); 1797 mapped_size = PAGE_SIZE; 1798 p = mapped_buffer; 1799 } 1800 1801 btrfs_csum_final(crc, calculated_csum); 1802 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1803 ++fail_cor; 1804 1805 if (fail_cor + fail_gen) { 1806 /* 1807 * if we find an error in a super block, we just report it. 1808 * They will get written with the next transaction commit 1809 * anyway 1810 */ 1811 spin_lock(&sctx->stat_lock); 1812 ++sctx->stat.super_errors; 1813 spin_unlock(&sctx->stat_lock); 1814 if (fail_cor) 1815 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 1816 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1817 else 1818 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 1819 BTRFS_DEV_STAT_GENERATION_ERRS); 1820 } 1821 1822 return fail_cor + fail_gen; 1823} 1824 1825static void scrub_block_get(struct scrub_block *sblock) 1826{ 1827 atomic_inc(&sblock->ref_count); 1828} 1829 1830static void scrub_block_put(struct scrub_block *sblock) 1831{ 1832 if (atomic_dec_and_test(&sblock->ref_count)) { 1833 int i; 1834 1835 for (i = 0; i < sblock->page_count; i++) 1836 scrub_page_put(sblock->pagev[i]); 1837 kfree(sblock); 1838 } 1839} 1840 1841static void scrub_page_get(struct scrub_page *spage) 1842{ 1843 atomic_inc(&spage->ref_count); 1844} 1845 1846static void scrub_page_put(struct scrub_page *spage) 1847{ 1848 if (atomic_dec_and_test(&spage->ref_count)) { 1849 if (spage->page) 1850 __free_page(spage->page); 1851 kfree(spage); 1852 } 1853} 1854 1855static void scrub_submit(struct scrub_ctx *sctx) 1856{ 1857 struct scrub_bio *sbio; 1858 1859 if (sctx->curr == -1) 1860 return; 1861 1862 sbio = sctx->bios[sctx->curr]; 1863 sctx->curr = -1; 1864 scrub_pending_bio_inc(sctx); 1865 1866 if (!sbio->bio->bi_bdev) { 1867 /* 1868 * this case should not happen. If btrfs_map_block() is 1869 * wrong, it could happen for dev-replace operations on 1870 * missing devices when no mirrors are available, but in 1871 * this case it should already fail the mount. 1872 * This case is handled correctly (but _very_ slowly). 1873 */ 1874 printk_ratelimited(KERN_WARNING 1875 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); 1876 bio_endio(sbio->bio, -EIO); 1877 } else { 1878 btrfsic_submit_bio(READ, sbio->bio); 1879 } 1880} 1881 1882static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 1883 struct scrub_page *spage) 1884{ 1885 struct scrub_block *sblock = spage->sblock; 1886 struct scrub_bio *sbio; 1887 int ret; 1888 1889again: 1890 /* 1891 * grab a fresh bio or wait for one to become available 1892 */ 1893 while (sctx->curr == -1) { 1894 spin_lock(&sctx->list_lock); 1895 sctx->curr = sctx->first_free; 1896 if (sctx->curr != -1) { 1897 sctx->first_free = sctx->bios[sctx->curr]->next_free; 1898 sctx->bios[sctx->curr]->next_free = -1; 1899 sctx->bios[sctx->curr]->page_count = 0; 1900 spin_unlock(&sctx->list_lock); 1901 } else { 1902 spin_unlock(&sctx->list_lock); 1903 wait_event(sctx->list_wait, sctx->first_free != -1); 1904 } 1905 } 1906 sbio = sctx->bios[sctx->curr]; 1907 if (sbio->page_count == 0) { 1908 struct bio *bio; 1909 1910 sbio->physical = spage->physical; 1911 sbio->logical = spage->logical; 1912 sbio->dev = spage->dev; 1913 bio = sbio->bio; 1914 if (!bio) { 1915 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); 1916 if (!bio) 1917 return -ENOMEM; 1918 sbio->bio = bio; 1919 } 1920 1921 bio->bi_private = sbio; 1922 bio->bi_end_io = scrub_bio_end_io; 1923 bio->bi_bdev = sbio->dev->bdev; 1924 bio->bi_sector = sbio->physical >> 9; 1925 sbio->err = 0; 1926 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1927 spage->physical || 1928 sbio->logical + sbio->page_count * PAGE_SIZE != 1929 spage->logical || 1930 sbio->dev != spage->dev) { 1931 scrub_submit(sctx); 1932 goto again; 1933 } 1934 1935 sbio->pagev[sbio->page_count] = spage; 1936 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 1937 if (ret != PAGE_SIZE) { 1938 if (sbio->page_count < 1) { 1939 bio_put(sbio->bio); 1940 sbio->bio = NULL; 1941 return -EIO; 1942 } 1943 scrub_submit(sctx); 1944 goto again; 1945 } 1946 1947 scrub_block_get(sblock); /* one for the page added to the bio */ 1948 atomic_inc(&sblock->outstanding_pages); 1949 sbio->page_count++; 1950 if (sbio->page_count == sctx->pages_per_rd_bio) 1951 scrub_submit(sctx); 1952 1953 return 0; 1954} 1955 1956static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 1957 u64 physical, struct btrfs_device *dev, u64 flags, 1958 u64 gen, int mirror_num, u8 *csum, int force, 1959 u64 physical_for_dev_replace) 1960{ 1961 struct scrub_block *sblock; 1962 int index; 1963 1964 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1965 if (!sblock) { 1966 spin_lock(&sctx->stat_lock); 1967 sctx->stat.malloc_errors++; 1968 spin_unlock(&sctx->stat_lock); 1969 return -ENOMEM; 1970 } 1971 1972 /* one ref inside this function, plus one for each page added to 1973 * a bio later on */ 1974 atomic_set(&sblock->ref_count, 1); 1975 sblock->sctx = sctx; 1976 sblock->no_io_error_seen = 1; 1977 1978 for (index = 0; len > 0; index++) { 1979 struct scrub_page *spage; 1980 u64 l = min_t(u64, len, PAGE_SIZE); 1981 1982 spage = kzalloc(sizeof(*spage), GFP_NOFS); 1983 if (!spage) { 1984leave_nomem: 1985 spin_lock(&sctx->stat_lock); 1986 sctx->stat.malloc_errors++; 1987 spin_unlock(&sctx->stat_lock); 1988 scrub_block_put(sblock); 1989 return -ENOMEM; 1990 } 1991 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1992 scrub_page_get(spage); 1993 sblock->pagev[index] = spage; 1994 spage->sblock = sblock; 1995 spage->dev = dev; 1996 spage->flags = flags; 1997 spage->generation = gen; 1998 spage->logical = logical; 1999 spage->physical = physical; 2000 spage->physical_for_dev_replace = physical_for_dev_replace; 2001 spage->mirror_num = mirror_num; 2002 if (csum) { 2003 spage->have_csum = 1; 2004 memcpy(spage->csum, csum, sctx->csum_size); 2005 } else { 2006 spage->have_csum = 0; 2007 } 2008 sblock->page_count++; 2009 spage->page = alloc_page(GFP_NOFS); 2010 if (!spage->page) 2011 goto leave_nomem; 2012 len -= l; 2013 logical += l; 2014 physical += l; 2015 physical_for_dev_replace += l; 2016 } 2017 2018 WARN_ON(sblock->page_count == 0); 2019 for (index = 0; index < sblock->page_count; index++) { 2020 struct scrub_page *spage = sblock->pagev[index]; 2021 int ret; 2022 2023 ret = scrub_add_page_to_rd_bio(sctx, spage); 2024 if (ret) { 2025 scrub_block_put(sblock); 2026 return ret; 2027 } 2028 } 2029 2030 if (force) 2031 scrub_submit(sctx); 2032 2033 /* last one frees, either here or in bio completion for last page */ 2034 scrub_block_put(sblock); 2035 return 0; 2036} 2037 2038static void scrub_bio_end_io(struct bio *bio, int err) 2039{ 2040 struct scrub_bio *sbio = bio->bi_private; 2041 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; 2042 2043 sbio->err = err; 2044 sbio->bio = bio; 2045 2046 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 2047} 2048 2049static void scrub_bio_end_io_worker(struct btrfs_work *work) 2050{ 2051 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2052 struct scrub_ctx *sctx = sbio->sctx; 2053 int i; 2054 2055 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); 2056 if (sbio->err) { 2057 for (i = 0; i < sbio->page_count; i++) { 2058 struct scrub_page *spage = sbio->pagev[i]; 2059 2060 spage->io_error = 1; 2061 spage->sblock->no_io_error_seen = 0; 2062 } 2063 } 2064 2065 /* now complete the scrub_block items that have all pages completed */ 2066 for (i = 0; i < sbio->page_count; i++) { 2067 struct scrub_page *spage = sbio->pagev[i]; 2068 struct scrub_block *sblock = spage->sblock; 2069 2070 if (atomic_dec_and_test(&sblock->outstanding_pages)) 2071 scrub_block_complete(sblock); 2072 scrub_block_put(sblock); 2073 } 2074 2075 bio_put(sbio->bio); 2076 sbio->bio = NULL; 2077 spin_lock(&sctx->list_lock); 2078 sbio->next_free = sctx->first_free; 2079 sctx->first_free = sbio->index; 2080 spin_unlock(&sctx->list_lock); 2081 2082 if (sctx->is_dev_replace && 2083 atomic_read(&sctx->wr_ctx.flush_all_writes)) { 2084 mutex_lock(&sctx->wr_ctx.wr_lock); 2085 scrub_wr_submit(sctx); 2086 mutex_unlock(&sctx->wr_ctx.wr_lock); 2087 } 2088 2089 scrub_pending_bio_dec(sctx); 2090} 2091 2092static void scrub_block_complete(struct scrub_block *sblock) 2093{ 2094 if (!sblock->no_io_error_seen) { 2095 scrub_handle_errored_block(sblock); 2096 } else { 2097 /* 2098 * if has checksum error, write via repair mechanism in 2099 * dev replace case, otherwise write here in dev replace 2100 * case. 2101 */ 2102 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) 2103 scrub_write_block_to_dev_replace(sblock); 2104 } 2105} 2106 2107static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 2108 u8 *csum) 2109{ 2110 struct btrfs_ordered_sum *sum = NULL; 2111 int ret = 0; 2112 unsigned long i; 2113 unsigned long num_sectors; 2114 2115 while (!list_empty(&sctx->csum_list)) { 2116 sum = list_first_entry(&sctx->csum_list, 2117 struct btrfs_ordered_sum, list); 2118 if (sum->bytenr > logical) 2119 return 0; 2120 if (sum->bytenr + sum->len > logical) 2121 break; 2122 2123 ++sctx->stat.csum_discards; 2124 list_del(&sum->list); 2125 kfree(sum); 2126 sum = NULL; 2127 } 2128 if (!sum) 2129 return 0; 2130 2131 num_sectors = sum->len / sctx->sectorsize; 2132 for (i = 0; i < num_sectors; ++i) { 2133 if (sum->sums[i].bytenr == logical) { 2134 memcpy(csum, &sum->sums[i].sum, sctx->csum_size); 2135 ret = 1; 2136 break; 2137 } 2138 } 2139 if (ret && i == num_sectors - 1) { 2140 list_del(&sum->list); 2141 kfree(sum); 2142 } 2143 return ret; 2144} 2145 2146/* scrub extent tries to collect up to 64 kB for each bio */ 2147static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, 2148 u64 physical, struct btrfs_device *dev, u64 flags, 2149 u64 gen, int mirror_num, u64 physical_for_dev_replace) 2150{ 2151 int ret; 2152 u8 csum[BTRFS_CSUM_SIZE]; 2153 u32 blocksize; 2154 2155 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2156 blocksize = sctx->sectorsize; 2157 spin_lock(&sctx->stat_lock); 2158 sctx->stat.data_extents_scrubbed++; 2159 sctx->stat.data_bytes_scrubbed += len; 2160 spin_unlock(&sctx->stat_lock); 2161 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2162 WARN_ON(sctx->nodesize != sctx->leafsize); 2163 blocksize = sctx->nodesize; 2164 spin_lock(&sctx->stat_lock); 2165 sctx->stat.tree_extents_scrubbed++; 2166 sctx->stat.tree_bytes_scrubbed += len; 2167 spin_unlock(&sctx->stat_lock); 2168 } else { 2169 blocksize = sctx->sectorsize; 2170 WARN_ON(1); 2171 } 2172 2173 while (len) { 2174 u64 l = min_t(u64, len, blocksize); 2175 int have_csum = 0; 2176 2177 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2178 /* push csums to sbio */ 2179 have_csum = scrub_find_csum(sctx, logical, l, csum); 2180 if (have_csum == 0) 2181 ++sctx->stat.no_csum; 2182 if (sctx->is_dev_replace && !have_csum) { 2183 ret = copy_nocow_pages(sctx, logical, l, 2184 mirror_num, 2185 physical_for_dev_replace); 2186 goto behind_scrub_pages; 2187 } 2188 } 2189 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, 2190 mirror_num, have_csum ? csum : NULL, 0, 2191 physical_for_dev_replace); 2192behind_scrub_pages: 2193 if (ret) 2194 return ret; 2195 len -= l; 2196 logical += l; 2197 physical += l; 2198 physical_for_dev_replace += l; 2199 } 2200 return 0; 2201} 2202 2203static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2204 struct map_lookup *map, 2205 struct btrfs_device *scrub_dev, 2206 int num, u64 base, u64 length, 2207 int is_dev_replace) 2208{ 2209 struct btrfs_path *path; 2210 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2211 struct btrfs_root *root = fs_info->extent_root; 2212 struct btrfs_root *csum_root = fs_info->csum_root; 2213 struct btrfs_extent_item *extent; 2214 struct blk_plug plug; 2215 u64 flags; 2216 int ret; 2217 int slot; 2218 int i; 2219 u64 nstripes; 2220 struct extent_buffer *l; 2221 struct btrfs_key key; 2222 u64 physical; 2223 u64 logical; 2224 u64 generation; 2225 int mirror_num; 2226 struct reada_control *reada1; 2227 struct reada_control *reada2; 2228 struct btrfs_key key_start; 2229 struct btrfs_key key_end; 2230 u64 increment = map->stripe_len; 2231 u64 offset; 2232 u64 extent_logical; 2233 u64 extent_physical; 2234 u64 extent_len; 2235 struct btrfs_device *extent_dev; 2236 int extent_mirror_num; 2237 2238 nstripes = length; 2239 offset = 0; 2240 do_div(nstripes, map->stripe_len); 2241 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2242 offset = map->stripe_len * num; 2243 increment = map->stripe_len * map->num_stripes; 2244 mirror_num = 1; 2245 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2246 int factor = map->num_stripes / map->sub_stripes; 2247 offset = map->stripe_len * (num / map->sub_stripes); 2248 increment = map->stripe_len * factor; 2249 mirror_num = num % map->sub_stripes + 1; 2250 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2251 increment = map->stripe_len; 2252 mirror_num = num % map->num_stripes + 1; 2253 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2254 increment = map->stripe_len; 2255 mirror_num = num % map->num_stripes + 1; 2256 } else { 2257 increment = map->stripe_len; 2258 mirror_num = 1; 2259 } 2260 2261 path = btrfs_alloc_path(); 2262 if (!path) 2263 return -ENOMEM; 2264 2265 /* 2266 * work on commit root. The related disk blocks are static as 2267 * long as COW is applied. This means, it is save to rewrite 2268 * them to repair disk errors without any race conditions 2269 */ 2270 path->search_commit_root = 1; 2271 path->skip_locking = 1; 2272 2273 /* 2274 * trigger the readahead for extent tree csum tree and wait for 2275 * completion. During readahead, the scrub is officially paused 2276 * to not hold off transaction commits 2277 */ 2278 logical = base + offset; 2279 2280 wait_event(sctx->list_wait, 2281 atomic_read(&sctx->bios_in_flight) == 0); 2282 atomic_inc(&fs_info->scrubs_paused); 2283 wake_up(&fs_info->scrub_pause_wait); 2284 2285 /* FIXME it might be better to start readahead at commit root */ 2286 key_start.objectid = logical; 2287 key_start.type = BTRFS_EXTENT_ITEM_KEY; 2288 key_start.offset = (u64)0; 2289 key_end.objectid = base + offset + nstripes * increment; 2290 key_end.type = BTRFS_EXTENT_ITEM_KEY; 2291 key_end.offset = (u64)0; 2292 reada1 = btrfs_reada_add(root, &key_start, &key_end); 2293 2294 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 2295 key_start.type = BTRFS_EXTENT_CSUM_KEY; 2296 key_start.offset = logical; 2297 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 2298 key_end.type = BTRFS_EXTENT_CSUM_KEY; 2299 key_end.offset = base + offset + nstripes * increment; 2300 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); 2301 2302 if (!IS_ERR(reada1)) 2303 btrfs_reada_wait(reada1); 2304 if (!IS_ERR(reada2)) 2305 btrfs_reada_wait(reada2); 2306 2307 mutex_lock(&fs_info->scrub_lock); 2308 while (atomic_read(&fs_info->scrub_pause_req)) { 2309 mutex_unlock(&fs_info->scrub_lock); 2310 wait_event(fs_info->scrub_pause_wait, 2311 atomic_read(&fs_info->scrub_pause_req) == 0); 2312 mutex_lock(&fs_info->scrub_lock); 2313 } 2314 atomic_dec(&fs_info->scrubs_paused); 2315 mutex_unlock(&fs_info->scrub_lock); 2316 wake_up(&fs_info->scrub_pause_wait); 2317 2318 /* 2319 * collect all data csums for the stripe to avoid seeking during 2320 * the scrub. This might currently (crc32) end up to be about 1MB 2321 */ 2322 blk_start_plug(&plug); 2323 2324 /* 2325 * now find all extents for each stripe and scrub them 2326 */ 2327 logical = base + offset; 2328 physical = map->stripes[num].physical; 2329 ret = 0; 2330 for (i = 0; i < nstripes; ++i) { 2331 /* 2332 * canceled? 2333 */ 2334 if (atomic_read(&fs_info->scrub_cancel_req) || 2335 atomic_read(&sctx->cancel_req)) { 2336 ret = -ECANCELED; 2337 goto out; 2338 } 2339 /* 2340 * check to see if we have to pause 2341 */ 2342 if (atomic_read(&fs_info->scrub_pause_req)) { 2343 /* push queued extents */ 2344 atomic_set(&sctx->wr_ctx.flush_all_writes, 1); 2345 scrub_submit(sctx); 2346 mutex_lock(&sctx->wr_ctx.wr_lock); 2347 scrub_wr_submit(sctx); 2348 mutex_unlock(&sctx->wr_ctx.wr_lock); 2349 wait_event(sctx->list_wait, 2350 atomic_read(&sctx->bios_in_flight) == 0); 2351 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2352 atomic_inc(&fs_info->scrubs_paused); 2353 wake_up(&fs_info->scrub_pause_wait); 2354 mutex_lock(&fs_info->scrub_lock); 2355 while (atomic_read(&fs_info->scrub_pause_req)) { 2356 mutex_unlock(&fs_info->scrub_lock); 2357 wait_event(fs_info->scrub_pause_wait, 2358 atomic_read(&fs_info->scrub_pause_req) == 0); 2359 mutex_lock(&fs_info->scrub_lock); 2360 } 2361 atomic_dec(&fs_info->scrubs_paused); 2362 mutex_unlock(&fs_info->scrub_lock); 2363 wake_up(&fs_info->scrub_pause_wait); 2364 } 2365 2366 ret = btrfs_lookup_csums_range(csum_root, logical, 2367 logical + map->stripe_len - 1, 2368 &sctx->csum_list, 1); 2369 if (ret) 2370 goto out; 2371 2372 key.objectid = logical; 2373 key.type = BTRFS_EXTENT_ITEM_KEY; 2374 key.offset = (u64)0; 2375 2376 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2377 if (ret < 0) 2378 goto out; 2379 if (ret > 0) { 2380 ret = btrfs_previous_item(root, path, 0, 2381 BTRFS_EXTENT_ITEM_KEY); 2382 if (ret < 0) 2383 goto out; 2384 if (ret > 0) { 2385 /* there's no smaller item, so stick with the 2386 * larger one */ 2387 btrfs_release_path(path); 2388 ret = btrfs_search_slot(NULL, root, &key, 2389 path, 0, 0); 2390 if (ret < 0) 2391 goto out; 2392 } 2393 } 2394 2395 while (1) { 2396 l = path->nodes[0]; 2397 slot = path->slots[0]; 2398 if (slot >= btrfs_header_nritems(l)) { 2399 ret = btrfs_next_leaf(root, path); 2400 if (ret == 0) 2401 continue; 2402 if (ret < 0) 2403 goto out; 2404 2405 break; 2406 } 2407 btrfs_item_key_to_cpu(l, &key, slot); 2408 2409 if (key.objectid + key.offset <= logical) 2410 goto next; 2411 2412 if (key.objectid >= logical + map->stripe_len) 2413 break; 2414 2415 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) 2416 goto next; 2417 2418 extent = btrfs_item_ptr(l, slot, 2419 struct btrfs_extent_item); 2420 flags = btrfs_extent_flags(l, extent); 2421 generation = btrfs_extent_generation(l, extent); 2422 2423 if (key.objectid < logical && 2424 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { 2425 printk(KERN_ERR 2426 "btrfs scrub: tree block %llu spanning " 2427 "stripes, ignored. logical=%llu\n", 2428 (unsigned long long)key.objectid, 2429 (unsigned long long)logical); 2430 goto next; 2431 } 2432 2433 /* 2434 * trim extent to this stripe 2435 */ 2436 if (key.objectid < logical) { 2437 key.offset -= logical - key.objectid; 2438 key.objectid = logical; 2439 } 2440 if (key.objectid + key.offset > 2441 logical + map->stripe_len) { 2442 key.offset = logical + map->stripe_len - 2443 key.objectid; 2444 } 2445 2446 extent_logical = key.objectid; 2447 extent_physical = key.objectid - logical + physical; 2448 extent_len = key.offset; 2449 extent_dev = scrub_dev; 2450 extent_mirror_num = mirror_num; 2451 if (is_dev_replace) 2452 scrub_remap_extent(fs_info, extent_logical, 2453 extent_len, &extent_physical, 2454 &extent_dev, 2455 &extent_mirror_num); 2456 ret = scrub_extent(sctx, extent_logical, extent_len, 2457 extent_physical, extent_dev, flags, 2458 generation, extent_mirror_num, 2459 key.objectid - logical + physical); 2460 if (ret) 2461 goto out; 2462 2463next: 2464 path->slots[0]++; 2465 } 2466 btrfs_release_path(path); 2467 logical += increment; 2468 physical += map->stripe_len; 2469 spin_lock(&sctx->stat_lock); 2470 sctx->stat.last_physical = physical; 2471 spin_unlock(&sctx->stat_lock); 2472 } 2473out: 2474 /* push queued extents */ 2475 scrub_submit(sctx); 2476 mutex_lock(&sctx->wr_ctx.wr_lock); 2477 scrub_wr_submit(sctx); 2478 mutex_unlock(&sctx->wr_ctx.wr_lock); 2479 2480 blk_finish_plug(&plug); 2481 btrfs_free_path(path); 2482 return ret < 0 ? ret : 0; 2483} 2484 2485static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 2486 struct btrfs_device *scrub_dev, 2487 u64 chunk_tree, u64 chunk_objectid, 2488 u64 chunk_offset, u64 length, 2489 u64 dev_offset, int is_dev_replace) 2490{ 2491 struct btrfs_mapping_tree *map_tree = 2492 &sctx->dev_root->fs_info->mapping_tree; 2493 struct map_lookup *map; 2494 struct extent_map *em; 2495 int i; 2496 int ret = 0; 2497 2498 read_lock(&map_tree->map_tree.lock); 2499 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2500 read_unlock(&map_tree->map_tree.lock); 2501 2502 if (!em) 2503 return -EINVAL; 2504 2505 map = (struct map_lookup *)em->bdev; 2506 if (em->start != chunk_offset) 2507 goto out; 2508 2509 if (em->len < length) 2510 goto out; 2511 2512 for (i = 0; i < map->num_stripes; ++i) { 2513 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2514 map->stripes[i].physical == dev_offset) { 2515 ret = scrub_stripe(sctx, map, scrub_dev, i, 2516 chunk_offset, length, 2517 is_dev_replace); 2518 if (ret) 2519 goto out; 2520 } 2521 } 2522out: 2523 free_extent_map(em); 2524 2525 return ret; 2526} 2527 2528static noinline_for_stack 2529int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2530 struct btrfs_device *scrub_dev, u64 start, u64 end, 2531 int is_dev_replace) 2532{ 2533 struct btrfs_dev_extent *dev_extent = NULL; 2534 struct btrfs_path *path; 2535 struct btrfs_root *root = sctx->dev_root; 2536 struct btrfs_fs_info *fs_info = root->fs_info; 2537 u64 length; 2538 u64 chunk_tree; 2539 u64 chunk_objectid; 2540 u64 chunk_offset; 2541 int ret; 2542 int slot; 2543 struct extent_buffer *l; 2544 struct btrfs_key key; 2545 struct btrfs_key found_key; 2546 struct btrfs_block_group_cache *cache; 2547 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2548 2549 path = btrfs_alloc_path(); 2550 if (!path) 2551 return -ENOMEM; 2552 2553 path->reada = 2; 2554 path->search_commit_root = 1; 2555 path->skip_locking = 1; 2556 2557 key.objectid = scrub_dev->devid; 2558 key.offset = 0ull; 2559 key.type = BTRFS_DEV_EXTENT_KEY; 2560 2561 while (1) { 2562 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2563 if (ret < 0) 2564 break; 2565 if (ret > 0) { 2566 if (path->slots[0] >= 2567 btrfs_header_nritems(path->nodes[0])) { 2568 ret = btrfs_next_leaf(root, path); 2569 if (ret) 2570 break; 2571 } 2572 } 2573 2574 l = path->nodes[0]; 2575 slot = path->slots[0]; 2576 2577 btrfs_item_key_to_cpu(l, &found_key, slot); 2578 2579 if (found_key.objectid != scrub_dev->devid) 2580 break; 2581 2582 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2583 break; 2584 2585 if (found_key.offset >= end) 2586 break; 2587 2588 if (found_key.offset < key.offset) 2589 break; 2590 2591 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2592 length = btrfs_dev_extent_length(l, dev_extent); 2593 2594 if (found_key.offset + length <= start) { 2595 key.offset = found_key.offset + length; 2596 btrfs_release_path(path); 2597 continue; 2598 } 2599 2600 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2601 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2602 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2603 2604 /* 2605 * get a reference on the corresponding block group to prevent 2606 * the chunk from going away while we scrub it 2607 */ 2608 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2609 if (!cache) { 2610 ret = -ENOENT; 2611 break; 2612 } 2613 dev_replace->cursor_right = found_key.offset + length; 2614 dev_replace->cursor_left = found_key.offset; 2615 dev_replace->item_needs_writeback = 1; 2616 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, 2617 chunk_offset, length, found_key.offset, 2618 is_dev_replace); 2619 2620 /* 2621 * flush, submit all pending read and write bios, afterwards 2622 * wait for them. 2623 * Note that in the dev replace case, a read request causes 2624 * write requests that are submitted in the read completion 2625 * worker. Therefore in the current situation, it is required 2626 * that all write requests are flushed, so that all read and 2627 * write requests are really completed when bios_in_flight 2628 * changes to 0. 2629 */ 2630 atomic_set(&sctx->wr_ctx.flush_all_writes, 1); 2631 scrub_submit(sctx); 2632 mutex_lock(&sctx->wr_ctx.wr_lock); 2633 scrub_wr_submit(sctx); 2634 mutex_unlock(&sctx->wr_ctx.wr_lock); 2635 2636 wait_event(sctx->list_wait, 2637 atomic_read(&sctx->bios_in_flight) == 0); 2638 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2639 atomic_inc(&fs_info->scrubs_paused); 2640 wake_up(&fs_info->scrub_pause_wait); 2641 wait_event(sctx->list_wait, 2642 atomic_read(&sctx->workers_pending) == 0); 2643 2644 mutex_lock(&fs_info->scrub_lock); 2645 while (atomic_read(&fs_info->scrub_pause_req)) { 2646 mutex_unlock(&fs_info->scrub_lock); 2647 wait_event(fs_info->scrub_pause_wait, 2648 atomic_read(&fs_info->scrub_pause_req) == 0); 2649 mutex_lock(&fs_info->scrub_lock); 2650 } 2651 atomic_dec(&fs_info->scrubs_paused); 2652 mutex_unlock(&fs_info->scrub_lock); 2653 wake_up(&fs_info->scrub_pause_wait); 2654 2655 dev_replace->cursor_left = dev_replace->cursor_right; 2656 dev_replace->item_needs_writeback = 1; 2657 btrfs_put_block_group(cache); 2658 if (ret) 2659 break; 2660 if (atomic64_read(&dev_replace->num_write_errors) > 0) { 2661 ret = -EIO; 2662 break; 2663 } 2664 if (sctx->stat.malloc_errors > 0) { 2665 ret = -ENOMEM; 2666 break; 2667 } 2668 2669 key.offset = found_key.offset + length; 2670 btrfs_release_path(path); 2671 } 2672 2673 btrfs_free_path(path); 2674 2675 /* 2676 * ret can still be 1 from search_slot or next_leaf, 2677 * that's not an error 2678 */ 2679 return ret < 0 ? ret : 0; 2680} 2681 2682static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 2683 struct btrfs_device *scrub_dev) 2684{ 2685 int i; 2686 u64 bytenr; 2687 u64 gen; 2688 int ret; 2689 struct btrfs_root *root = sctx->dev_root; 2690 2691 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2692 return -EIO; 2693 2694 gen = root->fs_info->last_trans_committed; 2695 2696 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2697 bytenr = btrfs_sb_offset(i); 2698 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) 2699 break; 2700 2701 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2702 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 2703 NULL, 1, bytenr); 2704 if (ret) 2705 return ret; 2706 } 2707 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 2708 2709 return 0; 2710} 2711 2712/* 2713 * get a reference count on fs_info->scrub_workers. start worker if necessary 2714 */ 2715static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, 2716 int is_dev_replace) 2717{ 2718 int ret = 0; 2719 2720 mutex_lock(&fs_info->scrub_lock); 2721 if (fs_info->scrub_workers_refcnt == 0) { 2722 if (is_dev_replace) 2723 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, 2724 &fs_info->generic_worker); 2725 else 2726 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2727 fs_info->thread_pool_size, 2728 &fs_info->generic_worker); 2729 fs_info->scrub_workers.idle_thresh = 4; 2730 ret = btrfs_start_workers(&fs_info->scrub_workers); 2731 if (ret) 2732 goto out; 2733 btrfs_init_workers(&fs_info->scrub_wr_completion_workers, 2734 "scrubwrc", 2735 fs_info->thread_pool_size, 2736 &fs_info->generic_worker); 2737 fs_info->scrub_wr_completion_workers.idle_thresh = 2; 2738 ret = btrfs_start_workers( 2739 &fs_info->scrub_wr_completion_workers); 2740 if (ret) 2741 goto out; 2742 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, 2743 &fs_info->generic_worker); 2744 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); 2745 if (ret) 2746 goto out; 2747 } 2748 ++fs_info->scrub_workers_refcnt; 2749out: 2750 mutex_unlock(&fs_info->scrub_lock); 2751 2752 return ret; 2753} 2754 2755static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2756{ 2757 mutex_lock(&fs_info->scrub_lock); 2758 if (--fs_info->scrub_workers_refcnt == 0) { 2759 btrfs_stop_workers(&fs_info->scrub_workers); 2760 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); 2761 btrfs_stop_workers(&fs_info->scrub_nocow_workers); 2762 } 2763 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2764 mutex_unlock(&fs_info->scrub_lock); 2765} 2766 2767int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 2768 u64 end, struct btrfs_scrub_progress *progress, 2769 int readonly, int is_dev_replace) 2770{ 2771 struct scrub_ctx *sctx; 2772 int ret; 2773 struct btrfs_device *dev; 2774 2775 if (btrfs_fs_closing(fs_info)) 2776 return -EINVAL; 2777 2778 /* 2779 * check some assumptions 2780 */ 2781 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { 2782 printk(KERN_ERR 2783 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2784 fs_info->chunk_root->nodesize, 2785 fs_info->chunk_root->leafsize); 2786 return -EINVAL; 2787 } 2788 2789 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { 2790 /* 2791 * in this case scrub is unable to calculate the checksum 2792 * the way scrub is implemented. Do not handle this 2793 * situation at all because it won't ever happen. 2794 */ 2795 printk(KERN_ERR 2796 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2797 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); 2798 return -EINVAL; 2799 } 2800 2801 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { 2802 /* not supported for data w/o checksums */ 2803 printk(KERN_ERR 2804 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2805 fs_info->chunk_root->sectorsize, 2806 (unsigned long long)PAGE_SIZE); 2807 return -EINVAL; 2808 } 2809 2810 if (fs_info->chunk_root->nodesize > 2811 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || 2812 fs_info->chunk_root->sectorsize > 2813 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { 2814 /* 2815 * would exhaust the array bounds of pagev member in 2816 * struct scrub_block 2817 */ 2818 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", 2819 fs_info->chunk_root->nodesize, 2820 SCRUB_MAX_PAGES_PER_BLOCK, 2821 fs_info->chunk_root->sectorsize, 2822 SCRUB_MAX_PAGES_PER_BLOCK); 2823 return -EINVAL; 2824 } 2825 2826 ret = scrub_workers_get(fs_info, is_dev_replace); 2827 if (ret) 2828 return ret; 2829 2830 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2831 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 2832 if (!dev || (dev->missing && !is_dev_replace)) { 2833 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2834 scrub_workers_put(fs_info); 2835 return -ENODEV; 2836 } 2837 mutex_lock(&fs_info->scrub_lock); 2838 2839 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { 2840 mutex_unlock(&fs_info->scrub_lock); 2841 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2842 scrub_workers_put(fs_info); 2843 return -EIO; 2844 } 2845 2846 if (dev->scrub_device) { 2847 mutex_unlock(&fs_info->scrub_lock); 2848 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2849 scrub_workers_put(fs_info); 2850 return -EINPROGRESS; 2851 } 2852 sctx = scrub_setup_ctx(dev, is_dev_replace); 2853 if (IS_ERR(sctx)) { 2854 mutex_unlock(&fs_info->scrub_lock); 2855 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2856 scrub_workers_put(fs_info); 2857 return PTR_ERR(sctx); 2858 } 2859 sctx->readonly = readonly; 2860 dev->scrub_device = sctx; 2861 2862 atomic_inc(&fs_info->scrubs_running); 2863 mutex_unlock(&fs_info->scrub_lock); 2864 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2865 2866 if (!is_dev_replace) { 2867 down_read(&fs_info->scrub_super_lock); 2868 ret = scrub_supers(sctx, dev); 2869 up_read(&fs_info->scrub_super_lock); 2870 } 2871 2872 if (!ret) 2873 ret = scrub_enumerate_chunks(sctx, dev, start, end, 2874 is_dev_replace); 2875 2876 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 2877 atomic_dec(&fs_info->scrubs_running); 2878 wake_up(&fs_info->scrub_pause_wait); 2879 2880 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); 2881 2882 if (progress) 2883 memcpy(progress, &sctx->stat, sizeof(*progress)); 2884 2885 mutex_lock(&fs_info->scrub_lock); 2886 dev->scrub_device = NULL; 2887 mutex_unlock(&fs_info->scrub_lock); 2888 2889 scrub_free_ctx(sctx); 2890 scrub_workers_put(fs_info); 2891 2892 return ret; 2893} 2894 2895void btrfs_scrub_pause(struct btrfs_root *root) 2896{ 2897 struct btrfs_fs_info *fs_info = root->fs_info; 2898 2899 mutex_lock(&fs_info->scrub_lock); 2900 atomic_inc(&fs_info->scrub_pause_req); 2901 while (atomic_read(&fs_info->scrubs_paused) != 2902 atomic_read(&fs_info->scrubs_running)) { 2903 mutex_unlock(&fs_info->scrub_lock); 2904 wait_event(fs_info->scrub_pause_wait, 2905 atomic_read(&fs_info->scrubs_paused) == 2906 atomic_read(&fs_info->scrubs_running)); 2907 mutex_lock(&fs_info->scrub_lock); 2908 } 2909 mutex_unlock(&fs_info->scrub_lock); 2910} 2911 2912void btrfs_scrub_continue(struct btrfs_root *root) 2913{ 2914 struct btrfs_fs_info *fs_info = root->fs_info; 2915 2916 atomic_dec(&fs_info->scrub_pause_req); 2917 wake_up(&fs_info->scrub_pause_wait); 2918} 2919 2920void btrfs_scrub_pause_super(struct btrfs_root *root) 2921{ 2922 down_write(&root->fs_info->scrub_super_lock); 2923} 2924 2925void btrfs_scrub_continue_super(struct btrfs_root *root) 2926{ 2927 up_write(&root->fs_info->scrub_super_lock); 2928} 2929 2930int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2931{ 2932 mutex_lock(&fs_info->scrub_lock); 2933 if (!atomic_read(&fs_info->scrubs_running)) { 2934 mutex_unlock(&fs_info->scrub_lock); 2935 return -ENOTCONN; 2936 } 2937 2938 atomic_inc(&fs_info->scrub_cancel_req); 2939 while (atomic_read(&fs_info->scrubs_running)) { 2940 mutex_unlock(&fs_info->scrub_lock); 2941 wait_event(fs_info->scrub_pause_wait, 2942 atomic_read(&fs_info->scrubs_running) == 0); 2943 mutex_lock(&fs_info->scrub_lock); 2944 } 2945 atomic_dec(&fs_info->scrub_cancel_req); 2946 mutex_unlock(&fs_info->scrub_lock); 2947 2948 return 0; 2949} 2950 2951int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, 2952 struct btrfs_device *dev) 2953{ 2954 struct scrub_ctx *sctx; 2955 2956 mutex_lock(&fs_info->scrub_lock); 2957 sctx = dev->scrub_device; 2958 if (!sctx) { 2959 mutex_unlock(&fs_info->scrub_lock); 2960 return -ENOTCONN; 2961 } 2962 atomic_inc(&sctx->cancel_req); 2963 while (dev->scrub_device) { 2964 mutex_unlock(&fs_info->scrub_lock); 2965 wait_event(fs_info->scrub_pause_wait, 2966 dev->scrub_device == NULL); 2967 mutex_lock(&fs_info->scrub_lock); 2968 } 2969 mutex_unlock(&fs_info->scrub_lock); 2970 2971 return 0; 2972} 2973 2974int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) 2975{ 2976 struct btrfs_fs_info *fs_info = root->fs_info; 2977 struct btrfs_device *dev; 2978 int ret; 2979 2980 /* 2981 * we have to hold the device_list_mutex here so the device 2982 * does not go away in cancel_dev. FIXME: find a better solution 2983 */ 2984 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2985 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 2986 if (!dev) { 2987 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2988 return -ENODEV; 2989 } 2990 ret = btrfs_scrub_cancel_dev(fs_info, dev); 2991 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2992 2993 return ret; 2994} 2995 2996int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2997 struct btrfs_scrub_progress *progress) 2998{ 2999 struct btrfs_device *dev; 3000 struct scrub_ctx *sctx = NULL; 3001 3002 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3003 dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); 3004 if (dev) 3005 sctx = dev->scrub_device; 3006 if (sctx) 3007 memcpy(progress, &sctx->stat, sizeof(*progress)); 3008 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3009 3010 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 3011} 3012 3013static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 3014 u64 extent_logical, u64 extent_len, 3015 u64 *extent_physical, 3016 struct btrfs_device **extent_dev, 3017 int *extent_mirror_num) 3018{ 3019 u64 mapped_length; 3020 struct btrfs_bio *bbio = NULL; 3021 int ret; 3022 3023 mapped_length = extent_len; 3024 ret = btrfs_map_block(fs_info, READ, extent_logical, 3025 &mapped_length, &bbio, 0); 3026 if (ret || !bbio || mapped_length < extent_len || 3027 !bbio->stripes[0].dev->bdev) { 3028 kfree(bbio); 3029 return; 3030 } 3031 3032 *extent_physical = bbio->stripes[0].physical; 3033 *extent_mirror_num = bbio->mirror_num; 3034 *extent_dev = bbio->stripes[0].dev; 3035 kfree(bbio); 3036} 3037 3038static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, 3039 struct scrub_wr_ctx *wr_ctx, 3040 struct btrfs_fs_info *fs_info, 3041 struct btrfs_device *dev, 3042 int is_dev_replace) 3043{ 3044 WARN_ON(wr_ctx->wr_curr_bio != NULL); 3045 3046 mutex_init(&wr_ctx->wr_lock); 3047 wr_ctx->wr_curr_bio = NULL; 3048 if (!is_dev_replace) 3049 return 0; 3050 3051 WARN_ON(!dev->bdev); 3052 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, 3053 bio_get_nr_vecs(dev->bdev)); 3054 wr_ctx->tgtdev = dev; 3055 atomic_set(&wr_ctx->flush_all_writes, 0); 3056 return 0; 3057} 3058 3059static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) 3060{ 3061 mutex_lock(&wr_ctx->wr_lock); 3062 kfree(wr_ctx->wr_curr_bio); 3063 wr_ctx->wr_curr_bio = NULL; 3064 mutex_unlock(&wr_ctx->wr_lock); 3065} 3066 3067static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 3068 int mirror_num, u64 physical_for_dev_replace) 3069{ 3070 struct scrub_copy_nocow_ctx *nocow_ctx; 3071 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 3072 3073 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); 3074 if (!nocow_ctx) { 3075 spin_lock(&sctx->stat_lock); 3076 sctx->stat.malloc_errors++; 3077 spin_unlock(&sctx->stat_lock); 3078 return -ENOMEM; 3079 } 3080 3081 scrub_pending_trans_workers_inc(sctx); 3082 3083 nocow_ctx->sctx = sctx; 3084 nocow_ctx->logical = logical; 3085 nocow_ctx->len = len; 3086 nocow_ctx->mirror_num = mirror_num; 3087 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3088 nocow_ctx->work.func = copy_nocow_pages_worker; 3089 btrfs_queue_worker(&fs_info->scrub_nocow_workers, 3090 &nocow_ctx->work); 3091 3092 return 0; 3093} 3094 3095static void copy_nocow_pages_worker(struct btrfs_work *work) 3096{ 3097 struct scrub_copy_nocow_ctx *nocow_ctx = 3098 container_of(work, struct scrub_copy_nocow_ctx, work); 3099 struct scrub_ctx *sctx = nocow_ctx->sctx; 3100 u64 logical = nocow_ctx->logical; 3101 u64 len = nocow_ctx->len; 3102 int mirror_num = nocow_ctx->mirror_num; 3103 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 3104 int ret; 3105 struct btrfs_trans_handle *trans = NULL; 3106 struct btrfs_fs_info *fs_info; 3107 struct btrfs_path *path; 3108 struct btrfs_root *root; 3109 int not_written = 0; 3110 3111 fs_info = sctx->dev_root->fs_info; 3112 root = fs_info->extent_root; 3113 3114 path = btrfs_alloc_path(); 3115 if (!path) { 3116 spin_lock(&sctx->stat_lock); 3117 sctx->stat.malloc_errors++; 3118 spin_unlock(&sctx->stat_lock); 3119 not_written = 1; 3120 goto out; 3121 } 3122 3123 trans = btrfs_join_transaction(root); 3124 if (IS_ERR(trans)) { 3125 not_written = 1; 3126 goto out; 3127 } 3128 3129 ret = iterate_inodes_from_logical(logical, fs_info, path, 3130 copy_nocow_pages_for_inode, 3131 nocow_ctx); 3132 if (ret != 0 && ret != -ENOENT) { 3133 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", 3134 (unsigned long long)logical, 3135 (unsigned long long)physical_for_dev_replace, 3136 (unsigned long long)len, 3137 (unsigned long long)mirror_num, ret); 3138 not_written = 1; 3139 goto out; 3140 } 3141 3142out: 3143 if (trans && !IS_ERR(trans)) 3144 btrfs_end_transaction(trans, root); 3145 if (not_written) 3146 btrfs_dev_replace_stats_inc(&fs_info->dev_replace. 3147 num_uncorrectable_read_errors); 3148 3149 btrfs_free_path(path); 3150 kfree(nocow_ctx); 3151 3152 scrub_pending_trans_workers_dec(sctx); 3153} 3154 3155static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) 3156{ 3157 unsigned long index; 3158 struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 3159 int ret = 0; 3160 struct btrfs_key key; 3161 struct inode *inode = NULL; 3162 struct btrfs_root *local_root; 3163 u64 physical_for_dev_replace; 3164 u64 len; 3165 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 3166 3167 key.objectid = root; 3168 key.type = BTRFS_ROOT_ITEM_KEY; 3169 key.offset = (u64)-1; 3170 local_root = btrfs_read_fs_root_no_name(fs_info, &key); 3171 if (IS_ERR(local_root)) 3172 return PTR_ERR(local_root); 3173 3174 key.type = BTRFS_INODE_ITEM_KEY; 3175 key.objectid = inum; 3176 key.offset = 0; 3177 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); 3178 if (IS_ERR(inode)) 3179 return PTR_ERR(inode); 3180 3181 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 3182 len = nocow_ctx->len; 3183 while (len >= PAGE_CACHE_SIZE) { 3184 struct page *page = NULL; 3185 int ret_sub; 3186 3187 index = offset >> PAGE_CACHE_SHIFT; 3188 3189 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 3190 if (!page) { 3191 pr_err("find_or_create_page() failed\n"); 3192 ret = -ENOMEM; 3193 goto next_page; 3194 } 3195 3196 if (PageUptodate(page)) { 3197 if (PageDirty(page)) 3198 goto next_page; 3199 } else { 3200 ClearPageError(page); 3201 ret_sub = extent_read_full_page(&BTRFS_I(inode)-> 3202 io_tree, 3203 page, btrfs_get_extent, 3204 nocow_ctx->mirror_num); 3205 if (ret_sub) { 3206 ret = ret_sub; 3207 goto next_page; 3208 } 3209 wait_on_page_locked(page); 3210 if (!PageUptodate(page)) { 3211 ret = -EIO; 3212 goto next_page; 3213 } 3214 } 3215 ret_sub = write_page_nocow(nocow_ctx->sctx, 3216 physical_for_dev_replace, page); 3217 if (ret_sub) { 3218 ret = ret_sub; 3219 goto next_page; 3220 } 3221 3222next_page: 3223 if (page) { 3224 unlock_page(page); 3225 put_page(page); 3226 } 3227 offset += PAGE_CACHE_SIZE; 3228 physical_for_dev_replace += PAGE_CACHE_SIZE; 3229 len -= PAGE_CACHE_SIZE; 3230 } 3231 3232 if (inode) 3233 iput(inode); 3234 return ret; 3235} 3236 3237static int write_page_nocow(struct scrub_ctx *sctx, 3238 u64 physical_for_dev_replace, struct page *page) 3239{ 3240 struct bio *bio; 3241 struct btrfs_device *dev; 3242 int ret; 3243 DECLARE_COMPLETION_ONSTACK(compl); 3244 3245 dev = sctx->wr_ctx.tgtdev; 3246 if (!dev) 3247 return -EIO; 3248 if (!dev->bdev) { 3249 printk_ratelimited(KERN_WARNING 3250 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); 3251 return -EIO; 3252 } 3253 bio = bio_alloc(GFP_NOFS, 1); 3254 if (!bio) { 3255 spin_lock(&sctx->stat_lock); 3256 sctx->stat.malloc_errors++; 3257 spin_unlock(&sctx->stat_lock); 3258 return -ENOMEM; 3259 } 3260 bio->bi_private = &compl; 3261 bio->bi_end_io = scrub_complete_bio_end_io; 3262 bio->bi_size = 0; 3263 bio->bi_sector = physical_for_dev_replace >> 9; 3264 bio->bi_bdev = dev->bdev; 3265 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 3266 if (ret != PAGE_CACHE_SIZE) { 3267leave_with_eio: 3268 bio_put(bio); 3269 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 3270 return -EIO; 3271 } 3272 btrfsic_submit_bio(WRITE_SYNC, bio); 3273 wait_for_completion(&compl); 3274 3275 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 3276 goto leave_with_eio; 3277 3278 bio_put(bio); 3279 return 0; 3280} 3281