scrub.c revision a36cf8b8933e4a7a7f2f2cbc3c70b097e97f7fd1
1/* 2 * Copyright (C) 2011 STRATO. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19#include <linux/blkdev.h> 20#include <linux/ratelimit.h> 21#include "ctree.h" 22#include "volumes.h" 23#include "disk-io.h" 24#include "ordered-data.h" 25#include "transaction.h" 26#include "backref.h" 27#include "extent_io.h" 28#include "check-integrity.h" 29#include "rcu-string.h" 30 31/* 32 * This is only the first step towards a full-features scrub. It reads all 33 * extent and super block and verifies the checksums. In case a bad checksum 34 * is found or the extent cannot be read, good data will be written back if 35 * any can be found. 36 * 37 * Future enhancements: 38 * - In case an unrepairable extent is encountered, track which files are 39 * affected and report them 40 * - track and record media errors, throw out bad devices 41 * - add a mode to also read unallocated space 42 */ 43 44struct scrub_block; 45struct scrub_ctx; 46 47#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ 49#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 50 51struct scrub_page { 52 struct scrub_block *sblock; 53 struct page *page; 54 struct btrfs_device *dev; 55 u64 flags; /* extent flags */ 56 u64 generation; 57 u64 logical; 58 u64 physical; 59 struct { 60 unsigned int mirror_num:8; 61 unsigned int have_csum:1; 62 unsigned int io_error:1; 63 }; 64 u8 csum[BTRFS_CSUM_SIZE]; 65}; 66 67struct scrub_bio { 68 int index; 69 struct scrub_ctx *sctx; 70 struct btrfs_device *dev; 71 struct bio *bio; 72 int err; 73 u64 logical; 74 u64 physical; 75 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 76 int page_count; 77 int next_free; 78 struct btrfs_work work; 79}; 80 81struct scrub_block { 82 struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 83 int page_count; 84 atomic_t outstanding_pages; 85 atomic_t ref_count; /* free mem on transition to zero */ 86 struct scrub_ctx *sctx; 87 struct { 88 unsigned int header_error:1; 89 unsigned int checksum_error:1; 90 unsigned int no_io_error_seen:1; 91 unsigned int generation_error:1; /* also sets header_error */ 92 }; 93}; 94 95struct scrub_ctx { 96 struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; 97 struct btrfs_root *dev_root; 98 int first_free; 99 int curr; 100 atomic_t in_flight; 101 atomic_t fixup_cnt; 102 spinlock_t list_lock; 103 wait_queue_head_t list_wait; 104 u16 csum_size; 105 struct list_head csum_list; 106 atomic_t cancel_req; 107 int readonly; 108 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 109 u32 sectorsize; 110 u32 nodesize; 111 u32 leafsize; 112 /* 113 * statistics 114 */ 115 struct btrfs_scrub_progress stat; 116 spinlock_t stat_lock; 117}; 118 119struct scrub_fixup_nodatasum { 120 struct scrub_ctx *sctx; 121 struct btrfs_device *dev; 122 u64 logical; 123 struct btrfs_root *root; 124 struct btrfs_work work; 125 int mirror_num; 126}; 127 128struct scrub_warning { 129 struct btrfs_path *path; 130 u64 extent_item_size; 131 char *scratch_buf; 132 char *msg_buf; 133 const char *errstr; 134 sector_t sector; 135 u64 logical; 136 struct btrfs_device *dev; 137 int msg_bufsize; 138 int scratch_bufsize; 139}; 140 141 142static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 143static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 144 struct btrfs_mapping_tree *map_tree, 145 u64 length, u64 logical, 146 struct scrub_block *sblock); 147static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 148 struct scrub_block *sblock, int is_metadata, 149 int have_csum, u8 *csum, u64 generation, 150 u16 csum_size); 151static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 152 struct scrub_block *sblock, 153 int is_metadata, int have_csum, 154 const u8 *csum, u64 generation, 155 u16 csum_size); 156static void scrub_complete_bio_end_io(struct bio *bio, int err); 157static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 158 struct scrub_block *sblock_good, 159 int force_write); 160static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 161 struct scrub_block *sblock_good, 162 int page_num, int force_write); 163static int scrub_checksum_data(struct scrub_block *sblock); 164static int scrub_checksum_tree_block(struct scrub_block *sblock); 165static int scrub_checksum_super(struct scrub_block *sblock); 166static void scrub_block_get(struct scrub_block *sblock); 167static void scrub_block_put(struct scrub_block *sblock); 168static int scrub_add_page_to_bio(struct scrub_ctx *sctx, 169 struct scrub_page *spage); 170static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 171 u64 physical, struct btrfs_device *dev, u64 flags, 172 u64 gen, int mirror_num, u8 *csum, int force); 173static void scrub_bio_end_io(struct bio *bio, int err); 174static void scrub_bio_end_io_worker(struct btrfs_work *work); 175static void scrub_block_complete(struct scrub_block *sblock); 176 177 178static void scrub_free_csums(struct scrub_ctx *sctx) 179{ 180 while (!list_empty(&sctx->csum_list)) { 181 struct btrfs_ordered_sum *sum; 182 sum = list_first_entry(&sctx->csum_list, 183 struct btrfs_ordered_sum, list); 184 list_del(&sum->list); 185 kfree(sum); 186 } 187} 188 189static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 190{ 191 int i; 192 193 if (!sctx) 194 return; 195 196 /* this can happen when scrub is cancelled */ 197 if (sctx->curr != -1) { 198 struct scrub_bio *sbio = sctx->bios[sctx->curr]; 199 200 for (i = 0; i < sbio->page_count; i++) { 201 BUG_ON(!sbio->pagev[i]); 202 BUG_ON(!sbio->pagev[i]->page); 203 scrub_block_put(sbio->pagev[i]->sblock); 204 } 205 bio_put(sbio->bio); 206 } 207 208 for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { 209 struct scrub_bio *sbio = sctx->bios[i]; 210 211 if (!sbio) 212 break; 213 kfree(sbio); 214 } 215 216 scrub_free_csums(sctx); 217 kfree(sctx); 218} 219 220static noinline_for_stack 221struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) 222{ 223 struct scrub_ctx *sctx; 224 int i; 225 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 226 int pages_per_bio; 227 228 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 229 bio_get_nr_vecs(dev->bdev)); 230 sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 231 if (!sctx) 232 goto nomem; 233 sctx->pages_per_bio = pages_per_bio; 234 sctx->curr = -1; 235 sctx->dev_root = dev->dev_root; 236 for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { 237 struct scrub_bio *sbio; 238 239 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 240 if (!sbio) 241 goto nomem; 242 sctx->bios[i] = sbio; 243 244 sbio->index = i; 245 sbio->sctx = sctx; 246 sbio->page_count = 0; 247 sbio->work.func = scrub_bio_end_io_worker; 248 249 if (i != SCRUB_BIOS_PER_CTX - 1) 250 sctx->bios[i]->next_free = i + 1; 251 else 252 sctx->bios[i]->next_free = -1; 253 } 254 sctx->first_free = 0; 255 sctx->nodesize = dev->dev_root->nodesize; 256 sctx->leafsize = dev->dev_root->leafsize; 257 sctx->sectorsize = dev->dev_root->sectorsize; 258 atomic_set(&sctx->in_flight, 0); 259 atomic_set(&sctx->fixup_cnt, 0); 260 atomic_set(&sctx->cancel_req, 0); 261 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); 262 INIT_LIST_HEAD(&sctx->csum_list); 263 264 spin_lock_init(&sctx->list_lock); 265 spin_lock_init(&sctx->stat_lock); 266 init_waitqueue_head(&sctx->list_wait); 267 return sctx; 268 269nomem: 270 scrub_free_ctx(sctx); 271 return ERR_PTR(-ENOMEM); 272} 273 274static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 275{ 276 u64 isize; 277 u32 nlink; 278 int ret; 279 int i; 280 struct extent_buffer *eb; 281 struct btrfs_inode_item *inode_item; 282 struct scrub_warning *swarn = ctx; 283 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 284 struct inode_fs_paths *ipath = NULL; 285 struct btrfs_root *local_root; 286 struct btrfs_key root_key; 287 288 root_key.objectid = root; 289 root_key.type = BTRFS_ROOT_ITEM_KEY; 290 root_key.offset = (u64)-1; 291 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); 292 if (IS_ERR(local_root)) { 293 ret = PTR_ERR(local_root); 294 goto err; 295 } 296 297 ret = inode_item_info(inum, 0, local_root, swarn->path); 298 if (ret) { 299 btrfs_release_path(swarn->path); 300 goto err; 301 } 302 303 eb = swarn->path->nodes[0]; 304 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 305 struct btrfs_inode_item); 306 isize = btrfs_inode_size(eb, inode_item); 307 nlink = btrfs_inode_nlink(eb, inode_item); 308 btrfs_release_path(swarn->path); 309 310 ipath = init_ipath(4096, local_root, swarn->path); 311 if (IS_ERR(ipath)) { 312 ret = PTR_ERR(ipath); 313 ipath = NULL; 314 goto err; 315 } 316 ret = paths_from_inode(inum, ipath); 317 318 if (ret < 0) 319 goto err; 320 321 /* 322 * we deliberately ignore the bit ipath might have been too small to 323 * hold all of the paths here 324 */ 325 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 326 printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " 327 "%s, sector %llu, root %llu, inode %llu, offset %llu, " 328 "length %llu, links %u (path: %s)\n", swarn->errstr, 329 swarn->logical, rcu_str_deref(swarn->dev->name), 330 (unsigned long long)swarn->sector, root, inum, offset, 331 min(isize - offset, (u64)PAGE_SIZE), nlink, 332 (char *)(unsigned long)ipath->fspath->val[i]); 333 334 free_ipath(ipath); 335 return 0; 336 337err: 338 printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " 339 "%s, sector %llu, root %llu, inode %llu, offset %llu: path " 340 "resolving failed with ret=%d\n", swarn->errstr, 341 swarn->logical, rcu_str_deref(swarn->dev->name), 342 (unsigned long long)swarn->sector, root, inum, offset, ret); 343 344 free_ipath(ipath); 345 return 0; 346} 347 348static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 349{ 350 struct btrfs_device *dev; 351 struct btrfs_fs_info *fs_info; 352 struct btrfs_path *path; 353 struct btrfs_key found_key; 354 struct extent_buffer *eb; 355 struct btrfs_extent_item *ei; 356 struct scrub_warning swarn; 357 unsigned long ptr = 0; 358 u64 extent_item_pos; 359 u64 flags = 0; 360 u64 ref_root; 361 u32 item_size; 362 u8 ref_level; 363 const int bufsize = 4096; 364 int ret; 365 366 WARN_ON(sblock->page_count < 1); 367 dev = sblock->pagev[0].dev; 368 fs_info = sblock->sctx->dev_root->fs_info; 369 370 path = btrfs_alloc_path(); 371 372 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 373 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 374 swarn.sector = (sblock->pagev[0].physical) >> 9; 375 swarn.logical = sblock->pagev[0].logical; 376 swarn.errstr = errstr; 377 swarn.dev = NULL; 378 swarn.msg_bufsize = bufsize; 379 swarn.scratch_bufsize = bufsize; 380 381 if (!path || !swarn.scratch_buf || !swarn.msg_buf) 382 goto out; 383 384 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 385 &flags); 386 if (ret < 0) 387 goto out; 388 389 extent_item_pos = swarn.logical - found_key.objectid; 390 swarn.extent_item_size = found_key.offset; 391 392 eb = path->nodes[0]; 393 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 394 item_size = btrfs_item_size_nr(eb, path->slots[0]); 395 btrfs_release_path(path); 396 397 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 398 do { 399 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 400 &ref_root, &ref_level); 401 printk_in_rcu(KERN_WARNING 402 "btrfs: %s at logical %llu on dev %s, " 403 "sector %llu: metadata %s (level %d) in tree " 404 "%llu\n", errstr, swarn.logical, 405 rcu_str_deref(dev->name), 406 (unsigned long long)swarn.sector, 407 ref_level ? "node" : "leaf", 408 ret < 0 ? -1 : ref_level, 409 ret < 0 ? -1 : ref_root); 410 } while (ret != 1); 411 } else { 412 swarn.path = path; 413 swarn.dev = dev; 414 iterate_extent_inodes(fs_info, found_key.objectid, 415 extent_item_pos, 1, 416 scrub_print_warning_inode, &swarn); 417 } 418 419out: 420 btrfs_free_path(path); 421 kfree(swarn.scratch_buf); 422 kfree(swarn.msg_buf); 423} 424 425static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 426{ 427 struct page *page = NULL; 428 unsigned long index; 429 struct scrub_fixup_nodatasum *fixup = ctx; 430 int ret; 431 int corrected = 0; 432 struct btrfs_key key; 433 struct inode *inode = NULL; 434 u64 end = offset + PAGE_SIZE - 1; 435 struct btrfs_root *local_root; 436 437 key.objectid = root; 438 key.type = BTRFS_ROOT_ITEM_KEY; 439 key.offset = (u64)-1; 440 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); 441 if (IS_ERR(local_root)) 442 return PTR_ERR(local_root); 443 444 key.type = BTRFS_INODE_ITEM_KEY; 445 key.objectid = inum; 446 key.offset = 0; 447 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); 448 if (IS_ERR(inode)) 449 return PTR_ERR(inode); 450 451 index = offset >> PAGE_CACHE_SHIFT; 452 453 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 454 if (!page) { 455 ret = -ENOMEM; 456 goto out; 457 } 458 459 if (PageUptodate(page)) { 460 struct btrfs_mapping_tree *map_tree; 461 if (PageDirty(page)) { 462 /* 463 * we need to write the data to the defect sector. the 464 * data that was in that sector is not in memory, 465 * because the page was modified. we must not write the 466 * modified page to that sector. 467 * 468 * TODO: what could be done here: wait for the delalloc 469 * runner to write out that page (might involve 470 * COW) and see whether the sector is still 471 * referenced afterwards. 472 * 473 * For the meantime, we'll treat this error 474 * incorrectable, although there is a chance that a 475 * later scrub will find the bad sector again and that 476 * there's no dirty page in memory, then. 477 */ 478 ret = -EIO; 479 goto out; 480 } 481 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 482 ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 483 fixup->logical, page, 484 fixup->mirror_num); 485 unlock_page(page); 486 corrected = !ret; 487 } else { 488 /* 489 * we need to get good data first. the general readpage path 490 * will call repair_io_failure for us, we just have to make 491 * sure we read the bad mirror. 492 */ 493 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 494 EXTENT_DAMAGED, GFP_NOFS); 495 if (ret) { 496 /* set_extent_bits should give proper error */ 497 WARN_ON(ret > 0); 498 if (ret > 0) 499 ret = -EFAULT; 500 goto out; 501 } 502 503 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, 504 btrfs_get_extent, 505 fixup->mirror_num); 506 wait_on_page_locked(page); 507 508 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, 509 end, EXTENT_DAMAGED, 0, NULL); 510 if (!corrected) 511 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 512 EXTENT_DAMAGED, GFP_NOFS); 513 } 514 515out: 516 if (page) 517 put_page(page); 518 if (inode) 519 iput(inode); 520 521 if (ret < 0) 522 return ret; 523 524 if (ret == 0 && corrected) { 525 /* 526 * we only need to call readpage for one of the inodes belonging 527 * to this extent. so make iterate_extent_inodes stop 528 */ 529 return 1; 530 } 531 532 return -EIO; 533} 534 535static void scrub_fixup_nodatasum(struct btrfs_work *work) 536{ 537 int ret; 538 struct scrub_fixup_nodatasum *fixup; 539 struct scrub_ctx *sctx; 540 struct btrfs_trans_handle *trans = NULL; 541 struct btrfs_fs_info *fs_info; 542 struct btrfs_path *path; 543 int uncorrectable = 0; 544 545 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 546 sctx = fixup->sctx; 547 fs_info = fixup->root->fs_info; 548 549 path = btrfs_alloc_path(); 550 if (!path) { 551 spin_lock(&sctx->stat_lock); 552 ++sctx->stat.malloc_errors; 553 spin_unlock(&sctx->stat_lock); 554 uncorrectable = 1; 555 goto out; 556 } 557 558 trans = btrfs_join_transaction(fixup->root); 559 if (IS_ERR(trans)) { 560 uncorrectable = 1; 561 goto out; 562 } 563 564 /* 565 * the idea is to trigger a regular read through the standard path. we 566 * read a page from the (failed) logical address by specifying the 567 * corresponding copynum of the failed sector. thus, that readpage is 568 * expected to fail. 569 * that is the point where on-the-fly error correction will kick in 570 * (once it's finished) and rewrite the failed sector if a good copy 571 * can be found. 572 */ 573 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, 574 path, scrub_fixup_readpage, 575 fixup); 576 if (ret < 0) { 577 uncorrectable = 1; 578 goto out; 579 } 580 WARN_ON(ret != 1); 581 582 spin_lock(&sctx->stat_lock); 583 ++sctx->stat.corrected_errors; 584 spin_unlock(&sctx->stat_lock); 585 586out: 587 if (trans && !IS_ERR(trans)) 588 btrfs_end_transaction(trans, fixup->root); 589 if (uncorrectable) { 590 spin_lock(&sctx->stat_lock); 591 ++sctx->stat.uncorrectable_errors; 592 spin_unlock(&sctx->stat_lock); 593 594 printk_ratelimited_in_rcu(KERN_ERR 595 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 596 (unsigned long long)fixup->logical, 597 rcu_str_deref(fixup->dev->name)); 598 } 599 600 btrfs_free_path(path); 601 kfree(fixup); 602 603 /* see caller why we're pretending to be paused in the scrub counters */ 604 mutex_lock(&fs_info->scrub_lock); 605 atomic_dec(&fs_info->scrubs_running); 606 atomic_dec(&fs_info->scrubs_paused); 607 mutex_unlock(&fs_info->scrub_lock); 608 atomic_dec(&sctx->fixup_cnt); 609 wake_up(&fs_info->scrub_pause_wait); 610 wake_up(&sctx->list_wait); 611} 612 613/* 614 * scrub_handle_errored_block gets called when either verification of the 615 * pages failed or the bio failed to read, e.g. with EIO. In the latter 616 * case, this function handles all pages in the bio, even though only one 617 * may be bad. 618 * The goal of this function is to repair the errored block by using the 619 * contents of one of the mirrors. 620 */ 621static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 622{ 623 struct scrub_ctx *sctx = sblock_to_check->sctx; 624 struct btrfs_device *dev; 625 struct btrfs_fs_info *fs_info; 626 u64 length; 627 u64 logical; 628 u64 generation; 629 unsigned int failed_mirror_index; 630 unsigned int is_metadata; 631 unsigned int have_csum; 632 u8 *csum; 633 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ 634 struct scrub_block *sblock_bad; 635 int ret; 636 int mirror_index; 637 int page_num; 638 int success; 639 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 640 DEFAULT_RATELIMIT_BURST); 641 642 BUG_ON(sblock_to_check->page_count < 1); 643 fs_info = sctx->dev_root->fs_info; 644 length = sblock_to_check->page_count * PAGE_SIZE; 645 logical = sblock_to_check->pagev[0].logical; 646 generation = sblock_to_check->pagev[0].generation; 647 BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 648 failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 649 is_metadata = !(sblock_to_check->pagev[0].flags & 650 BTRFS_EXTENT_FLAG_DATA); 651 have_csum = sblock_to_check->pagev[0].have_csum; 652 csum = sblock_to_check->pagev[0].csum; 653 dev = sblock_to_check->pagev[0].dev; 654 655 /* 656 * read all mirrors one after the other. This includes to 657 * re-read the extent or metadata block that failed (that was 658 * the cause that this fixup code is called) another time, 659 * page by page this time in order to know which pages 660 * caused I/O errors and which ones are good (for all mirrors). 661 * It is the goal to handle the situation when more than one 662 * mirror contains I/O errors, but the errors do not 663 * overlap, i.e. the data can be repaired by selecting the 664 * pages from those mirrors without I/O error on the 665 * particular pages. One example (with blocks >= 2 * PAGE_SIZE) 666 * would be that mirror #1 has an I/O error on the first page, 667 * the second page is good, and mirror #2 has an I/O error on 668 * the second page, but the first page is good. 669 * Then the first page of the first mirror can be repaired by 670 * taking the first page of the second mirror, and the 671 * second page of the second mirror can be repaired by 672 * copying the contents of the 2nd page of the 1st mirror. 673 * One more note: if the pages of one mirror contain I/O 674 * errors, the checksum cannot be verified. In order to get 675 * the best data for repairing, the first attempt is to find 676 * a mirror without I/O errors and with a validated checksum. 677 * Only if this is not possible, the pages are picked from 678 * mirrors with I/O errors without considering the checksum. 679 * If the latter is the case, at the end, the checksum of the 680 * repaired area is verified in order to correctly maintain 681 * the statistics. 682 */ 683 684 sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * 685 sizeof(*sblocks_for_recheck), 686 GFP_NOFS); 687 if (!sblocks_for_recheck) { 688 spin_lock(&sctx->stat_lock); 689 sctx->stat.malloc_errors++; 690 sctx->stat.read_errors++; 691 sctx->stat.uncorrectable_errors++; 692 spin_unlock(&sctx->stat_lock); 693 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 694 goto out; 695 } 696 697 /* setup the context, map the logical blocks and alloc the pages */ 698 ret = scrub_setup_recheck_block(sctx, &fs_info->mapping_tree, length, 699 logical, sblocks_for_recheck); 700 if (ret) { 701 spin_lock(&sctx->stat_lock); 702 sctx->stat.read_errors++; 703 sctx->stat.uncorrectable_errors++; 704 spin_unlock(&sctx->stat_lock); 705 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 706 goto out; 707 } 708 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 709 sblock_bad = sblocks_for_recheck + failed_mirror_index; 710 711 /* build and submit the bios for the failed mirror, check checksums */ 712 ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 713 csum, generation, sctx->csum_size); 714 if (ret) { 715 spin_lock(&sctx->stat_lock); 716 sctx->stat.read_errors++; 717 sctx->stat.uncorrectable_errors++; 718 spin_unlock(&sctx->stat_lock); 719 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 720 goto out; 721 } 722 723 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 724 sblock_bad->no_io_error_seen) { 725 /* 726 * the error disappeared after reading page by page, or 727 * the area was part of a huge bio and other parts of the 728 * bio caused I/O errors, or the block layer merged several 729 * read requests into one and the error is caused by a 730 * different bio (usually one of the two latter cases is 731 * the cause) 732 */ 733 spin_lock(&sctx->stat_lock); 734 sctx->stat.unverified_errors++; 735 spin_unlock(&sctx->stat_lock); 736 737 goto out; 738 } 739 740 if (!sblock_bad->no_io_error_seen) { 741 spin_lock(&sctx->stat_lock); 742 sctx->stat.read_errors++; 743 spin_unlock(&sctx->stat_lock); 744 if (__ratelimit(&_rs)) 745 scrub_print_warning("i/o error", sblock_to_check); 746 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 747 } else if (sblock_bad->checksum_error) { 748 spin_lock(&sctx->stat_lock); 749 sctx->stat.csum_errors++; 750 spin_unlock(&sctx->stat_lock); 751 if (__ratelimit(&_rs)) 752 scrub_print_warning("checksum error", sblock_to_check); 753 btrfs_dev_stat_inc_and_print(dev, 754 BTRFS_DEV_STAT_CORRUPTION_ERRS); 755 } else if (sblock_bad->header_error) { 756 spin_lock(&sctx->stat_lock); 757 sctx->stat.verify_errors++; 758 spin_unlock(&sctx->stat_lock); 759 if (__ratelimit(&_rs)) 760 scrub_print_warning("checksum/header error", 761 sblock_to_check); 762 if (sblock_bad->generation_error) 763 btrfs_dev_stat_inc_and_print(dev, 764 BTRFS_DEV_STAT_GENERATION_ERRS); 765 else 766 btrfs_dev_stat_inc_and_print(dev, 767 BTRFS_DEV_STAT_CORRUPTION_ERRS); 768 } 769 770 if (sctx->readonly) 771 goto did_not_correct_error; 772 773 if (!is_metadata && !have_csum) { 774 struct scrub_fixup_nodatasum *fixup_nodatasum; 775 776 /* 777 * !is_metadata and !have_csum, this means that the data 778 * might not be COW'ed, that it might be modified 779 * concurrently. The general strategy to work on the 780 * commit root does not help in the case when COW is not 781 * used. 782 */ 783 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 784 if (!fixup_nodatasum) 785 goto did_not_correct_error; 786 fixup_nodatasum->sctx = sctx; 787 fixup_nodatasum->dev = dev; 788 fixup_nodatasum->logical = logical; 789 fixup_nodatasum->root = fs_info->extent_root; 790 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 791 /* 792 * increment scrubs_running to prevent cancel requests from 793 * completing as long as a fixup worker is running. we must also 794 * increment scrubs_paused to prevent deadlocking on pause 795 * requests used for transactions commits (as the worker uses a 796 * transaction context). it is safe to regard the fixup worker 797 * as paused for all matters practical. effectively, we only 798 * avoid cancellation requests from completing. 799 */ 800 mutex_lock(&fs_info->scrub_lock); 801 atomic_inc(&fs_info->scrubs_running); 802 atomic_inc(&fs_info->scrubs_paused); 803 mutex_unlock(&fs_info->scrub_lock); 804 atomic_inc(&sctx->fixup_cnt); 805 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 806 btrfs_queue_worker(&fs_info->scrub_workers, 807 &fixup_nodatasum->work); 808 goto out; 809 } 810 811 /* 812 * now build and submit the bios for the other mirrors, check 813 * checksums 814 */ 815 for (mirror_index = 0; 816 mirror_index < BTRFS_MAX_MIRRORS && 817 sblocks_for_recheck[mirror_index].page_count > 0; 818 mirror_index++) { 819 if (mirror_index == failed_mirror_index) 820 continue; 821 822 /* build and submit the bios, check checksums */ 823 ret = scrub_recheck_block(fs_info, 824 sblocks_for_recheck + mirror_index, 825 is_metadata, have_csum, csum, 826 generation, sctx->csum_size); 827 if (ret) 828 goto did_not_correct_error; 829 } 830 831 /* 832 * first try to pick the mirror which is completely without I/O 833 * errors and also does not have a checksum error. 834 * If one is found, and if a checksum is present, the full block 835 * that is known to contain an error is rewritten. Afterwards 836 * the block is known to be corrected. 837 * If a mirror is found which is completely correct, and no 838 * checksum is present, only those pages are rewritten that had 839 * an I/O error in the block to be repaired, since it cannot be 840 * determined, which copy of the other pages is better (and it 841 * could happen otherwise that a correct page would be 842 * overwritten by a bad one). 843 */ 844 for (mirror_index = 0; 845 mirror_index < BTRFS_MAX_MIRRORS && 846 sblocks_for_recheck[mirror_index].page_count > 0; 847 mirror_index++) { 848 struct scrub_block *sblock_other = sblocks_for_recheck + 849 mirror_index; 850 851 if (!sblock_other->header_error && 852 !sblock_other->checksum_error && 853 sblock_other->no_io_error_seen) { 854 int force_write = is_metadata || have_csum; 855 856 ret = scrub_repair_block_from_good_copy(sblock_bad, 857 sblock_other, 858 force_write); 859 if (0 == ret) 860 goto corrected_error; 861 } 862 } 863 864 /* 865 * in case of I/O errors in the area that is supposed to be 866 * repaired, continue by picking good copies of those pages. 867 * Select the good pages from mirrors to rewrite bad pages from 868 * the area to fix. Afterwards verify the checksum of the block 869 * that is supposed to be repaired. This verification step is 870 * only done for the purpose of statistic counting and for the 871 * final scrub report, whether errors remain. 872 * A perfect algorithm could make use of the checksum and try 873 * all possible combinations of pages from the different mirrors 874 * until the checksum verification succeeds. For example, when 875 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page 876 * of mirror #2 is readable but the final checksum test fails, 877 * then the 2nd page of mirror #3 could be tried, whether now 878 * the final checksum succeedes. But this would be a rare 879 * exception and is therefore not implemented. At least it is 880 * avoided that the good copy is overwritten. 881 * A more useful improvement would be to pick the sectors 882 * without I/O error based on sector sizes (512 bytes on legacy 883 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one 884 * mirror could be repaired by taking 512 byte of a different 885 * mirror, even if other 512 byte sectors in the same PAGE_SIZE 886 * area are unreadable. 887 */ 888 889 /* can only fix I/O errors from here on */ 890 if (sblock_bad->no_io_error_seen) 891 goto did_not_correct_error; 892 893 success = 1; 894 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 895 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 896 897 if (!page_bad->io_error) 898 continue; 899 900 for (mirror_index = 0; 901 mirror_index < BTRFS_MAX_MIRRORS && 902 sblocks_for_recheck[mirror_index].page_count > 0; 903 mirror_index++) { 904 struct scrub_block *sblock_other = sblocks_for_recheck + 905 mirror_index; 906 struct scrub_page *page_other = sblock_other->pagev + 907 page_num; 908 909 if (!page_other->io_error) { 910 ret = scrub_repair_page_from_good_copy( 911 sblock_bad, sblock_other, page_num, 0); 912 if (0 == ret) { 913 page_bad->io_error = 0; 914 break; /* succeeded for this page */ 915 } 916 } 917 } 918 919 if (page_bad->io_error) { 920 /* did not find a mirror to copy the page from */ 921 success = 0; 922 } 923 } 924 925 if (success) { 926 if (is_metadata || have_csum) { 927 /* 928 * need to verify the checksum now that all 929 * sectors on disk are repaired (the write 930 * request for data to be repaired is on its way). 931 * Just be lazy and use scrub_recheck_block() 932 * which re-reads the data before the checksum 933 * is verified, but most likely the data comes out 934 * of the page cache. 935 */ 936 ret = scrub_recheck_block(fs_info, sblock_bad, 937 is_metadata, have_csum, csum, 938 generation, sctx->csum_size); 939 if (!ret && !sblock_bad->header_error && 940 !sblock_bad->checksum_error && 941 sblock_bad->no_io_error_seen) 942 goto corrected_error; 943 else 944 goto did_not_correct_error; 945 } else { 946corrected_error: 947 spin_lock(&sctx->stat_lock); 948 sctx->stat.corrected_errors++; 949 spin_unlock(&sctx->stat_lock); 950 printk_ratelimited_in_rcu(KERN_ERR 951 "btrfs: fixed up error at logical %llu on dev %s\n", 952 (unsigned long long)logical, 953 rcu_str_deref(dev->name)); 954 } 955 } else { 956did_not_correct_error: 957 spin_lock(&sctx->stat_lock); 958 sctx->stat.uncorrectable_errors++; 959 spin_unlock(&sctx->stat_lock); 960 printk_ratelimited_in_rcu(KERN_ERR 961 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 962 (unsigned long long)logical, 963 rcu_str_deref(dev->name)); 964 } 965 966out: 967 if (sblocks_for_recheck) { 968 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; 969 mirror_index++) { 970 struct scrub_block *sblock = sblocks_for_recheck + 971 mirror_index; 972 int page_index; 973 974 for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 975 page_index++) 976 if (sblock->pagev[page_index].page) 977 __free_page( 978 sblock->pagev[page_index].page); 979 } 980 kfree(sblocks_for_recheck); 981 } 982 983 return 0; 984} 985 986static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 987 struct btrfs_mapping_tree *map_tree, 988 u64 length, u64 logical, 989 struct scrub_block *sblocks_for_recheck) 990{ 991 int page_index; 992 int mirror_index; 993 int ret; 994 995 /* 996 * note: the three members sctx, ref_count and outstanding_pages 997 * are not used (and not set) in the blocks that are used for 998 * the recheck procedure 999 */ 1000 1001 page_index = 0; 1002 while (length > 0) { 1003 u64 sublen = min_t(u64, length, PAGE_SIZE); 1004 u64 mapped_length = sublen; 1005 struct btrfs_bio *bbio = NULL; 1006 1007 /* 1008 * with a length of PAGE_SIZE, each returned stripe 1009 * represents one mirror 1010 */ 1011 ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1012 &bbio, 0); 1013 if (ret || !bbio || mapped_length < sublen) { 1014 kfree(bbio); 1015 return -EIO; 1016 } 1017 1018 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1019 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1020 mirror_index++) { 1021 struct scrub_block *sblock; 1022 struct scrub_page *page; 1023 1024 if (mirror_index >= BTRFS_MAX_MIRRORS) 1025 continue; 1026 1027 sblock = sblocks_for_recheck + mirror_index; 1028 page = sblock->pagev + page_index; 1029 page->logical = logical; 1030 page->physical = bbio->stripes[mirror_index].physical; 1031 /* for missing devices, dev->bdev is NULL */ 1032 page->dev = bbio->stripes[mirror_index].dev; 1033 page->mirror_num = mirror_index + 1; 1034 page->page = alloc_page(GFP_NOFS); 1035 if (!page->page) { 1036 spin_lock(&sctx->stat_lock); 1037 sctx->stat.malloc_errors++; 1038 spin_unlock(&sctx->stat_lock); 1039 kfree(bbio); 1040 return -ENOMEM; 1041 } 1042 sblock->page_count++; 1043 } 1044 kfree(bbio); 1045 length -= sublen; 1046 logical += sublen; 1047 page_index++; 1048 } 1049 1050 return 0; 1051} 1052 1053/* 1054 * this function will check the on disk data for checksum errors, header 1055 * errors and read I/O errors. If any I/O errors happen, the exact pages 1056 * which are errored are marked as being bad. The goal is to enable scrub 1057 * to take those pages that are not errored from all the mirrors so that 1058 * the pages that are errored in the just handled mirror can be repaired. 1059 */ 1060static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1061 struct scrub_block *sblock, int is_metadata, 1062 int have_csum, u8 *csum, u64 generation, 1063 u16 csum_size) 1064{ 1065 int page_num; 1066 1067 sblock->no_io_error_seen = 1; 1068 sblock->header_error = 0; 1069 sblock->checksum_error = 0; 1070 1071 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1072 struct bio *bio; 1073 int ret; 1074 struct scrub_page *page = sblock->pagev + page_num; 1075 DECLARE_COMPLETION_ONSTACK(complete); 1076 1077 if (page->dev->bdev == NULL) { 1078 page->io_error = 1; 1079 sblock->no_io_error_seen = 0; 1080 continue; 1081 } 1082 1083 BUG_ON(!page->page); 1084 bio = bio_alloc(GFP_NOFS, 1); 1085 if (!bio) 1086 return -EIO; 1087 bio->bi_bdev = page->dev->bdev; 1088 bio->bi_sector = page->physical >> 9; 1089 bio->bi_end_io = scrub_complete_bio_end_io; 1090 bio->bi_private = &complete; 1091 1092 ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1093 if (PAGE_SIZE != ret) { 1094 bio_put(bio); 1095 return -EIO; 1096 } 1097 btrfsic_submit_bio(READ, bio); 1098 1099 /* this will also unplug the queue */ 1100 wait_for_completion(&complete); 1101 1102 page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); 1103 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1104 sblock->no_io_error_seen = 0; 1105 bio_put(bio); 1106 } 1107 1108 if (sblock->no_io_error_seen) 1109 scrub_recheck_block_checksum(fs_info, sblock, is_metadata, 1110 have_csum, csum, generation, 1111 csum_size); 1112 1113 return 0; 1114} 1115 1116static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1117 struct scrub_block *sblock, 1118 int is_metadata, int have_csum, 1119 const u8 *csum, u64 generation, 1120 u16 csum_size) 1121{ 1122 int page_num; 1123 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1124 u32 crc = ~(u32)0; 1125 struct btrfs_root *root = fs_info->extent_root; 1126 void *mapped_buffer; 1127 1128 BUG_ON(!sblock->pagev[0].page); 1129 if (is_metadata) { 1130 struct btrfs_header *h; 1131 1132 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1133 h = (struct btrfs_header *)mapped_buffer; 1134 1135 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1136 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1137 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1138 BTRFS_UUID_SIZE)) { 1139 sblock->header_error = 1; 1140 } else if (generation != le64_to_cpu(h->generation)) { 1141 sblock->header_error = 1; 1142 sblock->generation_error = 1; 1143 } 1144 csum = h->csum; 1145 } else { 1146 if (!have_csum) 1147 return; 1148 1149 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1150 } 1151 1152 for (page_num = 0;;) { 1153 if (page_num == 0 && is_metadata) 1154 crc = btrfs_csum_data(root, 1155 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, 1156 crc, PAGE_SIZE - BTRFS_CSUM_SIZE); 1157 else 1158 crc = btrfs_csum_data(root, mapped_buffer, crc, 1159 PAGE_SIZE); 1160 1161 kunmap_atomic(mapped_buffer); 1162 page_num++; 1163 if (page_num >= sblock->page_count) 1164 break; 1165 BUG_ON(!sblock->pagev[page_num].page); 1166 1167 mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1168 } 1169 1170 btrfs_csum_final(crc, calculated_csum); 1171 if (memcmp(calculated_csum, csum, csum_size)) 1172 sblock->checksum_error = 1; 1173} 1174 1175static void scrub_complete_bio_end_io(struct bio *bio, int err) 1176{ 1177 complete((struct completion *)bio->bi_private); 1178} 1179 1180static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 1181 struct scrub_block *sblock_good, 1182 int force_write) 1183{ 1184 int page_num; 1185 int ret = 0; 1186 1187 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1188 int ret_sub; 1189 1190 ret_sub = scrub_repair_page_from_good_copy(sblock_bad, 1191 sblock_good, 1192 page_num, 1193 force_write); 1194 if (ret_sub) 1195 ret = ret_sub; 1196 } 1197 1198 return ret; 1199} 1200 1201static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 1202 struct scrub_block *sblock_good, 1203 int page_num, int force_write) 1204{ 1205 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1206 struct scrub_page *page_good = sblock_good->pagev + page_num; 1207 1208 BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1209 BUG_ON(sblock_good->pagev[page_num].page == NULL); 1210 if (force_write || sblock_bad->header_error || 1211 sblock_bad->checksum_error || page_bad->io_error) { 1212 struct bio *bio; 1213 int ret; 1214 DECLARE_COMPLETION_ONSTACK(complete); 1215 1216 bio = bio_alloc(GFP_NOFS, 1); 1217 if (!bio) 1218 return -EIO; 1219 bio->bi_bdev = page_bad->dev->bdev; 1220 bio->bi_sector = page_bad->physical >> 9; 1221 bio->bi_end_io = scrub_complete_bio_end_io; 1222 bio->bi_private = &complete; 1223 1224 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); 1225 if (PAGE_SIZE != ret) { 1226 bio_put(bio); 1227 return -EIO; 1228 } 1229 btrfsic_submit_bio(WRITE, bio); 1230 1231 /* this will also unplug the queue */ 1232 wait_for_completion(&complete); 1233 if (!bio_flagged(bio, BIO_UPTODATE)) { 1234 btrfs_dev_stat_inc_and_print(page_bad->dev, 1235 BTRFS_DEV_STAT_WRITE_ERRS); 1236 bio_put(bio); 1237 return -EIO; 1238 } 1239 bio_put(bio); 1240 } 1241 1242 return 0; 1243} 1244 1245static void scrub_checksum(struct scrub_block *sblock) 1246{ 1247 u64 flags; 1248 int ret; 1249 1250 BUG_ON(sblock->page_count < 1); 1251 flags = sblock->pagev[0].flags; 1252 ret = 0; 1253 if (flags & BTRFS_EXTENT_FLAG_DATA) 1254 ret = scrub_checksum_data(sblock); 1255 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1256 ret = scrub_checksum_tree_block(sblock); 1257 else if (flags & BTRFS_EXTENT_FLAG_SUPER) 1258 (void)scrub_checksum_super(sblock); 1259 else 1260 WARN_ON(1); 1261 if (ret) 1262 scrub_handle_errored_block(sblock); 1263} 1264 1265static int scrub_checksum_data(struct scrub_block *sblock) 1266{ 1267 struct scrub_ctx *sctx = sblock->sctx; 1268 u8 csum[BTRFS_CSUM_SIZE]; 1269 u8 *on_disk_csum; 1270 struct page *page; 1271 void *buffer; 1272 u32 crc = ~(u32)0; 1273 int fail = 0; 1274 struct btrfs_root *root = sctx->dev_root; 1275 u64 len; 1276 int index; 1277 1278 BUG_ON(sblock->page_count < 1); 1279 if (!sblock->pagev[0].have_csum) 1280 return 0; 1281 1282 on_disk_csum = sblock->pagev[0].csum; 1283 page = sblock->pagev[0].page; 1284 buffer = kmap_atomic(page); 1285 1286 len = sctx->sectorsize; 1287 index = 0; 1288 for (;;) { 1289 u64 l = min_t(u64, len, PAGE_SIZE); 1290 1291 crc = btrfs_csum_data(root, buffer, crc, l); 1292 kunmap_atomic(buffer); 1293 len -= l; 1294 if (len == 0) 1295 break; 1296 index++; 1297 BUG_ON(index >= sblock->page_count); 1298 BUG_ON(!sblock->pagev[index].page); 1299 page = sblock->pagev[index].page; 1300 buffer = kmap_atomic(page); 1301 } 1302 1303 btrfs_csum_final(crc, csum); 1304 if (memcmp(csum, on_disk_csum, sctx->csum_size)) 1305 fail = 1; 1306 1307 return fail; 1308} 1309 1310static int scrub_checksum_tree_block(struct scrub_block *sblock) 1311{ 1312 struct scrub_ctx *sctx = sblock->sctx; 1313 struct btrfs_header *h; 1314 struct btrfs_root *root = sctx->dev_root; 1315 struct btrfs_fs_info *fs_info = root->fs_info; 1316 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1317 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1318 struct page *page; 1319 void *mapped_buffer; 1320 u64 mapped_size; 1321 void *p; 1322 u32 crc = ~(u32)0; 1323 int fail = 0; 1324 int crc_fail = 0; 1325 u64 len; 1326 int index; 1327 1328 BUG_ON(sblock->page_count < 1); 1329 page = sblock->pagev[0].page; 1330 mapped_buffer = kmap_atomic(page); 1331 h = (struct btrfs_header *)mapped_buffer; 1332 memcpy(on_disk_csum, h->csum, sctx->csum_size); 1333 1334 /* 1335 * we don't use the getter functions here, as we 1336 * a) don't have an extent buffer and 1337 * b) the page is already kmapped 1338 */ 1339 1340 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1341 ++fail; 1342 1343 if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1344 ++fail; 1345 1346 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1347 ++fail; 1348 1349 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1350 BTRFS_UUID_SIZE)) 1351 ++fail; 1352 1353 BUG_ON(sctx->nodesize != sctx->leafsize); 1354 len = sctx->nodesize - BTRFS_CSUM_SIZE; 1355 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1356 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1357 index = 0; 1358 for (;;) { 1359 u64 l = min_t(u64, len, mapped_size); 1360 1361 crc = btrfs_csum_data(root, p, crc, l); 1362 kunmap_atomic(mapped_buffer); 1363 len -= l; 1364 if (len == 0) 1365 break; 1366 index++; 1367 BUG_ON(index >= sblock->page_count); 1368 BUG_ON(!sblock->pagev[index].page); 1369 page = sblock->pagev[index].page; 1370 mapped_buffer = kmap_atomic(page); 1371 mapped_size = PAGE_SIZE; 1372 p = mapped_buffer; 1373 } 1374 1375 btrfs_csum_final(crc, calculated_csum); 1376 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1377 ++crc_fail; 1378 1379 return fail || crc_fail; 1380} 1381 1382static int scrub_checksum_super(struct scrub_block *sblock) 1383{ 1384 struct btrfs_super_block *s; 1385 struct scrub_ctx *sctx = sblock->sctx; 1386 struct btrfs_root *root = sctx->dev_root; 1387 struct btrfs_fs_info *fs_info = root->fs_info; 1388 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1389 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1390 struct page *page; 1391 void *mapped_buffer; 1392 u64 mapped_size; 1393 void *p; 1394 u32 crc = ~(u32)0; 1395 int fail_gen = 0; 1396 int fail_cor = 0; 1397 u64 len; 1398 int index; 1399 1400 BUG_ON(sblock->page_count < 1); 1401 page = sblock->pagev[0].page; 1402 mapped_buffer = kmap_atomic(page); 1403 s = (struct btrfs_super_block *)mapped_buffer; 1404 memcpy(on_disk_csum, s->csum, sctx->csum_size); 1405 1406 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1407 ++fail_cor; 1408 1409 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1410 ++fail_gen; 1411 1412 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1413 ++fail_cor; 1414 1415 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 1416 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1417 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1418 index = 0; 1419 for (;;) { 1420 u64 l = min_t(u64, len, mapped_size); 1421 1422 crc = btrfs_csum_data(root, p, crc, l); 1423 kunmap_atomic(mapped_buffer); 1424 len -= l; 1425 if (len == 0) 1426 break; 1427 index++; 1428 BUG_ON(index >= sblock->page_count); 1429 BUG_ON(!sblock->pagev[index].page); 1430 page = sblock->pagev[index].page; 1431 mapped_buffer = kmap_atomic(page); 1432 mapped_size = PAGE_SIZE; 1433 p = mapped_buffer; 1434 } 1435 1436 btrfs_csum_final(crc, calculated_csum); 1437 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1438 ++fail_cor; 1439 1440 if (fail_cor + fail_gen) { 1441 /* 1442 * if we find an error in a super block, we just report it. 1443 * They will get written with the next transaction commit 1444 * anyway 1445 */ 1446 spin_lock(&sctx->stat_lock); 1447 ++sctx->stat.super_errors; 1448 spin_unlock(&sctx->stat_lock); 1449 if (fail_cor) 1450 btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, 1451 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1452 else 1453 btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, 1454 BTRFS_DEV_STAT_GENERATION_ERRS); 1455 } 1456 1457 return fail_cor + fail_gen; 1458} 1459 1460static void scrub_block_get(struct scrub_block *sblock) 1461{ 1462 atomic_inc(&sblock->ref_count); 1463} 1464 1465static void scrub_block_put(struct scrub_block *sblock) 1466{ 1467 if (atomic_dec_and_test(&sblock->ref_count)) { 1468 int i; 1469 1470 for (i = 0; i < sblock->page_count; i++) 1471 if (sblock->pagev[i].page) 1472 __free_page(sblock->pagev[i].page); 1473 kfree(sblock); 1474 } 1475} 1476 1477static void scrub_submit(struct scrub_ctx *sctx) 1478{ 1479 struct scrub_bio *sbio; 1480 1481 if (sctx->curr == -1) 1482 return; 1483 1484 sbio = sctx->bios[sctx->curr]; 1485 sctx->curr = -1; 1486 atomic_inc(&sctx->in_flight); 1487 1488 btrfsic_submit_bio(READ, sbio->bio); 1489} 1490 1491static int scrub_add_page_to_bio(struct scrub_ctx *sctx, 1492 struct scrub_page *spage) 1493{ 1494 struct scrub_block *sblock = spage->sblock; 1495 struct scrub_bio *sbio; 1496 int ret; 1497 1498again: 1499 /* 1500 * grab a fresh bio or wait for one to become available 1501 */ 1502 while (sctx->curr == -1) { 1503 spin_lock(&sctx->list_lock); 1504 sctx->curr = sctx->first_free; 1505 if (sctx->curr != -1) { 1506 sctx->first_free = sctx->bios[sctx->curr]->next_free; 1507 sctx->bios[sctx->curr]->next_free = -1; 1508 sctx->bios[sctx->curr]->page_count = 0; 1509 spin_unlock(&sctx->list_lock); 1510 } else { 1511 spin_unlock(&sctx->list_lock); 1512 wait_event(sctx->list_wait, sctx->first_free != -1); 1513 } 1514 } 1515 sbio = sctx->bios[sctx->curr]; 1516 if (sbio->page_count == 0) { 1517 struct bio *bio; 1518 1519 sbio->physical = spage->physical; 1520 sbio->logical = spage->logical; 1521 sbio->dev = spage->dev; 1522 bio = sbio->bio; 1523 if (!bio) { 1524 bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); 1525 if (!bio) 1526 return -ENOMEM; 1527 sbio->bio = bio; 1528 } 1529 1530 bio->bi_private = sbio; 1531 bio->bi_end_io = scrub_bio_end_io; 1532 bio->bi_bdev = sbio->dev->bdev; 1533 bio->bi_sector = sbio->physical >> 9; 1534 sbio->err = 0; 1535 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1536 spage->physical || 1537 sbio->logical + sbio->page_count * PAGE_SIZE != 1538 spage->logical || 1539 sbio->dev != spage->dev) { 1540 scrub_submit(sctx); 1541 goto again; 1542 } 1543 1544 sbio->pagev[sbio->page_count] = spage; 1545 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 1546 if (ret != PAGE_SIZE) { 1547 if (sbio->page_count < 1) { 1548 bio_put(sbio->bio); 1549 sbio->bio = NULL; 1550 return -EIO; 1551 } 1552 scrub_submit(sctx); 1553 goto again; 1554 } 1555 1556 scrub_block_get(sblock); /* one for the added page */ 1557 atomic_inc(&sblock->outstanding_pages); 1558 sbio->page_count++; 1559 if (sbio->page_count == sctx->pages_per_bio) 1560 scrub_submit(sctx); 1561 1562 return 0; 1563} 1564 1565static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 1566 u64 physical, struct btrfs_device *dev, u64 flags, 1567 u64 gen, int mirror_num, u8 *csum, int force) 1568{ 1569 struct scrub_block *sblock; 1570 int index; 1571 1572 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1573 if (!sblock) { 1574 spin_lock(&sctx->stat_lock); 1575 sctx->stat.malloc_errors++; 1576 spin_unlock(&sctx->stat_lock); 1577 return -ENOMEM; 1578 } 1579 1580 /* one ref inside this function, plus one for each page later on */ 1581 atomic_set(&sblock->ref_count, 1); 1582 sblock->sctx = sctx; 1583 sblock->no_io_error_seen = 1; 1584 1585 for (index = 0; len > 0; index++) { 1586 struct scrub_page *spage = sblock->pagev + index; 1587 u64 l = min_t(u64, len, PAGE_SIZE); 1588 1589 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1590 spage->page = alloc_page(GFP_NOFS); 1591 if (!spage->page) { 1592 spin_lock(&sctx->stat_lock); 1593 sctx->stat.malloc_errors++; 1594 spin_unlock(&sctx->stat_lock); 1595 while (index > 0) { 1596 index--; 1597 __free_page(sblock->pagev[index].page); 1598 } 1599 kfree(sblock); 1600 return -ENOMEM; 1601 } 1602 spage->sblock = sblock; 1603 spage->dev = dev; 1604 spage->flags = flags; 1605 spage->generation = gen; 1606 spage->logical = logical; 1607 spage->physical = physical; 1608 spage->mirror_num = mirror_num; 1609 if (csum) { 1610 spage->have_csum = 1; 1611 memcpy(spage->csum, csum, sctx->csum_size); 1612 } else { 1613 spage->have_csum = 0; 1614 } 1615 sblock->page_count++; 1616 len -= l; 1617 logical += l; 1618 physical += l; 1619 } 1620 1621 BUG_ON(sblock->page_count == 0); 1622 for (index = 0; index < sblock->page_count; index++) { 1623 struct scrub_page *spage = sblock->pagev + index; 1624 int ret; 1625 1626 ret = scrub_add_page_to_bio(sctx, spage); 1627 if (ret) { 1628 scrub_block_put(sblock); 1629 return ret; 1630 } 1631 } 1632 1633 if (force) 1634 scrub_submit(sctx); 1635 1636 /* last one frees, either here or in bio completion for last page */ 1637 scrub_block_put(sblock); 1638 return 0; 1639} 1640 1641static void scrub_bio_end_io(struct bio *bio, int err) 1642{ 1643 struct scrub_bio *sbio = bio->bi_private; 1644 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; 1645 1646 sbio->err = err; 1647 sbio->bio = bio; 1648 1649 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 1650} 1651 1652static void scrub_bio_end_io_worker(struct btrfs_work *work) 1653{ 1654 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 1655 struct scrub_ctx *sctx = sbio->sctx; 1656 int i; 1657 1658 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 1659 if (sbio->err) { 1660 for (i = 0; i < sbio->page_count; i++) { 1661 struct scrub_page *spage = sbio->pagev[i]; 1662 1663 spage->io_error = 1; 1664 spage->sblock->no_io_error_seen = 0; 1665 } 1666 } 1667 1668 /* now complete the scrub_block items that have all pages completed */ 1669 for (i = 0; i < sbio->page_count; i++) { 1670 struct scrub_page *spage = sbio->pagev[i]; 1671 struct scrub_block *sblock = spage->sblock; 1672 1673 if (atomic_dec_and_test(&sblock->outstanding_pages)) 1674 scrub_block_complete(sblock); 1675 scrub_block_put(sblock); 1676 } 1677 1678 bio_put(sbio->bio); 1679 sbio->bio = NULL; 1680 spin_lock(&sctx->list_lock); 1681 sbio->next_free = sctx->first_free; 1682 sctx->first_free = sbio->index; 1683 spin_unlock(&sctx->list_lock); 1684 atomic_dec(&sctx->in_flight); 1685 wake_up(&sctx->list_wait); 1686} 1687 1688static void scrub_block_complete(struct scrub_block *sblock) 1689{ 1690 if (!sblock->no_io_error_seen) 1691 scrub_handle_errored_block(sblock); 1692 else 1693 scrub_checksum(sblock); 1694} 1695 1696static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 1697 u8 *csum) 1698{ 1699 struct btrfs_ordered_sum *sum = NULL; 1700 int ret = 0; 1701 unsigned long i; 1702 unsigned long num_sectors; 1703 1704 while (!list_empty(&sctx->csum_list)) { 1705 sum = list_first_entry(&sctx->csum_list, 1706 struct btrfs_ordered_sum, list); 1707 if (sum->bytenr > logical) 1708 return 0; 1709 if (sum->bytenr + sum->len > logical) 1710 break; 1711 1712 ++sctx->stat.csum_discards; 1713 list_del(&sum->list); 1714 kfree(sum); 1715 sum = NULL; 1716 } 1717 if (!sum) 1718 return 0; 1719 1720 num_sectors = sum->len / sctx->sectorsize; 1721 for (i = 0; i < num_sectors; ++i) { 1722 if (sum->sums[i].bytenr == logical) { 1723 memcpy(csum, &sum->sums[i].sum, sctx->csum_size); 1724 ret = 1; 1725 break; 1726 } 1727 } 1728 if (ret && i == num_sectors - 1) { 1729 list_del(&sum->list); 1730 kfree(sum); 1731 } 1732 return ret; 1733} 1734 1735/* scrub extent tries to collect up to 64 kB for each bio */ 1736static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, 1737 u64 physical, struct btrfs_device *dev, u64 flags, 1738 u64 gen, int mirror_num) 1739{ 1740 int ret; 1741 u8 csum[BTRFS_CSUM_SIZE]; 1742 u32 blocksize; 1743 1744 if (flags & BTRFS_EXTENT_FLAG_DATA) { 1745 blocksize = sctx->sectorsize; 1746 spin_lock(&sctx->stat_lock); 1747 sctx->stat.data_extents_scrubbed++; 1748 sctx->stat.data_bytes_scrubbed += len; 1749 spin_unlock(&sctx->stat_lock); 1750 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1751 BUG_ON(sctx->nodesize != sctx->leafsize); 1752 blocksize = sctx->nodesize; 1753 spin_lock(&sctx->stat_lock); 1754 sctx->stat.tree_extents_scrubbed++; 1755 sctx->stat.tree_bytes_scrubbed += len; 1756 spin_unlock(&sctx->stat_lock); 1757 } else { 1758 blocksize = sctx->sectorsize; 1759 BUG_ON(1); 1760 } 1761 1762 while (len) { 1763 u64 l = min_t(u64, len, blocksize); 1764 int have_csum = 0; 1765 1766 if (flags & BTRFS_EXTENT_FLAG_DATA) { 1767 /* push csums to sbio */ 1768 have_csum = scrub_find_csum(sctx, logical, l, csum); 1769 if (have_csum == 0) 1770 ++sctx->stat.no_csum; 1771 } 1772 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, 1773 mirror_num, have_csum ? csum : NULL, 0); 1774 if (ret) 1775 return ret; 1776 len -= l; 1777 logical += l; 1778 physical += l; 1779 } 1780 return 0; 1781} 1782 1783static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 1784 struct map_lookup *map, 1785 struct btrfs_device *scrub_dev, 1786 int num, u64 base, u64 length) 1787{ 1788 struct btrfs_path *path; 1789 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 1790 struct btrfs_root *root = fs_info->extent_root; 1791 struct btrfs_root *csum_root = fs_info->csum_root; 1792 struct btrfs_extent_item *extent; 1793 struct blk_plug plug; 1794 u64 flags; 1795 int ret; 1796 int slot; 1797 int i; 1798 u64 nstripes; 1799 struct extent_buffer *l; 1800 struct btrfs_key key; 1801 u64 physical; 1802 u64 logical; 1803 u64 generation; 1804 int mirror_num; 1805 struct reada_control *reada1; 1806 struct reada_control *reada2; 1807 struct btrfs_key key_start; 1808 struct btrfs_key key_end; 1809 u64 increment = map->stripe_len; 1810 u64 offset; 1811 1812 nstripes = length; 1813 offset = 0; 1814 do_div(nstripes, map->stripe_len); 1815 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1816 offset = map->stripe_len * num; 1817 increment = map->stripe_len * map->num_stripes; 1818 mirror_num = 1; 1819 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1820 int factor = map->num_stripes / map->sub_stripes; 1821 offset = map->stripe_len * (num / map->sub_stripes); 1822 increment = map->stripe_len * factor; 1823 mirror_num = num % map->sub_stripes + 1; 1824 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1825 increment = map->stripe_len; 1826 mirror_num = num % map->num_stripes + 1; 1827 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1828 increment = map->stripe_len; 1829 mirror_num = num % map->num_stripes + 1; 1830 } else { 1831 increment = map->stripe_len; 1832 mirror_num = 1; 1833 } 1834 1835 path = btrfs_alloc_path(); 1836 if (!path) 1837 return -ENOMEM; 1838 1839 /* 1840 * work on commit root. The related disk blocks are static as 1841 * long as COW is applied. This means, it is save to rewrite 1842 * them to repair disk errors without any race conditions 1843 */ 1844 path->search_commit_root = 1; 1845 path->skip_locking = 1; 1846 1847 /* 1848 * trigger the readahead for extent tree csum tree and wait for 1849 * completion. During readahead, the scrub is officially paused 1850 * to not hold off transaction commits 1851 */ 1852 logical = base + offset; 1853 1854 wait_event(sctx->list_wait, 1855 atomic_read(&sctx->in_flight) == 0); 1856 atomic_inc(&fs_info->scrubs_paused); 1857 wake_up(&fs_info->scrub_pause_wait); 1858 1859 /* FIXME it might be better to start readahead at commit root */ 1860 key_start.objectid = logical; 1861 key_start.type = BTRFS_EXTENT_ITEM_KEY; 1862 key_start.offset = (u64)0; 1863 key_end.objectid = base + offset + nstripes * increment; 1864 key_end.type = BTRFS_EXTENT_ITEM_KEY; 1865 key_end.offset = (u64)0; 1866 reada1 = btrfs_reada_add(root, &key_start, &key_end); 1867 1868 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 1869 key_start.type = BTRFS_EXTENT_CSUM_KEY; 1870 key_start.offset = logical; 1871 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 1872 key_end.type = BTRFS_EXTENT_CSUM_KEY; 1873 key_end.offset = base + offset + nstripes * increment; 1874 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); 1875 1876 if (!IS_ERR(reada1)) 1877 btrfs_reada_wait(reada1); 1878 if (!IS_ERR(reada2)) 1879 btrfs_reada_wait(reada2); 1880 1881 mutex_lock(&fs_info->scrub_lock); 1882 while (atomic_read(&fs_info->scrub_pause_req)) { 1883 mutex_unlock(&fs_info->scrub_lock); 1884 wait_event(fs_info->scrub_pause_wait, 1885 atomic_read(&fs_info->scrub_pause_req) == 0); 1886 mutex_lock(&fs_info->scrub_lock); 1887 } 1888 atomic_dec(&fs_info->scrubs_paused); 1889 mutex_unlock(&fs_info->scrub_lock); 1890 wake_up(&fs_info->scrub_pause_wait); 1891 1892 /* 1893 * collect all data csums for the stripe to avoid seeking during 1894 * the scrub. This might currently (crc32) end up to be about 1MB 1895 */ 1896 blk_start_plug(&plug); 1897 1898 /* 1899 * now find all extents for each stripe and scrub them 1900 */ 1901 logical = base + offset; 1902 physical = map->stripes[num].physical; 1903 ret = 0; 1904 for (i = 0; i < nstripes; ++i) { 1905 /* 1906 * canceled? 1907 */ 1908 if (atomic_read(&fs_info->scrub_cancel_req) || 1909 atomic_read(&sctx->cancel_req)) { 1910 ret = -ECANCELED; 1911 goto out; 1912 } 1913 /* 1914 * check to see if we have to pause 1915 */ 1916 if (atomic_read(&fs_info->scrub_pause_req)) { 1917 /* push queued extents */ 1918 scrub_submit(sctx); 1919 wait_event(sctx->list_wait, 1920 atomic_read(&sctx->in_flight) == 0); 1921 atomic_inc(&fs_info->scrubs_paused); 1922 wake_up(&fs_info->scrub_pause_wait); 1923 mutex_lock(&fs_info->scrub_lock); 1924 while (atomic_read(&fs_info->scrub_pause_req)) { 1925 mutex_unlock(&fs_info->scrub_lock); 1926 wait_event(fs_info->scrub_pause_wait, 1927 atomic_read(&fs_info->scrub_pause_req) == 0); 1928 mutex_lock(&fs_info->scrub_lock); 1929 } 1930 atomic_dec(&fs_info->scrubs_paused); 1931 mutex_unlock(&fs_info->scrub_lock); 1932 wake_up(&fs_info->scrub_pause_wait); 1933 } 1934 1935 ret = btrfs_lookup_csums_range(csum_root, logical, 1936 logical + map->stripe_len - 1, 1937 &sctx->csum_list, 1); 1938 if (ret) 1939 goto out; 1940 1941 key.objectid = logical; 1942 key.type = BTRFS_EXTENT_ITEM_KEY; 1943 key.offset = (u64)0; 1944 1945 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1946 if (ret < 0) 1947 goto out; 1948 if (ret > 0) { 1949 ret = btrfs_previous_item(root, path, 0, 1950 BTRFS_EXTENT_ITEM_KEY); 1951 if (ret < 0) 1952 goto out; 1953 if (ret > 0) { 1954 /* there's no smaller item, so stick with the 1955 * larger one */ 1956 btrfs_release_path(path); 1957 ret = btrfs_search_slot(NULL, root, &key, 1958 path, 0, 0); 1959 if (ret < 0) 1960 goto out; 1961 } 1962 } 1963 1964 while (1) { 1965 l = path->nodes[0]; 1966 slot = path->slots[0]; 1967 if (slot >= btrfs_header_nritems(l)) { 1968 ret = btrfs_next_leaf(root, path); 1969 if (ret == 0) 1970 continue; 1971 if (ret < 0) 1972 goto out; 1973 1974 break; 1975 } 1976 btrfs_item_key_to_cpu(l, &key, slot); 1977 1978 if (key.objectid + key.offset <= logical) 1979 goto next; 1980 1981 if (key.objectid >= logical + map->stripe_len) 1982 break; 1983 1984 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) 1985 goto next; 1986 1987 extent = btrfs_item_ptr(l, slot, 1988 struct btrfs_extent_item); 1989 flags = btrfs_extent_flags(l, extent); 1990 generation = btrfs_extent_generation(l, extent); 1991 1992 if (key.objectid < logical && 1993 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { 1994 printk(KERN_ERR 1995 "btrfs scrub: tree block %llu spanning " 1996 "stripes, ignored. logical=%llu\n", 1997 (unsigned long long)key.objectid, 1998 (unsigned long long)logical); 1999 goto next; 2000 } 2001 2002 /* 2003 * trim extent to this stripe 2004 */ 2005 if (key.objectid < logical) { 2006 key.offset -= logical - key.objectid; 2007 key.objectid = logical; 2008 } 2009 if (key.objectid + key.offset > 2010 logical + map->stripe_len) { 2011 key.offset = logical + map->stripe_len - 2012 key.objectid; 2013 } 2014 2015 ret = scrub_extent(sctx, key.objectid, key.offset, 2016 key.objectid - logical + physical, 2017 scrub_dev, flags, generation, 2018 mirror_num); 2019 if (ret) 2020 goto out; 2021 2022next: 2023 path->slots[0]++; 2024 } 2025 btrfs_release_path(path); 2026 logical += increment; 2027 physical += map->stripe_len; 2028 spin_lock(&sctx->stat_lock); 2029 sctx->stat.last_physical = physical; 2030 spin_unlock(&sctx->stat_lock); 2031 } 2032 /* push queued extents */ 2033 scrub_submit(sctx); 2034 2035out: 2036 blk_finish_plug(&plug); 2037 btrfs_free_path(path); 2038 return ret < 0 ? ret : 0; 2039} 2040 2041static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 2042 struct btrfs_device *scrub_dev, 2043 u64 chunk_tree, u64 chunk_objectid, 2044 u64 chunk_offset, u64 length, 2045 u64 dev_offset) 2046{ 2047 struct btrfs_mapping_tree *map_tree = 2048 &sctx->dev_root->fs_info->mapping_tree; 2049 struct map_lookup *map; 2050 struct extent_map *em; 2051 int i; 2052 int ret = -EINVAL; 2053 2054 read_lock(&map_tree->map_tree.lock); 2055 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2056 read_unlock(&map_tree->map_tree.lock); 2057 2058 if (!em) 2059 return -EINVAL; 2060 2061 map = (struct map_lookup *)em->bdev; 2062 if (em->start != chunk_offset) 2063 goto out; 2064 2065 if (em->len < length) 2066 goto out; 2067 2068 for (i = 0; i < map->num_stripes; ++i) { 2069 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2070 map->stripes[i].physical == dev_offset) { 2071 ret = scrub_stripe(sctx, map, scrub_dev, i, 2072 chunk_offset, length); 2073 if (ret) 2074 goto out; 2075 } 2076 } 2077out: 2078 free_extent_map(em); 2079 2080 return ret; 2081} 2082 2083static noinline_for_stack 2084int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2085 struct btrfs_device *scrub_dev, u64 start, u64 end) 2086{ 2087 struct btrfs_dev_extent *dev_extent = NULL; 2088 struct btrfs_path *path; 2089 struct btrfs_root *root = sctx->dev_root; 2090 struct btrfs_fs_info *fs_info = root->fs_info; 2091 u64 length; 2092 u64 chunk_tree; 2093 u64 chunk_objectid; 2094 u64 chunk_offset; 2095 int ret; 2096 int slot; 2097 struct extent_buffer *l; 2098 struct btrfs_key key; 2099 struct btrfs_key found_key; 2100 struct btrfs_block_group_cache *cache; 2101 2102 path = btrfs_alloc_path(); 2103 if (!path) 2104 return -ENOMEM; 2105 2106 path->reada = 2; 2107 path->search_commit_root = 1; 2108 path->skip_locking = 1; 2109 2110 key.objectid = scrub_dev->devid; 2111 key.offset = 0ull; 2112 key.type = BTRFS_DEV_EXTENT_KEY; 2113 2114 while (1) { 2115 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2116 if (ret < 0) 2117 break; 2118 if (ret > 0) { 2119 if (path->slots[0] >= 2120 btrfs_header_nritems(path->nodes[0])) { 2121 ret = btrfs_next_leaf(root, path); 2122 if (ret) 2123 break; 2124 } 2125 } 2126 2127 l = path->nodes[0]; 2128 slot = path->slots[0]; 2129 2130 btrfs_item_key_to_cpu(l, &found_key, slot); 2131 2132 if (found_key.objectid != scrub_dev->devid) 2133 break; 2134 2135 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2136 break; 2137 2138 if (found_key.offset >= end) 2139 break; 2140 2141 if (found_key.offset < key.offset) 2142 break; 2143 2144 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2145 length = btrfs_dev_extent_length(l, dev_extent); 2146 2147 if (found_key.offset + length <= start) { 2148 key.offset = found_key.offset + length; 2149 btrfs_release_path(path); 2150 continue; 2151 } 2152 2153 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2154 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2155 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2156 2157 /* 2158 * get a reference on the corresponding block group to prevent 2159 * the chunk from going away while we scrub it 2160 */ 2161 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2162 if (!cache) { 2163 ret = -ENOENT; 2164 break; 2165 } 2166 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, 2167 chunk_offset, length, found_key.offset); 2168 btrfs_put_block_group(cache); 2169 if (ret) 2170 break; 2171 2172 key.offset = found_key.offset + length; 2173 btrfs_release_path(path); 2174 } 2175 2176 btrfs_free_path(path); 2177 2178 /* 2179 * ret can still be 1 from search_slot or next_leaf, 2180 * that's not an error 2181 */ 2182 return ret < 0 ? ret : 0; 2183} 2184 2185static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 2186 struct btrfs_device *scrub_dev) 2187{ 2188 int i; 2189 u64 bytenr; 2190 u64 gen; 2191 int ret; 2192 struct btrfs_root *root = sctx->dev_root; 2193 2194 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2195 return -EIO; 2196 2197 gen = root->fs_info->last_trans_committed; 2198 2199 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2200 bytenr = btrfs_sb_offset(i); 2201 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) 2202 break; 2203 2204 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2205 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 2206 NULL, 1); 2207 if (ret) 2208 return ret; 2209 } 2210 wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); 2211 2212 return 0; 2213} 2214 2215/* 2216 * get a reference count on fs_info->scrub_workers. start worker if necessary 2217 */ 2218static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2219{ 2220 struct btrfs_fs_info *fs_info = root->fs_info; 2221 int ret = 0; 2222 2223 mutex_lock(&fs_info->scrub_lock); 2224 if (fs_info->scrub_workers_refcnt == 0) { 2225 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2226 fs_info->thread_pool_size, &fs_info->generic_worker); 2227 fs_info->scrub_workers.idle_thresh = 4; 2228 ret = btrfs_start_workers(&fs_info->scrub_workers); 2229 if (ret) 2230 goto out; 2231 } 2232 ++fs_info->scrub_workers_refcnt; 2233out: 2234 mutex_unlock(&fs_info->scrub_lock); 2235 2236 return ret; 2237} 2238 2239static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2240{ 2241 struct btrfs_fs_info *fs_info = root->fs_info; 2242 2243 mutex_lock(&fs_info->scrub_lock); 2244 if (--fs_info->scrub_workers_refcnt == 0) 2245 btrfs_stop_workers(&fs_info->scrub_workers); 2246 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2247 mutex_unlock(&fs_info->scrub_lock); 2248} 2249 2250 2251int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2252 struct btrfs_scrub_progress *progress, int readonly) 2253{ 2254 struct scrub_ctx *sctx; 2255 struct btrfs_fs_info *fs_info = root->fs_info; 2256 int ret; 2257 struct btrfs_device *dev; 2258 2259 if (btrfs_fs_closing(root->fs_info)) 2260 return -EINVAL; 2261 2262 /* 2263 * check some assumptions 2264 */ 2265 if (root->nodesize != root->leafsize) { 2266 printk(KERN_ERR 2267 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2268 root->nodesize, root->leafsize); 2269 return -EINVAL; 2270 } 2271 2272 if (root->nodesize > BTRFS_STRIPE_LEN) { 2273 /* 2274 * in this case scrub is unable to calculate the checksum 2275 * the way scrub is implemented. Do not handle this 2276 * situation at all because it won't ever happen. 2277 */ 2278 printk(KERN_ERR 2279 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2280 root->nodesize, BTRFS_STRIPE_LEN); 2281 return -EINVAL; 2282 } 2283 2284 if (root->sectorsize != PAGE_SIZE) { 2285 /* not supported for data w/o checksums */ 2286 printk(KERN_ERR 2287 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2288 root->sectorsize, (unsigned long long)PAGE_SIZE); 2289 return -EINVAL; 2290 } 2291 2292 ret = scrub_workers_get(root); 2293 if (ret) 2294 return ret; 2295 2296 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2297 dev = btrfs_find_device(root, devid, NULL, NULL); 2298 if (!dev || dev->missing) { 2299 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2300 scrub_workers_put(root); 2301 return -ENODEV; 2302 } 2303 mutex_lock(&fs_info->scrub_lock); 2304 2305 if (!dev->in_fs_metadata) { 2306 mutex_unlock(&fs_info->scrub_lock); 2307 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2308 scrub_workers_put(root); 2309 return -ENODEV; 2310 } 2311 2312 if (dev->scrub_device) { 2313 mutex_unlock(&fs_info->scrub_lock); 2314 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2315 scrub_workers_put(root); 2316 return -EINPROGRESS; 2317 } 2318 sctx = scrub_setup_ctx(dev); 2319 if (IS_ERR(sctx)) { 2320 mutex_unlock(&fs_info->scrub_lock); 2321 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2322 scrub_workers_put(root); 2323 return PTR_ERR(sctx); 2324 } 2325 sctx->readonly = readonly; 2326 dev->scrub_device = sctx; 2327 2328 atomic_inc(&fs_info->scrubs_running); 2329 mutex_unlock(&fs_info->scrub_lock); 2330 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2331 2332 down_read(&fs_info->scrub_super_lock); 2333 ret = scrub_supers(sctx, dev); 2334 up_read(&fs_info->scrub_super_lock); 2335 2336 if (!ret) 2337 ret = scrub_enumerate_chunks(sctx, dev, start, end); 2338 2339 wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); 2340 atomic_dec(&fs_info->scrubs_running); 2341 wake_up(&fs_info->scrub_pause_wait); 2342 2343 wait_event(sctx->list_wait, atomic_read(&sctx->fixup_cnt) == 0); 2344 2345 if (progress) 2346 memcpy(progress, &sctx->stat, sizeof(*progress)); 2347 2348 mutex_lock(&fs_info->scrub_lock); 2349 dev->scrub_device = NULL; 2350 mutex_unlock(&fs_info->scrub_lock); 2351 2352 scrub_free_ctx(sctx); 2353 scrub_workers_put(root); 2354 2355 return ret; 2356} 2357 2358void btrfs_scrub_pause(struct btrfs_root *root) 2359{ 2360 struct btrfs_fs_info *fs_info = root->fs_info; 2361 2362 mutex_lock(&fs_info->scrub_lock); 2363 atomic_inc(&fs_info->scrub_pause_req); 2364 while (atomic_read(&fs_info->scrubs_paused) != 2365 atomic_read(&fs_info->scrubs_running)) { 2366 mutex_unlock(&fs_info->scrub_lock); 2367 wait_event(fs_info->scrub_pause_wait, 2368 atomic_read(&fs_info->scrubs_paused) == 2369 atomic_read(&fs_info->scrubs_running)); 2370 mutex_lock(&fs_info->scrub_lock); 2371 } 2372 mutex_unlock(&fs_info->scrub_lock); 2373} 2374 2375void btrfs_scrub_continue(struct btrfs_root *root) 2376{ 2377 struct btrfs_fs_info *fs_info = root->fs_info; 2378 2379 atomic_dec(&fs_info->scrub_pause_req); 2380 wake_up(&fs_info->scrub_pause_wait); 2381} 2382 2383void btrfs_scrub_pause_super(struct btrfs_root *root) 2384{ 2385 down_write(&root->fs_info->scrub_super_lock); 2386} 2387 2388void btrfs_scrub_continue_super(struct btrfs_root *root) 2389{ 2390 up_write(&root->fs_info->scrub_super_lock); 2391} 2392 2393int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2394{ 2395 2396 mutex_lock(&fs_info->scrub_lock); 2397 if (!atomic_read(&fs_info->scrubs_running)) { 2398 mutex_unlock(&fs_info->scrub_lock); 2399 return -ENOTCONN; 2400 } 2401 2402 atomic_inc(&fs_info->scrub_cancel_req); 2403 while (atomic_read(&fs_info->scrubs_running)) { 2404 mutex_unlock(&fs_info->scrub_lock); 2405 wait_event(fs_info->scrub_pause_wait, 2406 atomic_read(&fs_info->scrubs_running) == 0); 2407 mutex_lock(&fs_info->scrub_lock); 2408 } 2409 atomic_dec(&fs_info->scrub_cancel_req); 2410 mutex_unlock(&fs_info->scrub_lock); 2411 2412 return 0; 2413} 2414 2415int btrfs_scrub_cancel(struct btrfs_root *root) 2416{ 2417 return __btrfs_scrub_cancel(root->fs_info); 2418} 2419 2420int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) 2421{ 2422 struct btrfs_fs_info *fs_info = root->fs_info; 2423 struct scrub_ctx *sctx; 2424 2425 mutex_lock(&fs_info->scrub_lock); 2426 sctx = dev->scrub_device; 2427 if (!sctx) { 2428 mutex_unlock(&fs_info->scrub_lock); 2429 return -ENOTCONN; 2430 } 2431 atomic_inc(&sctx->cancel_req); 2432 while (dev->scrub_device) { 2433 mutex_unlock(&fs_info->scrub_lock); 2434 wait_event(fs_info->scrub_pause_wait, 2435 dev->scrub_device == NULL); 2436 mutex_lock(&fs_info->scrub_lock); 2437 } 2438 mutex_unlock(&fs_info->scrub_lock); 2439 2440 return 0; 2441} 2442 2443int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) 2444{ 2445 struct btrfs_fs_info *fs_info = root->fs_info; 2446 struct btrfs_device *dev; 2447 int ret; 2448 2449 /* 2450 * we have to hold the device_list_mutex here so the device 2451 * does not go away in cancel_dev. FIXME: find a better solution 2452 */ 2453 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2454 dev = btrfs_find_device(root, devid, NULL, NULL); 2455 if (!dev) { 2456 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2457 return -ENODEV; 2458 } 2459 ret = btrfs_scrub_cancel_dev(root, dev); 2460 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2461 2462 return ret; 2463} 2464 2465int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2466 struct btrfs_scrub_progress *progress) 2467{ 2468 struct btrfs_device *dev; 2469 struct scrub_ctx *sctx = NULL; 2470 2471 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2472 dev = btrfs_find_device(root, devid, NULL, NULL); 2473 if (dev) 2474 sctx = dev->scrub_device; 2475 if (sctx) 2476 memcpy(progress, &sctx->stat, sizeof(*progress)); 2477 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2478 2479 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 2480} 2481