scrub.c revision 00d01bc17cc2807292303961519d9c005794eb1d
1/* 2 * Copyright (C) 2011 STRATO. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19#include <linux/sched.h> 20#include <linux/pagemap.h> 21#include <linux/writeback.h> 22#include <linux/blkdev.h> 23#include <linux/rbtree.h> 24#include <linux/slab.h> 25#include <linux/workqueue.h> 26#include "ctree.h" 27#include "volumes.h" 28#include "disk-io.h" 29#include "ordered-data.h" 30 31/* 32 * This is only the first step towards a full-features scrub. It reads all 33 * extent and super block and verifies the checksums. In case a bad checksum 34 * is found or the extent cannot be read, good data will be written back if 35 * any can be found. 36 * 37 * Future enhancements: 38 * - To enhance the performance, better read-ahead strategies for the 39 * extent-tree can be employed. 40 * - In case an unrepairable extent is encountered, track which files are 41 * affected and report them 42 * - In case of a read error on files with nodatasum, map the file and read 43 * the extent to trigger a writeback of the good copy 44 * - track and record media errors, throw out bad devices 45 * - add a mode to also read unallocated space 46 * - make the prefetch cancellable 47 */ 48 49struct scrub_bio; 50struct scrub_page; 51struct scrub_dev; 52static void scrub_bio_end_io(struct bio *bio, int err); 53static void scrub_checksum(struct btrfs_work *work); 54static int scrub_checksum_data(struct scrub_dev *sdev, 55 struct scrub_page *spag, void *buffer); 56static int scrub_checksum_tree_block(struct scrub_dev *sdev, 57 struct scrub_page *spag, u64 logical, 58 void *buffer); 59static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); 60static int scrub_fixup_check(struct scrub_bio *sbio, int ix); 61static void scrub_fixup_end_io(struct bio *bio, int err); 62static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 63 struct page *page); 64static void scrub_fixup(struct scrub_bio *sbio, int ix); 65 66#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 67#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 68 69struct scrub_page { 70 u64 flags; /* extent flags */ 71 u64 generation; 72 u64 mirror_num; 73 int have_csum; 74 u8 csum[BTRFS_CSUM_SIZE]; 75}; 76 77struct scrub_bio { 78 int index; 79 struct scrub_dev *sdev; 80 struct bio *bio; 81 int err; 82 u64 logical; 83 u64 physical; 84 struct scrub_page spag[SCRUB_PAGES_PER_BIO]; 85 u64 count; 86 int next_free; 87 struct btrfs_work work; 88}; 89 90struct scrub_dev { 91 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 92 struct btrfs_device *dev; 93 int first_free; 94 int curr; 95 atomic_t in_flight; 96 spinlock_t list_lock; 97 wait_queue_head_t list_wait; 98 u16 csum_size; 99 struct list_head csum_list; 100 atomic_t cancel_req; 101 int readonly; 102 /* 103 * statistics 104 */ 105 struct btrfs_scrub_progress stat; 106 spinlock_t stat_lock; 107}; 108 109static void scrub_free_csums(struct scrub_dev *sdev) 110{ 111 while (!list_empty(&sdev->csum_list)) { 112 struct btrfs_ordered_sum *sum; 113 sum = list_first_entry(&sdev->csum_list, 114 struct btrfs_ordered_sum, list); 115 list_del(&sum->list); 116 kfree(sum); 117 } 118} 119 120static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 121{ 122 int i; 123 int j; 124 struct page *last_page; 125 126 if (!sdev) 127 return; 128 129 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 130 struct scrub_bio *sbio = sdev->bios[i]; 131 struct bio *bio; 132 133 if (!sbio) 134 break; 135 136 bio = sbio->bio; 137 if (bio) { 138 last_page = NULL; 139 for (j = 0; j < bio->bi_vcnt; ++j) { 140 if (bio->bi_io_vec[j].bv_page == last_page) 141 continue; 142 last_page = bio->bi_io_vec[j].bv_page; 143 __free_page(last_page); 144 } 145 bio_put(bio); 146 } 147 kfree(sbio); 148 } 149 150 scrub_free_csums(sdev); 151 kfree(sdev); 152} 153 154static noinline_for_stack 155struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 156{ 157 struct scrub_dev *sdev; 158 int i; 159 int j; 160 int ret; 161 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 162 163 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 164 if (!sdev) 165 goto nomem; 166 sdev->dev = dev; 167 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 168 struct bio *bio; 169 struct scrub_bio *sbio; 170 171 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 172 if (!sbio) 173 goto nomem; 174 sdev->bios[i] = sbio; 175 176 bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); 177 if (!bio) 178 goto nomem; 179 180 sbio->index = i; 181 sbio->sdev = sdev; 182 sbio->bio = bio; 183 sbio->count = 0; 184 sbio->work.func = scrub_checksum; 185 bio->bi_private = sdev->bios[i]; 186 bio->bi_end_io = scrub_bio_end_io; 187 bio->bi_sector = 0; 188 bio->bi_bdev = dev->bdev; 189 bio->bi_size = 0; 190 191 for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) { 192 struct page *page; 193 page = alloc_page(GFP_NOFS); 194 if (!page) 195 goto nomem; 196 197 ret = bio_add_page(bio, page, PAGE_SIZE, 0); 198 if (!ret) 199 goto nomem; 200 } 201 WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO); 202 203 if (i != SCRUB_BIOS_PER_DEV-1) 204 sdev->bios[i]->next_free = i + 1; 205 else 206 sdev->bios[i]->next_free = -1; 207 } 208 sdev->first_free = 0; 209 sdev->curr = -1; 210 atomic_set(&sdev->in_flight, 0); 211 atomic_set(&sdev->cancel_req, 0); 212 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 213 INIT_LIST_HEAD(&sdev->csum_list); 214 215 spin_lock_init(&sdev->list_lock); 216 spin_lock_init(&sdev->stat_lock); 217 init_waitqueue_head(&sdev->list_wait); 218 return sdev; 219 220nomem: 221 scrub_free_dev(sdev); 222 return ERR_PTR(-ENOMEM); 223} 224 225/* 226 * scrub_recheck_error gets called when either verification of the page 227 * failed or the bio failed to read, e.g. with EIO. In the latter case, 228 * recheck_error gets called for every page in the bio, even though only 229 * one may be bad 230 */ 231static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 232{ 233 if (sbio->err) { 234 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 235 (sbio->physical + ix * PAGE_SIZE) >> 9, 236 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 237 if (scrub_fixup_check(sbio, ix) == 0) 238 return; 239 } 240 } 241 242 scrub_fixup(sbio, ix); 243} 244 245static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 246{ 247 int ret = 1; 248 struct page *page; 249 void *buffer; 250 u64 flags = sbio->spag[ix].flags; 251 252 page = sbio->bio->bi_io_vec[ix].bv_page; 253 buffer = kmap_atomic(page, KM_USER0); 254 if (flags & BTRFS_EXTENT_FLAG_DATA) { 255 ret = scrub_checksum_data(sbio->sdev, 256 sbio->spag + ix, buffer); 257 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 258 ret = scrub_checksum_tree_block(sbio->sdev, 259 sbio->spag + ix, 260 sbio->logical + ix * PAGE_SIZE, 261 buffer); 262 } else { 263 WARN_ON(1); 264 } 265 kunmap_atomic(buffer, KM_USER0); 266 267 return ret; 268} 269 270static void scrub_fixup_end_io(struct bio *bio, int err) 271{ 272 complete((struct completion *)bio->bi_private); 273} 274 275static void scrub_fixup(struct scrub_bio *sbio, int ix) 276{ 277 struct scrub_dev *sdev = sbio->sdev; 278 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 279 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 280 struct btrfs_multi_bio *multi = NULL; 281 u64 logical = sbio->logical + ix * PAGE_SIZE; 282 u64 length; 283 int i; 284 int ret; 285 DECLARE_COMPLETION_ONSTACK(complete); 286 287 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 288 (sbio->spag[ix].have_csum == 0)) { 289 /* 290 * nodatasum, don't try to fix anything 291 * FIXME: we can do better, open the inode and trigger a 292 * writeback 293 */ 294 goto uncorrectable; 295 } 296 297 length = PAGE_SIZE; 298 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 299 &multi, 0); 300 if (ret || !multi || length < PAGE_SIZE) { 301 printk(KERN_ERR 302 "scrub_fixup: btrfs_map_block failed us for %llu\n", 303 (unsigned long long)logical); 304 WARN_ON(1); 305 return; 306 } 307 308 if (multi->num_stripes == 1) 309 /* there aren't any replicas */ 310 goto uncorrectable; 311 312 /* 313 * first find a good copy 314 */ 315 for (i = 0; i < multi->num_stripes; ++i) { 316 if (i == sbio->spag[ix].mirror_num) 317 continue; 318 319 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 320 multi->stripes[i].physical >> 9, 321 sbio->bio->bi_io_vec[ix].bv_page)) { 322 /* I/O-error, this is not a good copy */ 323 continue; 324 } 325 326 if (scrub_fixup_check(sbio, ix) == 0) 327 break; 328 } 329 if (i == multi->num_stripes) 330 goto uncorrectable; 331 332 if (!sdev->readonly) { 333 /* 334 * bi_io_vec[ix].bv_page now contains good data, write it back 335 */ 336 if (scrub_fixup_io(WRITE, sdev->dev->bdev, 337 (sbio->physical + ix * PAGE_SIZE) >> 9, 338 sbio->bio->bi_io_vec[ix].bv_page)) { 339 /* I/O-error, writeback failed, give up */ 340 goto uncorrectable; 341 } 342 } 343 344 kfree(multi); 345 spin_lock(&sdev->stat_lock); 346 ++sdev->stat.corrected_errors; 347 spin_unlock(&sdev->stat_lock); 348 349 if (printk_ratelimit()) 350 printk(KERN_ERR "btrfs: fixed up at %llu\n", 351 (unsigned long long)logical); 352 return; 353 354uncorrectable: 355 kfree(multi); 356 spin_lock(&sdev->stat_lock); 357 ++sdev->stat.uncorrectable_errors; 358 spin_unlock(&sdev->stat_lock); 359 360 if (printk_ratelimit()) 361 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 362 (unsigned long long)logical); 363} 364 365static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 366 struct page *page) 367{ 368 struct bio *bio = NULL; 369 int ret; 370 DECLARE_COMPLETION_ONSTACK(complete); 371 372 /* we are going to wait on this IO */ 373 rw |= REQ_SYNC; 374 375 bio = bio_alloc(GFP_NOFS, 1); 376 bio->bi_bdev = bdev; 377 bio->bi_sector = sector; 378 bio_add_page(bio, page, PAGE_SIZE, 0); 379 bio->bi_end_io = scrub_fixup_end_io; 380 bio->bi_private = &complete; 381 submit_bio(rw, bio); 382 383 wait_for_completion(&complete); 384 385 ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); 386 bio_put(bio); 387 return ret; 388} 389 390static void scrub_bio_end_io(struct bio *bio, int err) 391{ 392 struct scrub_bio *sbio = bio->bi_private; 393 struct scrub_dev *sdev = sbio->sdev; 394 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 395 396 sbio->err = err; 397 398 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 399} 400 401static void scrub_checksum(struct btrfs_work *work) 402{ 403 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 404 struct scrub_dev *sdev = sbio->sdev; 405 struct page *page; 406 void *buffer; 407 int i; 408 u64 flags; 409 u64 logical; 410 int ret; 411 412 if (sbio->err) { 413 for (i = 0; i < sbio->count; ++i) 414 scrub_recheck_error(sbio, i); 415 416 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 417 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 418 sbio->bio->bi_phys_segments = 0; 419 sbio->bio->bi_idx = 0; 420 421 for (i = 0; i < sbio->count; i++) { 422 struct bio_vec *bi; 423 bi = &sbio->bio->bi_io_vec[i]; 424 bi->bv_offset = 0; 425 bi->bv_len = PAGE_SIZE; 426 } 427 428 spin_lock(&sdev->stat_lock); 429 ++sdev->stat.read_errors; 430 spin_unlock(&sdev->stat_lock); 431 goto out; 432 } 433 for (i = 0; i < sbio->count; ++i) { 434 page = sbio->bio->bi_io_vec[i].bv_page; 435 buffer = kmap_atomic(page, KM_USER0); 436 flags = sbio->spag[i].flags; 437 logical = sbio->logical + i * PAGE_SIZE; 438 ret = 0; 439 if (flags & BTRFS_EXTENT_FLAG_DATA) { 440 ret = scrub_checksum_data(sdev, sbio->spag + i, buffer); 441 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 442 ret = scrub_checksum_tree_block(sdev, sbio->spag + i, 443 logical, buffer); 444 } else if (flags & BTRFS_EXTENT_FLAG_SUPER) { 445 BUG_ON(i); 446 (void)scrub_checksum_super(sbio, buffer); 447 } else { 448 WARN_ON(1); 449 } 450 kunmap_atomic(buffer, KM_USER0); 451 if (ret) 452 scrub_recheck_error(sbio, i); 453 } 454 455out: 456 spin_lock(&sdev->list_lock); 457 sbio->next_free = sdev->first_free; 458 sdev->first_free = sbio->index; 459 spin_unlock(&sdev->list_lock); 460 atomic_dec(&sdev->in_flight); 461 wake_up(&sdev->list_wait); 462} 463 464static int scrub_checksum_data(struct scrub_dev *sdev, 465 struct scrub_page *spag, void *buffer) 466{ 467 u8 csum[BTRFS_CSUM_SIZE]; 468 u32 crc = ~(u32)0; 469 int fail = 0; 470 struct btrfs_root *root = sdev->dev->dev_root; 471 472 if (!spag->have_csum) 473 return 0; 474 475 crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE); 476 btrfs_csum_final(crc, csum); 477 if (memcmp(csum, spag->csum, sdev->csum_size)) 478 fail = 1; 479 480 spin_lock(&sdev->stat_lock); 481 ++sdev->stat.data_extents_scrubbed; 482 sdev->stat.data_bytes_scrubbed += PAGE_SIZE; 483 if (fail) 484 ++sdev->stat.csum_errors; 485 spin_unlock(&sdev->stat_lock); 486 487 return fail; 488} 489 490static int scrub_checksum_tree_block(struct scrub_dev *sdev, 491 struct scrub_page *spag, u64 logical, 492 void *buffer) 493{ 494 struct btrfs_header *h; 495 struct btrfs_root *root = sdev->dev->dev_root; 496 struct btrfs_fs_info *fs_info = root->fs_info; 497 u8 csum[BTRFS_CSUM_SIZE]; 498 u32 crc = ~(u32)0; 499 int fail = 0; 500 int crc_fail = 0; 501 502 /* 503 * we don't use the getter functions here, as we 504 * a) don't have an extent buffer and 505 * b) the page is already kmapped 506 */ 507 h = (struct btrfs_header *)buffer; 508 509 if (logical != le64_to_cpu(h->bytenr)) 510 ++fail; 511 512 if (spag->generation != le64_to_cpu(h->generation)) 513 ++fail; 514 515 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 516 ++fail; 517 518 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 519 BTRFS_UUID_SIZE)) 520 ++fail; 521 522 crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, 523 PAGE_SIZE - BTRFS_CSUM_SIZE); 524 btrfs_csum_final(crc, csum); 525 if (memcmp(csum, h->csum, sdev->csum_size)) 526 ++crc_fail; 527 528 spin_lock(&sdev->stat_lock); 529 ++sdev->stat.tree_extents_scrubbed; 530 sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; 531 if (crc_fail) 532 ++sdev->stat.csum_errors; 533 if (fail) 534 ++sdev->stat.verify_errors; 535 spin_unlock(&sdev->stat_lock); 536 537 return fail || crc_fail; 538} 539 540static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) 541{ 542 struct btrfs_super_block *s; 543 u64 logical; 544 struct scrub_dev *sdev = sbio->sdev; 545 struct btrfs_root *root = sdev->dev->dev_root; 546 struct btrfs_fs_info *fs_info = root->fs_info; 547 u8 csum[BTRFS_CSUM_SIZE]; 548 u32 crc = ~(u32)0; 549 int fail = 0; 550 551 s = (struct btrfs_super_block *)buffer; 552 logical = sbio->logical; 553 554 if (logical != le64_to_cpu(s->bytenr)) 555 ++fail; 556 557 if (sbio->spag[0].generation != le64_to_cpu(s->generation)) 558 ++fail; 559 560 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 561 ++fail; 562 563 crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, 564 PAGE_SIZE - BTRFS_CSUM_SIZE); 565 btrfs_csum_final(crc, csum); 566 if (memcmp(csum, s->csum, sbio->sdev->csum_size)) 567 ++fail; 568 569 if (fail) { 570 /* 571 * if we find an error in a super block, we just report it. 572 * They will get written with the next transaction commit 573 * anyway 574 */ 575 spin_lock(&sdev->stat_lock); 576 ++sdev->stat.super_errors; 577 spin_unlock(&sdev->stat_lock); 578 } 579 580 return fail; 581} 582 583static int scrub_submit(struct scrub_dev *sdev) 584{ 585 struct scrub_bio *sbio; 586 587 if (sdev->curr == -1) 588 return 0; 589 590 sbio = sdev->bios[sdev->curr]; 591 592 sbio->bio->bi_sector = sbio->physical >> 9; 593 sbio->bio->bi_size = sbio->count * PAGE_SIZE; 594 sbio->bio->bi_next = NULL; 595 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 596 sbio->bio->bi_comp_cpu = -1; 597 sbio->bio->bi_bdev = sdev->dev->bdev; 598 sbio->err = 0; 599 sdev->curr = -1; 600 atomic_inc(&sdev->in_flight); 601 602 submit_bio(0, sbio->bio); 603 604 return 0; 605} 606 607static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 608 u64 physical, u64 flags, u64 gen, u64 mirror_num, 609 u8 *csum, int force) 610{ 611 struct scrub_bio *sbio; 612 613again: 614 /* 615 * grab a fresh bio or wait for one to become available 616 */ 617 while (sdev->curr == -1) { 618 spin_lock(&sdev->list_lock); 619 sdev->curr = sdev->first_free; 620 if (sdev->curr != -1) { 621 sdev->first_free = sdev->bios[sdev->curr]->next_free; 622 sdev->bios[sdev->curr]->next_free = -1; 623 sdev->bios[sdev->curr]->count = 0; 624 spin_unlock(&sdev->list_lock); 625 } else { 626 spin_unlock(&sdev->list_lock); 627 wait_event(sdev->list_wait, sdev->first_free != -1); 628 } 629 } 630 sbio = sdev->bios[sdev->curr]; 631 if (sbio->count == 0) { 632 sbio->physical = physical; 633 sbio->logical = logical; 634 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 635 sbio->logical + sbio->count * PAGE_SIZE != logical) { 636 scrub_submit(sdev); 637 goto again; 638 } 639 sbio->spag[sbio->count].flags = flags; 640 sbio->spag[sbio->count].generation = gen; 641 sbio->spag[sbio->count].have_csum = 0; 642 sbio->spag[sbio->count].mirror_num = mirror_num; 643 if (csum) { 644 sbio->spag[sbio->count].have_csum = 1; 645 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 646 } 647 ++sbio->count; 648 if (sbio->count == SCRUB_PAGES_PER_BIO || force) 649 scrub_submit(sdev); 650 651 return 0; 652} 653 654static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 655 u8 *csum) 656{ 657 struct btrfs_ordered_sum *sum = NULL; 658 int ret = 0; 659 unsigned long i; 660 unsigned long num_sectors; 661 u32 sectorsize = sdev->dev->dev_root->sectorsize; 662 663 while (!list_empty(&sdev->csum_list)) { 664 sum = list_first_entry(&sdev->csum_list, 665 struct btrfs_ordered_sum, list); 666 if (sum->bytenr > logical) 667 return 0; 668 if (sum->bytenr + sum->len > logical) 669 break; 670 671 ++sdev->stat.csum_discards; 672 list_del(&sum->list); 673 kfree(sum); 674 sum = NULL; 675 } 676 if (!sum) 677 return 0; 678 679 num_sectors = sum->len / sectorsize; 680 for (i = 0; i < num_sectors; ++i) { 681 if (sum->sums[i].bytenr == logical) { 682 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 683 ret = 1; 684 break; 685 } 686 } 687 if (ret && i == num_sectors - 1) { 688 list_del(&sum->list); 689 kfree(sum); 690 } 691 return ret; 692} 693 694/* scrub extent tries to collect up to 64 kB for each bio */ 695static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 696 u64 physical, u64 flags, u64 gen, u64 mirror_num) 697{ 698 int ret; 699 u8 csum[BTRFS_CSUM_SIZE]; 700 701 while (len) { 702 u64 l = min_t(u64, len, PAGE_SIZE); 703 int have_csum = 0; 704 705 if (flags & BTRFS_EXTENT_FLAG_DATA) { 706 /* push csums to sbio */ 707 have_csum = scrub_find_csum(sdev, logical, l, csum); 708 if (have_csum == 0) 709 ++sdev->stat.no_csum; 710 } 711 ret = scrub_page(sdev, logical, l, physical, flags, gen, 712 mirror_num, have_csum ? csum : NULL, 0); 713 if (ret) 714 return ret; 715 len -= l; 716 logical += l; 717 physical += l; 718 } 719 return 0; 720} 721 722static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 723 struct map_lookup *map, int num, u64 base, u64 length) 724{ 725 struct btrfs_path *path; 726 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 727 struct btrfs_root *root = fs_info->extent_root; 728 struct btrfs_root *csum_root = fs_info->csum_root; 729 struct btrfs_extent_item *extent; 730 u64 flags; 731 int ret; 732 int slot; 733 int i; 734 u64 nstripes; 735 int start_stripe; 736 struct extent_buffer *l; 737 struct btrfs_key key; 738 u64 physical; 739 u64 logical; 740 u64 generation; 741 u64 mirror_num; 742 743 u64 increment = map->stripe_len; 744 u64 offset; 745 746 nstripes = length; 747 offset = 0; 748 do_div(nstripes, map->stripe_len); 749 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 750 offset = map->stripe_len * num; 751 increment = map->stripe_len * map->num_stripes; 752 mirror_num = 0; 753 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 754 int factor = map->num_stripes / map->sub_stripes; 755 offset = map->stripe_len * (num / map->sub_stripes); 756 increment = map->stripe_len * factor; 757 mirror_num = num % map->sub_stripes; 758 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 759 increment = map->stripe_len; 760 mirror_num = num % map->num_stripes; 761 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 762 increment = map->stripe_len; 763 mirror_num = num % map->num_stripes; 764 } else { 765 increment = map->stripe_len; 766 mirror_num = 0; 767 } 768 769 path = btrfs_alloc_path(); 770 if (!path) 771 return -ENOMEM; 772 773 path->reada = 2; 774 path->search_commit_root = 1; 775 path->skip_locking = 1; 776 777 /* 778 * find all extents for each stripe and just read them to get 779 * them into the page cache 780 * FIXME: we can do better. build a more intelligent prefetching 781 */ 782 logical = base + offset; 783 physical = map->stripes[num].physical; 784 ret = 0; 785 for (i = 0; i < nstripes; ++i) { 786 key.objectid = logical; 787 key.type = BTRFS_EXTENT_ITEM_KEY; 788 key.offset = (u64)0; 789 790 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 791 if (ret < 0) 792 goto out; 793 794 l = path->nodes[0]; 795 slot = path->slots[0]; 796 btrfs_item_key_to_cpu(l, &key, slot); 797 if (key.objectid != logical) { 798 ret = btrfs_previous_item(root, path, 0, 799 BTRFS_EXTENT_ITEM_KEY); 800 if (ret < 0) 801 goto out; 802 } 803 804 while (1) { 805 l = path->nodes[0]; 806 slot = path->slots[0]; 807 if (slot >= btrfs_header_nritems(l)) { 808 ret = btrfs_next_leaf(root, path); 809 if (ret == 0) 810 continue; 811 if (ret < 0) 812 goto out; 813 814 break; 815 } 816 btrfs_item_key_to_cpu(l, &key, slot); 817 818 if (key.objectid >= logical + map->stripe_len) 819 break; 820 821 path->slots[0]++; 822 } 823 btrfs_release_path(path); 824 logical += increment; 825 physical += map->stripe_len; 826 cond_resched(); 827 } 828 829 /* 830 * collect all data csums for the stripe to avoid seeking during 831 * the scrub. This might currently (crc32) end up to be about 1MB 832 */ 833 start_stripe = 0; 834again: 835 logical = base + offset + start_stripe * increment; 836 for (i = start_stripe; i < nstripes; ++i) { 837 ret = btrfs_lookup_csums_range(csum_root, logical, 838 logical + map->stripe_len - 1, 839 &sdev->csum_list, 1); 840 if (ret) 841 goto out; 842 843 logical += increment; 844 cond_resched(); 845 } 846 /* 847 * now find all extents for each stripe and scrub them 848 */ 849 logical = base + offset + start_stripe * increment; 850 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 851 ret = 0; 852 for (i = start_stripe; i < nstripes; ++i) { 853 /* 854 * canceled? 855 */ 856 if (atomic_read(&fs_info->scrub_cancel_req) || 857 atomic_read(&sdev->cancel_req)) { 858 ret = -ECANCELED; 859 goto out; 860 } 861 /* 862 * check to see if we have to pause 863 */ 864 if (atomic_read(&fs_info->scrub_pause_req)) { 865 /* push queued extents */ 866 scrub_submit(sdev); 867 wait_event(sdev->list_wait, 868 atomic_read(&sdev->in_flight) == 0); 869 atomic_inc(&fs_info->scrubs_paused); 870 wake_up(&fs_info->scrub_pause_wait); 871 mutex_lock(&fs_info->scrub_lock); 872 while (atomic_read(&fs_info->scrub_pause_req)) { 873 mutex_unlock(&fs_info->scrub_lock); 874 wait_event(fs_info->scrub_pause_wait, 875 atomic_read(&fs_info->scrub_pause_req) == 0); 876 mutex_lock(&fs_info->scrub_lock); 877 } 878 atomic_dec(&fs_info->scrubs_paused); 879 mutex_unlock(&fs_info->scrub_lock); 880 wake_up(&fs_info->scrub_pause_wait); 881 scrub_free_csums(sdev); 882 start_stripe = i; 883 goto again; 884 } 885 886 key.objectid = logical; 887 key.type = BTRFS_EXTENT_ITEM_KEY; 888 key.offset = (u64)0; 889 890 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 891 if (ret < 0) 892 goto out; 893 894 l = path->nodes[0]; 895 slot = path->slots[0]; 896 btrfs_item_key_to_cpu(l, &key, slot); 897 if (key.objectid != logical) { 898 ret = btrfs_previous_item(root, path, 0, 899 BTRFS_EXTENT_ITEM_KEY); 900 if (ret < 0) 901 goto out; 902 } 903 904 while (1) { 905 l = path->nodes[0]; 906 slot = path->slots[0]; 907 if (slot >= btrfs_header_nritems(l)) { 908 ret = btrfs_next_leaf(root, path); 909 if (ret == 0) 910 continue; 911 if (ret < 0) 912 goto out; 913 914 break; 915 } 916 btrfs_item_key_to_cpu(l, &key, slot); 917 918 if (key.objectid + key.offset <= logical) 919 goto next; 920 921 if (key.objectid >= logical + map->stripe_len) 922 break; 923 924 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) 925 goto next; 926 927 extent = btrfs_item_ptr(l, slot, 928 struct btrfs_extent_item); 929 flags = btrfs_extent_flags(l, extent); 930 generation = btrfs_extent_generation(l, extent); 931 932 if (key.objectid < logical && 933 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { 934 printk(KERN_ERR 935 "btrfs scrub: tree block %llu spanning " 936 "stripes, ignored. logical=%llu\n", 937 (unsigned long long)key.objectid, 938 (unsigned long long)logical); 939 goto next; 940 } 941 942 /* 943 * trim extent to this stripe 944 */ 945 if (key.objectid < logical) { 946 key.offset -= logical - key.objectid; 947 key.objectid = logical; 948 } 949 if (key.objectid + key.offset > 950 logical + map->stripe_len) { 951 key.offset = logical + map->stripe_len - 952 key.objectid; 953 } 954 955 ret = scrub_extent(sdev, key.objectid, key.offset, 956 key.objectid - logical + physical, 957 flags, generation, mirror_num); 958 if (ret) 959 goto out; 960 961next: 962 path->slots[0]++; 963 } 964 btrfs_release_path(path); 965 logical += increment; 966 physical += map->stripe_len; 967 spin_lock(&sdev->stat_lock); 968 sdev->stat.last_physical = physical; 969 spin_unlock(&sdev->stat_lock); 970 } 971 /* push queued extents */ 972 scrub_submit(sdev); 973 974out: 975 btrfs_free_path(path); 976 return ret < 0 ? ret : 0; 977} 978 979static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 980 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length) 981{ 982 struct btrfs_mapping_tree *map_tree = 983 &sdev->dev->dev_root->fs_info->mapping_tree; 984 struct map_lookup *map; 985 struct extent_map *em; 986 int i; 987 int ret = -EINVAL; 988 989 read_lock(&map_tree->map_tree.lock); 990 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 991 read_unlock(&map_tree->map_tree.lock); 992 993 if (!em) 994 return -EINVAL; 995 996 map = (struct map_lookup *)em->bdev; 997 if (em->start != chunk_offset) 998 goto out; 999 1000 if (em->len < length) 1001 goto out; 1002 1003 for (i = 0; i < map->num_stripes; ++i) { 1004 if (map->stripes[i].dev == sdev->dev) { 1005 ret = scrub_stripe(sdev, map, i, chunk_offset, length); 1006 if (ret) 1007 goto out; 1008 } 1009 } 1010out: 1011 free_extent_map(em); 1012 1013 return ret; 1014} 1015 1016static noinline_for_stack 1017int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 1018{ 1019 struct btrfs_dev_extent *dev_extent = NULL; 1020 struct btrfs_path *path; 1021 struct btrfs_root *root = sdev->dev->dev_root; 1022 struct btrfs_fs_info *fs_info = root->fs_info; 1023 u64 length; 1024 u64 chunk_tree; 1025 u64 chunk_objectid; 1026 u64 chunk_offset; 1027 int ret; 1028 int slot; 1029 struct extent_buffer *l; 1030 struct btrfs_key key; 1031 struct btrfs_key found_key; 1032 struct btrfs_block_group_cache *cache; 1033 1034 path = btrfs_alloc_path(); 1035 if (!path) 1036 return -ENOMEM; 1037 1038 path->reada = 2; 1039 path->search_commit_root = 1; 1040 path->skip_locking = 1; 1041 1042 key.objectid = sdev->dev->devid; 1043 key.offset = 0ull; 1044 key.type = BTRFS_DEV_EXTENT_KEY; 1045 1046 1047 while (1) { 1048 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1049 if (ret < 0) 1050 goto out; 1051 ret = 0; 1052 1053 l = path->nodes[0]; 1054 slot = path->slots[0]; 1055 1056 btrfs_item_key_to_cpu(l, &found_key, slot); 1057 1058 if (found_key.objectid != sdev->dev->devid) 1059 break; 1060 1061 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1062 break; 1063 1064 if (found_key.offset >= end) 1065 break; 1066 1067 if (found_key.offset < key.offset) 1068 break; 1069 1070 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1071 length = btrfs_dev_extent_length(l, dev_extent); 1072 1073 if (found_key.offset + length <= start) { 1074 key.offset = found_key.offset + length; 1075 btrfs_release_path(path); 1076 continue; 1077 } 1078 1079 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 1080 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 1081 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 1082 1083 /* 1084 * get a reference on the corresponding block group to prevent 1085 * the chunk from going away while we scrub it 1086 */ 1087 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 1088 if (!cache) { 1089 ret = -ENOENT; 1090 goto out; 1091 } 1092 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 1093 chunk_offset, length); 1094 btrfs_put_block_group(cache); 1095 if (ret) 1096 break; 1097 1098 key.offset = found_key.offset + length; 1099 btrfs_release_path(path); 1100 } 1101 1102out: 1103 btrfs_free_path(path); 1104 return ret; 1105} 1106 1107static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 1108{ 1109 int i; 1110 u64 bytenr; 1111 u64 gen; 1112 int ret; 1113 struct btrfs_device *device = sdev->dev; 1114 struct btrfs_root *root = device->dev_root; 1115 1116 gen = root->fs_info->last_trans_committed; 1117 1118 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1119 bytenr = btrfs_sb_offset(i); 1120 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 1121 break; 1122 1123 ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, 1124 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 1125 if (ret) 1126 return ret; 1127 } 1128 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1129 1130 return 0; 1131} 1132 1133/* 1134 * get a reference count on fs_info->scrub_workers. start worker if necessary 1135 */ 1136static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 1137{ 1138 struct btrfs_fs_info *fs_info = root->fs_info; 1139 1140 mutex_lock(&fs_info->scrub_lock); 1141 if (fs_info->scrub_workers_refcnt == 0) 1142 btrfs_start_workers(&fs_info->scrub_workers, 1); 1143 ++fs_info->scrub_workers_refcnt; 1144 mutex_unlock(&fs_info->scrub_lock); 1145 1146 return 0; 1147} 1148 1149static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 1150{ 1151 struct btrfs_fs_info *fs_info = root->fs_info; 1152 1153 mutex_lock(&fs_info->scrub_lock); 1154 if (--fs_info->scrub_workers_refcnt == 0) 1155 btrfs_stop_workers(&fs_info->scrub_workers); 1156 WARN_ON(fs_info->scrub_workers_refcnt < 0); 1157 mutex_unlock(&fs_info->scrub_lock); 1158} 1159 1160 1161int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 1162 struct btrfs_scrub_progress *progress, int readonly) 1163{ 1164 struct scrub_dev *sdev; 1165 struct btrfs_fs_info *fs_info = root->fs_info; 1166 int ret; 1167 struct btrfs_device *dev; 1168 1169 if (root->fs_info->closing) 1170 return -EINVAL; 1171 1172 /* 1173 * check some assumptions 1174 */ 1175 if (root->sectorsize != PAGE_SIZE || 1176 root->sectorsize != root->leafsize || 1177 root->sectorsize != root->nodesize) { 1178 printk(KERN_ERR "btrfs_scrub: size assumptions fail\n"); 1179 return -EINVAL; 1180 } 1181 1182 ret = scrub_workers_get(root); 1183 if (ret) 1184 return ret; 1185 1186 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1187 dev = btrfs_find_device(root, devid, NULL, NULL); 1188 if (!dev || dev->missing) { 1189 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1190 scrub_workers_put(root); 1191 return -ENODEV; 1192 } 1193 mutex_lock(&fs_info->scrub_lock); 1194 1195 if (!dev->in_fs_metadata) { 1196 mutex_unlock(&fs_info->scrub_lock); 1197 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1198 scrub_workers_put(root); 1199 return -ENODEV; 1200 } 1201 1202 if (dev->scrub_device) { 1203 mutex_unlock(&fs_info->scrub_lock); 1204 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1205 scrub_workers_put(root); 1206 return -EINPROGRESS; 1207 } 1208 sdev = scrub_setup_dev(dev); 1209 if (IS_ERR(sdev)) { 1210 mutex_unlock(&fs_info->scrub_lock); 1211 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1212 scrub_workers_put(root); 1213 return PTR_ERR(sdev); 1214 } 1215 sdev->readonly = readonly; 1216 dev->scrub_device = sdev; 1217 1218 atomic_inc(&fs_info->scrubs_running); 1219 mutex_unlock(&fs_info->scrub_lock); 1220 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1221 1222 down_read(&fs_info->scrub_super_lock); 1223 ret = scrub_supers(sdev); 1224 up_read(&fs_info->scrub_super_lock); 1225 1226 if (!ret) 1227 ret = scrub_enumerate_chunks(sdev, start, end); 1228 1229 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1230 1231 atomic_dec(&fs_info->scrubs_running); 1232 wake_up(&fs_info->scrub_pause_wait); 1233 1234 if (progress) 1235 memcpy(progress, &sdev->stat, sizeof(*progress)); 1236 1237 mutex_lock(&fs_info->scrub_lock); 1238 dev->scrub_device = NULL; 1239 mutex_unlock(&fs_info->scrub_lock); 1240 1241 scrub_free_dev(sdev); 1242 scrub_workers_put(root); 1243 1244 return ret; 1245} 1246 1247int btrfs_scrub_pause(struct btrfs_root *root) 1248{ 1249 struct btrfs_fs_info *fs_info = root->fs_info; 1250 1251 mutex_lock(&fs_info->scrub_lock); 1252 atomic_inc(&fs_info->scrub_pause_req); 1253 while (atomic_read(&fs_info->scrubs_paused) != 1254 atomic_read(&fs_info->scrubs_running)) { 1255 mutex_unlock(&fs_info->scrub_lock); 1256 wait_event(fs_info->scrub_pause_wait, 1257 atomic_read(&fs_info->scrubs_paused) == 1258 atomic_read(&fs_info->scrubs_running)); 1259 mutex_lock(&fs_info->scrub_lock); 1260 } 1261 mutex_unlock(&fs_info->scrub_lock); 1262 1263 return 0; 1264} 1265 1266int btrfs_scrub_continue(struct btrfs_root *root) 1267{ 1268 struct btrfs_fs_info *fs_info = root->fs_info; 1269 1270 atomic_dec(&fs_info->scrub_pause_req); 1271 wake_up(&fs_info->scrub_pause_wait); 1272 return 0; 1273} 1274 1275int btrfs_scrub_pause_super(struct btrfs_root *root) 1276{ 1277 down_write(&root->fs_info->scrub_super_lock); 1278 return 0; 1279} 1280 1281int btrfs_scrub_continue_super(struct btrfs_root *root) 1282{ 1283 up_write(&root->fs_info->scrub_super_lock); 1284 return 0; 1285} 1286 1287int btrfs_scrub_cancel(struct btrfs_root *root) 1288{ 1289 struct btrfs_fs_info *fs_info = root->fs_info; 1290 1291 mutex_lock(&fs_info->scrub_lock); 1292 if (!atomic_read(&fs_info->scrubs_running)) { 1293 mutex_unlock(&fs_info->scrub_lock); 1294 return -ENOTCONN; 1295 } 1296 1297 atomic_inc(&fs_info->scrub_cancel_req); 1298 while (atomic_read(&fs_info->scrubs_running)) { 1299 mutex_unlock(&fs_info->scrub_lock); 1300 wait_event(fs_info->scrub_pause_wait, 1301 atomic_read(&fs_info->scrubs_running) == 0); 1302 mutex_lock(&fs_info->scrub_lock); 1303 } 1304 atomic_dec(&fs_info->scrub_cancel_req); 1305 mutex_unlock(&fs_info->scrub_lock); 1306 1307 return 0; 1308} 1309 1310int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) 1311{ 1312 struct btrfs_fs_info *fs_info = root->fs_info; 1313 struct scrub_dev *sdev; 1314 1315 mutex_lock(&fs_info->scrub_lock); 1316 sdev = dev->scrub_device; 1317 if (!sdev) { 1318 mutex_unlock(&fs_info->scrub_lock); 1319 return -ENOTCONN; 1320 } 1321 atomic_inc(&sdev->cancel_req); 1322 while (dev->scrub_device) { 1323 mutex_unlock(&fs_info->scrub_lock); 1324 wait_event(fs_info->scrub_pause_wait, 1325 dev->scrub_device == NULL); 1326 mutex_lock(&fs_info->scrub_lock); 1327 } 1328 mutex_unlock(&fs_info->scrub_lock); 1329 1330 return 0; 1331} 1332int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) 1333{ 1334 struct btrfs_fs_info *fs_info = root->fs_info; 1335 struct btrfs_device *dev; 1336 int ret; 1337 1338 /* 1339 * we have to hold the device_list_mutex here so the device 1340 * does not go away in cancel_dev. FIXME: find a better solution 1341 */ 1342 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1343 dev = btrfs_find_device(root, devid, NULL, NULL); 1344 if (!dev) { 1345 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1346 return -ENODEV; 1347 } 1348 ret = btrfs_scrub_cancel_dev(root, dev); 1349 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1350 1351 return ret; 1352} 1353 1354int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 1355 struct btrfs_scrub_progress *progress) 1356{ 1357 struct btrfs_device *dev; 1358 struct scrub_dev *sdev = NULL; 1359 1360 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1361 dev = btrfs_find_device(root, devid, NULL, NULL); 1362 if (dev) 1363 sdev = dev->scrub_device; 1364 if (sdev) 1365 memcpy(progress, &sdev->stat, sizeof(*progress)); 1366 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1367 1368 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 1369} 1370