raid5.c revision dd054fce88d33da1aa81d018db75b91b102a6959
1/* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21/* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46#include <linux/blkdev.h> 47#include <linux/kthread.h> 48#include <linux/raid/pq.h> 49#include <linux/async_tx.h> 50#include <linux/module.h> 51#include <linux/async.h> 52#include <linux/seq_file.h> 53#include <linux/cpu.h> 54#include <linux/slab.h> 55#include <linux/ratelimit.h> 56#include "md.h" 57#include "raid5.h" 58#include "raid0.h" 59#include "bitmap.h" 60 61/* 62 * Stripe cache 63 */ 64 65#define NR_STRIPES 256 66#define STRIPE_SIZE PAGE_SIZE 67#define STRIPE_SHIFT (PAGE_SHIFT - 9) 68#define STRIPE_SECTORS (STRIPE_SIZE>>9) 69#define IO_THRESHOLD 1 70#define BYPASS_THRESHOLD 1 71#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 72#define HASH_MASK (NR_HASH - 1) 73 74static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 75{ 76 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 77 return &conf->stripe_hashtbl[hash]; 78} 79 80/* bio's attached to a stripe+device for I/O are linked together in bi_sector 81 * order without overlap. There may be several bio's per stripe+device, and 82 * a bio could span several devices. 83 * When walking this list for a particular stripe+device, we must never proceed 84 * beyond a bio that extends past this device, as the next bio might no longer 85 * be valid. 86 * This function is used to determine the 'next' bio in the list, given the sector 87 * of the current stripe+device 88 */ 89static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 90{ 91 int sectors = bio->bi_size >> 9; 92 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 93 return bio->bi_next; 94 else 95 return NULL; 96} 97 98/* 99 * We maintain a biased count of active stripes in the bottom 16 bits of 100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 101 */ 102static inline int raid5_bi_phys_segments(struct bio *bio) 103{ 104 return bio->bi_phys_segments & 0xffff; 105} 106 107static inline int raid5_bi_hw_segments(struct bio *bio) 108{ 109 return (bio->bi_phys_segments >> 16) & 0xffff; 110} 111 112static inline int raid5_dec_bi_phys_segments(struct bio *bio) 113{ 114 --bio->bi_phys_segments; 115 return raid5_bi_phys_segments(bio); 116} 117 118static inline int raid5_dec_bi_hw_segments(struct bio *bio) 119{ 120 unsigned short val = raid5_bi_hw_segments(bio); 121 122 --val; 123 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 124 return val; 125} 126 127static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 128{ 129 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 130} 131 132/* Find first data disk in a raid6 stripe */ 133static inline int raid6_d0(struct stripe_head *sh) 134{ 135 if (sh->ddf_layout) 136 /* ddf always start from first device */ 137 return 0; 138 /* md starts just after Q block */ 139 if (sh->qd_idx == sh->disks - 1) 140 return 0; 141 else 142 return sh->qd_idx + 1; 143} 144static inline int raid6_next_disk(int disk, int raid_disks) 145{ 146 disk++; 147 return (disk < raid_disks) ? disk : 0; 148} 149 150/* When walking through the disks in a raid5, starting at raid6_d0, 151 * We need to map each disk to a 'slot', where the data disks are slot 152 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 153 * is raid_disks-1. This help does that mapping. 154 */ 155static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 156 int *count, int syndrome_disks) 157{ 158 int slot = *count; 159 160 if (sh->ddf_layout) 161 (*count)++; 162 if (idx == sh->pd_idx) 163 return syndrome_disks; 164 if (idx == sh->qd_idx) 165 return syndrome_disks + 1; 166 if (!sh->ddf_layout) 167 (*count)++; 168 return slot; 169} 170 171static void return_io(struct bio *return_bi) 172{ 173 struct bio *bi = return_bi; 174 while (bi) { 175 176 return_bi = bi->bi_next; 177 bi->bi_next = NULL; 178 bi->bi_size = 0; 179 bio_endio(bi, 0); 180 bi = return_bi; 181 } 182} 183 184static void print_raid5_conf (struct r5conf *conf); 185 186static int stripe_operations_active(struct stripe_head *sh) 187{ 188 return sh->check_state || sh->reconstruct_state || 189 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 190 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 191} 192 193static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 194{ 195 if (atomic_dec_and_test(&sh->count)) { 196 BUG_ON(!list_empty(&sh->lru)); 197 BUG_ON(atomic_read(&conf->active_stripes)==0); 198 if (test_bit(STRIPE_HANDLE, &sh->state)) { 199 if (test_bit(STRIPE_DELAYED, &sh->state)) 200 list_add_tail(&sh->lru, &conf->delayed_list); 201 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 202 sh->bm_seq - conf->seq_write > 0) 203 list_add_tail(&sh->lru, &conf->bitmap_list); 204 else { 205 clear_bit(STRIPE_BIT_DELAY, &sh->state); 206 list_add_tail(&sh->lru, &conf->handle_list); 207 } 208 md_wakeup_thread(conf->mddev->thread); 209 } else { 210 BUG_ON(stripe_operations_active(sh)); 211 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 212 atomic_dec(&conf->preread_active_stripes); 213 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 214 md_wakeup_thread(conf->mddev->thread); 215 } 216 atomic_dec(&conf->active_stripes); 217 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 218 list_add_tail(&sh->lru, &conf->inactive_list); 219 wake_up(&conf->wait_for_stripe); 220 if (conf->retry_read_aligned) 221 md_wakeup_thread(conf->mddev->thread); 222 } 223 } 224 } 225} 226 227static void release_stripe(struct stripe_head *sh) 228{ 229 struct r5conf *conf = sh->raid_conf; 230 unsigned long flags; 231 232 spin_lock_irqsave(&conf->device_lock, flags); 233 __release_stripe(conf, sh); 234 spin_unlock_irqrestore(&conf->device_lock, flags); 235} 236 237static inline void remove_hash(struct stripe_head *sh) 238{ 239 pr_debug("remove_hash(), stripe %llu\n", 240 (unsigned long long)sh->sector); 241 242 hlist_del_init(&sh->hash); 243} 244 245static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 246{ 247 struct hlist_head *hp = stripe_hash(conf, sh->sector); 248 249 pr_debug("insert_hash(), stripe %llu\n", 250 (unsigned long long)sh->sector); 251 252 hlist_add_head(&sh->hash, hp); 253} 254 255 256/* find an idle stripe, make sure it is unhashed, and return it. */ 257static struct stripe_head *get_free_stripe(struct r5conf *conf) 258{ 259 struct stripe_head *sh = NULL; 260 struct list_head *first; 261 262 if (list_empty(&conf->inactive_list)) 263 goto out; 264 first = conf->inactive_list.next; 265 sh = list_entry(first, struct stripe_head, lru); 266 list_del_init(first); 267 remove_hash(sh); 268 atomic_inc(&conf->active_stripes); 269out: 270 return sh; 271} 272 273static void shrink_buffers(struct stripe_head *sh) 274{ 275 struct page *p; 276 int i; 277 int num = sh->raid_conf->pool_size; 278 279 for (i = 0; i < num ; i++) { 280 p = sh->dev[i].page; 281 if (!p) 282 continue; 283 sh->dev[i].page = NULL; 284 put_page(p); 285 } 286} 287 288static int grow_buffers(struct stripe_head *sh) 289{ 290 int i; 291 int num = sh->raid_conf->pool_size; 292 293 for (i = 0; i < num; i++) { 294 struct page *page; 295 296 if (!(page = alloc_page(GFP_KERNEL))) { 297 return 1; 298 } 299 sh->dev[i].page = page; 300 } 301 return 0; 302} 303 304static void raid5_build_block(struct stripe_head *sh, int i, int previous); 305static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 306 struct stripe_head *sh); 307 308static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 309{ 310 struct r5conf *conf = sh->raid_conf; 311 int i; 312 313 BUG_ON(atomic_read(&sh->count) != 0); 314 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 315 BUG_ON(stripe_operations_active(sh)); 316 317 pr_debug("init_stripe called, stripe %llu\n", 318 (unsigned long long)sh->sector); 319 320 remove_hash(sh); 321 322 sh->generation = conf->generation - previous; 323 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 324 sh->sector = sector; 325 stripe_set_idx(sector, conf, previous, sh); 326 sh->state = 0; 327 328 329 for (i = sh->disks; i--; ) { 330 struct r5dev *dev = &sh->dev[i]; 331 332 if (dev->toread || dev->read || dev->towrite || dev->written || 333 test_bit(R5_LOCKED, &dev->flags)) { 334 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 335 (unsigned long long)sh->sector, i, dev->toread, 336 dev->read, dev->towrite, dev->written, 337 test_bit(R5_LOCKED, &dev->flags)); 338 WARN_ON(1); 339 } 340 dev->flags = 0; 341 raid5_build_block(sh, i, previous); 342 } 343 insert_hash(conf, sh); 344} 345 346static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 347 short generation) 348{ 349 struct stripe_head *sh; 350 struct hlist_node *hn; 351 352 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 353 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 354 if (sh->sector == sector && sh->generation == generation) 355 return sh; 356 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 357 return NULL; 358} 359 360/* 361 * Need to check if array has failed when deciding whether to: 362 * - start an array 363 * - remove non-faulty devices 364 * - add a spare 365 * - allow a reshape 366 * This determination is simple when no reshape is happening. 367 * However if there is a reshape, we need to carefully check 368 * both the before and after sections. 369 * This is because some failed devices may only affect one 370 * of the two sections, and some non-in_sync devices may 371 * be insync in the section most affected by failed devices. 372 */ 373static int calc_degraded(struct r5conf *conf) 374{ 375 int degraded, degraded2; 376 int i; 377 378 rcu_read_lock(); 379 degraded = 0; 380 for (i = 0; i < conf->previous_raid_disks; i++) { 381 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 382 if (!rdev || test_bit(Faulty, &rdev->flags)) 383 degraded++; 384 else if (test_bit(In_sync, &rdev->flags)) 385 ; 386 else 387 /* not in-sync or faulty. 388 * If the reshape increases the number of devices, 389 * this is being recovered by the reshape, so 390 * this 'previous' section is not in_sync. 391 * If the number of devices is being reduced however, 392 * the device can only be part of the array if 393 * we are reverting a reshape, so this section will 394 * be in-sync. 395 */ 396 if (conf->raid_disks >= conf->previous_raid_disks) 397 degraded++; 398 } 399 rcu_read_unlock(); 400 if (conf->raid_disks == conf->previous_raid_disks) 401 return degraded; 402 rcu_read_lock(); 403 degraded2 = 0; 404 for (i = 0; i < conf->raid_disks; i++) { 405 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 406 if (!rdev || test_bit(Faulty, &rdev->flags)) 407 degraded2++; 408 else if (test_bit(In_sync, &rdev->flags)) 409 ; 410 else 411 /* not in-sync or faulty. 412 * If reshape increases the number of devices, this 413 * section has already been recovered, else it 414 * almost certainly hasn't. 415 */ 416 if (conf->raid_disks <= conf->previous_raid_disks) 417 degraded2++; 418 } 419 rcu_read_unlock(); 420 if (degraded2 > degraded) 421 return degraded2; 422 return degraded; 423} 424 425static int has_failed(struct r5conf *conf) 426{ 427 int degraded; 428 429 if (conf->mddev->reshape_position == MaxSector) 430 return conf->mddev->degraded > conf->max_degraded; 431 432 degraded = calc_degraded(conf); 433 if (degraded > conf->max_degraded) 434 return 1; 435 return 0; 436} 437 438static struct stripe_head * 439get_active_stripe(struct r5conf *conf, sector_t sector, 440 int previous, int noblock, int noquiesce) 441{ 442 struct stripe_head *sh; 443 444 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 445 446 spin_lock_irq(&conf->device_lock); 447 448 do { 449 wait_event_lock_irq(conf->wait_for_stripe, 450 conf->quiesce == 0 || noquiesce, 451 conf->device_lock, /* nothing */); 452 sh = __find_stripe(conf, sector, conf->generation - previous); 453 if (!sh) { 454 if (!conf->inactive_blocked) 455 sh = get_free_stripe(conf); 456 if (noblock && sh == NULL) 457 break; 458 if (!sh) { 459 conf->inactive_blocked = 1; 460 wait_event_lock_irq(conf->wait_for_stripe, 461 !list_empty(&conf->inactive_list) && 462 (atomic_read(&conf->active_stripes) 463 < (conf->max_nr_stripes *3/4) 464 || !conf->inactive_blocked), 465 conf->device_lock, 466 ); 467 conf->inactive_blocked = 0; 468 } else 469 init_stripe(sh, sector, previous); 470 } else { 471 if (atomic_read(&sh->count)) { 472 BUG_ON(!list_empty(&sh->lru) 473 && !test_bit(STRIPE_EXPANDING, &sh->state)); 474 } else { 475 if (!test_bit(STRIPE_HANDLE, &sh->state)) 476 atomic_inc(&conf->active_stripes); 477 if (list_empty(&sh->lru) && 478 !test_bit(STRIPE_EXPANDING, &sh->state)) 479 BUG(); 480 list_del_init(&sh->lru); 481 } 482 } 483 } while (sh == NULL); 484 485 if (sh) 486 atomic_inc(&sh->count); 487 488 spin_unlock_irq(&conf->device_lock); 489 return sh; 490} 491 492static void 493raid5_end_read_request(struct bio *bi, int error); 494static void 495raid5_end_write_request(struct bio *bi, int error); 496 497static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 498{ 499 struct r5conf *conf = sh->raid_conf; 500 int i, disks = sh->disks; 501 502 might_sleep(); 503 504 for (i = disks; i--; ) { 505 int rw; 506 int replace_only = 0; 507 struct bio *bi, *rbi; 508 struct md_rdev *rdev, *rrdev = NULL; 509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 510 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 511 rw = WRITE_FUA; 512 else 513 rw = WRITE; 514 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 515 rw = READ; 516 else if (test_and_clear_bit(R5_WantReplace, 517 &sh->dev[i].flags)) { 518 rw = WRITE; 519 replace_only = 1; 520 } else 521 continue; 522 523 bi = &sh->dev[i].req; 524 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 525 526 bi->bi_rw = rw; 527 rbi->bi_rw = rw; 528 if (rw & WRITE) { 529 bi->bi_end_io = raid5_end_write_request; 530 rbi->bi_end_io = raid5_end_write_request; 531 } else 532 bi->bi_end_io = raid5_end_read_request; 533 534 rcu_read_lock(); 535 rrdev = rcu_dereference(conf->disks[i].replacement); 536 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 537 rdev = rcu_dereference(conf->disks[i].rdev); 538 if (!rdev) { 539 rdev = rrdev; 540 rrdev = NULL; 541 } 542 if (rw & WRITE) { 543 if (replace_only) 544 rdev = NULL; 545 if (rdev == rrdev) 546 /* We raced and saw duplicates */ 547 rrdev = NULL; 548 } else { 549 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 550 rdev = rrdev; 551 rrdev = NULL; 552 } 553 554 if (rdev && test_bit(Faulty, &rdev->flags)) 555 rdev = NULL; 556 if (rdev) 557 atomic_inc(&rdev->nr_pending); 558 if (rrdev && test_bit(Faulty, &rrdev->flags)) 559 rrdev = NULL; 560 if (rrdev) 561 atomic_inc(&rrdev->nr_pending); 562 rcu_read_unlock(); 563 564 /* We have already checked bad blocks for reads. Now 565 * need to check for writes. We never accept write errors 566 * on the replacement, so we don't to check rrdev. 567 */ 568 while ((rw & WRITE) && rdev && 569 test_bit(WriteErrorSeen, &rdev->flags)) { 570 sector_t first_bad; 571 int bad_sectors; 572 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 573 &first_bad, &bad_sectors); 574 if (!bad) 575 break; 576 577 if (bad < 0) { 578 set_bit(BlockedBadBlocks, &rdev->flags); 579 if (!conf->mddev->external && 580 conf->mddev->flags) { 581 /* It is very unlikely, but we might 582 * still need to write out the 583 * bad block log - better give it 584 * a chance*/ 585 md_check_recovery(conf->mddev); 586 } 587 md_wait_for_blocked_rdev(rdev, conf->mddev); 588 } else { 589 /* Acknowledged bad block - skip the write */ 590 rdev_dec_pending(rdev, conf->mddev); 591 rdev = NULL; 592 } 593 } 594 595 if (rdev) { 596 if (s->syncing || s->expanding || s->expanded 597 || s->replacing) 598 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 599 600 set_bit(STRIPE_IO_STARTED, &sh->state); 601 602 bi->bi_bdev = rdev->bdev; 603 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 604 __func__, (unsigned long long)sh->sector, 605 bi->bi_rw, i); 606 atomic_inc(&sh->count); 607 bi->bi_sector = sh->sector + rdev->data_offset; 608 bi->bi_flags = 1 << BIO_UPTODATE; 609 bi->bi_idx = 0; 610 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 611 bi->bi_io_vec[0].bv_offset = 0; 612 bi->bi_size = STRIPE_SIZE; 613 bi->bi_next = NULL; 614 if (rrdev) 615 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 616 generic_make_request(bi); 617 } 618 if (rrdev) { 619 if (s->syncing || s->expanding || s->expanded 620 || s->replacing) 621 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 622 623 set_bit(STRIPE_IO_STARTED, &sh->state); 624 625 rbi->bi_bdev = rrdev->bdev; 626 pr_debug("%s: for %llu schedule op %ld on " 627 "replacement disc %d\n", 628 __func__, (unsigned long long)sh->sector, 629 rbi->bi_rw, i); 630 atomic_inc(&sh->count); 631 rbi->bi_sector = sh->sector + rrdev->data_offset; 632 rbi->bi_flags = 1 << BIO_UPTODATE; 633 rbi->bi_idx = 0; 634 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 635 rbi->bi_io_vec[0].bv_offset = 0; 636 rbi->bi_size = STRIPE_SIZE; 637 rbi->bi_next = NULL; 638 generic_make_request(rbi); 639 } 640 if (!rdev && !rrdev) { 641 if (rw & WRITE) 642 set_bit(STRIPE_DEGRADED, &sh->state); 643 pr_debug("skip op %ld on disc %d for sector %llu\n", 644 bi->bi_rw, i, (unsigned long long)sh->sector); 645 clear_bit(R5_LOCKED, &sh->dev[i].flags); 646 set_bit(STRIPE_HANDLE, &sh->state); 647 } 648 } 649} 650 651static struct dma_async_tx_descriptor * 652async_copy_data(int frombio, struct bio *bio, struct page *page, 653 sector_t sector, struct dma_async_tx_descriptor *tx) 654{ 655 struct bio_vec *bvl; 656 struct page *bio_page; 657 int i; 658 int page_offset; 659 struct async_submit_ctl submit; 660 enum async_tx_flags flags = 0; 661 662 if (bio->bi_sector >= sector) 663 page_offset = (signed)(bio->bi_sector - sector) * 512; 664 else 665 page_offset = (signed)(sector - bio->bi_sector) * -512; 666 667 if (frombio) 668 flags |= ASYNC_TX_FENCE; 669 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 670 671 bio_for_each_segment(bvl, bio, i) { 672 int len = bvl->bv_len; 673 int clen; 674 int b_offset = 0; 675 676 if (page_offset < 0) { 677 b_offset = -page_offset; 678 page_offset += b_offset; 679 len -= b_offset; 680 } 681 682 if (len > 0 && page_offset + len > STRIPE_SIZE) 683 clen = STRIPE_SIZE - page_offset; 684 else 685 clen = len; 686 687 if (clen > 0) { 688 b_offset += bvl->bv_offset; 689 bio_page = bvl->bv_page; 690 if (frombio) 691 tx = async_memcpy(page, bio_page, page_offset, 692 b_offset, clen, &submit); 693 else 694 tx = async_memcpy(bio_page, page, b_offset, 695 page_offset, clen, &submit); 696 } 697 /* chain the operations */ 698 submit.depend_tx = tx; 699 700 if (clen < len) /* hit end of page */ 701 break; 702 page_offset += len; 703 } 704 705 return tx; 706} 707 708static void ops_complete_biofill(void *stripe_head_ref) 709{ 710 struct stripe_head *sh = stripe_head_ref; 711 struct bio *return_bi = NULL; 712 struct r5conf *conf = sh->raid_conf; 713 int i; 714 715 pr_debug("%s: stripe %llu\n", __func__, 716 (unsigned long long)sh->sector); 717 718 /* clear completed biofills */ 719 spin_lock_irq(&conf->device_lock); 720 for (i = sh->disks; i--; ) { 721 struct r5dev *dev = &sh->dev[i]; 722 723 /* acknowledge completion of a biofill operation */ 724 /* and check if we need to reply to a read request, 725 * new R5_Wantfill requests are held off until 726 * !STRIPE_BIOFILL_RUN 727 */ 728 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 729 struct bio *rbi, *rbi2; 730 731 BUG_ON(!dev->read); 732 rbi = dev->read; 733 dev->read = NULL; 734 while (rbi && rbi->bi_sector < 735 dev->sector + STRIPE_SECTORS) { 736 rbi2 = r5_next_bio(rbi, dev->sector); 737 if (!raid5_dec_bi_phys_segments(rbi)) { 738 rbi->bi_next = return_bi; 739 return_bi = rbi; 740 } 741 rbi = rbi2; 742 } 743 } 744 } 745 spin_unlock_irq(&conf->device_lock); 746 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 747 748 return_io(return_bi); 749 750 set_bit(STRIPE_HANDLE, &sh->state); 751 release_stripe(sh); 752} 753 754static void ops_run_biofill(struct stripe_head *sh) 755{ 756 struct dma_async_tx_descriptor *tx = NULL; 757 struct r5conf *conf = sh->raid_conf; 758 struct async_submit_ctl submit; 759 int i; 760 761 pr_debug("%s: stripe %llu\n", __func__, 762 (unsigned long long)sh->sector); 763 764 for (i = sh->disks; i--; ) { 765 struct r5dev *dev = &sh->dev[i]; 766 if (test_bit(R5_Wantfill, &dev->flags)) { 767 struct bio *rbi; 768 spin_lock_irq(&conf->device_lock); 769 dev->read = rbi = dev->toread; 770 dev->toread = NULL; 771 spin_unlock_irq(&conf->device_lock); 772 while (rbi && rbi->bi_sector < 773 dev->sector + STRIPE_SECTORS) { 774 tx = async_copy_data(0, rbi, dev->page, 775 dev->sector, tx); 776 rbi = r5_next_bio(rbi, dev->sector); 777 } 778 } 779 } 780 781 atomic_inc(&sh->count); 782 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 783 async_trigger_callback(&submit); 784} 785 786static void mark_target_uptodate(struct stripe_head *sh, int target) 787{ 788 struct r5dev *tgt; 789 790 if (target < 0) 791 return; 792 793 tgt = &sh->dev[target]; 794 set_bit(R5_UPTODATE, &tgt->flags); 795 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 796 clear_bit(R5_Wantcompute, &tgt->flags); 797} 798 799static void ops_complete_compute(void *stripe_head_ref) 800{ 801 struct stripe_head *sh = stripe_head_ref; 802 803 pr_debug("%s: stripe %llu\n", __func__, 804 (unsigned long long)sh->sector); 805 806 /* mark the computed target(s) as uptodate */ 807 mark_target_uptodate(sh, sh->ops.target); 808 mark_target_uptodate(sh, sh->ops.target2); 809 810 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 811 if (sh->check_state == check_state_compute_run) 812 sh->check_state = check_state_compute_result; 813 set_bit(STRIPE_HANDLE, &sh->state); 814 release_stripe(sh); 815} 816 817/* return a pointer to the address conversion region of the scribble buffer */ 818static addr_conv_t *to_addr_conv(struct stripe_head *sh, 819 struct raid5_percpu *percpu) 820{ 821 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 822} 823 824static struct dma_async_tx_descriptor * 825ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 826{ 827 int disks = sh->disks; 828 struct page **xor_srcs = percpu->scribble; 829 int target = sh->ops.target; 830 struct r5dev *tgt = &sh->dev[target]; 831 struct page *xor_dest = tgt->page; 832 int count = 0; 833 struct dma_async_tx_descriptor *tx; 834 struct async_submit_ctl submit; 835 int i; 836 837 pr_debug("%s: stripe %llu block: %d\n", 838 __func__, (unsigned long long)sh->sector, target); 839 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 840 841 for (i = disks; i--; ) 842 if (i != target) 843 xor_srcs[count++] = sh->dev[i].page; 844 845 atomic_inc(&sh->count); 846 847 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 848 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 849 if (unlikely(count == 1)) 850 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 851 else 852 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 853 854 return tx; 855} 856 857/* set_syndrome_sources - populate source buffers for gen_syndrome 858 * @srcs - (struct page *) array of size sh->disks 859 * @sh - stripe_head to parse 860 * 861 * Populates srcs in proper layout order for the stripe and returns the 862 * 'count' of sources to be used in a call to async_gen_syndrome. The P 863 * destination buffer is recorded in srcs[count] and the Q destination 864 * is recorded in srcs[count+1]]. 865 */ 866static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 867{ 868 int disks = sh->disks; 869 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 870 int d0_idx = raid6_d0(sh); 871 int count; 872 int i; 873 874 for (i = 0; i < disks; i++) 875 srcs[i] = NULL; 876 877 count = 0; 878 i = d0_idx; 879 do { 880 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 881 882 srcs[slot] = sh->dev[i].page; 883 i = raid6_next_disk(i, disks); 884 } while (i != d0_idx); 885 886 return syndrome_disks; 887} 888 889static struct dma_async_tx_descriptor * 890ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 891{ 892 int disks = sh->disks; 893 struct page **blocks = percpu->scribble; 894 int target; 895 int qd_idx = sh->qd_idx; 896 struct dma_async_tx_descriptor *tx; 897 struct async_submit_ctl submit; 898 struct r5dev *tgt; 899 struct page *dest; 900 int i; 901 int count; 902 903 if (sh->ops.target < 0) 904 target = sh->ops.target2; 905 else if (sh->ops.target2 < 0) 906 target = sh->ops.target; 907 else 908 /* we should only have one valid target */ 909 BUG(); 910 BUG_ON(target < 0); 911 pr_debug("%s: stripe %llu block: %d\n", 912 __func__, (unsigned long long)sh->sector, target); 913 914 tgt = &sh->dev[target]; 915 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 916 dest = tgt->page; 917 918 atomic_inc(&sh->count); 919 920 if (target == qd_idx) { 921 count = set_syndrome_sources(blocks, sh); 922 blocks[count] = NULL; /* regenerating p is not necessary */ 923 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 924 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 925 ops_complete_compute, sh, 926 to_addr_conv(sh, percpu)); 927 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 928 } else { 929 /* Compute any data- or p-drive using XOR */ 930 count = 0; 931 for (i = disks; i-- ; ) { 932 if (i == target || i == qd_idx) 933 continue; 934 blocks[count++] = sh->dev[i].page; 935 } 936 937 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 938 NULL, ops_complete_compute, sh, 939 to_addr_conv(sh, percpu)); 940 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 941 } 942 943 return tx; 944} 945 946static struct dma_async_tx_descriptor * 947ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 948{ 949 int i, count, disks = sh->disks; 950 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 951 int d0_idx = raid6_d0(sh); 952 int faila = -1, failb = -1; 953 int target = sh->ops.target; 954 int target2 = sh->ops.target2; 955 struct r5dev *tgt = &sh->dev[target]; 956 struct r5dev *tgt2 = &sh->dev[target2]; 957 struct dma_async_tx_descriptor *tx; 958 struct page **blocks = percpu->scribble; 959 struct async_submit_ctl submit; 960 961 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 962 __func__, (unsigned long long)sh->sector, target, target2); 963 BUG_ON(target < 0 || target2 < 0); 964 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 965 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 966 967 /* we need to open-code set_syndrome_sources to handle the 968 * slot number conversion for 'faila' and 'failb' 969 */ 970 for (i = 0; i < disks ; i++) 971 blocks[i] = NULL; 972 count = 0; 973 i = d0_idx; 974 do { 975 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 976 977 blocks[slot] = sh->dev[i].page; 978 979 if (i == target) 980 faila = slot; 981 if (i == target2) 982 failb = slot; 983 i = raid6_next_disk(i, disks); 984 } while (i != d0_idx); 985 986 BUG_ON(faila == failb); 987 if (failb < faila) 988 swap(faila, failb); 989 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 990 __func__, (unsigned long long)sh->sector, faila, failb); 991 992 atomic_inc(&sh->count); 993 994 if (failb == syndrome_disks+1) { 995 /* Q disk is one of the missing disks */ 996 if (faila == syndrome_disks) { 997 /* Missing P+Q, just recompute */ 998 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 999 ops_complete_compute, sh, 1000 to_addr_conv(sh, percpu)); 1001 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1002 STRIPE_SIZE, &submit); 1003 } else { 1004 struct page *dest; 1005 int data_target; 1006 int qd_idx = sh->qd_idx; 1007 1008 /* Missing D+Q: recompute D from P, then recompute Q */ 1009 if (target == qd_idx) 1010 data_target = target2; 1011 else 1012 data_target = target; 1013 1014 count = 0; 1015 for (i = disks; i-- ; ) { 1016 if (i == data_target || i == qd_idx) 1017 continue; 1018 blocks[count++] = sh->dev[i].page; 1019 } 1020 dest = sh->dev[data_target].page; 1021 init_async_submit(&submit, 1022 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1023 NULL, NULL, NULL, 1024 to_addr_conv(sh, percpu)); 1025 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1026 &submit); 1027 1028 count = set_syndrome_sources(blocks, sh); 1029 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1030 ops_complete_compute, sh, 1031 to_addr_conv(sh, percpu)); 1032 return async_gen_syndrome(blocks, 0, count+2, 1033 STRIPE_SIZE, &submit); 1034 } 1035 } else { 1036 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1037 ops_complete_compute, sh, 1038 to_addr_conv(sh, percpu)); 1039 if (failb == syndrome_disks) { 1040 /* We're missing D+P. */ 1041 return async_raid6_datap_recov(syndrome_disks+2, 1042 STRIPE_SIZE, faila, 1043 blocks, &submit); 1044 } else { 1045 /* We're missing D+D. */ 1046 return async_raid6_2data_recov(syndrome_disks+2, 1047 STRIPE_SIZE, faila, failb, 1048 blocks, &submit); 1049 } 1050 } 1051} 1052 1053 1054static void ops_complete_prexor(void *stripe_head_ref) 1055{ 1056 struct stripe_head *sh = stripe_head_ref; 1057 1058 pr_debug("%s: stripe %llu\n", __func__, 1059 (unsigned long long)sh->sector); 1060} 1061 1062static struct dma_async_tx_descriptor * 1063ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1064 struct dma_async_tx_descriptor *tx) 1065{ 1066 int disks = sh->disks; 1067 struct page **xor_srcs = percpu->scribble; 1068 int count = 0, pd_idx = sh->pd_idx, i; 1069 struct async_submit_ctl submit; 1070 1071 /* existing parity data subtracted */ 1072 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1073 1074 pr_debug("%s: stripe %llu\n", __func__, 1075 (unsigned long long)sh->sector); 1076 1077 for (i = disks; i--; ) { 1078 struct r5dev *dev = &sh->dev[i]; 1079 /* Only process blocks that are known to be uptodate */ 1080 if (test_bit(R5_Wantdrain, &dev->flags)) 1081 xor_srcs[count++] = dev->page; 1082 } 1083 1084 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1085 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1086 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1087 1088 return tx; 1089} 1090 1091static struct dma_async_tx_descriptor * 1092ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1093{ 1094 int disks = sh->disks; 1095 int i; 1096 1097 pr_debug("%s: stripe %llu\n", __func__, 1098 (unsigned long long)sh->sector); 1099 1100 for (i = disks; i--; ) { 1101 struct r5dev *dev = &sh->dev[i]; 1102 struct bio *chosen; 1103 1104 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1105 struct bio *wbi; 1106 1107 spin_lock_irq(&sh->raid_conf->device_lock); 1108 chosen = dev->towrite; 1109 dev->towrite = NULL; 1110 BUG_ON(dev->written); 1111 wbi = dev->written = chosen; 1112 spin_unlock_irq(&sh->raid_conf->device_lock); 1113 1114 while (wbi && wbi->bi_sector < 1115 dev->sector + STRIPE_SECTORS) { 1116 if (wbi->bi_rw & REQ_FUA) 1117 set_bit(R5_WantFUA, &dev->flags); 1118 tx = async_copy_data(1, wbi, dev->page, 1119 dev->sector, tx); 1120 wbi = r5_next_bio(wbi, dev->sector); 1121 } 1122 } 1123 } 1124 1125 return tx; 1126} 1127 1128static void ops_complete_reconstruct(void *stripe_head_ref) 1129{ 1130 struct stripe_head *sh = stripe_head_ref; 1131 int disks = sh->disks; 1132 int pd_idx = sh->pd_idx; 1133 int qd_idx = sh->qd_idx; 1134 int i; 1135 bool fua = false; 1136 1137 pr_debug("%s: stripe %llu\n", __func__, 1138 (unsigned long long)sh->sector); 1139 1140 for (i = disks; i--; ) 1141 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1142 1143 for (i = disks; i--; ) { 1144 struct r5dev *dev = &sh->dev[i]; 1145 1146 if (dev->written || i == pd_idx || i == qd_idx) { 1147 set_bit(R5_UPTODATE, &dev->flags); 1148 if (fua) 1149 set_bit(R5_WantFUA, &dev->flags); 1150 } 1151 } 1152 1153 if (sh->reconstruct_state == reconstruct_state_drain_run) 1154 sh->reconstruct_state = reconstruct_state_drain_result; 1155 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1156 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1157 else { 1158 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1159 sh->reconstruct_state = reconstruct_state_result; 1160 } 1161 1162 set_bit(STRIPE_HANDLE, &sh->state); 1163 release_stripe(sh); 1164} 1165 1166static void 1167ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1168 struct dma_async_tx_descriptor *tx) 1169{ 1170 int disks = sh->disks; 1171 struct page **xor_srcs = percpu->scribble; 1172 struct async_submit_ctl submit; 1173 int count = 0, pd_idx = sh->pd_idx, i; 1174 struct page *xor_dest; 1175 int prexor = 0; 1176 unsigned long flags; 1177 1178 pr_debug("%s: stripe %llu\n", __func__, 1179 (unsigned long long)sh->sector); 1180 1181 /* check if prexor is active which means only process blocks 1182 * that are part of a read-modify-write (written) 1183 */ 1184 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1185 prexor = 1; 1186 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1187 for (i = disks; i--; ) { 1188 struct r5dev *dev = &sh->dev[i]; 1189 if (dev->written) 1190 xor_srcs[count++] = dev->page; 1191 } 1192 } else { 1193 xor_dest = sh->dev[pd_idx].page; 1194 for (i = disks; i--; ) { 1195 struct r5dev *dev = &sh->dev[i]; 1196 if (i != pd_idx) 1197 xor_srcs[count++] = dev->page; 1198 } 1199 } 1200 1201 /* 1/ if we prexor'd then the dest is reused as a source 1202 * 2/ if we did not prexor then we are redoing the parity 1203 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1204 * for the synchronous xor case 1205 */ 1206 flags = ASYNC_TX_ACK | 1207 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1208 1209 atomic_inc(&sh->count); 1210 1211 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1212 to_addr_conv(sh, percpu)); 1213 if (unlikely(count == 1)) 1214 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1215 else 1216 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1217} 1218 1219static void 1220ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1221 struct dma_async_tx_descriptor *tx) 1222{ 1223 struct async_submit_ctl submit; 1224 struct page **blocks = percpu->scribble; 1225 int count; 1226 1227 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1228 1229 count = set_syndrome_sources(blocks, sh); 1230 1231 atomic_inc(&sh->count); 1232 1233 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1234 sh, to_addr_conv(sh, percpu)); 1235 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1236} 1237 1238static void ops_complete_check(void *stripe_head_ref) 1239{ 1240 struct stripe_head *sh = stripe_head_ref; 1241 1242 pr_debug("%s: stripe %llu\n", __func__, 1243 (unsigned long long)sh->sector); 1244 1245 sh->check_state = check_state_check_result; 1246 set_bit(STRIPE_HANDLE, &sh->state); 1247 release_stripe(sh); 1248} 1249 1250static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1251{ 1252 int disks = sh->disks; 1253 int pd_idx = sh->pd_idx; 1254 int qd_idx = sh->qd_idx; 1255 struct page *xor_dest; 1256 struct page **xor_srcs = percpu->scribble; 1257 struct dma_async_tx_descriptor *tx; 1258 struct async_submit_ctl submit; 1259 int count; 1260 int i; 1261 1262 pr_debug("%s: stripe %llu\n", __func__, 1263 (unsigned long long)sh->sector); 1264 1265 count = 0; 1266 xor_dest = sh->dev[pd_idx].page; 1267 xor_srcs[count++] = xor_dest; 1268 for (i = disks; i--; ) { 1269 if (i == pd_idx || i == qd_idx) 1270 continue; 1271 xor_srcs[count++] = sh->dev[i].page; 1272 } 1273 1274 init_async_submit(&submit, 0, NULL, NULL, NULL, 1275 to_addr_conv(sh, percpu)); 1276 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1277 &sh->ops.zero_sum_result, &submit); 1278 1279 atomic_inc(&sh->count); 1280 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1281 tx = async_trigger_callback(&submit); 1282} 1283 1284static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1285{ 1286 struct page **srcs = percpu->scribble; 1287 struct async_submit_ctl submit; 1288 int count; 1289 1290 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1291 (unsigned long long)sh->sector, checkp); 1292 1293 count = set_syndrome_sources(srcs, sh); 1294 if (!checkp) 1295 srcs[count] = NULL; 1296 1297 atomic_inc(&sh->count); 1298 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1299 sh, to_addr_conv(sh, percpu)); 1300 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1301 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1302} 1303 1304static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1305{ 1306 int overlap_clear = 0, i, disks = sh->disks; 1307 struct dma_async_tx_descriptor *tx = NULL; 1308 struct r5conf *conf = sh->raid_conf; 1309 int level = conf->level; 1310 struct raid5_percpu *percpu; 1311 unsigned long cpu; 1312 1313 cpu = get_cpu(); 1314 percpu = per_cpu_ptr(conf->percpu, cpu); 1315 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1316 ops_run_biofill(sh); 1317 overlap_clear++; 1318 } 1319 1320 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1321 if (level < 6) 1322 tx = ops_run_compute5(sh, percpu); 1323 else { 1324 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1325 tx = ops_run_compute6_1(sh, percpu); 1326 else 1327 tx = ops_run_compute6_2(sh, percpu); 1328 } 1329 /* terminate the chain if reconstruct is not set to be run */ 1330 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1331 async_tx_ack(tx); 1332 } 1333 1334 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1335 tx = ops_run_prexor(sh, percpu, tx); 1336 1337 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1338 tx = ops_run_biodrain(sh, tx); 1339 overlap_clear++; 1340 } 1341 1342 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1343 if (level < 6) 1344 ops_run_reconstruct5(sh, percpu, tx); 1345 else 1346 ops_run_reconstruct6(sh, percpu, tx); 1347 } 1348 1349 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1350 if (sh->check_state == check_state_run) 1351 ops_run_check_p(sh, percpu); 1352 else if (sh->check_state == check_state_run_q) 1353 ops_run_check_pq(sh, percpu, 0); 1354 else if (sh->check_state == check_state_run_pq) 1355 ops_run_check_pq(sh, percpu, 1); 1356 else 1357 BUG(); 1358 } 1359 1360 if (overlap_clear) 1361 for (i = disks; i--; ) { 1362 struct r5dev *dev = &sh->dev[i]; 1363 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1364 wake_up(&sh->raid_conf->wait_for_overlap); 1365 } 1366 put_cpu(); 1367} 1368 1369#ifdef CONFIG_MULTICORE_RAID456 1370static void async_run_ops(void *param, async_cookie_t cookie) 1371{ 1372 struct stripe_head *sh = param; 1373 unsigned long ops_request = sh->ops.request; 1374 1375 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1376 wake_up(&sh->ops.wait_for_ops); 1377 1378 __raid_run_ops(sh, ops_request); 1379 release_stripe(sh); 1380} 1381 1382static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1383{ 1384 /* since handle_stripe can be called outside of raid5d context 1385 * we need to ensure sh->ops.request is de-staged before another 1386 * request arrives 1387 */ 1388 wait_event(sh->ops.wait_for_ops, 1389 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1390 sh->ops.request = ops_request; 1391 1392 atomic_inc(&sh->count); 1393 async_schedule(async_run_ops, sh); 1394} 1395#else 1396#define raid_run_ops __raid_run_ops 1397#endif 1398 1399static int grow_one_stripe(struct r5conf *conf) 1400{ 1401 struct stripe_head *sh; 1402 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1403 if (!sh) 1404 return 0; 1405 1406 sh->raid_conf = conf; 1407 #ifdef CONFIG_MULTICORE_RAID456 1408 init_waitqueue_head(&sh->ops.wait_for_ops); 1409 #endif 1410 1411 if (grow_buffers(sh)) { 1412 shrink_buffers(sh); 1413 kmem_cache_free(conf->slab_cache, sh); 1414 return 0; 1415 } 1416 /* we just created an active stripe so... */ 1417 atomic_set(&sh->count, 1); 1418 atomic_inc(&conf->active_stripes); 1419 INIT_LIST_HEAD(&sh->lru); 1420 release_stripe(sh); 1421 return 1; 1422} 1423 1424static int grow_stripes(struct r5conf *conf, int num) 1425{ 1426 struct kmem_cache *sc; 1427 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1428 1429 if (conf->mddev->gendisk) 1430 sprintf(conf->cache_name[0], 1431 "raid%d-%s", conf->level, mdname(conf->mddev)); 1432 else 1433 sprintf(conf->cache_name[0], 1434 "raid%d-%p", conf->level, conf->mddev); 1435 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1436 1437 conf->active_name = 0; 1438 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1439 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1440 0, 0, NULL); 1441 if (!sc) 1442 return 1; 1443 conf->slab_cache = sc; 1444 conf->pool_size = devs; 1445 while (num--) 1446 if (!grow_one_stripe(conf)) 1447 return 1; 1448 return 0; 1449} 1450 1451/** 1452 * scribble_len - return the required size of the scribble region 1453 * @num - total number of disks in the array 1454 * 1455 * The size must be enough to contain: 1456 * 1/ a struct page pointer for each device in the array +2 1457 * 2/ room to convert each entry in (1) to its corresponding dma 1458 * (dma_map_page()) or page (page_address()) address. 1459 * 1460 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1461 * calculate over all devices (not just the data blocks), using zeros in place 1462 * of the P and Q blocks. 1463 */ 1464static size_t scribble_len(int num) 1465{ 1466 size_t len; 1467 1468 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1469 1470 return len; 1471} 1472 1473static int resize_stripes(struct r5conf *conf, int newsize) 1474{ 1475 /* Make all the stripes able to hold 'newsize' devices. 1476 * New slots in each stripe get 'page' set to a new page. 1477 * 1478 * This happens in stages: 1479 * 1/ create a new kmem_cache and allocate the required number of 1480 * stripe_heads. 1481 * 2/ gather all the old stripe_heads and tranfer the pages across 1482 * to the new stripe_heads. This will have the side effect of 1483 * freezing the array as once all stripe_heads have been collected, 1484 * no IO will be possible. Old stripe heads are freed once their 1485 * pages have been transferred over, and the old kmem_cache is 1486 * freed when all stripes are done. 1487 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1488 * we simple return a failre status - no need to clean anything up. 1489 * 4/ allocate new pages for the new slots in the new stripe_heads. 1490 * If this fails, we don't bother trying the shrink the 1491 * stripe_heads down again, we just leave them as they are. 1492 * As each stripe_head is processed the new one is released into 1493 * active service. 1494 * 1495 * Once step2 is started, we cannot afford to wait for a write, 1496 * so we use GFP_NOIO allocations. 1497 */ 1498 struct stripe_head *osh, *nsh; 1499 LIST_HEAD(newstripes); 1500 struct disk_info *ndisks; 1501 unsigned long cpu; 1502 int err; 1503 struct kmem_cache *sc; 1504 int i; 1505 1506 if (newsize <= conf->pool_size) 1507 return 0; /* never bother to shrink */ 1508 1509 err = md_allow_write(conf->mddev); 1510 if (err) 1511 return err; 1512 1513 /* Step 1 */ 1514 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1515 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1516 0, 0, NULL); 1517 if (!sc) 1518 return -ENOMEM; 1519 1520 for (i = conf->max_nr_stripes; i; i--) { 1521 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1522 if (!nsh) 1523 break; 1524 1525 nsh->raid_conf = conf; 1526 #ifdef CONFIG_MULTICORE_RAID456 1527 init_waitqueue_head(&nsh->ops.wait_for_ops); 1528 #endif 1529 1530 list_add(&nsh->lru, &newstripes); 1531 } 1532 if (i) { 1533 /* didn't get enough, give up */ 1534 while (!list_empty(&newstripes)) { 1535 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1536 list_del(&nsh->lru); 1537 kmem_cache_free(sc, nsh); 1538 } 1539 kmem_cache_destroy(sc); 1540 return -ENOMEM; 1541 } 1542 /* Step 2 - Must use GFP_NOIO now. 1543 * OK, we have enough stripes, start collecting inactive 1544 * stripes and copying them over 1545 */ 1546 list_for_each_entry(nsh, &newstripes, lru) { 1547 spin_lock_irq(&conf->device_lock); 1548 wait_event_lock_irq(conf->wait_for_stripe, 1549 !list_empty(&conf->inactive_list), 1550 conf->device_lock, 1551 ); 1552 osh = get_free_stripe(conf); 1553 spin_unlock_irq(&conf->device_lock); 1554 atomic_set(&nsh->count, 1); 1555 for(i=0; i<conf->pool_size; i++) 1556 nsh->dev[i].page = osh->dev[i].page; 1557 for( ; i<newsize; i++) 1558 nsh->dev[i].page = NULL; 1559 kmem_cache_free(conf->slab_cache, osh); 1560 } 1561 kmem_cache_destroy(conf->slab_cache); 1562 1563 /* Step 3. 1564 * At this point, we are holding all the stripes so the array 1565 * is completely stalled, so now is a good time to resize 1566 * conf->disks and the scribble region 1567 */ 1568 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1569 if (ndisks) { 1570 for (i=0; i<conf->raid_disks; i++) 1571 ndisks[i] = conf->disks[i]; 1572 kfree(conf->disks); 1573 conf->disks = ndisks; 1574 } else 1575 err = -ENOMEM; 1576 1577 get_online_cpus(); 1578 conf->scribble_len = scribble_len(newsize); 1579 for_each_present_cpu(cpu) { 1580 struct raid5_percpu *percpu; 1581 void *scribble; 1582 1583 percpu = per_cpu_ptr(conf->percpu, cpu); 1584 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1585 1586 if (scribble) { 1587 kfree(percpu->scribble); 1588 percpu->scribble = scribble; 1589 } else { 1590 err = -ENOMEM; 1591 break; 1592 } 1593 } 1594 put_online_cpus(); 1595 1596 /* Step 4, return new stripes to service */ 1597 while(!list_empty(&newstripes)) { 1598 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1599 list_del_init(&nsh->lru); 1600 1601 for (i=conf->raid_disks; i < newsize; i++) 1602 if (nsh->dev[i].page == NULL) { 1603 struct page *p = alloc_page(GFP_NOIO); 1604 nsh->dev[i].page = p; 1605 if (!p) 1606 err = -ENOMEM; 1607 } 1608 release_stripe(nsh); 1609 } 1610 /* critical section pass, GFP_NOIO no longer needed */ 1611 1612 conf->slab_cache = sc; 1613 conf->active_name = 1-conf->active_name; 1614 conf->pool_size = newsize; 1615 return err; 1616} 1617 1618static int drop_one_stripe(struct r5conf *conf) 1619{ 1620 struct stripe_head *sh; 1621 1622 spin_lock_irq(&conf->device_lock); 1623 sh = get_free_stripe(conf); 1624 spin_unlock_irq(&conf->device_lock); 1625 if (!sh) 1626 return 0; 1627 BUG_ON(atomic_read(&sh->count)); 1628 shrink_buffers(sh); 1629 kmem_cache_free(conf->slab_cache, sh); 1630 atomic_dec(&conf->active_stripes); 1631 return 1; 1632} 1633 1634static void shrink_stripes(struct r5conf *conf) 1635{ 1636 while (drop_one_stripe(conf)) 1637 ; 1638 1639 if (conf->slab_cache) 1640 kmem_cache_destroy(conf->slab_cache); 1641 conf->slab_cache = NULL; 1642} 1643 1644static void raid5_end_read_request(struct bio * bi, int error) 1645{ 1646 struct stripe_head *sh = bi->bi_private; 1647 struct r5conf *conf = sh->raid_conf; 1648 int disks = sh->disks, i; 1649 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1650 char b[BDEVNAME_SIZE]; 1651 struct md_rdev *rdev = NULL; 1652 1653 1654 for (i=0 ; i<disks; i++) 1655 if (bi == &sh->dev[i].req) 1656 break; 1657 1658 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1659 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1660 uptodate); 1661 if (i == disks) { 1662 BUG(); 1663 return; 1664 } 1665 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1666 /* If replacement finished while this request was outstanding, 1667 * 'replacement' might be NULL already. 1668 * In that case it moved down to 'rdev'. 1669 * rdev is not removed until all requests are finished. 1670 */ 1671 rdev = conf->disks[i].replacement; 1672 if (!rdev) 1673 rdev = conf->disks[i].rdev; 1674 1675 if (uptodate) { 1676 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1677 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1678 /* Note that this cannot happen on a 1679 * replacement device. We just fail those on 1680 * any error 1681 */ 1682 printk_ratelimited( 1683 KERN_INFO 1684 "md/raid:%s: read error corrected" 1685 " (%lu sectors at %llu on %s)\n", 1686 mdname(conf->mddev), STRIPE_SECTORS, 1687 (unsigned long long)(sh->sector 1688 + rdev->data_offset), 1689 bdevname(rdev->bdev, b)); 1690 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1691 clear_bit(R5_ReadError, &sh->dev[i].flags); 1692 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1693 } 1694 if (atomic_read(&rdev->read_errors)) 1695 atomic_set(&rdev->read_errors, 0); 1696 } else { 1697 const char *bdn = bdevname(rdev->bdev, b); 1698 int retry = 0; 1699 1700 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1701 atomic_inc(&rdev->read_errors); 1702 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1703 printk_ratelimited( 1704 KERN_WARNING 1705 "md/raid:%s: read error on replacement device " 1706 "(sector %llu on %s).\n", 1707 mdname(conf->mddev), 1708 (unsigned long long)(sh->sector 1709 + rdev->data_offset), 1710 bdn); 1711 else if (conf->mddev->degraded >= conf->max_degraded) 1712 printk_ratelimited( 1713 KERN_WARNING 1714 "md/raid:%s: read error not correctable " 1715 "(sector %llu on %s).\n", 1716 mdname(conf->mddev), 1717 (unsigned long long)(sh->sector 1718 + rdev->data_offset), 1719 bdn); 1720 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1721 /* Oh, no!!! */ 1722 printk_ratelimited( 1723 KERN_WARNING 1724 "md/raid:%s: read error NOT corrected!! " 1725 "(sector %llu on %s).\n", 1726 mdname(conf->mddev), 1727 (unsigned long long)(sh->sector 1728 + rdev->data_offset), 1729 bdn); 1730 else if (atomic_read(&rdev->read_errors) 1731 > conf->max_nr_stripes) 1732 printk(KERN_WARNING 1733 "md/raid:%s: Too many read errors, failing device %s.\n", 1734 mdname(conf->mddev), bdn); 1735 else 1736 retry = 1; 1737 if (retry) 1738 set_bit(R5_ReadError, &sh->dev[i].flags); 1739 else { 1740 clear_bit(R5_ReadError, &sh->dev[i].flags); 1741 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1742 md_error(conf->mddev, rdev); 1743 } 1744 } 1745 rdev_dec_pending(rdev, conf->mddev); 1746 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1747 set_bit(STRIPE_HANDLE, &sh->state); 1748 release_stripe(sh); 1749} 1750 1751static void raid5_end_write_request(struct bio *bi, int error) 1752{ 1753 struct stripe_head *sh = bi->bi_private; 1754 struct r5conf *conf = sh->raid_conf; 1755 int disks = sh->disks, i; 1756 struct md_rdev *uninitialized_var(rdev); 1757 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1758 sector_t first_bad; 1759 int bad_sectors; 1760 int replacement = 0; 1761 1762 for (i = 0 ; i < disks; i++) { 1763 if (bi == &sh->dev[i].req) { 1764 rdev = conf->disks[i].rdev; 1765 break; 1766 } 1767 if (bi == &sh->dev[i].rreq) { 1768 rdev = conf->disks[i].replacement; 1769 if (rdev) 1770 replacement = 1; 1771 else 1772 /* rdev was removed and 'replacement' 1773 * replaced it. rdev is not removed 1774 * until all requests are finished. 1775 */ 1776 rdev = conf->disks[i].rdev; 1777 break; 1778 } 1779 } 1780 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1781 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1782 uptodate); 1783 if (i == disks) { 1784 BUG(); 1785 return; 1786 } 1787 1788 if (replacement) { 1789 if (!uptodate) 1790 md_error(conf->mddev, rdev); 1791 else if (is_badblock(rdev, sh->sector, 1792 STRIPE_SECTORS, 1793 &first_bad, &bad_sectors)) 1794 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 1795 } else { 1796 if (!uptodate) { 1797 set_bit(WriteErrorSeen, &rdev->flags); 1798 set_bit(R5_WriteError, &sh->dev[i].flags); 1799 } else if (is_badblock(rdev, sh->sector, 1800 STRIPE_SECTORS, 1801 &first_bad, &bad_sectors)) 1802 set_bit(R5_MadeGood, &sh->dev[i].flags); 1803 } 1804 rdev_dec_pending(rdev, conf->mddev); 1805 1806 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 1807 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1808 set_bit(STRIPE_HANDLE, &sh->state); 1809 release_stripe(sh); 1810} 1811 1812static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1813 1814static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1815{ 1816 struct r5dev *dev = &sh->dev[i]; 1817 1818 bio_init(&dev->req); 1819 dev->req.bi_io_vec = &dev->vec; 1820 dev->req.bi_vcnt++; 1821 dev->req.bi_max_vecs++; 1822 dev->req.bi_private = sh; 1823 dev->vec.bv_page = dev->page; 1824 1825 bio_init(&dev->rreq); 1826 dev->rreq.bi_io_vec = &dev->rvec; 1827 dev->rreq.bi_vcnt++; 1828 dev->rreq.bi_max_vecs++; 1829 dev->rreq.bi_private = sh; 1830 dev->rvec.bv_page = dev->page; 1831 1832 dev->flags = 0; 1833 dev->sector = compute_blocknr(sh, i, previous); 1834} 1835 1836static void error(struct mddev *mddev, struct md_rdev *rdev) 1837{ 1838 char b[BDEVNAME_SIZE]; 1839 struct r5conf *conf = mddev->private; 1840 unsigned long flags; 1841 pr_debug("raid456: error called\n"); 1842 1843 spin_lock_irqsave(&conf->device_lock, flags); 1844 clear_bit(In_sync, &rdev->flags); 1845 mddev->degraded = calc_degraded(conf); 1846 spin_unlock_irqrestore(&conf->device_lock, flags); 1847 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1848 1849 set_bit(Blocked, &rdev->flags); 1850 set_bit(Faulty, &rdev->flags); 1851 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1852 printk(KERN_ALERT 1853 "md/raid:%s: Disk failure on %s, disabling device.\n" 1854 "md/raid:%s: Operation continuing on %d devices.\n", 1855 mdname(mddev), 1856 bdevname(rdev->bdev, b), 1857 mdname(mddev), 1858 conf->raid_disks - mddev->degraded); 1859} 1860 1861/* 1862 * Input: a 'big' sector number, 1863 * Output: index of the data and parity disk, and the sector # in them. 1864 */ 1865static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 1866 int previous, int *dd_idx, 1867 struct stripe_head *sh) 1868{ 1869 sector_t stripe, stripe2; 1870 sector_t chunk_number; 1871 unsigned int chunk_offset; 1872 int pd_idx, qd_idx; 1873 int ddf_layout = 0; 1874 sector_t new_sector; 1875 int algorithm = previous ? conf->prev_algo 1876 : conf->algorithm; 1877 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1878 : conf->chunk_sectors; 1879 int raid_disks = previous ? conf->previous_raid_disks 1880 : conf->raid_disks; 1881 int data_disks = raid_disks - conf->max_degraded; 1882 1883 /* First compute the information on this sector */ 1884 1885 /* 1886 * Compute the chunk number and the sector offset inside the chunk 1887 */ 1888 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1889 chunk_number = r_sector; 1890 1891 /* 1892 * Compute the stripe number 1893 */ 1894 stripe = chunk_number; 1895 *dd_idx = sector_div(stripe, data_disks); 1896 stripe2 = stripe; 1897 /* 1898 * Select the parity disk based on the user selected algorithm. 1899 */ 1900 pd_idx = qd_idx = -1; 1901 switch(conf->level) { 1902 case 4: 1903 pd_idx = data_disks; 1904 break; 1905 case 5: 1906 switch (algorithm) { 1907 case ALGORITHM_LEFT_ASYMMETRIC: 1908 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1909 if (*dd_idx >= pd_idx) 1910 (*dd_idx)++; 1911 break; 1912 case ALGORITHM_RIGHT_ASYMMETRIC: 1913 pd_idx = sector_div(stripe2, raid_disks); 1914 if (*dd_idx >= pd_idx) 1915 (*dd_idx)++; 1916 break; 1917 case ALGORITHM_LEFT_SYMMETRIC: 1918 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1919 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1920 break; 1921 case ALGORITHM_RIGHT_SYMMETRIC: 1922 pd_idx = sector_div(stripe2, raid_disks); 1923 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1924 break; 1925 case ALGORITHM_PARITY_0: 1926 pd_idx = 0; 1927 (*dd_idx)++; 1928 break; 1929 case ALGORITHM_PARITY_N: 1930 pd_idx = data_disks; 1931 break; 1932 default: 1933 BUG(); 1934 } 1935 break; 1936 case 6: 1937 1938 switch (algorithm) { 1939 case ALGORITHM_LEFT_ASYMMETRIC: 1940 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1941 qd_idx = pd_idx + 1; 1942 if (pd_idx == raid_disks-1) { 1943 (*dd_idx)++; /* Q D D D P */ 1944 qd_idx = 0; 1945 } else if (*dd_idx >= pd_idx) 1946 (*dd_idx) += 2; /* D D P Q D */ 1947 break; 1948 case ALGORITHM_RIGHT_ASYMMETRIC: 1949 pd_idx = sector_div(stripe2, raid_disks); 1950 qd_idx = pd_idx + 1; 1951 if (pd_idx == raid_disks-1) { 1952 (*dd_idx)++; /* Q D D D P */ 1953 qd_idx = 0; 1954 } else if (*dd_idx >= pd_idx) 1955 (*dd_idx) += 2; /* D D P Q D */ 1956 break; 1957 case ALGORITHM_LEFT_SYMMETRIC: 1958 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1959 qd_idx = (pd_idx + 1) % raid_disks; 1960 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1961 break; 1962 case ALGORITHM_RIGHT_SYMMETRIC: 1963 pd_idx = sector_div(stripe2, raid_disks); 1964 qd_idx = (pd_idx + 1) % raid_disks; 1965 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1966 break; 1967 1968 case ALGORITHM_PARITY_0: 1969 pd_idx = 0; 1970 qd_idx = 1; 1971 (*dd_idx) += 2; 1972 break; 1973 case ALGORITHM_PARITY_N: 1974 pd_idx = data_disks; 1975 qd_idx = data_disks + 1; 1976 break; 1977 1978 case ALGORITHM_ROTATING_ZERO_RESTART: 1979 /* Exactly the same as RIGHT_ASYMMETRIC, but or 1980 * of blocks for computing Q is different. 1981 */ 1982 pd_idx = sector_div(stripe2, raid_disks); 1983 qd_idx = pd_idx + 1; 1984 if (pd_idx == raid_disks-1) { 1985 (*dd_idx)++; /* Q D D D P */ 1986 qd_idx = 0; 1987 } else if (*dd_idx >= pd_idx) 1988 (*dd_idx) += 2; /* D D P Q D */ 1989 ddf_layout = 1; 1990 break; 1991 1992 case ALGORITHM_ROTATING_N_RESTART: 1993 /* Same a left_asymmetric, by first stripe is 1994 * D D D P Q rather than 1995 * Q D D D P 1996 */ 1997 stripe2 += 1; 1998 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1999 qd_idx = pd_idx + 1; 2000 if (pd_idx == raid_disks-1) { 2001 (*dd_idx)++; /* Q D D D P */ 2002 qd_idx = 0; 2003 } else if (*dd_idx >= pd_idx) 2004 (*dd_idx) += 2; /* D D P Q D */ 2005 ddf_layout = 1; 2006 break; 2007 2008 case ALGORITHM_ROTATING_N_CONTINUE: 2009 /* Same as left_symmetric but Q is before P */ 2010 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2011 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2012 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2013 ddf_layout = 1; 2014 break; 2015 2016 case ALGORITHM_LEFT_ASYMMETRIC_6: 2017 /* RAID5 left_asymmetric, with Q on last device */ 2018 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2019 if (*dd_idx >= pd_idx) 2020 (*dd_idx)++; 2021 qd_idx = raid_disks - 1; 2022 break; 2023 2024 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2025 pd_idx = sector_div(stripe2, raid_disks-1); 2026 if (*dd_idx >= pd_idx) 2027 (*dd_idx)++; 2028 qd_idx = raid_disks - 1; 2029 break; 2030 2031 case ALGORITHM_LEFT_SYMMETRIC_6: 2032 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2033 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2034 qd_idx = raid_disks - 1; 2035 break; 2036 2037 case ALGORITHM_RIGHT_SYMMETRIC_6: 2038 pd_idx = sector_div(stripe2, raid_disks-1); 2039 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2040 qd_idx = raid_disks - 1; 2041 break; 2042 2043 case ALGORITHM_PARITY_0_6: 2044 pd_idx = 0; 2045 (*dd_idx)++; 2046 qd_idx = raid_disks - 1; 2047 break; 2048 2049 default: 2050 BUG(); 2051 } 2052 break; 2053 } 2054 2055 if (sh) { 2056 sh->pd_idx = pd_idx; 2057 sh->qd_idx = qd_idx; 2058 sh->ddf_layout = ddf_layout; 2059 } 2060 /* 2061 * Finally, compute the new sector number 2062 */ 2063 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2064 return new_sector; 2065} 2066 2067 2068static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2069{ 2070 struct r5conf *conf = sh->raid_conf; 2071 int raid_disks = sh->disks; 2072 int data_disks = raid_disks - conf->max_degraded; 2073 sector_t new_sector = sh->sector, check; 2074 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2075 : conf->chunk_sectors; 2076 int algorithm = previous ? conf->prev_algo 2077 : conf->algorithm; 2078 sector_t stripe; 2079 int chunk_offset; 2080 sector_t chunk_number; 2081 int dummy1, dd_idx = i; 2082 sector_t r_sector; 2083 struct stripe_head sh2; 2084 2085 2086 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2087 stripe = new_sector; 2088 2089 if (i == sh->pd_idx) 2090 return 0; 2091 switch(conf->level) { 2092 case 4: break; 2093 case 5: 2094 switch (algorithm) { 2095 case ALGORITHM_LEFT_ASYMMETRIC: 2096 case ALGORITHM_RIGHT_ASYMMETRIC: 2097 if (i > sh->pd_idx) 2098 i--; 2099 break; 2100 case ALGORITHM_LEFT_SYMMETRIC: 2101 case ALGORITHM_RIGHT_SYMMETRIC: 2102 if (i < sh->pd_idx) 2103 i += raid_disks; 2104 i -= (sh->pd_idx + 1); 2105 break; 2106 case ALGORITHM_PARITY_0: 2107 i -= 1; 2108 break; 2109 case ALGORITHM_PARITY_N: 2110 break; 2111 default: 2112 BUG(); 2113 } 2114 break; 2115 case 6: 2116 if (i == sh->qd_idx) 2117 return 0; /* It is the Q disk */ 2118 switch (algorithm) { 2119 case ALGORITHM_LEFT_ASYMMETRIC: 2120 case ALGORITHM_RIGHT_ASYMMETRIC: 2121 case ALGORITHM_ROTATING_ZERO_RESTART: 2122 case ALGORITHM_ROTATING_N_RESTART: 2123 if (sh->pd_idx == raid_disks-1) 2124 i--; /* Q D D D P */ 2125 else if (i > sh->pd_idx) 2126 i -= 2; /* D D P Q D */ 2127 break; 2128 case ALGORITHM_LEFT_SYMMETRIC: 2129 case ALGORITHM_RIGHT_SYMMETRIC: 2130 if (sh->pd_idx == raid_disks-1) 2131 i--; /* Q D D D P */ 2132 else { 2133 /* D D P Q D */ 2134 if (i < sh->pd_idx) 2135 i += raid_disks; 2136 i -= (sh->pd_idx + 2); 2137 } 2138 break; 2139 case ALGORITHM_PARITY_0: 2140 i -= 2; 2141 break; 2142 case ALGORITHM_PARITY_N: 2143 break; 2144 case ALGORITHM_ROTATING_N_CONTINUE: 2145 /* Like left_symmetric, but P is before Q */ 2146 if (sh->pd_idx == 0) 2147 i--; /* P D D D Q */ 2148 else { 2149 /* D D Q P D */ 2150 if (i < sh->pd_idx) 2151 i += raid_disks; 2152 i -= (sh->pd_idx + 1); 2153 } 2154 break; 2155 case ALGORITHM_LEFT_ASYMMETRIC_6: 2156 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2157 if (i > sh->pd_idx) 2158 i--; 2159 break; 2160 case ALGORITHM_LEFT_SYMMETRIC_6: 2161 case ALGORITHM_RIGHT_SYMMETRIC_6: 2162 if (i < sh->pd_idx) 2163 i += data_disks + 1; 2164 i -= (sh->pd_idx + 1); 2165 break; 2166 case ALGORITHM_PARITY_0_6: 2167 i -= 1; 2168 break; 2169 default: 2170 BUG(); 2171 } 2172 break; 2173 } 2174 2175 chunk_number = stripe * data_disks + i; 2176 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2177 2178 check = raid5_compute_sector(conf, r_sector, 2179 previous, &dummy1, &sh2); 2180 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2181 || sh2.qd_idx != sh->qd_idx) { 2182 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2183 mdname(conf->mddev)); 2184 return 0; 2185 } 2186 return r_sector; 2187} 2188 2189 2190static void 2191schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2192 int rcw, int expand) 2193{ 2194 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2195 struct r5conf *conf = sh->raid_conf; 2196 int level = conf->level; 2197 2198 if (rcw) { 2199 /* if we are not expanding this is a proper write request, and 2200 * there will be bios with new data to be drained into the 2201 * stripe cache 2202 */ 2203 if (!expand) { 2204 sh->reconstruct_state = reconstruct_state_drain_run; 2205 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2206 } else 2207 sh->reconstruct_state = reconstruct_state_run; 2208 2209 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2210 2211 for (i = disks; i--; ) { 2212 struct r5dev *dev = &sh->dev[i]; 2213 2214 if (dev->towrite) { 2215 set_bit(R5_LOCKED, &dev->flags); 2216 set_bit(R5_Wantdrain, &dev->flags); 2217 if (!expand) 2218 clear_bit(R5_UPTODATE, &dev->flags); 2219 s->locked++; 2220 } 2221 } 2222 if (s->locked + conf->max_degraded == disks) 2223 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2224 atomic_inc(&conf->pending_full_writes); 2225 } else { 2226 BUG_ON(level == 6); 2227 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2228 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2229 2230 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2231 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2232 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2233 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2234 2235 for (i = disks; i--; ) { 2236 struct r5dev *dev = &sh->dev[i]; 2237 if (i == pd_idx) 2238 continue; 2239 2240 if (dev->towrite && 2241 (test_bit(R5_UPTODATE, &dev->flags) || 2242 test_bit(R5_Wantcompute, &dev->flags))) { 2243 set_bit(R5_Wantdrain, &dev->flags); 2244 set_bit(R5_LOCKED, &dev->flags); 2245 clear_bit(R5_UPTODATE, &dev->flags); 2246 s->locked++; 2247 } 2248 } 2249 } 2250 2251 /* keep the parity disk(s) locked while asynchronous operations 2252 * are in flight 2253 */ 2254 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2255 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2256 s->locked++; 2257 2258 if (level == 6) { 2259 int qd_idx = sh->qd_idx; 2260 struct r5dev *dev = &sh->dev[qd_idx]; 2261 2262 set_bit(R5_LOCKED, &dev->flags); 2263 clear_bit(R5_UPTODATE, &dev->flags); 2264 s->locked++; 2265 } 2266 2267 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2268 __func__, (unsigned long long)sh->sector, 2269 s->locked, s->ops_request); 2270} 2271 2272/* 2273 * Each stripe/dev can have one or more bion attached. 2274 * toread/towrite point to the first in a chain. 2275 * The bi_next chain must be in order. 2276 */ 2277static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2278{ 2279 struct bio **bip; 2280 struct r5conf *conf = sh->raid_conf; 2281 int firstwrite=0; 2282 2283 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2284 (unsigned long long)bi->bi_sector, 2285 (unsigned long long)sh->sector); 2286 2287 2288 spin_lock_irq(&conf->device_lock); 2289 if (forwrite) { 2290 bip = &sh->dev[dd_idx].towrite; 2291 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2292 firstwrite = 1; 2293 } else 2294 bip = &sh->dev[dd_idx].toread; 2295 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2296 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2297 goto overlap; 2298 bip = & (*bip)->bi_next; 2299 } 2300 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2301 goto overlap; 2302 2303 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2304 if (*bip) 2305 bi->bi_next = *bip; 2306 *bip = bi; 2307 bi->bi_phys_segments++; 2308 2309 if (forwrite) { 2310 /* check if page is covered */ 2311 sector_t sector = sh->dev[dd_idx].sector; 2312 for (bi=sh->dev[dd_idx].towrite; 2313 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2314 bi && bi->bi_sector <= sector; 2315 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2316 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2317 sector = bi->bi_sector + (bi->bi_size>>9); 2318 } 2319 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2320 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2321 } 2322 spin_unlock_irq(&conf->device_lock); 2323 2324 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2325 (unsigned long long)(*bip)->bi_sector, 2326 (unsigned long long)sh->sector, dd_idx); 2327 2328 if (conf->mddev->bitmap && firstwrite) { 2329 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2330 STRIPE_SECTORS, 0); 2331 sh->bm_seq = conf->seq_flush+1; 2332 set_bit(STRIPE_BIT_DELAY, &sh->state); 2333 } 2334 return 1; 2335 2336 overlap: 2337 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2338 spin_unlock_irq(&conf->device_lock); 2339 return 0; 2340} 2341 2342static void end_reshape(struct r5conf *conf); 2343 2344static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2345 struct stripe_head *sh) 2346{ 2347 int sectors_per_chunk = 2348 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2349 int dd_idx; 2350 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2351 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2352 2353 raid5_compute_sector(conf, 2354 stripe * (disks - conf->max_degraded) 2355 *sectors_per_chunk + chunk_offset, 2356 previous, 2357 &dd_idx, sh); 2358} 2359 2360static void 2361handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2362 struct stripe_head_state *s, int disks, 2363 struct bio **return_bi) 2364{ 2365 int i; 2366 for (i = disks; i--; ) { 2367 struct bio *bi; 2368 int bitmap_end = 0; 2369 2370 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2371 struct md_rdev *rdev; 2372 rcu_read_lock(); 2373 rdev = rcu_dereference(conf->disks[i].rdev); 2374 if (rdev && test_bit(In_sync, &rdev->flags)) 2375 atomic_inc(&rdev->nr_pending); 2376 else 2377 rdev = NULL; 2378 rcu_read_unlock(); 2379 if (rdev) { 2380 if (!rdev_set_badblocks( 2381 rdev, 2382 sh->sector, 2383 STRIPE_SECTORS, 0)) 2384 md_error(conf->mddev, rdev); 2385 rdev_dec_pending(rdev, conf->mddev); 2386 } 2387 } 2388 spin_lock_irq(&conf->device_lock); 2389 /* fail all writes first */ 2390 bi = sh->dev[i].towrite; 2391 sh->dev[i].towrite = NULL; 2392 if (bi) { 2393 s->to_write--; 2394 bitmap_end = 1; 2395 } 2396 2397 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2398 wake_up(&conf->wait_for_overlap); 2399 2400 while (bi && bi->bi_sector < 2401 sh->dev[i].sector + STRIPE_SECTORS) { 2402 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2403 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2404 if (!raid5_dec_bi_phys_segments(bi)) { 2405 md_write_end(conf->mddev); 2406 bi->bi_next = *return_bi; 2407 *return_bi = bi; 2408 } 2409 bi = nextbi; 2410 } 2411 /* and fail all 'written' */ 2412 bi = sh->dev[i].written; 2413 sh->dev[i].written = NULL; 2414 if (bi) bitmap_end = 1; 2415 while (bi && bi->bi_sector < 2416 sh->dev[i].sector + STRIPE_SECTORS) { 2417 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2418 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2419 if (!raid5_dec_bi_phys_segments(bi)) { 2420 md_write_end(conf->mddev); 2421 bi->bi_next = *return_bi; 2422 *return_bi = bi; 2423 } 2424 bi = bi2; 2425 } 2426 2427 /* fail any reads if this device is non-operational and 2428 * the data has not reached the cache yet. 2429 */ 2430 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2431 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2432 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2433 bi = sh->dev[i].toread; 2434 sh->dev[i].toread = NULL; 2435 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2436 wake_up(&conf->wait_for_overlap); 2437 if (bi) s->to_read--; 2438 while (bi && bi->bi_sector < 2439 sh->dev[i].sector + STRIPE_SECTORS) { 2440 struct bio *nextbi = 2441 r5_next_bio(bi, sh->dev[i].sector); 2442 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2443 if (!raid5_dec_bi_phys_segments(bi)) { 2444 bi->bi_next = *return_bi; 2445 *return_bi = bi; 2446 } 2447 bi = nextbi; 2448 } 2449 } 2450 spin_unlock_irq(&conf->device_lock); 2451 if (bitmap_end) 2452 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2453 STRIPE_SECTORS, 0, 0); 2454 /* If we were in the middle of a write the parity block might 2455 * still be locked - so just clear all R5_LOCKED flags 2456 */ 2457 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2458 } 2459 2460 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2461 if (atomic_dec_and_test(&conf->pending_full_writes)) 2462 md_wakeup_thread(conf->mddev->thread); 2463} 2464 2465static void 2466handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2467 struct stripe_head_state *s) 2468{ 2469 int abort = 0; 2470 int i; 2471 2472 md_done_sync(conf->mddev, STRIPE_SECTORS, 0); 2473 clear_bit(STRIPE_SYNCING, &sh->state); 2474 s->syncing = 0; 2475 s->replacing = 0; 2476 /* There is nothing more to do for sync/check/repair. 2477 * For recover/replace we need to record a bad block on all 2478 * non-sync devices, or abort the recovery 2479 */ 2480 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) 2481 return; 2482 /* During recovery devices cannot be removed, so locking and 2483 * refcounting of rdevs is not needed 2484 */ 2485 for (i = 0; i < conf->raid_disks; i++) { 2486 struct md_rdev *rdev = conf->disks[i].rdev; 2487 if (rdev 2488 && !test_bit(Faulty, &rdev->flags) 2489 && !test_bit(In_sync, &rdev->flags) 2490 && !rdev_set_badblocks(rdev, sh->sector, 2491 STRIPE_SECTORS, 0)) 2492 abort = 1; 2493 rdev = conf->disks[i].replacement; 2494 if (rdev 2495 && !test_bit(Faulty, &rdev->flags) 2496 && !test_bit(In_sync, &rdev->flags) 2497 && !rdev_set_badblocks(rdev, sh->sector, 2498 STRIPE_SECTORS, 0)) 2499 abort = 1; 2500 } 2501 if (abort) { 2502 conf->recovery_disabled = conf->mddev->recovery_disabled; 2503 set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); 2504 } 2505} 2506 2507static int want_replace(struct stripe_head *sh, int disk_idx) 2508{ 2509 struct md_rdev *rdev; 2510 int rv = 0; 2511 /* Doing recovery so rcu locking not required */ 2512 rdev = sh->raid_conf->disks[disk_idx].replacement; 2513 if (rdev 2514 && !test_bit(Faulty, &rdev->flags) 2515 && !test_bit(In_sync, &rdev->flags) 2516 && (rdev->recovery_offset <= sh->sector 2517 || rdev->mddev->recovery_cp <= sh->sector)) 2518 rv = 1; 2519 2520 return rv; 2521} 2522 2523/* fetch_block - checks the given member device to see if its data needs 2524 * to be read or computed to satisfy a request. 2525 * 2526 * Returns 1 when no more member devices need to be checked, otherwise returns 2527 * 0 to tell the loop in handle_stripe_fill to continue 2528 */ 2529static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2530 int disk_idx, int disks) 2531{ 2532 struct r5dev *dev = &sh->dev[disk_idx]; 2533 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2534 &sh->dev[s->failed_num[1]] }; 2535 2536 /* is the data in this block needed, and can we get it? */ 2537 if (!test_bit(R5_LOCKED, &dev->flags) && 2538 !test_bit(R5_UPTODATE, &dev->flags) && 2539 (dev->toread || 2540 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2541 s->syncing || s->expanding || 2542 (s->replacing && want_replace(sh, disk_idx)) || 2543 (s->failed >= 1 && fdev[0]->toread) || 2544 (s->failed >= 2 && fdev[1]->toread) || 2545 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2546 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2547 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2548 /* we would like to get this block, possibly by computing it, 2549 * otherwise read it if the backing disk is insync 2550 */ 2551 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2552 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2553 if ((s->uptodate == disks - 1) && 2554 (s->failed && (disk_idx == s->failed_num[0] || 2555 disk_idx == s->failed_num[1]))) { 2556 /* have disk failed, and we're requested to fetch it; 2557 * do compute it 2558 */ 2559 pr_debug("Computing stripe %llu block %d\n", 2560 (unsigned long long)sh->sector, disk_idx); 2561 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2562 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2563 set_bit(R5_Wantcompute, &dev->flags); 2564 sh->ops.target = disk_idx; 2565 sh->ops.target2 = -1; /* no 2nd target */ 2566 s->req_compute = 1; 2567 /* Careful: from this point on 'uptodate' is in the eye 2568 * of raid_run_ops which services 'compute' operations 2569 * before writes. R5_Wantcompute flags a block that will 2570 * be R5_UPTODATE by the time it is needed for a 2571 * subsequent operation. 2572 */ 2573 s->uptodate++; 2574 return 1; 2575 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2576 /* Computing 2-failure is *very* expensive; only 2577 * do it if failed >= 2 2578 */ 2579 int other; 2580 for (other = disks; other--; ) { 2581 if (other == disk_idx) 2582 continue; 2583 if (!test_bit(R5_UPTODATE, 2584 &sh->dev[other].flags)) 2585 break; 2586 } 2587 BUG_ON(other < 0); 2588 pr_debug("Computing stripe %llu blocks %d,%d\n", 2589 (unsigned long long)sh->sector, 2590 disk_idx, other); 2591 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2592 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2593 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2594 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2595 sh->ops.target = disk_idx; 2596 sh->ops.target2 = other; 2597 s->uptodate += 2; 2598 s->req_compute = 1; 2599 return 1; 2600 } else if (test_bit(R5_Insync, &dev->flags)) { 2601 set_bit(R5_LOCKED, &dev->flags); 2602 set_bit(R5_Wantread, &dev->flags); 2603 s->locked++; 2604 pr_debug("Reading block %d (sync=%d)\n", 2605 disk_idx, s->syncing); 2606 } 2607 } 2608 2609 return 0; 2610} 2611 2612/** 2613 * handle_stripe_fill - read or compute data to satisfy pending requests. 2614 */ 2615static void handle_stripe_fill(struct stripe_head *sh, 2616 struct stripe_head_state *s, 2617 int disks) 2618{ 2619 int i; 2620 2621 /* look for blocks to read/compute, skip this if a compute 2622 * is already in flight, or if the stripe contents are in the 2623 * midst of changing due to a write 2624 */ 2625 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2626 !sh->reconstruct_state) 2627 for (i = disks; i--; ) 2628 if (fetch_block(sh, s, i, disks)) 2629 break; 2630 set_bit(STRIPE_HANDLE, &sh->state); 2631} 2632 2633 2634/* handle_stripe_clean_event 2635 * any written block on an uptodate or failed drive can be returned. 2636 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2637 * never LOCKED, so we don't need to test 'failed' directly. 2638 */ 2639static void handle_stripe_clean_event(struct r5conf *conf, 2640 struct stripe_head *sh, int disks, struct bio **return_bi) 2641{ 2642 int i; 2643 struct r5dev *dev; 2644 2645 for (i = disks; i--; ) 2646 if (sh->dev[i].written) { 2647 dev = &sh->dev[i]; 2648 if (!test_bit(R5_LOCKED, &dev->flags) && 2649 test_bit(R5_UPTODATE, &dev->flags)) { 2650 /* We can return any write requests */ 2651 struct bio *wbi, *wbi2; 2652 int bitmap_end = 0; 2653 pr_debug("Return write for disc %d\n", i); 2654 spin_lock_irq(&conf->device_lock); 2655 wbi = dev->written; 2656 dev->written = NULL; 2657 while (wbi && wbi->bi_sector < 2658 dev->sector + STRIPE_SECTORS) { 2659 wbi2 = r5_next_bio(wbi, dev->sector); 2660 if (!raid5_dec_bi_phys_segments(wbi)) { 2661 md_write_end(conf->mddev); 2662 wbi->bi_next = *return_bi; 2663 *return_bi = wbi; 2664 } 2665 wbi = wbi2; 2666 } 2667 if (dev->towrite == NULL) 2668 bitmap_end = 1; 2669 spin_unlock_irq(&conf->device_lock); 2670 if (bitmap_end) 2671 bitmap_endwrite(conf->mddev->bitmap, 2672 sh->sector, 2673 STRIPE_SECTORS, 2674 !test_bit(STRIPE_DEGRADED, &sh->state), 2675 0); 2676 } 2677 } 2678 2679 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2680 if (atomic_dec_and_test(&conf->pending_full_writes)) 2681 md_wakeup_thread(conf->mddev->thread); 2682} 2683 2684static void handle_stripe_dirtying(struct r5conf *conf, 2685 struct stripe_head *sh, 2686 struct stripe_head_state *s, 2687 int disks) 2688{ 2689 int rmw = 0, rcw = 0, i; 2690 if (conf->max_degraded == 2) { 2691 /* RAID6 requires 'rcw' in current implementation 2692 * Calculate the real rcw later - for now fake it 2693 * look like rcw is cheaper 2694 */ 2695 rcw = 1; rmw = 2; 2696 } else for (i = disks; i--; ) { 2697 /* would I have to read this buffer for read_modify_write */ 2698 struct r5dev *dev = &sh->dev[i]; 2699 if ((dev->towrite || i == sh->pd_idx) && 2700 !test_bit(R5_LOCKED, &dev->flags) && 2701 !(test_bit(R5_UPTODATE, &dev->flags) || 2702 test_bit(R5_Wantcompute, &dev->flags))) { 2703 if (test_bit(R5_Insync, &dev->flags)) 2704 rmw++; 2705 else 2706 rmw += 2*disks; /* cannot read it */ 2707 } 2708 /* Would I have to read this buffer for reconstruct_write */ 2709 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2710 !test_bit(R5_LOCKED, &dev->flags) && 2711 !(test_bit(R5_UPTODATE, &dev->flags) || 2712 test_bit(R5_Wantcompute, &dev->flags))) { 2713 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2714 else 2715 rcw += 2*disks; 2716 } 2717 } 2718 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2719 (unsigned long long)sh->sector, rmw, rcw); 2720 set_bit(STRIPE_HANDLE, &sh->state); 2721 if (rmw < rcw && rmw > 0) 2722 /* prefer read-modify-write, but need to get some data */ 2723 for (i = disks; i--; ) { 2724 struct r5dev *dev = &sh->dev[i]; 2725 if ((dev->towrite || i == sh->pd_idx) && 2726 !test_bit(R5_LOCKED, &dev->flags) && 2727 !(test_bit(R5_UPTODATE, &dev->flags) || 2728 test_bit(R5_Wantcompute, &dev->flags)) && 2729 test_bit(R5_Insync, &dev->flags)) { 2730 if ( 2731 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2732 pr_debug("Read_old block " 2733 "%d for r-m-w\n", i); 2734 set_bit(R5_LOCKED, &dev->flags); 2735 set_bit(R5_Wantread, &dev->flags); 2736 s->locked++; 2737 } else { 2738 set_bit(STRIPE_DELAYED, &sh->state); 2739 set_bit(STRIPE_HANDLE, &sh->state); 2740 } 2741 } 2742 } 2743 if (rcw <= rmw && rcw > 0) { 2744 /* want reconstruct write, but need to get some data */ 2745 rcw = 0; 2746 for (i = disks; i--; ) { 2747 struct r5dev *dev = &sh->dev[i]; 2748 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2749 i != sh->pd_idx && i != sh->qd_idx && 2750 !test_bit(R5_LOCKED, &dev->flags) && 2751 !(test_bit(R5_UPTODATE, &dev->flags) || 2752 test_bit(R5_Wantcompute, &dev->flags))) { 2753 rcw++; 2754 if (!test_bit(R5_Insync, &dev->flags)) 2755 continue; /* it's a failed drive */ 2756 if ( 2757 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2758 pr_debug("Read_old block " 2759 "%d for Reconstruct\n", i); 2760 set_bit(R5_LOCKED, &dev->flags); 2761 set_bit(R5_Wantread, &dev->flags); 2762 s->locked++; 2763 } else { 2764 set_bit(STRIPE_DELAYED, &sh->state); 2765 set_bit(STRIPE_HANDLE, &sh->state); 2766 } 2767 } 2768 } 2769 } 2770 /* now if nothing is locked, and if we have enough data, 2771 * we can start a write request 2772 */ 2773 /* since handle_stripe can be called at any time we need to handle the 2774 * case where a compute block operation has been submitted and then a 2775 * subsequent call wants to start a write request. raid_run_ops only 2776 * handles the case where compute block and reconstruct are requested 2777 * simultaneously. If this is not the case then new writes need to be 2778 * held off until the compute completes. 2779 */ 2780 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2781 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2782 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2783 schedule_reconstruction(sh, s, rcw == 0, 0); 2784} 2785 2786static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 2787 struct stripe_head_state *s, int disks) 2788{ 2789 struct r5dev *dev = NULL; 2790 2791 set_bit(STRIPE_HANDLE, &sh->state); 2792 2793 switch (sh->check_state) { 2794 case check_state_idle: 2795 /* start a new check operation if there are no failures */ 2796 if (s->failed == 0) { 2797 BUG_ON(s->uptodate != disks); 2798 sh->check_state = check_state_run; 2799 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2800 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2801 s->uptodate--; 2802 break; 2803 } 2804 dev = &sh->dev[s->failed_num[0]]; 2805 /* fall through */ 2806 case check_state_compute_result: 2807 sh->check_state = check_state_idle; 2808 if (!dev) 2809 dev = &sh->dev[sh->pd_idx]; 2810 2811 /* check that a write has not made the stripe insync */ 2812 if (test_bit(STRIPE_INSYNC, &sh->state)) 2813 break; 2814 2815 /* either failed parity check, or recovery is happening */ 2816 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2817 BUG_ON(s->uptodate != disks); 2818 2819 set_bit(R5_LOCKED, &dev->flags); 2820 s->locked++; 2821 set_bit(R5_Wantwrite, &dev->flags); 2822 2823 clear_bit(STRIPE_DEGRADED, &sh->state); 2824 set_bit(STRIPE_INSYNC, &sh->state); 2825 break; 2826 case check_state_run: 2827 break; /* we will be called again upon completion */ 2828 case check_state_check_result: 2829 sh->check_state = check_state_idle; 2830 2831 /* if a failure occurred during the check operation, leave 2832 * STRIPE_INSYNC not set and let the stripe be handled again 2833 */ 2834 if (s->failed) 2835 break; 2836 2837 /* handle a successful check operation, if parity is correct 2838 * we are done. Otherwise update the mismatch count and repair 2839 * parity if !MD_RECOVERY_CHECK 2840 */ 2841 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2842 /* parity is correct (on disc, 2843 * not in buffer any more) 2844 */ 2845 set_bit(STRIPE_INSYNC, &sh->state); 2846 else { 2847 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2848 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2849 /* don't try to repair!! */ 2850 set_bit(STRIPE_INSYNC, &sh->state); 2851 else { 2852 sh->check_state = check_state_compute_run; 2853 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2854 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2855 set_bit(R5_Wantcompute, 2856 &sh->dev[sh->pd_idx].flags); 2857 sh->ops.target = sh->pd_idx; 2858 sh->ops.target2 = -1; 2859 s->uptodate++; 2860 } 2861 } 2862 break; 2863 case check_state_compute_run: 2864 break; 2865 default: 2866 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2867 __func__, sh->check_state, 2868 (unsigned long long) sh->sector); 2869 BUG(); 2870 } 2871} 2872 2873 2874static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 2875 struct stripe_head_state *s, 2876 int disks) 2877{ 2878 int pd_idx = sh->pd_idx; 2879 int qd_idx = sh->qd_idx; 2880 struct r5dev *dev; 2881 2882 set_bit(STRIPE_HANDLE, &sh->state); 2883 2884 BUG_ON(s->failed > 2); 2885 2886 /* Want to check and possibly repair P and Q. 2887 * However there could be one 'failed' device, in which 2888 * case we can only check one of them, possibly using the 2889 * other to generate missing data 2890 */ 2891 2892 switch (sh->check_state) { 2893 case check_state_idle: 2894 /* start a new check operation if there are < 2 failures */ 2895 if (s->failed == s->q_failed) { 2896 /* The only possible failed device holds Q, so it 2897 * makes sense to check P (If anything else were failed, 2898 * we would have used P to recreate it). 2899 */ 2900 sh->check_state = check_state_run; 2901 } 2902 if (!s->q_failed && s->failed < 2) { 2903 /* Q is not failed, and we didn't use it to generate 2904 * anything, so it makes sense to check it 2905 */ 2906 if (sh->check_state == check_state_run) 2907 sh->check_state = check_state_run_pq; 2908 else 2909 sh->check_state = check_state_run_q; 2910 } 2911 2912 /* discard potentially stale zero_sum_result */ 2913 sh->ops.zero_sum_result = 0; 2914 2915 if (sh->check_state == check_state_run) { 2916 /* async_xor_zero_sum destroys the contents of P */ 2917 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2918 s->uptodate--; 2919 } 2920 if (sh->check_state >= check_state_run && 2921 sh->check_state <= check_state_run_pq) { 2922 /* async_syndrome_zero_sum preserves P and Q, so 2923 * no need to mark them !uptodate here 2924 */ 2925 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2926 break; 2927 } 2928 2929 /* we have 2-disk failure */ 2930 BUG_ON(s->failed != 2); 2931 /* fall through */ 2932 case check_state_compute_result: 2933 sh->check_state = check_state_idle; 2934 2935 /* check that a write has not made the stripe insync */ 2936 if (test_bit(STRIPE_INSYNC, &sh->state)) 2937 break; 2938 2939 /* now write out any block on a failed drive, 2940 * or P or Q if they were recomputed 2941 */ 2942 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2943 if (s->failed == 2) { 2944 dev = &sh->dev[s->failed_num[1]]; 2945 s->locked++; 2946 set_bit(R5_LOCKED, &dev->flags); 2947 set_bit(R5_Wantwrite, &dev->flags); 2948 } 2949 if (s->failed >= 1) { 2950 dev = &sh->dev[s->failed_num[0]]; 2951 s->locked++; 2952 set_bit(R5_LOCKED, &dev->flags); 2953 set_bit(R5_Wantwrite, &dev->flags); 2954 } 2955 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2956 dev = &sh->dev[pd_idx]; 2957 s->locked++; 2958 set_bit(R5_LOCKED, &dev->flags); 2959 set_bit(R5_Wantwrite, &dev->flags); 2960 } 2961 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2962 dev = &sh->dev[qd_idx]; 2963 s->locked++; 2964 set_bit(R5_LOCKED, &dev->flags); 2965 set_bit(R5_Wantwrite, &dev->flags); 2966 } 2967 clear_bit(STRIPE_DEGRADED, &sh->state); 2968 2969 set_bit(STRIPE_INSYNC, &sh->state); 2970 break; 2971 case check_state_run: 2972 case check_state_run_q: 2973 case check_state_run_pq: 2974 break; /* we will be called again upon completion */ 2975 case check_state_check_result: 2976 sh->check_state = check_state_idle; 2977 2978 /* handle a successful check operation, if parity is correct 2979 * we are done. Otherwise update the mismatch count and repair 2980 * parity if !MD_RECOVERY_CHECK 2981 */ 2982 if (sh->ops.zero_sum_result == 0) { 2983 /* both parities are correct */ 2984 if (!s->failed) 2985 set_bit(STRIPE_INSYNC, &sh->state); 2986 else { 2987 /* in contrast to the raid5 case we can validate 2988 * parity, but still have a failure to write 2989 * back 2990 */ 2991 sh->check_state = check_state_compute_result; 2992 /* Returning at this point means that we may go 2993 * off and bring p and/or q uptodate again so 2994 * we make sure to check zero_sum_result again 2995 * to verify if p or q need writeback 2996 */ 2997 } 2998 } else { 2999 conf->mddev->resync_mismatches += STRIPE_SECTORS; 3000 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3001 /* don't try to repair!! */ 3002 set_bit(STRIPE_INSYNC, &sh->state); 3003 else { 3004 int *target = &sh->ops.target; 3005 3006 sh->ops.target = -1; 3007 sh->ops.target2 = -1; 3008 sh->check_state = check_state_compute_run; 3009 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3010 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3011 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3012 set_bit(R5_Wantcompute, 3013 &sh->dev[pd_idx].flags); 3014 *target = pd_idx; 3015 target = &sh->ops.target2; 3016 s->uptodate++; 3017 } 3018 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3019 set_bit(R5_Wantcompute, 3020 &sh->dev[qd_idx].flags); 3021 *target = qd_idx; 3022 s->uptodate++; 3023 } 3024 } 3025 } 3026 break; 3027 case check_state_compute_run: 3028 break; 3029 default: 3030 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3031 __func__, sh->check_state, 3032 (unsigned long long) sh->sector); 3033 BUG(); 3034 } 3035} 3036 3037static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3038{ 3039 int i; 3040 3041 /* We have read all the blocks in this stripe and now we need to 3042 * copy some of them into a target stripe for expand. 3043 */ 3044 struct dma_async_tx_descriptor *tx = NULL; 3045 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3046 for (i = 0; i < sh->disks; i++) 3047 if (i != sh->pd_idx && i != sh->qd_idx) { 3048 int dd_idx, j; 3049 struct stripe_head *sh2; 3050 struct async_submit_ctl submit; 3051 3052 sector_t bn = compute_blocknr(sh, i, 1); 3053 sector_t s = raid5_compute_sector(conf, bn, 0, 3054 &dd_idx, NULL); 3055 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3056 if (sh2 == NULL) 3057 /* so far only the early blocks of this stripe 3058 * have been requested. When later blocks 3059 * get requested, we will try again 3060 */ 3061 continue; 3062 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3063 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3064 /* must have already done this block */ 3065 release_stripe(sh2); 3066 continue; 3067 } 3068 3069 /* place all the copies on one channel */ 3070 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3071 tx = async_memcpy(sh2->dev[dd_idx].page, 3072 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3073 &submit); 3074 3075 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3076 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3077 for (j = 0; j < conf->raid_disks; j++) 3078 if (j != sh2->pd_idx && 3079 j != sh2->qd_idx && 3080 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3081 break; 3082 if (j == conf->raid_disks) { 3083 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3084 set_bit(STRIPE_HANDLE, &sh2->state); 3085 } 3086 release_stripe(sh2); 3087 3088 } 3089 /* done submitting copies, wait for them to complete */ 3090 if (tx) { 3091 async_tx_ack(tx); 3092 dma_wait_for_async_tx(tx); 3093 } 3094} 3095 3096/* 3097 * handle_stripe - do things to a stripe. 3098 * 3099 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3100 * state of various bits to see what needs to be done. 3101 * Possible results: 3102 * return some read requests which now have data 3103 * return some write requests which are safely on storage 3104 * schedule a read on some buffers 3105 * schedule a write of some buffers 3106 * return confirmation of parity correctness 3107 * 3108 */ 3109 3110static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3111{ 3112 struct r5conf *conf = sh->raid_conf; 3113 int disks = sh->disks; 3114 struct r5dev *dev; 3115 int i; 3116 int do_recovery = 0; 3117 3118 memset(s, 0, sizeof(*s)); 3119 3120 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3121 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3122 s->failed_num[0] = -1; 3123 s->failed_num[1] = -1; 3124 3125 /* Now to look around and see what can be done */ 3126 rcu_read_lock(); 3127 spin_lock_irq(&conf->device_lock); 3128 for (i=disks; i--; ) { 3129 struct md_rdev *rdev; 3130 sector_t first_bad; 3131 int bad_sectors; 3132 int is_bad = 0; 3133 3134 dev = &sh->dev[i]; 3135 3136 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3137 i, dev->flags, 3138 dev->toread, dev->towrite, dev->written); 3139 /* maybe we can reply to a read 3140 * 3141 * new wantfill requests are only permitted while 3142 * ops_complete_biofill is guaranteed to be inactive 3143 */ 3144 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3145 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3146 set_bit(R5_Wantfill, &dev->flags); 3147 3148 /* now count some things */ 3149 if (test_bit(R5_LOCKED, &dev->flags)) 3150 s->locked++; 3151 if (test_bit(R5_UPTODATE, &dev->flags)) 3152 s->uptodate++; 3153 if (test_bit(R5_Wantcompute, &dev->flags)) { 3154 s->compute++; 3155 BUG_ON(s->compute > 2); 3156 } 3157 3158 if (test_bit(R5_Wantfill, &dev->flags)) 3159 s->to_fill++; 3160 else if (dev->toread) 3161 s->to_read++; 3162 if (dev->towrite) { 3163 s->to_write++; 3164 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3165 s->non_overwrite++; 3166 } 3167 if (dev->written) 3168 s->written++; 3169 /* Prefer to use the replacement for reads, but only 3170 * if it is recovered enough and has no bad blocks. 3171 */ 3172 rdev = rcu_dereference(conf->disks[i].replacement); 3173 if (rdev && !test_bit(Faulty, &rdev->flags) && 3174 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3175 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3176 &first_bad, &bad_sectors)) 3177 set_bit(R5_ReadRepl, &dev->flags); 3178 else { 3179 if (rdev) 3180 set_bit(R5_NeedReplace, &dev->flags); 3181 rdev = rcu_dereference(conf->disks[i].rdev); 3182 clear_bit(R5_ReadRepl, &dev->flags); 3183 } 3184 if (rdev && test_bit(Faulty, &rdev->flags)) 3185 rdev = NULL; 3186 if (rdev) { 3187 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3188 &first_bad, &bad_sectors); 3189 if (s->blocked_rdev == NULL 3190 && (test_bit(Blocked, &rdev->flags) 3191 || is_bad < 0)) { 3192 if (is_bad < 0) 3193 set_bit(BlockedBadBlocks, 3194 &rdev->flags); 3195 s->blocked_rdev = rdev; 3196 atomic_inc(&rdev->nr_pending); 3197 } 3198 } 3199 clear_bit(R5_Insync, &dev->flags); 3200 if (!rdev) 3201 /* Not in-sync */; 3202 else if (is_bad) { 3203 /* also not in-sync */ 3204 if (!test_bit(WriteErrorSeen, &rdev->flags)) { 3205 /* treat as in-sync, but with a read error 3206 * which we can now try to correct 3207 */ 3208 set_bit(R5_Insync, &dev->flags); 3209 set_bit(R5_ReadError, &dev->flags); 3210 } 3211 } else if (test_bit(In_sync, &rdev->flags)) 3212 set_bit(R5_Insync, &dev->flags); 3213 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3214 /* in sync if before recovery_offset */ 3215 set_bit(R5_Insync, &dev->flags); 3216 else if (test_bit(R5_UPTODATE, &dev->flags) && 3217 test_bit(R5_Expanded, &dev->flags)) 3218 /* If we've reshaped into here, we assume it is Insync. 3219 * We will shortly update recovery_offset to make 3220 * it official. 3221 */ 3222 set_bit(R5_Insync, &dev->flags); 3223 3224 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3225 /* This flag does not apply to '.replacement' 3226 * only to .rdev, so make sure to check that*/ 3227 struct md_rdev *rdev2 = rcu_dereference( 3228 conf->disks[i].rdev); 3229 if (rdev2 == rdev) 3230 clear_bit(R5_Insync, &dev->flags); 3231 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3232 s->handle_bad_blocks = 1; 3233 atomic_inc(&rdev2->nr_pending); 3234 } else 3235 clear_bit(R5_WriteError, &dev->flags); 3236 } 3237 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3238 /* This flag does not apply to '.replacement' 3239 * only to .rdev, so make sure to check that*/ 3240 struct md_rdev *rdev2 = rcu_dereference( 3241 conf->disks[i].rdev); 3242 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3243 s->handle_bad_blocks = 1; 3244 atomic_inc(&rdev2->nr_pending); 3245 } else 3246 clear_bit(R5_MadeGood, &dev->flags); 3247 } 3248 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3249 struct md_rdev *rdev2 = rcu_dereference( 3250 conf->disks[i].replacement); 3251 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3252 s->handle_bad_blocks = 1; 3253 atomic_inc(&rdev2->nr_pending); 3254 } else 3255 clear_bit(R5_MadeGoodRepl, &dev->flags); 3256 } 3257 if (!test_bit(R5_Insync, &dev->flags)) { 3258 /* The ReadError flag will just be confusing now */ 3259 clear_bit(R5_ReadError, &dev->flags); 3260 clear_bit(R5_ReWrite, &dev->flags); 3261 } 3262 if (test_bit(R5_ReadError, &dev->flags)) 3263 clear_bit(R5_Insync, &dev->flags); 3264 if (!test_bit(R5_Insync, &dev->flags)) { 3265 if (s->failed < 2) 3266 s->failed_num[s->failed] = i; 3267 s->failed++; 3268 if (rdev && !test_bit(Faulty, &rdev->flags)) 3269 do_recovery = 1; 3270 } 3271 } 3272 spin_unlock_irq(&conf->device_lock); 3273 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3274 /* If there is a failed device being replaced, 3275 * we must be recovering. 3276 * else if we are after recovery_cp, we must be syncing 3277 * else we can only be replacing 3278 * sync and recovery both need to read all devices, and so 3279 * use the same flag. 3280 */ 3281 if (do_recovery || 3282 sh->sector >= conf->mddev->recovery_cp) 3283 s->syncing = 1; 3284 else 3285 s->replacing = 1; 3286 } 3287 rcu_read_unlock(); 3288} 3289 3290static void handle_stripe(struct stripe_head *sh) 3291{ 3292 struct stripe_head_state s; 3293 struct r5conf *conf = sh->raid_conf; 3294 int i; 3295 int prexor; 3296 int disks = sh->disks; 3297 struct r5dev *pdev, *qdev; 3298 3299 clear_bit(STRIPE_HANDLE, &sh->state); 3300 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3301 /* already being handled, ensure it gets handled 3302 * again when current action finishes */ 3303 set_bit(STRIPE_HANDLE, &sh->state); 3304 return; 3305 } 3306 3307 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3308 set_bit(STRIPE_SYNCING, &sh->state); 3309 clear_bit(STRIPE_INSYNC, &sh->state); 3310 } 3311 clear_bit(STRIPE_DELAYED, &sh->state); 3312 3313 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3314 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3315 (unsigned long long)sh->sector, sh->state, 3316 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3317 sh->check_state, sh->reconstruct_state); 3318 3319 analyse_stripe(sh, &s); 3320 3321 if (s.handle_bad_blocks) { 3322 set_bit(STRIPE_HANDLE, &sh->state); 3323 goto finish; 3324 } 3325 3326 if (unlikely(s.blocked_rdev)) { 3327 if (s.syncing || s.expanding || s.expanded || 3328 s.replacing || s.to_write || s.written) { 3329 set_bit(STRIPE_HANDLE, &sh->state); 3330 goto finish; 3331 } 3332 /* There is nothing for the blocked_rdev to block */ 3333 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3334 s.blocked_rdev = NULL; 3335 } 3336 3337 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3338 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3339 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3340 } 3341 3342 pr_debug("locked=%d uptodate=%d to_read=%d" 3343 " to_write=%d failed=%d failed_num=%d,%d\n", 3344 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3345 s.failed_num[0], s.failed_num[1]); 3346 /* check if the array has lost more than max_degraded devices and, 3347 * if so, some requests might need to be failed. 3348 */ 3349 if (s.failed > conf->max_degraded) { 3350 sh->check_state = 0; 3351 sh->reconstruct_state = 0; 3352 if (s.to_read+s.to_write+s.written) 3353 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3354 if (s.syncing + s.replacing) 3355 handle_failed_sync(conf, sh, &s); 3356 } 3357 3358 /* 3359 * might be able to return some write requests if the parity blocks 3360 * are safe, or on a failed drive 3361 */ 3362 pdev = &sh->dev[sh->pd_idx]; 3363 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3364 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3365 qdev = &sh->dev[sh->qd_idx]; 3366 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3367 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3368 || conf->level < 6; 3369 3370 if (s.written && 3371 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3372 && !test_bit(R5_LOCKED, &pdev->flags) 3373 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3374 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3375 && !test_bit(R5_LOCKED, &qdev->flags) 3376 && test_bit(R5_UPTODATE, &qdev->flags))))) 3377 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3378 3379 /* Now we might consider reading some blocks, either to check/generate 3380 * parity, or to satisfy requests 3381 * or to load a block that is being partially written. 3382 */ 3383 if (s.to_read || s.non_overwrite 3384 || (conf->level == 6 && s.to_write && s.failed) 3385 || (s.syncing && (s.uptodate + s.compute < disks)) 3386 || s.replacing 3387 || s.expanding) 3388 handle_stripe_fill(sh, &s, disks); 3389 3390 /* Now we check to see if any write operations have recently 3391 * completed 3392 */ 3393 prexor = 0; 3394 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3395 prexor = 1; 3396 if (sh->reconstruct_state == reconstruct_state_drain_result || 3397 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3398 sh->reconstruct_state = reconstruct_state_idle; 3399 3400 /* All the 'written' buffers and the parity block are ready to 3401 * be written back to disk 3402 */ 3403 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3404 BUG_ON(sh->qd_idx >= 0 && 3405 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); 3406 for (i = disks; i--; ) { 3407 struct r5dev *dev = &sh->dev[i]; 3408 if (test_bit(R5_LOCKED, &dev->flags) && 3409 (i == sh->pd_idx || i == sh->qd_idx || 3410 dev->written)) { 3411 pr_debug("Writing block %d\n", i); 3412 set_bit(R5_Wantwrite, &dev->flags); 3413 if (prexor) 3414 continue; 3415 if (!test_bit(R5_Insync, &dev->flags) || 3416 ((i == sh->pd_idx || i == sh->qd_idx) && 3417 s.failed == 0)) 3418 set_bit(STRIPE_INSYNC, &sh->state); 3419 } 3420 } 3421 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3422 s.dec_preread_active = 1; 3423 } 3424 3425 /* Now to consider new write requests and what else, if anything 3426 * should be read. We do not handle new writes when: 3427 * 1/ A 'write' operation (copy+xor) is already in flight. 3428 * 2/ A 'check' operation is in flight, as it may clobber the parity 3429 * block. 3430 */ 3431 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3432 handle_stripe_dirtying(conf, sh, &s, disks); 3433 3434 /* maybe we need to check and possibly fix the parity for this stripe 3435 * Any reads will already have been scheduled, so we just see if enough 3436 * data is available. The parity check is held off while parity 3437 * dependent operations are in flight. 3438 */ 3439 if (sh->check_state || 3440 (s.syncing && s.locked == 0 && 3441 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3442 !test_bit(STRIPE_INSYNC, &sh->state))) { 3443 if (conf->level == 6) 3444 handle_parity_checks6(conf, sh, &s, disks); 3445 else 3446 handle_parity_checks5(conf, sh, &s, disks); 3447 } 3448 3449 if (s.replacing && s.locked == 0 3450 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3451 /* Write out to replacement devices where possible */ 3452 for (i = 0; i < conf->raid_disks; i++) 3453 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && 3454 test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3455 set_bit(R5_WantReplace, &sh->dev[i].flags); 3456 set_bit(R5_LOCKED, &sh->dev[i].flags); 3457 s.locked++; 3458 } 3459 set_bit(STRIPE_INSYNC, &sh->state); 3460 } 3461 if ((s.syncing || s.replacing) && s.locked == 0 && 3462 test_bit(STRIPE_INSYNC, &sh->state)) { 3463 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3464 clear_bit(STRIPE_SYNCING, &sh->state); 3465 } 3466 3467 /* If the failed drives are just a ReadError, then we might need 3468 * to progress the repair/check process 3469 */ 3470 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3471 for (i = 0; i < s.failed; i++) { 3472 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3473 if (test_bit(R5_ReadError, &dev->flags) 3474 && !test_bit(R5_LOCKED, &dev->flags) 3475 && test_bit(R5_UPTODATE, &dev->flags) 3476 ) { 3477 if (!test_bit(R5_ReWrite, &dev->flags)) { 3478 set_bit(R5_Wantwrite, &dev->flags); 3479 set_bit(R5_ReWrite, &dev->flags); 3480 set_bit(R5_LOCKED, &dev->flags); 3481 s.locked++; 3482 } else { 3483 /* let's read it back */ 3484 set_bit(R5_Wantread, &dev->flags); 3485 set_bit(R5_LOCKED, &dev->flags); 3486 s.locked++; 3487 } 3488 } 3489 } 3490 3491 3492 /* Finish reconstruct operations initiated by the expansion process */ 3493 if (sh->reconstruct_state == reconstruct_state_result) { 3494 struct stripe_head *sh_src 3495 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3496 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3497 /* sh cannot be written until sh_src has been read. 3498 * so arrange for sh to be delayed a little 3499 */ 3500 set_bit(STRIPE_DELAYED, &sh->state); 3501 set_bit(STRIPE_HANDLE, &sh->state); 3502 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3503 &sh_src->state)) 3504 atomic_inc(&conf->preread_active_stripes); 3505 release_stripe(sh_src); 3506 goto finish; 3507 } 3508 if (sh_src) 3509 release_stripe(sh_src); 3510 3511 sh->reconstruct_state = reconstruct_state_idle; 3512 clear_bit(STRIPE_EXPANDING, &sh->state); 3513 for (i = conf->raid_disks; i--; ) { 3514 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3515 set_bit(R5_LOCKED, &sh->dev[i].flags); 3516 s.locked++; 3517 } 3518 } 3519 3520 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3521 !sh->reconstruct_state) { 3522 /* Need to write out all blocks after computing parity */ 3523 sh->disks = conf->raid_disks; 3524 stripe_set_idx(sh->sector, conf, 0, sh); 3525 schedule_reconstruction(sh, &s, 1, 1); 3526 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3527 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3528 atomic_dec(&conf->reshape_stripes); 3529 wake_up(&conf->wait_for_overlap); 3530 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3531 } 3532 3533 if (s.expanding && s.locked == 0 && 3534 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3535 handle_stripe_expansion(conf, sh); 3536 3537finish: 3538 /* wait for this device to become unblocked */ 3539 if (conf->mddev->external && unlikely(s.blocked_rdev)) 3540 md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); 3541 3542 if (s.handle_bad_blocks) 3543 for (i = disks; i--; ) { 3544 struct md_rdev *rdev; 3545 struct r5dev *dev = &sh->dev[i]; 3546 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3547 /* We own a safe reference to the rdev */ 3548 rdev = conf->disks[i].rdev; 3549 if (!rdev_set_badblocks(rdev, sh->sector, 3550 STRIPE_SECTORS, 0)) 3551 md_error(conf->mddev, rdev); 3552 rdev_dec_pending(rdev, conf->mddev); 3553 } 3554 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3555 rdev = conf->disks[i].rdev; 3556 rdev_clear_badblocks(rdev, sh->sector, 3557 STRIPE_SECTORS); 3558 rdev_dec_pending(rdev, conf->mddev); 3559 } 3560 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3561 rdev = conf->disks[i].replacement; 3562 if (!rdev) 3563 /* rdev have been moved down */ 3564 rdev = conf->disks[i].rdev; 3565 rdev_clear_badblocks(rdev, sh->sector, 3566 STRIPE_SECTORS); 3567 rdev_dec_pending(rdev, conf->mddev); 3568 } 3569 } 3570 3571 if (s.ops_request) 3572 raid_run_ops(sh, s.ops_request); 3573 3574 ops_run_io(sh, &s); 3575 3576 if (s.dec_preread_active) { 3577 /* We delay this until after ops_run_io so that if make_request 3578 * is waiting on a flush, it won't continue until the writes 3579 * have actually been submitted. 3580 */ 3581 atomic_dec(&conf->preread_active_stripes); 3582 if (atomic_read(&conf->preread_active_stripes) < 3583 IO_THRESHOLD) 3584 md_wakeup_thread(conf->mddev->thread); 3585 } 3586 3587 return_io(s.return_bi); 3588 3589 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3590} 3591 3592static void raid5_activate_delayed(struct r5conf *conf) 3593{ 3594 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3595 while (!list_empty(&conf->delayed_list)) { 3596 struct list_head *l = conf->delayed_list.next; 3597 struct stripe_head *sh; 3598 sh = list_entry(l, struct stripe_head, lru); 3599 list_del_init(l); 3600 clear_bit(STRIPE_DELAYED, &sh->state); 3601 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3602 atomic_inc(&conf->preread_active_stripes); 3603 list_add_tail(&sh->lru, &conf->hold_list); 3604 } 3605 } 3606} 3607 3608static void activate_bit_delay(struct r5conf *conf) 3609{ 3610 /* device_lock is held */ 3611 struct list_head head; 3612 list_add(&head, &conf->bitmap_list); 3613 list_del_init(&conf->bitmap_list); 3614 while (!list_empty(&head)) { 3615 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3616 list_del_init(&sh->lru); 3617 atomic_inc(&sh->count); 3618 __release_stripe(conf, sh); 3619 } 3620} 3621 3622int md_raid5_congested(struct mddev *mddev, int bits) 3623{ 3624 struct r5conf *conf = mddev->private; 3625 3626 /* No difference between reads and writes. Just check 3627 * how busy the stripe_cache is 3628 */ 3629 3630 if (conf->inactive_blocked) 3631 return 1; 3632 if (conf->quiesce) 3633 return 1; 3634 if (list_empty_careful(&conf->inactive_list)) 3635 return 1; 3636 3637 return 0; 3638} 3639EXPORT_SYMBOL_GPL(md_raid5_congested); 3640 3641static int raid5_congested(void *data, int bits) 3642{ 3643 struct mddev *mddev = data; 3644 3645 return mddev_congested(mddev, bits) || 3646 md_raid5_congested(mddev, bits); 3647} 3648 3649/* We want read requests to align with chunks where possible, 3650 * but write requests don't need to. 3651 */ 3652static int raid5_mergeable_bvec(struct request_queue *q, 3653 struct bvec_merge_data *bvm, 3654 struct bio_vec *biovec) 3655{ 3656 struct mddev *mddev = q->queuedata; 3657 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3658 int max; 3659 unsigned int chunk_sectors = mddev->chunk_sectors; 3660 unsigned int bio_sectors = bvm->bi_size >> 9; 3661 3662 if ((bvm->bi_rw & 1) == WRITE) 3663 return biovec->bv_len; /* always allow writes to be mergeable */ 3664 3665 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3666 chunk_sectors = mddev->new_chunk_sectors; 3667 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3668 if (max < 0) max = 0; 3669 if (max <= biovec->bv_len && bio_sectors == 0) 3670 return biovec->bv_len; 3671 else 3672 return max; 3673} 3674 3675 3676static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3677{ 3678 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3679 unsigned int chunk_sectors = mddev->chunk_sectors; 3680 unsigned int bio_sectors = bio->bi_size >> 9; 3681 3682 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3683 chunk_sectors = mddev->new_chunk_sectors; 3684 return chunk_sectors >= 3685 ((sector & (chunk_sectors - 1)) + bio_sectors); 3686} 3687 3688/* 3689 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3690 * later sampled by raid5d. 3691 */ 3692static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3693{ 3694 unsigned long flags; 3695 3696 spin_lock_irqsave(&conf->device_lock, flags); 3697 3698 bi->bi_next = conf->retry_read_aligned_list; 3699 conf->retry_read_aligned_list = bi; 3700 3701 spin_unlock_irqrestore(&conf->device_lock, flags); 3702 md_wakeup_thread(conf->mddev->thread); 3703} 3704 3705 3706static struct bio *remove_bio_from_retry(struct r5conf *conf) 3707{ 3708 struct bio *bi; 3709 3710 bi = conf->retry_read_aligned; 3711 if (bi) { 3712 conf->retry_read_aligned = NULL; 3713 return bi; 3714 } 3715 bi = conf->retry_read_aligned_list; 3716 if(bi) { 3717 conf->retry_read_aligned_list = bi->bi_next; 3718 bi->bi_next = NULL; 3719 /* 3720 * this sets the active strip count to 1 and the processed 3721 * strip count to zero (upper 8 bits) 3722 */ 3723 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3724 } 3725 3726 return bi; 3727} 3728 3729 3730/* 3731 * The "raid5_align_endio" should check if the read succeeded and if it 3732 * did, call bio_endio on the original bio (having bio_put the new bio 3733 * first). 3734 * If the read failed.. 3735 */ 3736static void raid5_align_endio(struct bio *bi, int error) 3737{ 3738 struct bio* raid_bi = bi->bi_private; 3739 struct mddev *mddev; 3740 struct r5conf *conf; 3741 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3742 struct md_rdev *rdev; 3743 3744 bio_put(bi); 3745 3746 rdev = (void*)raid_bi->bi_next; 3747 raid_bi->bi_next = NULL; 3748 mddev = rdev->mddev; 3749 conf = mddev->private; 3750 3751 rdev_dec_pending(rdev, conf->mddev); 3752 3753 if (!error && uptodate) { 3754 bio_endio(raid_bi, 0); 3755 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3756 wake_up(&conf->wait_for_stripe); 3757 return; 3758 } 3759 3760 3761 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3762 3763 add_bio_to_retry(raid_bi, conf); 3764} 3765 3766static int bio_fits_rdev(struct bio *bi) 3767{ 3768 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3769 3770 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3771 return 0; 3772 blk_recount_segments(q, bi); 3773 if (bi->bi_phys_segments > queue_max_segments(q)) 3774 return 0; 3775 3776 if (q->merge_bvec_fn) 3777 /* it's too hard to apply the merge_bvec_fn at this stage, 3778 * just just give up 3779 */ 3780 return 0; 3781 3782 return 1; 3783} 3784 3785 3786static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 3787{ 3788 struct r5conf *conf = mddev->private; 3789 int dd_idx; 3790 struct bio* align_bi; 3791 struct md_rdev *rdev; 3792 sector_t end_sector; 3793 3794 if (!in_chunk_boundary(mddev, raid_bio)) { 3795 pr_debug("chunk_aligned_read : non aligned\n"); 3796 return 0; 3797 } 3798 /* 3799 * use bio_clone_mddev to make a copy of the bio 3800 */ 3801 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3802 if (!align_bi) 3803 return 0; 3804 /* 3805 * set bi_end_io to a new function, and set bi_private to the 3806 * original bio. 3807 */ 3808 align_bi->bi_end_io = raid5_align_endio; 3809 align_bi->bi_private = raid_bio; 3810 /* 3811 * compute position 3812 */ 3813 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3814 0, 3815 &dd_idx, NULL); 3816 3817 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); 3818 rcu_read_lock(); 3819 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3820 if (!rdev || test_bit(Faulty, &rdev->flags) || 3821 rdev->recovery_offset < end_sector) { 3822 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3823 if (rdev && 3824 (test_bit(Faulty, &rdev->flags) || 3825 !(test_bit(In_sync, &rdev->flags) || 3826 rdev->recovery_offset >= end_sector))) 3827 rdev = NULL; 3828 } 3829 if (rdev) { 3830 sector_t first_bad; 3831 int bad_sectors; 3832 3833 atomic_inc(&rdev->nr_pending); 3834 rcu_read_unlock(); 3835 raid_bio->bi_next = (void*)rdev; 3836 align_bi->bi_bdev = rdev->bdev; 3837 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3838 align_bi->bi_sector += rdev->data_offset; 3839 3840 if (!bio_fits_rdev(align_bi) || 3841 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 3842 &first_bad, &bad_sectors)) { 3843 /* too big in some way, or has a known bad block */ 3844 bio_put(align_bi); 3845 rdev_dec_pending(rdev, mddev); 3846 return 0; 3847 } 3848 3849 spin_lock_irq(&conf->device_lock); 3850 wait_event_lock_irq(conf->wait_for_stripe, 3851 conf->quiesce == 0, 3852 conf->device_lock, /* nothing */); 3853 atomic_inc(&conf->active_aligned_reads); 3854 spin_unlock_irq(&conf->device_lock); 3855 3856 generic_make_request(align_bi); 3857 return 1; 3858 } else { 3859 rcu_read_unlock(); 3860 bio_put(align_bi); 3861 return 0; 3862 } 3863} 3864 3865/* __get_priority_stripe - get the next stripe to process 3866 * 3867 * Full stripe writes are allowed to pass preread active stripes up until 3868 * the bypass_threshold is exceeded. In general the bypass_count 3869 * increments when the handle_list is handled before the hold_list; however, it 3870 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3871 * stripe with in flight i/o. The bypass_count will be reset when the 3872 * head of the hold_list has changed, i.e. the head was promoted to the 3873 * handle_list. 3874 */ 3875static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 3876{ 3877 struct stripe_head *sh; 3878 3879 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3880 __func__, 3881 list_empty(&conf->handle_list) ? "empty" : "busy", 3882 list_empty(&conf->hold_list) ? "empty" : "busy", 3883 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3884 3885 if (!list_empty(&conf->handle_list)) { 3886 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3887 3888 if (list_empty(&conf->hold_list)) 3889 conf->bypass_count = 0; 3890 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3891 if (conf->hold_list.next == conf->last_hold) 3892 conf->bypass_count++; 3893 else { 3894 conf->last_hold = conf->hold_list.next; 3895 conf->bypass_count -= conf->bypass_threshold; 3896 if (conf->bypass_count < 0) 3897 conf->bypass_count = 0; 3898 } 3899 } 3900 } else if (!list_empty(&conf->hold_list) && 3901 ((conf->bypass_threshold && 3902 conf->bypass_count > conf->bypass_threshold) || 3903 atomic_read(&conf->pending_full_writes) == 0)) { 3904 sh = list_entry(conf->hold_list.next, 3905 typeof(*sh), lru); 3906 conf->bypass_count -= conf->bypass_threshold; 3907 if (conf->bypass_count < 0) 3908 conf->bypass_count = 0; 3909 } else 3910 return NULL; 3911 3912 list_del_init(&sh->lru); 3913 atomic_inc(&sh->count); 3914 BUG_ON(atomic_read(&sh->count) != 1); 3915 return sh; 3916} 3917 3918static void make_request(struct mddev *mddev, struct bio * bi) 3919{ 3920 struct r5conf *conf = mddev->private; 3921 int dd_idx; 3922 sector_t new_sector; 3923 sector_t logical_sector, last_sector; 3924 struct stripe_head *sh; 3925 const int rw = bio_data_dir(bi); 3926 int remaining; 3927 int plugged; 3928 3929 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 3930 md_flush_request(mddev, bi); 3931 return; 3932 } 3933 3934 md_write_start(mddev, bi); 3935 3936 if (rw == READ && 3937 mddev->reshape_position == MaxSector && 3938 chunk_aligned_read(mddev,bi)) 3939 return; 3940 3941 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3942 last_sector = bi->bi_sector + (bi->bi_size>>9); 3943 bi->bi_next = NULL; 3944 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3945 3946 plugged = mddev_check_plugged(mddev); 3947 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3948 DEFINE_WAIT(w); 3949 int disks, data_disks; 3950 int previous; 3951 3952 retry: 3953 previous = 0; 3954 disks = conf->raid_disks; 3955 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3956 if (unlikely(conf->reshape_progress != MaxSector)) { 3957 /* spinlock is needed as reshape_progress may be 3958 * 64bit on a 32bit platform, and so it might be 3959 * possible to see a half-updated value 3960 * Of course reshape_progress could change after 3961 * the lock is dropped, so once we get a reference 3962 * to the stripe that we think it is, we will have 3963 * to check again. 3964 */ 3965 spin_lock_irq(&conf->device_lock); 3966 if (mddev->delta_disks < 0 3967 ? logical_sector < conf->reshape_progress 3968 : logical_sector >= conf->reshape_progress) { 3969 disks = conf->previous_raid_disks; 3970 previous = 1; 3971 } else { 3972 if (mddev->delta_disks < 0 3973 ? logical_sector < conf->reshape_safe 3974 : logical_sector >= conf->reshape_safe) { 3975 spin_unlock_irq(&conf->device_lock); 3976 schedule(); 3977 goto retry; 3978 } 3979 } 3980 spin_unlock_irq(&conf->device_lock); 3981 } 3982 data_disks = disks - conf->max_degraded; 3983 3984 new_sector = raid5_compute_sector(conf, logical_sector, 3985 previous, 3986 &dd_idx, NULL); 3987 pr_debug("raid456: make_request, sector %llu logical %llu\n", 3988 (unsigned long long)new_sector, 3989 (unsigned long long)logical_sector); 3990 3991 sh = get_active_stripe(conf, new_sector, previous, 3992 (bi->bi_rw&RWA_MASK), 0); 3993 if (sh) { 3994 if (unlikely(previous)) { 3995 /* expansion might have moved on while waiting for a 3996 * stripe, so we must do the range check again. 3997 * Expansion could still move past after this 3998 * test, but as we are holding a reference to 3999 * 'sh', we know that if that happens, 4000 * STRIPE_EXPANDING will get set and the expansion 4001 * won't proceed until we finish with the stripe. 4002 */ 4003 int must_retry = 0; 4004 spin_lock_irq(&conf->device_lock); 4005 if (mddev->delta_disks < 0 4006 ? logical_sector >= conf->reshape_progress 4007 : logical_sector < conf->reshape_progress) 4008 /* mismatch, need to try again */ 4009 must_retry = 1; 4010 spin_unlock_irq(&conf->device_lock); 4011 if (must_retry) { 4012 release_stripe(sh); 4013 schedule(); 4014 goto retry; 4015 } 4016 } 4017 4018 if (rw == WRITE && 4019 logical_sector >= mddev->suspend_lo && 4020 logical_sector < mddev->suspend_hi) { 4021 release_stripe(sh); 4022 /* As the suspend_* range is controlled by 4023 * userspace, we want an interruptible 4024 * wait. 4025 */ 4026 flush_signals(current); 4027 prepare_to_wait(&conf->wait_for_overlap, 4028 &w, TASK_INTERRUPTIBLE); 4029 if (logical_sector >= mddev->suspend_lo && 4030 logical_sector < mddev->suspend_hi) 4031 schedule(); 4032 goto retry; 4033 } 4034 4035 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4036 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4037 /* Stripe is busy expanding or 4038 * add failed due to overlap. Flush everything 4039 * and wait a while 4040 */ 4041 md_wakeup_thread(mddev->thread); 4042 release_stripe(sh); 4043 schedule(); 4044 goto retry; 4045 } 4046 finish_wait(&conf->wait_for_overlap, &w); 4047 set_bit(STRIPE_HANDLE, &sh->state); 4048 clear_bit(STRIPE_DELAYED, &sh->state); 4049 if ((bi->bi_rw & REQ_SYNC) && 4050 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4051 atomic_inc(&conf->preread_active_stripes); 4052 release_stripe(sh); 4053 } else { 4054 /* cannot get stripe for read-ahead, just give-up */ 4055 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4056 finish_wait(&conf->wait_for_overlap, &w); 4057 break; 4058 } 4059 4060 } 4061 if (!plugged) 4062 md_wakeup_thread(mddev->thread); 4063 4064 spin_lock_irq(&conf->device_lock); 4065 remaining = raid5_dec_bi_phys_segments(bi); 4066 spin_unlock_irq(&conf->device_lock); 4067 if (remaining == 0) { 4068 4069 if ( rw == WRITE ) 4070 md_write_end(mddev); 4071 4072 bio_endio(bi, 0); 4073 } 4074} 4075 4076static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4077 4078static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4079{ 4080 /* reshaping is quite different to recovery/resync so it is 4081 * handled quite separately ... here. 4082 * 4083 * On each call to sync_request, we gather one chunk worth of 4084 * destination stripes and flag them as expanding. 4085 * Then we find all the source stripes and request reads. 4086 * As the reads complete, handle_stripe will copy the data 4087 * into the destination stripe and release that stripe. 4088 */ 4089 struct r5conf *conf = mddev->private; 4090 struct stripe_head *sh; 4091 sector_t first_sector, last_sector; 4092 int raid_disks = conf->previous_raid_disks; 4093 int data_disks = raid_disks - conf->max_degraded; 4094 int new_data_disks = conf->raid_disks - conf->max_degraded; 4095 int i; 4096 int dd_idx; 4097 sector_t writepos, readpos, safepos; 4098 sector_t stripe_addr; 4099 int reshape_sectors; 4100 struct list_head stripes; 4101 4102 if (sector_nr == 0) { 4103 /* If restarting in the middle, skip the initial sectors */ 4104 if (mddev->delta_disks < 0 && 4105 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4106 sector_nr = raid5_size(mddev, 0, 0) 4107 - conf->reshape_progress; 4108 } else if (mddev->delta_disks >= 0 && 4109 conf->reshape_progress > 0) 4110 sector_nr = conf->reshape_progress; 4111 sector_div(sector_nr, new_data_disks); 4112 if (sector_nr) { 4113 mddev->curr_resync_completed = sector_nr; 4114 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4115 *skipped = 1; 4116 return sector_nr; 4117 } 4118 } 4119 4120 /* We need to process a full chunk at a time. 4121 * If old and new chunk sizes differ, we need to process the 4122 * largest of these 4123 */ 4124 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4125 reshape_sectors = mddev->new_chunk_sectors; 4126 else 4127 reshape_sectors = mddev->chunk_sectors; 4128 4129 /* we update the metadata when there is more than 3Meg 4130 * in the block range (that is rather arbitrary, should 4131 * probably be time based) or when the data about to be 4132 * copied would over-write the source of the data at 4133 * the front of the range. 4134 * i.e. one new_stripe along from reshape_progress new_maps 4135 * to after where reshape_safe old_maps to 4136 */ 4137 writepos = conf->reshape_progress; 4138 sector_div(writepos, new_data_disks); 4139 readpos = conf->reshape_progress; 4140 sector_div(readpos, data_disks); 4141 safepos = conf->reshape_safe; 4142 sector_div(safepos, data_disks); 4143 if (mddev->delta_disks < 0) { 4144 writepos -= min_t(sector_t, reshape_sectors, writepos); 4145 readpos += reshape_sectors; 4146 safepos += reshape_sectors; 4147 } else { 4148 writepos += reshape_sectors; 4149 readpos -= min_t(sector_t, reshape_sectors, readpos); 4150 safepos -= min_t(sector_t, reshape_sectors, safepos); 4151 } 4152 4153 /* 'writepos' is the most advanced device address we might write. 4154 * 'readpos' is the least advanced device address we might read. 4155 * 'safepos' is the least address recorded in the metadata as having 4156 * been reshaped. 4157 * If 'readpos' is behind 'writepos', then there is no way that we can 4158 * ensure safety in the face of a crash - that must be done by userspace 4159 * making a backup of the data. So in that case there is no particular 4160 * rush to update metadata. 4161 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4162 * update the metadata to advance 'safepos' to match 'readpos' so that 4163 * we can be safe in the event of a crash. 4164 * So we insist on updating metadata if safepos is behind writepos and 4165 * readpos is beyond writepos. 4166 * In any case, update the metadata every 10 seconds. 4167 * Maybe that number should be configurable, but I'm not sure it is 4168 * worth it.... maybe it could be a multiple of safemode_delay??? 4169 */ 4170 if ((mddev->delta_disks < 0 4171 ? (safepos > writepos && readpos < writepos) 4172 : (safepos < writepos && readpos > writepos)) || 4173 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4174 /* Cannot proceed until we've updated the superblock... */ 4175 wait_event(conf->wait_for_overlap, 4176 atomic_read(&conf->reshape_stripes)==0); 4177 mddev->reshape_position = conf->reshape_progress; 4178 mddev->curr_resync_completed = sector_nr; 4179 conf->reshape_checkpoint = jiffies; 4180 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4181 md_wakeup_thread(mddev->thread); 4182 wait_event(mddev->sb_wait, mddev->flags == 0 || 4183 kthread_should_stop()); 4184 spin_lock_irq(&conf->device_lock); 4185 conf->reshape_safe = mddev->reshape_position; 4186 spin_unlock_irq(&conf->device_lock); 4187 wake_up(&conf->wait_for_overlap); 4188 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4189 } 4190 4191 if (mddev->delta_disks < 0) { 4192 BUG_ON(conf->reshape_progress == 0); 4193 stripe_addr = writepos; 4194 BUG_ON((mddev->dev_sectors & 4195 ~((sector_t)reshape_sectors - 1)) 4196 - reshape_sectors - stripe_addr 4197 != sector_nr); 4198 } else { 4199 BUG_ON(writepos != sector_nr + reshape_sectors); 4200 stripe_addr = sector_nr; 4201 } 4202 INIT_LIST_HEAD(&stripes); 4203 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4204 int j; 4205 int skipped_disk = 0; 4206 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4207 set_bit(STRIPE_EXPANDING, &sh->state); 4208 atomic_inc(&conf->reshape_stripes); 4209 /* If any of this stripe is beyond the end of the old 4210 * array, then we need to zero those blocks 4211 */ 4212 for (j=sh->disks; j--;) { 4213 sector_t s; 4214 if (j == sh->pd_idx) 4215 continue; 4216 if (conf->level == 6 && 4217 j == sh->qd_idx) 4218 continue; 4219 s = compute_blocknr(sh, j, 0); 4220 if (s < raid5_size(mddev, 0, 0)) { 4221 skipped_disk = 1; 4222 continue; 4223 } 4224 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4225 set_bit(R5_Expanded, &sh->dev[j].flags); 4226 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4227 } 4228 if (!skipped_disk) { 4229 set_bit(STRIPE_EXPAND_READY, &sh->state); 4230 set_bit(STRIPE_HANDLE, &sh->state); 4231 } 4232 list_add(&sh->lru, &stripes); 4233 } 4234 spin_lock_irq(&conf->device_lock); 4235 if (mddev->delta_disks < 0) 4236 conf->reshape_progress -= reshape_sectors * new_data_disks; 4237 else 4238 conf->reshape_progress += reshape_sectors * new_data_disks; 4239 spin_unlock_irq(&conf->device_lock); 4240 /* Ok, those stripe are ready. We can start scheduling 4241 * reads on the source stripes. 4242 * The source stripes are determined by mapping the first and last 4243 * block on the destination stripes. 4244 */ 4245 first_sector = 4246 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4247 1, &dd_idx, NULL); 4248 last_sector = 4249 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4250 * new_data_disks - 1), 4251 1, &dd_idx, NULL); 4252 if (last_sector >= mddev->dev_sectors) 4253 last_sector = mddev->dev_sectors - 1; 4254 while (first_sector <= last_sector) { 4255 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4256 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4257 set_bit(STRIPE_HANDLE, &sh->state); 4258 release_stripe(sh); 4259 first_sector += STRIPE_SECTORS; 4260 } 4261 /* Now that the sources are clearly marked, we can release 4262 * the destination stripes 4263 */ 4264 while (!list_empty(&stripes)) { 4265 sh = list_entry(stripes.next, struct stripe_head, lru); 4266 list_del_init(&sh->lru); 4267 release_stripe(sh); 4268 } 4269 /* If this takes us to the resync_max point where we have to pause, 4270 * then we need to write out the superblock. 4271 */ 4272 sector_nr += reshape_sectors; 4273 if ((sector_nr - mddev->curr_resync_completed) * 2 4274 >= mddev->resync_max - mddev->curr_resync_completed) { 4275 /* Cannot proceed until we've updated the superblock... */ 4276 wait_event(conf->wait_for_overlap, 4277 atomic_read(&conf->reshape_stripes) == 0); 4278 mddev->reshape_position = conf->reshape_progress; 4279 mddev->curr_resync_completed = sector_nr; 4280 conf->reshape_checkpoint = jiffies; 4281 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4282 md_wakeup_thread(mddev->thread); 4283 wait_event(mddev->sb_wait, 4284 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4285 || kthread_should_stop()); 4286 spin_lock_irq(&conf->device_lock); 4287 conf->reshape_safe = mddev->reshape_position; 4288 spin_unlock_irq(&conf->device_lock); 4289 wake_up(&conf->wait_for_overlap); 4290 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4291 } 4292 return reshape_sectors; 4293} 4294 4295/* FIXME go_faster isn't used */ 4296static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4297{ 4298 struct r5conf *conf = mddev->private; 4299 struct stripe_head *sh; 4300 sector_t max_sector = mddev->dev_sectors; 4301 sector_t sync_blocks; 4302 int still_degraded = 0; 4303 int i; 4304 4305 if (sector_nr >= max_sector) { 4306 /* just being told to finish up .. nothing much to do */ 4307 4308 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4309 end_reshape(conf); 4310 return 0; 4311 } 4312 4313 if (mddev->curr_resync < max_sector) /* aborted */ 4314 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4315 &sync_blocks, 1); 4316 else /* completed sync */ 4317 conf->fullsync = 0; 4318 bitmap_close_sync(mddev->bitmap); 4319 4320 return 0; 4321 } 4322 4323 /* Allow raid5_quiesce to complete */ 4324 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4325 4326 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4327 return reshape_request(mddev, sector_nr, skipped); 4328 4329 /* No need to check resync_max as we never do more than one 4330 * stripe, and as resync_max will always be on a chunk boundary, 4331 * if the check in md_do_sync didn't fire, there is no chance 4332 * of overstepping resync_max here 4333 */ 4334 4335 /* if there is too many failed drives and we are trying 4336 * to resync, then assert that we are finished, because there is 4337 * nothing we can do. 4338 */ 4339 if (mddev->degraded >= conf->max_degraded && 4340 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4341 sector_t rv = mddev->dev_sectors - sector_nr; 4342 *skipped = 1; 4343 return rv; 4344 } 4345 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4346 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4347 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4348 /* we can skip this block, and probably more */ 4349 sync_blocks /= STRIPE_SECTORS; 4350 *skipped = 1; 4351 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4352 } 4353 4354 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4355 4356 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4357 if (sh == NULL) { 4358 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4359 /* make sure we don't swamp the stripe cache if someone else 4360 * is trying to get access 4361 */ 4362 schedule_timeout_uninterruptible(1); 4363 } 4364 /* Need to check if array will still be degraded after recovery/resync 4365 * We don't need to check the 'failed' flag as when that gets set, 4366 * recovery aborts. 4367 */ 4368 for (i = 0; i < conf->raid_disks; i++) 4369 if (conf->disks[i].rdev == NULL) 4370 still_degraded = 1; 4371 4372 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4373 4374 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4375 4376 handle_stripe(sh); 4377 release_stripe(sh); 4378 4379 return STRIPE_SECTORS; 4380} 4381 4382static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4383{ 4384 /* We may not be able to submit a whole bio at once as there 4385 * may not be enough stripe_heads available. 4386 * We cannot pre-allocate enough stripe_heads as we may need 4387 * more than exist in the cache (if we allow ever large chunks). 4388 * So we do one stripe head at a time and record in 4389 * ->bi_hw_segments how many have been done. 4390 * 4391 * We *know* that this entire raid_bio is in one chunk, so 4392 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4393 */ 4394 struct stripe_head *sh; 4395 int dd_idx; 4396 sector_t sector, logical_sector, last_sector; 4397 int scnt = 0; 4398 int remaining; 4399 int handled = 0; 4400 4401 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4402 sector = raid5_compute_sector(conf, logical_sector, 4403 0, &dd_idx, NULL); 4404 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4405 4406 for (; logical_sector < last_sector; 4407 logical_sector += STRIPE_SECTORS, 4408 sector += STRIPE_SECTORS, 4409 scnt++) { 4410 4411 if (scnt < raid5_bi_hw_segments(raid_bio)) 4412 /* already done this stripe */ 4413 continue; 4414 4415 sh = get_active_stripe(conf, sector, 0, 1, 0); 4416 4417 if (!sh) { 4418 /* failed to get a stripe - must wait */ 4419 raid5_set_bi_hw_segments(raid_bio, scnt); 4420 conf->retry_read_aligned = raid_bio; 4421 return handled; 4422 } 4423 4424 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4425 release_stripe(sh); 4426 raid5_set_bi_hw_segments(raid_bio, scnt); 4427 conf->retry_read_aligned = raid_bio; 4428 return handled; 4429 } 4430 4431 handle_stripe(sh); 4432 release_stripe(sh); 4433 handled++; 4434 } 4435 spin_lock_irq(&conf->device_lock); 4436 remaining = raid5_dec_bi_phys_segments(raid_bio); 4437 spin_unlock_irq(&conf->device_lock); 4438 if (remaining == 0) 4439 bio_endio(raid_bio, 0); 4440 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4441 wake_up(&conf->wait_for_stripe); 4442 return handled; 4443} 4444 4445 4446/* 4447 * This is our raid5 kernel thread. 4448 * 4449 * We scan the hash table for stripes which can be handled now. 4450 * During the scan, completed stripes are saved for us by the interrupt 4451 * handler, so that they will not have to wait for our next wakeup. 4452 */ 4453static void raid5d(struct mddev *mddev) 4454{ 4455 struct stripe_head *sh; 4456 struct r5conf *conf = mddev->private; 4457 int handled; 4458 struct blk_plug plug; 4459 4460 pr_debug("+++ raid5d active\n"); 4461 4462 md_check_recovery(mddev); 4463 4464 blk_start_plug(&plug); 4465 handled = 0; 4466 spin_lock_irq(&conf->device_lock); 4467 while (1) { 4468 struct bio *bio; 4469 4470 if (atomic_read(&mddev->plug_cnt) == 0 && 4471 !list_empty(&conf->bitmap_list)) { 4472 /* Now is a good time to flush some bitmap updates */ 4473 conf->seq_flush++; 4474 spin_unlock_irq(&conf->device_lock); 4475 bitmap_unplug(mddev->bitmap); 4476 spin_lock_irq(&conf->device_lock); 4477 conf->seq_write = conf->seq_flush; 4478 activate_bit_delay(conf); 4479 } 4480 if (atomic_read(&mddev->plug_cnt) == 0) 4481 raid5_activate_delayed(conf); 4482 4483 while ((bio = remove_bio_from_retry(conf))) { 4484 int ok; 4485 spin_unlock_irq(&conf->device_lock); 4486 ok = retry_aligned_read(conf, bio); 4487 spin_lock_irq(&conf->device_lock); 4488 if (!ok) 4489 break; 4490 handled++; 4491 } 4492 4493 sh = __get_priority_stripe(conf); 4494 4495 if (!sh) 4496 break; 4497 spin_unlock_irq(&conf->device_lock); 4498 4499 handled++; 4500 handle_stripe(sh); 4501 release_stripe(sh); 4502 cond_resched(); 4503 4504 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) 4505 md_check_recovery(mddev); 4506 4507 spin_lock_irq(&conf->device_lock); 4508 } 4509 pr_debug("%d stripes handled\n", handled); 4510 4511 spin_unlock_irq(&conf->device_lock); 4512 4513 async_tx_issue_pending_all(); 4514 blk_finish_plug(&plug); 4515 4516 pr_debug("--- raid5d inactive\n"); 4517} 4518 4519static ssize_t 4520raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 4521{ 4522 struct r5conf *conf = mddev->private; 4523 if (conf) 4524 return sprintf(page, "%d\n", conf->max_nr_stripes); 4525 else 4526 return 0; 4527} 4528 4529int 4530raid5_set_cache_size(struct mddev *mddev, int size) 4531{ 4532 struct r5conf *conf = mddev->private; 4533 int err; 4534 4535 if (size <= 16 || size > 32768) 4536 return -EINVAL; 4537 while (size < conf->max_nr_stripes) { 4538 if (drop_one_stripe(conf)) 4539 conf->max_nr_stripes--; 4540 else 4541 break; 4542 } 4543 err = md_allow_write(mddev); 4544 if (err) 4545 return err; 4546 while (size > conf->max_nr_stripes) { 4547 if (grow_one_stripe(conf)) 4548 conf->max_nr_stripes++; 4549 else break; 4550 } 4551 return 0; 4552} 4553EXPORT_SYMBOL(raid5_set_cache_size); 4554 4555static ssize_t 4556raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 4557{ 4558 struct r5conf *conf = mddev->private; 4559 unsigned long new; 4560 int err; 4561 4562 if (len >= PAGE_SIZE) 4563 return -EINVAL; 4564 if (!conf) 4565 return -ENODEV; 4566 4567 if (strict_strtoul(page, 10, &new)) 4568 return -EINVAL; 4569 err = raid5_set_cache_size(mddev, new); 4570 if (err) 4571 return err; 4572 return len; 4573} 4574 4575static struct md_sysfs_entry 4576raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4577 raid5_show_stripe_cache_size, 4578 raid5_store_stripe_cache_size); 4579 4580static ssize_t 4581raid5_show_preread_threshold(struct mddev *mddev, char *page) 4582{ 4583 struct r5conf *conf = mddev->private; 4584 if (conf) 4585 return sprintf(page, "%d\n", conf->bypass_threshold); 4586 else 4587 return 0; 4588} 4589 4590static ssize_t 4591raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 4592{ 4593 struct r5conf *conf = mddev->private; 4594 unsigned long new; 4595 if (len >= PAGE_SIZE) 4596 return -EINVAL; 4597 if (!conf) 4598 return -ENODEV; 4599 4600 if (strict_strtoul(page, 10, &new)) 4601 return -EINVAL; 4602 if (new > conf->max_nr_stripes) 4603 return -EINVAL; 4604 conf->bypass_threshold = new; 4605 return len; 4606} 4607 4608static struct md_sysfs_entry 4609raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4610 S_IRUGO | S_IWUSR, 4611 raid5_show_preread_threshold, 4612 raid5_store_preread_threshold); 4613 4614static ssize_t 4615stripe_cache_active_show(struct mddev *mddev, char *page) 4616{ 4617 struct r5conf *conf = mddev->private; 4618 if (conf) 4619 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4620 else 4621 return 0; 4622} 4623 4624static struct md_sysfs_entry 4625raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4626 4627static struct attribute *raid5_attrs[] = { 4628 &raid5_stripecache_size.attr, 4629 &raid5_stripecache_active.attr, 4630 &raid5_preread_bypass_threshold.attr, 4631 NULL, 4632}; 4633static struct attribute_group raid5_attrs_group = { 4634 .name = NULL, 4635 .attrs = raid5_attrs, 4636}; 4637 4638static sector_t 4639raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 4640{ 4641 struct r5conf *conf = mddev->private; 4642 4643 if (!sectors) 4644 sectors = mddev->dev_sectors; 4645 if (!raid_disks) 4646 /* size is defined by the smallest of previous and new size */ 4647 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 4648 4649 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4650 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4651 return sectors * (raid_disks - conf->max_degraded); 4652} 4653 4654static void raid5_free_percpu(struct r5conf *conf) 4655{ 4656 struct raid5_percpu *percpu; 4657 unsigned long cpu; 4658 4659 if (!conf->percpu) 4660 return; 4661 4662 get_online_cpus(); 4663 for_each_possible_cpu(cpu) { 4664 percpu = per_cpu_ptr(conf->percpu, cpu); 4665 safe_put_page(percpu->spare_page); 4666 kfree(percpu->scribble); 4667 } 4668#ifdef CONFIG_HOTPLUG_CPU 4669 unregister_cpu_notifier(&conf->cpu_notify); 4670#endif 4671 put_online_cpus(); 4672 4673 free_percpu(conf->percpu); 4674} 4675 4676static void free_conf(struct r5conf *conf) 4677{ 4678 shrink_stripes(conf); 4679 raid5_free_percpu(conf); 4680 kfree(conf->disks); 4681 kfree(conf->stripe_hashtbl); 4682 kfree(conf); 4683} 4684 4685#ifdef CONFIG_HOTPLUG_CPU 4686static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4687 void *hcpu) 4688{ 4689 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 4690 long cpu = (long)hcpu; 4691 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4692 4693 switch (action) { 4694 case CPU_UP_PREPARE: 4695 case CPU_UP_PREPARE_FROZEN: 4696 if (conf->level == 6 && !percpu->spare_page) 4697 percpu->spare_page = alloc_page(GFP_KERNEL); 4698 if (!percpu->scribble) 4699 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4700 4701 if (!percpu->scribble || 4702 (conf->level == 6 && !percpu->spare_page)) { 4703 safe_put_page(percpu->spare_page); 4704 kfree(percpu->scribble); 4705 pr_err("%s: failed memory allocation for cpu%ld\n", 4706 __func__, cpu); 4707 return notifier_from_errno(-ENOMEM); 4708 } 4709 break; 4710 case CPU_DEAD: 4711 case CPU_DEAD_FROZEN: 4712 safe_put_page(percpu->spare_page); 4713 kfree(percpu->scribble); 4714 percpu->spare_page = NULL; 4715 percpu->scribble = NULL; 4716 break; 4717 default: 4718 break; 4719 } 4720 return NOTIFY_OK; 4721} 4722#endif 4723 4724static int raid5_alloc_percpu(struct r5conf *conf) 4725{ 4726 unsigned long cpu; 4727 struct page *spare_page; 4728 struct raid5_percpu __percpu *allcpus; 4729 void *scribble; 4730 int err; 4731 4732 allcpus = alloc_percpu(struct raid5_percpu); 4733 if (!allcpus) 4734 return -ENOMEM; 4735 conf->percpu = allcpus; 4736 4737 get_online_cpus(); 4738 err = 0; 4739 for_each_present_cpu(cpu) { 4740 if (conf->level == 6) { 4741 spare_page = alloc_page(GFP_KERNEL); 4742 if (!spare_page) { 4743 err = -ENOMEM; 4744 break; 4745 } 4746 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4747 } 4748 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4749 if (!scribble) { 4750 err = -ENOMEM; 4751 break; 4752 } 4753 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 4754 } 4755#ifdef CONFIG_HOTPLUG_CPU 4756 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4757 conf->cpu_notify.priority = 0; 4758 if (err == 0) 4759 err = register_cpu_notifier(&conf->cpu_notify); 4760#endif 4761 put_online_cpus(); 4762 4763 return err; 4764} 4765 4766static struct r5conf *setup_conf(struct mddev *mddev) 4767{ 4768 struct r5conf *conf; 4769 int raid_disk, memory, max_disks; 4770 struct md_rdev *rdev; 4771 struct disk_info *disk; 4772 4773 if (mddev->new_level != 5 4774 && mddev->new_level != 4 4775 && mddev->new_level != 6) { 4776 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 4777 mdname(mddev), mddev->new_level); 4778 return ERR_PTR(-EIO); 4779 } 4780 if ((mddev->new_level == 5 4781 && !algorithm_valid_raid5(mddev->new_layout)) || 4782 (mddev->new_level == 6 4783 && !algorithm_valid_raid6(mddev->new_layout))) { 4784 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 4785 mdname(mddev), mddev->new_layout); 4786 return ERR_PTR(-EIO); 4787 } 4788 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4789 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 4790 mdname(mddev), mddev->raid_disks); 4791 return ERR_PTR(-EINVAL); 4792 } 4793 4794 if (!mddev->new_chunk_sectors || 4795 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4796 !is_power_of_2(mddev->new_chunk_sectors)) { 4797 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 4798 mdname(mddev), mddev->new_chunk_sectors << 9); 4799 return ERR_PTR(-EINVAL); 4800 } 4801 4802 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 4803 if (conf == NULL) 4804 goto abort; 4805 spin_lock_init(&conf->device_lock); 4806 init_waitqueue_head(&conf->wait_for_stripe); 4807 init_waitqueue_head(&conf->wait_for_overlap); 4808 INIT_LIST_HEAD(&conf->handle_list); 4809 INIT_LIST_HEAD(&conf->hold_list); 4810 INIT_LIST_HEAD(&conf->delayed_list); 4811 INIT_LIST_HEAD(&conf->bitmap_list); 4812 INIT_LIST_HEAD(&conf->inactive_list); 4813 atomic_set(&conf->active_stripes, 0); 4814 atomic_set(&conf->preread_active_stripes, 0); 4815 atomic_set(&conf->active_aligned_reads, 0); 4816 conf->bypass_threshold = BYPASS_THRESHOLD; 4817 conf->recovery_disabled = mddev->recovery_disabled - 1; 4818 4819 conf->raid_disks = mddev->raid_disks; 4820 if (mddev->reshape_position == MaxSector) 4821 conf->previous_raid_disks = mddev->raid_disks; 4822 else 4823 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4824 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 4825 conf->scribble_len = scribble_len(max_disks); 4826 4827 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 4828 GFP_KERNEL); 4829 if (!conf->disks) 4830 goto abort; 4831 4832 conf->mddev = mddev; 4833 4834 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4835 goto abort; 4836 4837 conf->level = mddev->new_level; 4838 if (raid5_alloc_percpu(conf) != 0) 4839 goto abort; 4840 4841 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4842 4843 list_for_each_entry(rdev, &mddev->disks, same_set) { 4844 raid_disk = rdev->raid_disk; 4845 if (raid_disk >= max_disks 4846 || raid_disk < 0) 4847 continue; 4848 disk = conf->disks + raid_disk; 4849 4850 disk->rdev = rdev; 4851 4852 if (test_bit(In_sync, &rdev->flags)) { 4853 char b[BDEVNAME_SIZE]; 4854 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4855 " disk %d\n", 4856 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4857 } else if (rdev->saved_raid_disk != raid_disk) 4858 /* Cannot rely on bitmap to complete recovery */ 4859 conf->fullsync = 1; 4860 } 4861 4862 conf->chunk_sectors = mddev->new_chunk_sectors; 4863 conf->level = mddev->new_level; 4864 if (conf->level == 6) 4865 conf->max_degraded = 2; 4866 else 4867 conf->max_degraded = 1; 4868 conf->algorithm = mddev->new_layout; 4869 conf->max_nr_stripes = NR_STRIPES; 4870 conf->reshape_progress = mddev->reshape_position; 4871 if (conf->reshape_progress != MaxSector) { 4872 conf->prev_chunk_sectors = mddev->chunk_sectors; 4873 conf->prev_algo = mddev->layout; 4874 } 4875 4876 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4877 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4878 if (grow_stripes(conf, conf->max_nr_stripes)) { 4879 printk(KERN_ERR 4880 "md/raid:%s: couldn't allocate %dkB for buffers\n", 4881 mdname(mddev), memory); 4882 goto abort; 4883 } else 4884 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 4885 mdname(mddev), memory); 4886 4887 conf->thread = md_register_thread(raid5d, mddev, NULL); 4888 if (!conf->thread) { 4889 printk(KERN_ERR 4890 "md/raid:%s: couldn't allocate thread.\n", 4891 mdname(mddev)); 4892 goto abort; 4893 } 4894 4895 return conf; 4896 4897 abort: 4898 if (conf) { 4899 free_conf(conf); 4900 return ERR_PTR(-EIO); 4901 } else 4902 return ERR_PTR(-ENOMEM); 4903} 4904 4905 4906static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 4907{ 4908 switch (algo) { 4909 case ALGORITHM_PARITY_0: 4910 if (raid_disk < max_degraded) 4911 return 1; 4912 break; 4913 case ALGORITHM_PARITY_N: 4914 if (raid_disk >= raid_disks - max_degraded) 4915 return 1; 4916 break; 4917 case ALGORITHM_PARITY_0_6: 4918 if (raid_disk == 0 || 4919 raid_disk == raid_disks - 1) 4920 return 1; 4921 break; 4922 case ALGORITHM_LEFT_ASYMMETRIC_6: 4923 case ALGORITHM_RIGHT_ASYMMETRIC_6: 4924 case ALGORITHM_LEFT_SYMMETRIC_6: 4925 case ALGORITHM_RIGHT_SYMMETRIC_6: 4926 if (raid_disk == raid_disks - 1) 4927 return 1; 4928 } 4929 return 0; 4930} 4931 4932static int run(struct mddev *mddev) 4933{ 4934 struct r5conf *conf; 4935 int working_disks = 0; 4936 int dirty_parity_disks = 0; 4937 struct md_rdev *rdev; 4938 sector_t reshape_offset = 0; 4939 4940 if (mddev->recovery_cp != MaxSector) 4941 printk(KERN_NOTICE "md/raid:%s: not clean" 4942 " -- starting background reconstruction\n", 4943 mdname(mddev)); 4944 if (mddev->reshape_position != MaxSector) { 4945 /* Check that we can continue the reshape. 4946 * Currently only disks can change, it must 4947 * increase, and we must be past the point where 4948 * a stripe over-writes itself 4949 */ 4950 sector_t here_new, here_old; 4951 int old_disks; 4952 int max_degraded = (mddev->level == 6 ? 2 : 1); 4953 4954 if (mddev->new_level != mddev->level) { 4955 printk(KERN_ERR "md/raid:%s: unsupported reshape " 4956 "required - aborting.\n", 4957 mdname(mddev)); 4958 return -EINVAL; 4959 } 4960 old_disks = mddev->raid_disks - mddev->delta_disks; 4961 /* reshape_position must be on a new-stripe boundary, and one 4962 * further up in new geometry must map after here in old 4963 * geometry. 4964 */ 4965 here_new = mddev->reshape_position; 4966 if (sector_div(here_new, mddev->new_chunk_sectors * 4967 (mddev->raid_disks - max_degraded))) { 4968 printk(KERN_ERR "md/raid:%s: reshape_position not " 4969 "on a stripe boundary\n", mdname(mddev)); 4970 return -EINVAL; 4971 } 4972 reshape_offset = here_new * mddev->new_chunk_sectors; 4973 /* here_new is the stripe we will write to */ 4974 here_old = mddev->reshape_position; 4975 sector_div(here_old, mddev->chunk_sectors * 4976 (old_disks-max_degraded)); 4977 /* here_old is the first stripe that we might need to read 4978 * from */ 4979 if (mddev->delta_disks == 0) { 4980 /* We cannot be sure it is safe to start an in-place 4981 * reshape. It is only safe if user-space if monitoring 4982 * and taking constant backups. 4983 * mdadm always starts a situation like this in 4984 * readonly mode so it can take control before 4985 * allowing any writes. So just check for that. 4986 */ 4987 if ((here_new * mddev->new_chunk_sectors != 4988 here_old * mddev->chunk_sectors) || 4989 mddev->ro == 0) { 4990 printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 4991 " in read-only mode - aborting\n", 4992 mdname(mddev)); 4993 return -EINVAL; 4994 } 4995 } else if (mddev->delta_disks < 0 4996 ? (here_new * mddev->new_chunk_sectors <= 4997 here_old * mddev->chunk_sectors) 4998 : (here_new * mddev->new_chunk_sectors >= 4999 here_old * mddev->chunk_sectors)) { 5000 /* Reading from the same stripe as writing to - bad */ 5001 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5002 "auto-recovery - aborting.\n", 5003 mdname(mddev)); 5004 return -EINVAL; 5005 } 5006 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5007 mdname(mddev)); 5008 /* OK, we should be able to continue; */ 5009 } else { 5010 BUG_ON(mddev->level != mddev->new_level); 5011 BUG_ON(mddev->layout != mddev->new_layout); 5012 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5013 BUG_ON(mddev->delta_disks != 0); 5014 } 5015 5016 if (mddev->private == NULL) 5017 conf = setup_conf(mddev); 5018 else 5019 conf = mddev->private; 5020 5021 if (IS_ERR(conf)) 5022 return PTR_ERR(conf); 5023 5024 mddev->thread = conf->thread; 5025 conf->thread = NULL; 5026 mddev->private = conf; 5027 5028 /* 5029 * 0 for a fully functional array, 1 or 2 for a degraded array. 5030 */ 5031 list_for_each_entry(rdev, &mddev->disks, same_set) { 5032 if (rdev->raid_disk < 0) 5033 continue; 5034 if (test_bit(In_sync, &rdev->flags)) { 5035 working_disks++; 5036 continue; 5037 } 5038 /* This disc is not fully in-sync. However if it 5039 * just stored parity (beyond the recovery_offset), 5040 * when we don't need to be concerned about the 5041 * array being dirty. 5042 * When reshape goes 'backwards', we never have 5043 * partially completed devices, so we only need 5044 * to worry about reshape going forwards. 5045 */ 5046 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5047 if (mddev->major_version == 0 && 5048 mddev->minor_version > 90) 5049 rdev->recovery_offset = reshape_offset; 5050 5051 if (rdev->recovery_offset < reshape_offset) { 5052 /* We need to check old and new layout */ 5053 if (!only_parity(rdev->raid_disk, 5054 conf->algorithm, 5055 conf->raid_disks, 5056 conf->max_degraded)) 5057 continue; 5058 } 5059 if (!only_parity(rdev->raid_disk, 5060 conf->prev_algo, 5061 conf->previous_raid_disks, 5062 conf->max_degraded)) 5063 continue; 5064 dirty_parity_disks++; 5065 } 5066 5067 mddev->degraded = calc_degraded(conf); 5068 5069 if (has_failed(conf)) { 5070 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5071 " (%d/%d failed)\n", 5072 mdname(mddev), mddev->degraded, conf->raid_disks); 5073 goto abort; 5074 } 5075 5076 /* device size must be a multiple of chunk size */ 5077 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5078 mddev->resync_max_sectors = mddev->dev_sectors; 5079 5080 if (mddev->degraded > dirty_parity_disks && 5081 mddev->recovery_cp != MaxSector) { 5082 if (mddev->ok_start_degraded) 5083 printk(KERN_WARNING 5084 "md/raid:%s: starting dirty degraded array" 5085 " - data corruption possible.\n", 5086 mdname(mddev)); 5087 else { 5088 printk(KERN_ERR 5089 "md/raid:%s: cannot start dirty degraded array.\n", 5090 mdname(mddev)); 5091 goto abort; 5092 } 5093 } 5094 5095 if (mddev->degraded == 0) 5096 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5097 " devices, algorithm %d\n", mdname(mddev), conf->level, 5098 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5099 mddev->new_layout); 5100 else 5101 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5102 " out of %d devices, algorithm %d\n", 5103 mdname(mddev), conf->level, 5104 mddev->raid_disks - mddev->degraded, 5105 mddev->raid_disks, mddev->new_layout); 5106 5107 print_raid5_conf(conf); 5108 5109 if (conf->reshape_progress != MaxSector) { 5110 conf->reshape_safe = conf->reshape_progress; 5111 atomic_set(&conf->reshape_stripes, 0); 5112 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5113 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5114 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5115 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5116 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5117 "reshape"); 5118 } 5119 5120 5121 /* Ok, everything is just fine now */ 5122 if (mddev->to_remove == &raid5_attrs_group) 5123 mddev->to_remove = NULL; 5124 else if (mddev->kobj.sd && 5125 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5126 printk(KERN_WARNING 5127 "raid5: failed to create sysfs attributes for %s\n", 5128 mdname(mddev)); 5129 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5130 5131 if (mddev->queue) { 5132 int chunk_size; 5133 /* read-ahead size must cover two whole stripes, which 5134 * is 2 * (datadisks) * chunksize where 'n' is the 5135 * number of raid devices 5136 */ 5137 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5138 int stripe = data_disks * 5139 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5140 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5141 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5142 5143 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5144 5145 mddev->queue->backing_dev_info.congested_data = mddev; 5146 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5147 5148 chunk_size = mddev->chunk_sectors << 9; 5149 blk_queue_io_min(mddev->queue, chunk_size); 5150 blk_queue_io_opt(mddev->queue, chunk_size * 5151 (conf->raid_disks - conf->max_degraded)); 5152 5153 list_for_each_entry(rdev, &mddev->disks, same_set) 5154 disk_stack_limits(mddev->gendisk, rdev->bdev, 5155 rdev->data_offset << 9); 5156 } 5157 5158 return 0; 5159abort: 5160 md_unregister_thread(&mddev->thread); 5161 print_raid5_conf(conf); 5162 free_conf(conf); 5163 mddev->private = NULL; 5164 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5165 return -EIO; 5166} 5167 5168static int stop(struct mddev *mddev) 5169{ 5170 struct r5conf *conf = mddev->private; 5171 5172 md_unregister_thread(&mddev->thread); 5173 if (mddev->queue) 5174 mddev->queue->backing_dev_info.congested_fn = NULL; 5175 free_conf(conf); 5176 mddev->private = NULL; 5177 mddev->to_remove = &raid5_attrs_group; 5178 return 0; 5179} 5180 5181static void status(struct seq_file *seq, struct mddev *mddev) 5182{ 5183 struct r5conf *conf = mddev->private; 5184 int i; 5185 5186 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5187 mddev->chunk_sectors / 2, mddev->layout); 5188 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5189 for (i = 0; i < conf->raid_disks; i++) 5190 seq_printf (seq, "%s", 5191 conf->disks[i].rdev && 5192 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5193 seq_printf (seq, "]"); 5194} 5195 5196static void print_raid5_conf (struct r5conf *conf) 5197{ 5198 int i; 5199 struct disk_info *tmp; 5200 5201 printk(KERN_DEBUG "RAID conf printout:\n"); 5202 if (!conf) { 5203 printk("(conf==NULL)\n"); 5204 return; 5205 } 5206 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5207 conf->raid_disks, 5208 conf->raid_disks - conf->mddev->degraded); 5209 5210 for (i = 0; i < conf->raid_disks; i++) { 5211 char b[BDEVNAME_SIZE]; 5212 tmp = conf->disks + i; 5213 if (tmp->rdev) 5214 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5215 i, !test_bit(Faulty, &tmp->rdev->flags), 5216 bdevname(tmp->rdev->bdev, b)); 5217 } 5218} 5219 5220static int raid5_spare_active(struct mddev *mddev) 5221{ 5222 int i; 5223 struct r5conf *conf = mddev->private; 5224 struct disk_info *tmp; 5225 int count = 0; 5226 unsigned long flags; 5227 5228 for (i = 0; i < conf->raid_disks; i++) { 5229 tmp = conf->disks + i; 5230 if (tmp->replacement 5231 && tmp->replacement->recovery_offset == MaxSector 5232 && !test_bit(Faulty, &tmp->replacement->flags) 5233 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 5234 /* Replacement has just become active. */ 5235 if (!tmp->rdev 5236 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 5237 count++; 5238 if (tmp->rdev) { 5239 /* Replaced device not technically faulty, 5240 * but we need to be sure it gets removed 5241 * and never re-added. 5242 */ 5243 set_bit(Faulty, &tmp->rdev->flags); 5244 sysfs_notify_dirent_safe( 5245 tmp->rdev->sysfs_state); 5246 } 5247 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 5248 } else if (tmp->rdev 5249 && tmp->rdev->recovery_offset == MaxSector 5250 && !test_bit(Faulty, &tmp->rdev->flags) 5251 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5252 count++; 5253 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5254 } 5255 } 5256 spin_lock_irqsave(&conf->device_lock, flags); 5257 mddev->degraded = calc_degraded(conf); 5258 spin_unlock_irqrestore(&conf->device_lock, flags); 5259 print_raid5_conf(conf); 5260 return count; 5261} 5262 5263static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 5264{ 5265 struct r5conf *conf = mddev->private; 5266 int err = 0; 5267 int number = rdev->raid_disk; 5268 struct md_rdev **rdevp; 5269 struct disk_info *p = conf->disks + number; 5270 5271 print_raid5_conf(conf); 5272 if (rdev == p->rdev) 5273 rdevp = &p->rdev; 5274 else if (rdev == p->replacement) 5275 rdevp = &p->replacement; 5276 else 5277 return 0; 5278 5279 if (number >= conf->raid_disks && 5280 conf->reshape_progress == MaxSector) 5281 clear_bit(In_sync, &rdev->flags); 5282 5283 if (test_bit(In_sync, &rdev->flags) || 5284 atomic_read(&rdev->nr_pending)) { 5285 err = -EBUSY; 5286 goto abort; 5287 } 5288 /* Only remove non-faulty devices if recovery 5289 * isn't possible. 5290 */ 5291 if (!test_bit(Faulty, &rdev->flags) && 5292 mddev->recovery_disabled != conf->recovery_disabled && 5293 !has_failed(conf) && 5294 (!p->replacement || p->replacement == rdev) && 5295 number < conf->raid_disks) { 5296 err = -EBUSY; 5297 goto abort; 5298 } 5299 *rdevp = NULL; 5300 synchronize_rcu(); 5301 if (atomic_read(&rdev->nr_pending)) { 5302 /* lost the race, try later */ 5303 err = -EBUSY; 5304 *rdevp = rdev; 5305 } else if (p->replacement) { 5306 /* We must have just cleared 'rdev' */ 5307 p->rdev = p->replacement; 5308 clear_bit(Replacement, &p->replacement->flags); 5309 smp_mb(); /* Make sure other CPUs may see both as identical 5310 * but will never see neither - if they are careful 5311 */ 5312 p->replacement = NULL; 5313 clear_bit(WantReplacement, &rdev->flags); 5314 } else 5315 /* We might have just removed the Replacement as faulty- 5316 * clear the bit just in case 5317 */ 5318 clear_bit(WantReplacement, &rdev->flags); 5319abort: 5320 5321 print_raid5_conf(conf); 5322 return err; 5323} 5324 5325static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 5326{ 5327 struct r5conf *conf = mddev->private; 5328 int err = -EEXIST; 5329 int disk; 5330 struct disk_info *p; 5331 int first = 0; 5332 int last = conf->raid_disks - 1; 5333 5334 if (mddev->recovery_disabled == conf->recovery_disabled) 5335 return -EBUSY; 5336 5337 if (has_failed(conf)) 5338 /* no point adding a device */ 5339 return -EINVAL; 5340 5341 if (rdev->raid_disk >= 0) 5342 first = last = rdev->raid_disk; 5343 5344 /* 5345 * find the disk ... but prefer rdev->saved_raid_disk 5346 * if possible. 5347 */ 5348 if (rdev->saved_raid_disk >= 0 && 5349 rdev->saved_raid_disk >= first && 5350 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5351 disk = rdev->saved_raid_disk; 5352 else 5353 disk = first; 5354 for ( ; disk <= last ; disk++) 5355 if ((p=conf->disks + disk)->rdev == NULL) { 5356 clear_bit(In_sync, &rdev->flags); 5357 rdev->raid_disk = disk; 5358 err = 0; 5359 if (rdev->saved_raid_disk != disk) 5360 conf->fullsync = 1; 5361 rcu_assign_pointer(p->rdev, rdev); 5362 break; 5363 } 5364 print_raid5_conf(conf); 5365 return err; 5366} 5367 5368static int raid5_resize(struct mddev *mddev, sector_t sectors) 5369{ 5370 /* no resync is happening, and there is enough space 5371 * on all devices, so we can resize. 5372 * We need to make sure resync covers any new space. 5373 * If the array is shrinking we should possibly wait until 5374 * any io in the removed space completes, but it hardly seems 5375 * worth it. 5376 */ 5377 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5378 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5379 mddev->raid_disks)); 5380 if (mddev->array_sectors > 5381 raid5_size(mddev, sectors, mddev->raid_disks)) 5382 return -EINVAL; 5383 set_capacity(mddev->gendisk, mddev->array_sectors); 5384 revalidate_disk(mddev->gendisk); 5385 if (sectors > mddev->dev_sectors && 5386 mddev->recovery_cp > mddev->dev_sectors) { 5387 mddev->recovery_cp = mddev->dev_sectors; 5388 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5389 } 5390 mddev->dev_sectors = sectors; 5391 mddev->resync_max_sectors = sectors; 5392 return 0; 5393} 5394 5395static int check_stripe_cache(struct mddev *mddev) 5396{ 5397 /* Can only proceed if there are plenty of stripe_heads. 5398 * We need a minimum of one full stripe,, and for sensible progress 5399 * it is best to have about 4 times that. 5400 * If we require 4 times, then the default 256 4K stripe_heads will 5401 * allow for chunk sizes up to 256K, which is probably OK. 5402 * If the chunk size is greater, user-space should request more 5403 * stripe_heads first. 5404 */ 5405 struct r5conf *conf = mddev->private; 5406 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5407 > conf->max_nr_stripes || 5408 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5409 > conf->max_nr_stripes) { 5410 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5411 mdname(mddev), 5412 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5413 / STRIPE_SIZE)*4); 5414 return 0; 5415 } 5416 return 1; 5417} 5418 5419static int check_reshape(struct mddev *mddev) 5420{ 5421 struct r5conf *conf = mddev->private; 5422 5423 if (mddev->delta_disks == 0 && 5424 mddev->new_layout == mddev->layout && 5425 mddev->new_chunk_sectors == mddev->chunk_sectors) 5426 return 0; /* nothing to do */ 5427 if (mddev->bitmap) 5428 /* Cannot grow a bitmap yet */ 5429 return -EBUSY; 5430 if (has_failed(conf)) 5431 return -EINVAL; 5432 if (mddev->delta_disks < 0) { 5433 /* We might be able to shrink, but the devices must 5434 * be made bigger first. 5435 * For raid6, 4 is the minimum size. 5436 * Otherwise 2 is the minimum 5437 */ 5438 int min = 2; 5439 if (mddev->level == 6) 5440 min = 4; 5441 if (mddev->raid_disks + mddev->delta_disks < min) 5442 return -EINVAL; 5443 } 5444 5445 if (!check_stripe_cache(mddev)) 5446 return -ENOSPC; 5447 5448 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5449} 5450 5451static int raid5_start_reshape(struct mddev *mddev) 5452{ 5453 struct r5conf *conf = mddev->private; 5454 struct md_rdev *rdev; 5455 int spares = 0; 5456 unsigned long flags; 5457 5458 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5459 return -EBUSY; 5460 5461 if (!check_stripe_cache(mddev)) 5462 return -ENOSPC; 5463 5464 list_for_each_entry(rdev, &mddev->disks, same_set) 5465 if (!test_bit(In_sync, &rdev->flags) 5466 && !test_bit(Faulty, &rdev->flags)) 5467 spares++; 5468 5469 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5470 /* Not enough devices even to make a degraded array 5471 * of that size 5472 */ 5473 return -EINVAL; 5474 5475 /* Refuse to reduce size of the array. Any reductions in 5476 * array size must be through explicit setting of array_size 5477 * attribute. 5478 */ 5479 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5480 < mddev->array_sectors) { 5481 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5482 "before number of disks\n", mdname(mddev)); 5483 return -EINVAL; 5484 } 5485 5486 atomic_set(&conf->reshape_stripes, 0); 5487 spin_lock_irq(&conf->device_lock); 5488 conf->previous_raid_disks = conf->raid_disks; 5489 conf->raid_disks += mddev->delta_disks; 5490 conf->prev_chunk_sectors = conf->chunk_sectors; 5491 conf->chunk_sectors = mddev->new_chunk_sectors; 5492 conf->prev_algo = conf->algorithm; 5493 conf->algorithm = mddev->new_layout; 5494 if (mddev->delta_disks < 0) 5495 conf->reshape_progress = raid5_size(mddev, 0, 0); 5496 else 5497 conf->reshape_progress = 0; 5498 conf->reshape_safe = conf->reshape_progress; 5499 conf->generation++; 5500 spin_unlock_irq(&conf->device_lock); 5501 5502 /* Add some new drives, as many as will fit. 5503 * We know there are enough to make the newly sized array work. 5504 * Don't add devices if we are reducing the number of 5505 * devices in the array. This is because it is not possible 5506 * to correctly record the "partially reconstructed" state of 5507 * such devices during the reshape and confusion could result. 5508 */ 5509 if (mddev->delta_disks >= 0) { 5510 int added_devices = 0; 5511 list_for_each_entry(rdev, &mddev->disks, same_set) 5512 if (rdev->raid_disk < 0 && 5513 !test_bit(Faulty, &rdev->flags)) { 5514 if (raid5_add_disk(mddev, rdev) == 0) { 5515 if (rdev->raid_disk 5516 >= conf->previous_raid_disks) { 5517 set_bit(In_sync, &rdev->flags); 5518 added_devices++; 5519 } else 5520 rdev->recovery_offset = 0; 5521 5522 if (sysfs_link_rdev(mddev, rdev)) 5523 /* Failure here is OK */; 5524 } 5525 } else if (rdev->raid_disk >= conf->previous_raid_disks 5526 && !test_bit(Faulty, &rdev->flags)) { 5527 /* This is a spare that was manually added */ 5528 set_bit(In_sync, &rdev->flags); 5529 added_devices++; 5530 } 5531 5532 /* When a reshape changes the number of devices, 5533 * ->degraded is measured against the larger of the 5534 * pre and post number of devices. 5535 */ 5536 spin_lock_irqsave(&conf->device_lock, flags); 5537 mddev->degraded = calc_degraded(conf); 5538 spin_unlock_irqrestore(&conf->device_lock, flags); 5539 } 5540 mddev->raid_disks = conf->raid_disks; 5541 mddev->reshape_position = conf->reshape_progress; 5542 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5543 5544 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5545 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5546 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5547 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5548 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5549 "reshape"); 5550 if (!mddev->sync_thread) { 5551 mddev->recovery = 0; 5552 spin_lock_irq(&conf->device_lock); 5553 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5554 conf->reshape_progress = MaxSector; 5555 spin_unlock_irq(&conf->device_lock); 5556 return -EAGAIN; 5557 } 5558 conf->reshape_checkpoint = jiffies; 5559 md_wakeup_thread(mddev->sync_thread); 5560 md_new_event(mddev); 5561 return 0; 5562} 5563 5564/* This is called from the reshape thread and should make any 5565 * changes needed in 'conf' 5566 */ 5567static void end_reshape(struct r5conf *conf) 5568{ 5569 5570 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5571 5572 spin_lock_irq(&conf->device_lock); 5573 conf->previous_raid_disks = conf->raid_disks; 5574 conf->reshape_progress = MaxSector; 5575 spin_unlock_irq(&conf->device_lock); 5576 wake_up(&conf->wait_for_overlap); 5577 5578 /* read-ahead size must cover two whole stripes, which is 5579 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5580 */ 5581 if (conf->mddev->queue) { 5582 int data_disks = conf->raid_disks - conf->max_degraded; 5583 int stripe = data_disks * ((conf->chunk_sectors << 9) 5584 / PAGE_SIZE); 5585 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5586 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5587 } 5588 } 5589} 5590 5591/* This is called from the raid5d thread with mddev_lock held. 5592 * It makes config changes to the device. 5593 */ 5594static void raid5_finish_reshape(struct mddev *mddev) 5595{ 5596 struct r5conf *conf = mddev->private; 5597 5598 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5599 5600 if (mddev->delta_disks > 0) { 5601 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5602 set_capacity(mddev->gendisk, mddev->array_sectors); 5603 revalidate_disk(mddev->gendisk); 5604 } else { 5605 int d; 5606 spin_lock_irq(&conf->device_lock); 5607 mddev->degraded = calc_degraded(conf); 5608 spin_unlock_irq(&conf->device_lock); 5609 for (d = conf->raid_disks ; 5610 d < conf->raid_disks - mddev->delta_disks; 5611 d++) { 5612 struct md_rdev *rdev = conf->disks[d].rdev; 5613 if (rdev && 5614 raid5_remove_disk(mddev, rdev) == 0) { 5615 sysfs_unlink_rdev(mddev, rdev); 5616 rdev->raid_disk = -1; 5617 } 5618 } 5619 } 5620 mddev->layout = conf->algorithm; 5621 mddev->chunk_sectors = conf->chunk_sectors; 5622 mddev->reshape_position = MaxSector; 5623 mddev->delta_disks = 0; 5624 } 5625} 5626 5627static void raid5_quiesce(struct mddev *mddev, int state) 5628{ 5629 struct r5conf *conf = mddev->private; 5630 5631 switch(state) { 5632 case 2: /* resume for a suspend */ 5633 wake_up(&conf->wait_for_overlap); 5634 break; 5635 5636 case 1: /* stop all writes */ 5637 spin_lock_irq(&conf->device_lock); 5638 /* '2' tells resync/reshape to pause so that all 5639 * active stripes can drain 5640 */ 5641 conf->quiesce = 2; 5642 wait_event_lock_irq(conf->wait_for_stripe, 5643 atomic_read(&conf->active_stripes) == 0 && 5644 atomic_read(&conf->active_aligned_reads) == 0, 5645 conf->device_lock, /* nothing */); 5646 conf->quiesce = 1; 5647 spin_unlock_irq(&conf->device_lock); 5648 /* allow reshape to continue */ 5649 wake_up(&conf->wait_for_overlap); 5650 break; 5651 5652 case 0: /* re-enable writes */ 5653 spin_lock_irq(&conf->device_lock); 5654 conf->quiesce = 0; 5655 wake_up(&conf->wait_for_stripe); 5656 wake_up(&conf->wait_for_overlap); 5657 spin_unlock_irq(&conf->device_lock); 5658 break; 5659 } 5660} 5661 5662 5663static void *raid45_takeover_raid0(struct mddev *mddev, int level) 5664{ 5665 struct r0conf *raid0_conf = mddev->private; 5666 sector_t sectors; 5667 5668 /* for raid0 takeover only one zone is supported */ 5669 if (raid0_conf->nr_strip_zones > 1) { 5670 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5671 mdname(mddev)); 5672 return ERR_PTR(-EINVAL); 5673 } 5674 5675 sectors = raid0_conf->strip_zone[0].zone_end; 5676 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 5677 mddev->dev_sectors = sectors; 5678 mddev->new_level = level; 5679 mddev->new_layout = ALGORITHM_PARITY_N; 5680 mddev->new_chunk_sectors = mddev->chunk_sectors; 5681 mddev->raid_disks += 1; 5682 mddev->delta_disks = 1; 5683 /* make sure it will be not marked as dirty */ 5684 mddev->recovery_cp = MaxSector; 5685 5686 return setup_conf(mddev); 5687} 5688 5689 5690static void *raid5_takeover_raid1(struct mddev *mddev) 5691{ 5692 int chunksect; 5693 5694 if (mddev->raid_disks != 2 || 5695 mddev->degraded > 1) 5696 return ERR_PTR(-EINVAL); 5697 5698 /* Should check if there are write-behind devices? */ 5699 5700 chunksect = 64*2; /* 64K by default */ 5701 5702 /* The array must be an exact multiple of chunksize */ 5703 while (chunksect && (mddev->array_sectors & (chunksect-1))) 5704 chunksect >>= 1; 5705 5706 if ((chunksect<<9) < STRIPE_SIZE) 5707 /* array size does not allow a suitable chunk size */ 5708 return ERR_PTR(-EINVAL); 5709 5710 mddev->new_level = 5; 5711 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 5712 mddev->new_chunk_sectors = chunksect; 5713 5714 return setup_conf(mddev); 5715} 5716 5717static void *raid5_takeover_raid6(struct mddev *mddev) 5718{ 5719 int new_layout; 5720 5721 switch (mddev->layout) { 5722 case ALGORITHM_LEFT_ASYMMETRIC_6: 5723 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 5724 break; 5725 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5726 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 5727 break; 5728 case ALGORITHM_LEFT_SYMMETRIC_6: 5729 new_layout = ALGORITHM_LEFT_SYMMETRIC; 5730 break; 5731 case ALGORITHM_RIGHT_SYMMETRIC_6: 5732 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 5733 break; 5734 case ALGORITHM_PARITY_0_6: 5735 new_layout = ALGORITHM_PARITY_0; 5736 break; 5737 case ALGORITHM_PARITY_N: 5738 new_layout = ALGORITHM_PARITY_N; 5739 break; 5740 default: 5741 return ERR_PTR(-EINVAL); 5742 } 5743 mddev->new_level = 5; 5744 mddev->new_layout = new_layout; 5745 mddev->delta_disks = -1; 5746 mddev->raid_disks -= 1; 5747 return setup_conf(mddev); 5748} 5749 5750 5751static int raid5_check_reshape(struct mddev *mddev) 5752{ 5753 /* For a 2-drive array, the layout and chunk size can be changed 5754 * immediately as not restriping is needed. 5755 * For larger arrays we record the new value - after validation 5756 * to be used by a reshape pass. 5757 */ 5758 struct r5conf *conf = mddev->private; 5759 int new_chunk = mddev->new_chunk_sectors; 5760 5761 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 5762 return -EINVAL; 5763 if (new_chunk > 0) { 5764 if (!is_power_of_2(new_chunk)) 5765 return -EINVAL; 5766 if (new_chunk < (PAGE_SIZE>>9)) 5767 return -EINVAL; 5768 if (mddev->array_sectors & (new_chunk-1)) 5769 /* not factor of array size */ 5770 return -EINVAL; 5771 } 5772 5773 /* They look valid */ 5774 5775 if (mddev->raid_disks == 2) { 5776 /* can make the change immediately */ 5777 if (mddev->new_layout >= 0) { 5778 conf->algorithm = mddev->new_layout; 5779 mddev->layout = mddev->new_layout; 5780 } 5781 if (new_chunk > 0) { 5782 conf->chunk_sectors = new_chunk ; 5783 mddev->chunk_sectors = new_chunk; 5784 } 5785 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5786 md_wakeup_thread(mddev->thread); 5787 } 5788 return check_reshape(mddev); 5789} 5790 5791static int raid6_check_reshape(struct mddev *mddev) 5792{ 5793 int new_chunk = mddev->new_chunk_sectors; 5794 5795 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 5796 return -EINVAL; 5797 if (new_chunk > 0) { 5798 if (!is_power_of_2(new_chunk)) 5799 return -EINVAL; 5800 if (new_chunk < (PAGE_SIZE >> 9)) 5801 return -EINVAL; 5802 if (mddev->array_sectors & (new_chunk-1)) 5803 /* not factor of array size */ 5804 return -EINVAL; 5805 } 5806 5807 /* They look valid */ 5808 return check_reshape(mddev); 5809} 5810 5811static void *raid5_takeover(struct mddev *mddev) 5812{ 5813 /* raid5 can take over: 5814 * raid0 - if there is only one strip zone - make it a raid4 layout 5815 * raid1 - if there are two drives. We need to know the chunk size 5816 * raid4 - trivial - just use a raid4 layout. 5817 * raid6 - Providing it is a *_6 layout 5818 */ 5819 if (mddev->level == 0) 5820 return raid45_takeover_raid0(mddev, 5); 5821 if (mddev->level == 1) 5822 return raid5_takeover_raid1(mddev); 5823 if (mddev->level == 4) { 5824 mddev->new_layout = ALGORITHM_PARITY_N; 5825 mddev->new_level = 5; 5826 return setup_conf(mddev); 5827 } 5828 if (mddev->level == 6) 5829 return raid5_takeover_raid6(mddev); 5830 5831 return ERR_PTR(-EINVAL); 5832} 5833 5834static void *raid4_takeover(struct mddev *mddev) 5835{ 5836 /* raid4 can take over: 5837 * raid0 - if there is only one strip zone 5838 * raid5 - if layout is right 5839 */ 5840 if (mddev->level == 0) 5841 return raid45_takeover_raid0(mddev, 4); 5842 if (mddev->level == 5 && 5843 mddev->layout == ALGORITHM_PARITY_N) { 5844 mddev->new_layout = 0; 5845 mddev->new_level = 4; 5846 return setup_conf(mddev); 5847 } 5848 return ERR_PTR(-EINVAL); 5849} 5850 5851static struct md_personality raid5_personality; 5852 5853static void *raid6_takeover(struct mddev *mddev) 5854{ 5855 /* Currently can only take over a raid5. We map the 5856 * personality to an equivalent raid6 personality 5857 * with the Q block at the end. 5858 */ 5859 int new_layout; 5860 5861 if (mddev->pers != &raid5_personality) 5862 return ERR_PTR(-EINVAL); 5863 if (mddev->degraded > 1) 5864 return ERR_PTR(-EINVAL); 5865 if (mddev->raid_disks > 253) 5866 return ERR_PTR(-EINVAL); 5867 if (mddev->raid_disks < 3) 5868 return ERR_PTR(-EINVAL); 5869 5870 switch (mddev->layout) { 5871 case ALGORITHM_LEFT_ASYMMETRIC: 5872 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 5873 break; 5874 case ALGORITHM_RIGHT_ASYMMETRIC: 5875 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 5876 break; 5877 case ALGORITHM_LEFT_SYMMETRIC: 5878 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 5879 break; 5880 case ALGORITHM_RIGHT_SYMMETRIC: 5881 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 5882 break; 5883 case ALGORITHM_PARITY_0: 5884 new_layout = ALGORITHM_PARITY_0_6; 5885 break; 5886 case ALGORITHM_PARITY_N: 5887 new_layout = ALGORITHM_PARITY_N; 5888 break; 5889 default: 5890 return ERR_PTR(-EINVAL); 5891 } 5892 mddev->new_level = 6; 5893 mddev->new_layout = new_layout; 5894 mddev->delta_disks = 1; 5895 mddev->raid_disks += 1; 5896 return setup_conf(mddev); 5897} 5898 5899 5900static struct md_personality raid6_personality = 5901{ 5902 .name = "raid6", 5903 .level = 6, 5904 .owner = THIS_MODULE, 5905 .make_request = make_request, 5906 .run = run, 5907 .stop = stop, 5908 .status = status, 5909 .error_handler = error, 5910 .hot_add_disk = raid5_add_disk, 5911 .hot_remove_disk= raid5_remove_disk, 5912 .spare_active = raid5_spare_active, 5913 .sync_request = sync_request, 5914 .resize = raid5_resize, 5915 .size = raid5_size, 5916 .check_reshape = raid6_check_reshape, 5917 .start_reshape = raid5_start_reshape, 5918 .finish_reshape = raid5_finish_reshape, 5919 .quiesce = raid5_quiesce, 5920 .takeover = raid6_takeover, 5921}; 5922static struct md_personality raid5_personality = 5923{ 5924 .name = "raid5", 5925 .level = 5, 5926 .owner = THIS_MODULE, 5927 .make_request = make_request, 5928 .run = run, 5929 .stop = stop, 5930 .status = status, 5931 .error_handler = error, 5932 .hot_add_disk = raid5_add_disk, 5933 .hot_remove_disk= raid5_remove_disk, 5934 .spare_active = raid5_spare_active, 5935 .sync_request = sync_request, 5936 .resize = raid5_resize, 5937 .size = raid5_size, 5938 .check_reshape = raid5_check_reshape, 5939 .start_reshape = raid5_start_reshape, 5940 .finish_reshape = raid5_finish_reshape, 5941 .quiesce = raid5_quiesce, 5942 .takeover = raid5_takeover, 5943}; 5944 5945static struct md_personality raid4_personality = 5946{ 5947 .name = "raid4", 5948 .level = 4, 5949 .owner = THIS_MODULE, 5950 .make_request = make_request, 5951 .run = run, 5952 .stop = stop, 5953 .status = status, 5954 .error_handler = error, 5955 .hot_add_disk = raid5_add_disk, 5956 .hot_remove_disk= raid5_remove_disk, 5957 .spare_active = raid5_spare_active, 5958 .sync_request = sync_request, 5959 .resize = raid5_resize, 5960 .size = raid5_size, 5961 .check_reshape = raid5_check_reshape, 5962 .start_reshape = raid5_start_reshape, 5963 .finish_reshape = raid5_finish_reshape, 5964 .quiesce = raid5_quiesce, 5965 .takeover = raid4_takeover, 5966}; 5967 5968static int __init raid5_init(void) 5969{ 5970 register_md_personality(&raid6_personality); 5971 register_md_personality(&raid5_personality); 5972 register_md_personality(&raid4_personality); 5973 return 0; 5974} 5975 5976static void raid5_exit(void) 5977{ 5978 unregister_md_personality(&raid6_personality); 5979 unregister_md_personality(&raid5_personality); 5980 unregister_md_personality(&raid4_personality); 5981} 5982 5983module_init(raid5_init); 5984module_exit(raid5_exit); 5985MODULE_LICENSE("GPL"); 5986MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 5987MODULE_ALIAS("md-personality-4"); /* RAID5 */ 5988MODULE_ALIAS("md-raid5"); 5989MODULE_ALIAS("md-raid4"); 5990MODULE_ALIAS("md-level-5"); 5991MODULE_ALIAS("md-level-4"); 5992MODULE_ALIAS("md-personality-8"); /* RAID6 */ 5993MODULE_ALIAS("md-raid6"); 5994MODULE_ALIAS("md-level-6"); 5995 5996/* This used to be two separate modules, they were: */ 5997MODULE_ALIAS("raid5"); 5998MODULE_ALIAS("raid6"); 5999